Add namespace aware DOM/SAX parsing for XML Sitemaps (#176)

* Add namespace aware DOM/SAX parsing for XML Sitemaps. RSS and Atom parsing is also namespace aware, but finding elements is left "relaxed" by only matching on the element "localName". * Lenient namespacing in non strict mode + applied formatting * Introduced separate field strictNamespace to sitemapparsers + added test to saxparser * Fixes Javadoc * Fixes the fix for the Javadoc * Allow to set strictNamespace in SiteMapTester - Fix strict namespace handling in SitemapParserSAX: - pass strictNamespace from DelegatorHandler to delegates - ignore text if inside an element of invalid namespace - use SAX parser in unit test - set exception and pass it to calling DelegatorHandler if namespace does not match
2024-05-20 18:36:03 +02:00 · 2017-10-17 10:47:17 +01:00 · 2017-10-17 10:47:17 +01:00 · 6adb771b72
parent 5e60792a0b
commit 6adb771b72
15 changed files with 369 additions and 71 deletions
--- a/src/main/java/crawlercommons/domains/EffectiveTldFinder.java
+++ b/src/main/java/crawlercommons/domains/EffectiveTldFinder.java
@ -79,10 +79,9 @@ public class EffectiveTldFinder {
        domains = new HashMap<>();
        try {
            if (null == effectiveTldDataStream && null != this.getClass().getResource(ETLD_DATA)) {
-              effectiveTldDataStream = this.getClass().getResourceAsStream(ETLD_DATA);
+                effectiveTldDataStream = this.getClass().getResourceAsStream(ETLD_DATA);
            }
-            BufferedReader input = new BufferedReader(new InputStreamReader(
-                    effectiveTldDataStream, StandardCharsets.UTF_8));
+            BufferedReader input = new BufferedReader(new InputStreamReader(effectiveTldDataStream, StandardCharsets.UTF_8));
            String line = null;
            while (null != (line = input.readLine())) {
                if (line.length() == 0 || (line.length() > 1 && line.startsWith(COMMENT))) {
--- a/src/main/java/crawlercommons/sitemaps/AbstractSiteMap.java
+++ b/src/main/java/crawlercommons/sitemaps/AbstractSiteMap.java
@ -67,12 +67,8 @@ public abstract class AbstractSiteMap {
    private static final ThreadLocal<DateFormat[]> RSS_DATE_FORMATS = new ThreadLocal<DateFormat[]>() {
        @Override
        protected DateFormat[] initialValue() {
-            return new DateFormat[] {
-                    new SimpleDateFormat("EEE, dd MMM yy HH:mm:ss Z", Locale.ROOT),
-                    new SimpleDateFormat("dd MMM yy HH:mm:ss Z", Locale.ROOT),
-                    new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss Z", Locale.ROOT),
-                    new SimpleDateFormat("dd MMM yyyy HH:mm:ss Z", Locale.ROOT)
-            };
+            return new DateFormat[] { new SimpleDateFormat("EEE, dd MMM yy HH:mm:ss Z", Locale.ROOT), new SimpleDateFormat("dd MMM yy HH:mm:ss Z", Locale.ROOT),
+                            new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss Z", Locale.ROOT), new SimpleDateFormat("dd MMM yyyy HH:mm:ss Z", Locale.ROOT) };
        }
    };

@ -205,13 +201,13 @@ public abstract class AbstractSiteMap {
    }

    /**
-     * Converts pubDate of RSS to the string representation which could be parsed
-     * in {@link #convertToDate(String)} method.
-     *
+     * Converts pubDate of RSS to the string representation which could be
+     * parsed in {@link #convertToDate(String)} method.
+     * 
     * @param pubDate
     *            - date time of pubDate in RFC822
-     * @return converted to &quot;yyyy-MM-dd'T'HH:mm:ssZ&quot; format or original value if it doesn't
-     *         follow the RFC822
+     * @return converted to &quot;yyyy-MM-dd'T'HH:mm:ssZ&quot; format or
+     *         original value if it doesn't follow the RFC822
     */
    public static String normalizeRSSTimestamp(String pubDate) {
        if (pubDate == null) {
--- a/src/main/java/crawlercommons/sitemaps/Namespace.java
+++ b/src/main/java/crawlercommons/sitemaps/Namespace.java
@ -0,0 +1,38 @@
+/**
+ * Copyright 2016 Crawler-Commons
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package crawlercommons.sitemaps;
+
+/**
+ * supported sitemap formats:
+ * https://www.sitemaps.org/protocol.html#otherformats
+ */
+public class Namespace {
+
+    public static final String SITEMAP = "http://www.sitemaps.org/schemas/sitemap/0.9";
+
+    /**
+     * RSS and Atom sitemap formats do not have strict definition. But if we do
+     * not parse as namespace aware, then RSS/Atom files that choose to use
+     * namespaces will break. The relaxed compromise for RSS/Atom is to always
+     * parse as "namespace aware", but we will only match elements by the
+     * localName, accepting any element namespace.
+     */
+    public static final String RSS_2_0 = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
+    public static final String ATOM_0_3 = "http://purl.org/atom/ns#";
+    public static final String ATOM_1_0 = "http://www.w3.org/2005/Atom";
+
+}
--- a/src/main/java/crawlercommons/sitemaps/SiteMapParser.java
+++ b/src/main/java/crawlercommons/sitemaps/SiteMapParser.java
@ -87,12 +87,19 @@ public class SiteMapParser {
    /**
     * True (by default) meaning that invalid URLs should be rejected, as the
     * official docs allow the siteMapURLs to be only under the base url:
-     * http://www.sitemaps.org/protocol.html#location
+     * http://www.sitemaps.org/protocol.html#location Also checks that the
+     * correct namespace is used.
     */
    protected boolean strict = true;

+    /**
+     * Indicates whether the parser should work with the namespace from the
+     * specifications or any namespace. Defaults to false.
+     **/
+    protected boolean strictNamespace = false;
+
    public SiteMapParser() {
-      //default constructor
+        // default constructor
    }

    public SiteMapParser(boolean strict) {
@ -107,6 +114,22 @@ public class SiteMapParser {
        return strict;
    }

+    /**
+     * @return whether the parser allows any namespace or just the one from the
+     *         specification
+     */
+    public boolean isStrictNamespace() {
+        return strictNamespace;
+    }
+
+    /**
+     * Sets the parser to allow any namespace or just the one from the
+     *         specification
+     */
+    public void setStrictNamespace(boolean s) {
+        strictNamespace = s;
+    }
+
    /**
     * Returns a SiteMap or SiteMapIndex given an online sitemap URL
     *
@ -234,7 +257,8 @@ public class SiteMapParser {
                }
                throw new UnknownFormatException("Can't parse a gzipped sitemap with the embedded MediaType of: " + embeddedType + " (at: " + url + ")");
            }
-            mediaType = MEDIA_TYPE_REGISTRY.getSupertype(mediaType); // Check parent
+            mediaType = MEDIA_TYPE_REGISTRY.getSupertype(mediaType); // Check
+                                                                     // parent
        }

        throw new UnknownFormatException("Can't parse a sitemap with the MediaType of: " + contentType + " (at: " + url + ")");
@ -352,16 +376,22 @@ public class SiteMapParser {
     *             {@link org.xml.sax.InputSource}
     */
    protected AbstractSiteMap processXml(URL sitemapUrl, InputSource is) throws UnknownFormatException {
-
        Document doc = null;

        try {
            DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
-            // disable validation and avoid that remote DTDs, schemas, etc. are fetched
+
+            // disable validation and avoid that remote DTDs, schemas, etc. are
+            // fetched
            dbf.setValidating(false);
+
+            // support an explicitly named namespace.
+            dbf.setNamespaceAware(true);
+
            dbf.setXIncludeAware(false);
            dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
            DocumentBuilder db = dbf.newDocumentBuilder();
+
            db.setEntityResolver(new EntityResolver() {
                // noop entity resolver, does not fetch remote content
                @Override
@ -369,14 +399,17 @@ public class SiteMapParser {
                    return new InputSource(new StringReader(""));
                }
            });
+
            db.setErrorHandler(new ErrorHandler() {
                public void warning(SAXParseException e) throws SAXException {
                    LOG.warn("Warning parsing XML: {}", e.toString());
                }
+
                public void fatalError(SAXParseException e) throws SAXException {
                    LOG.error("Fatal error parsing XML: {}", e.toString());
                    throw e;
                }
+
                public void error(SAXParseException e) throws SAXException {
                    LOG.error("Error parsing XML: {}", e.toString());
                    throw e;
@ -389,14 +422,14 @@ public class SiteMapParser {
        }

        // See if this is a sitemap index
-        NodeList nodeList = doc.getElementsByTagName("sitemapindex");
+        NodeList nodeList = doc.getElementsByTagNameNS("*", "sitemapindex");
        if (nodeList.getLength() > 0) {
-            nodeList = doc.getElementsByTagName("sitemap");
+            nodeList = doc.getElementsByTagNameNS("*", "sitemap");
            return parseSitemapIndex(sitemapUrl, nodeList);
-        } else if (doc.getElementsByTagName("urlset").getLength() > 0) {
+        } else if (doc.getElementsByTagNameNS("*", "urlset").getLength() > 0) {
            // This is a regular Sitemap
            return parseXmlSitemap(sitemapUrl, doc);
-        } else if (doc.getElementsByTagName("link").getLength() > 0) {
+        } else if (doc.getElementsByTagNameNS("*", "link").getLength() > 0) {
            // Could be RSS or Atom
            return parseSyndicationFormat(sitemapUrl, doc);
        }
@ -436,7 +469,12 @@ public class SiteMapParser {
        SiteMap sitemap = new SiteMap(sitemapUrl);
        sitemap.setType(SitemapType.XML);

-        NodeList list = doc.getElementsByTagName("url");
+        String namespace = Namespace.SITEMAP;
+        if (!strictNamespace) {
+            namespace = "*";
+        }
+
+        NodeList list = doc.getElementsByTagNameNS(namespace, "url");

        // Loop through the <url>s
        for (int i = 0; i < list.getLength(); i++) {
@ -444,10 +482,10 @@ public class SiteMapParser {
            Node n = list.item(i);
            if (n.getNodeType() == Node.ELEMENT_NODE) {
                Element elem = (Element) n;
-                String lastMod = getElementValue(elem, "lastmod");
-                String changeFreq = getElementValue(elem, "changefreq");
-                String priority = getElementValue(elem, "priority");
-                String loc = getElementValue(elem, "loc");
+                String lastMod = getElementValue(namespace, elem, "lastmod");
+                String changeFreq = getElementValue(namespace, elem, "changefreq");
+                String priority = getElementValue(namespace, elem, "priority");
+                String loc = getElementValue(namespace, elem, "loc");

                addUrlIntoSitemap(loc, sitemap, lastMod, changeFreq, priority, i);
            }
@ -496,7 +534,12 @@ public class SiteMapParser {

            if (firstNode.getNodeType() == Node.ELEMENT_NODE) {
                Element elem = (Element) firstNode;
-                String loc = getElementValue(elem, "loc");
+                String loc = null;
+                String namespace = Namespace.SITEMAP;
+                if (!strictNamespace) {
+                    namespace = "*";
+                }
+                loc = getElementValue(namespace, elem, "loc");

                // try the text content when no loc element
                // has been specified
@ -506,7 +549,7 @@ public class SiteMapParser {

                try {
                    URL sitemapUrl = new URL(loc);
-                    String lastmod = getElementValue(elem, "lastmod");
+                    String lastmod = getElementValue(namespace, elem, "lastmod");
                    Date lastModified = SiteMap.convertToDate(lastmod);

                    // Right now we are not worried about sitemapUrls that point
@ -543,7 +586,7 @@ public class SiteMapParser {
        SiteMap sitemap = new SiteMap(sitemapUrl);

        // See if this is an Atom feed by looking for "feed" element
-        NodeList list = doc.getElementsByTagName("feed");
+        NodeList list = doc.getElementsByTagNameNS("*", "feed");
        if (list.getLength() > 0) {
            parseAtom(sitemap, (Element) list.item(0), doc);
            sitemap.setProcessed(true);
@ -557,7 +600,7 @@ public class SiteMapParser {
            // See https://github.com/crawler-commons/crawler-commons/issues/87
            // and also RSS 1.0 specification
            // http://web.resource.org/rss/1.0/spec
-            list = doc.getElementsByTagName("channel");
+            list = doc.getElementsByTagNameNS("*", "channel");
            if (list.getLength() > 0) {
                parseRSS(sitemap, doc);
                sitemap.setProcessed(true);
@ -620,7 +663,7 @@ public class SiteMapParser {
        String lastMod = getElementValue(elem, "modified");
        LOG.debug("lastMod = {}", lastMod);

-        NodeList list = doc.getElementsByTagName("entry");
+        NodeList list = doc.getElementsByTagNameNS("*", "entry");

        // Loop through the <entry>s
        for (int i = 0; i < list.getLength() && i < MAX_URLS; i++) {
@ -691,7 +734,7 @@ public class SiteMapParser {

        LOG.debug("Parsing RSS doc");
        sitemap.setType(SitemapType.RSS);
-        NodeList list = doc.getElementsByTagName("channel");
+        NodeList list = doc.getElementsByTagNameNS("*", "channel");
        Element elem = (Element) list.item(0);

        // Treat publication date as last mod (Tue, 10 Jun 2003 04:00:00 GMT)
@ -699,7 +742,7 @@ public class SiteMapParser {
        LOG.debug("channel's lastMod = {}", channelLastMod);
        sitemap.setLastModified(channelLastMod);

-        list = doc.getElementsByTagName("item");
+        list = doc.getElementsByTagNameNS("*", "item");
        // Loop through the <item>s
        for (int i = 0; i < list.getLength() && i < MAX_URLS; i++) {

@ -715,15 +758,17 @@ public class SiteMapParser {
    }

    /**
-     * Get the element's textual content.
+     * Get the element's textual content. Find element under parent element,
+     * with namespaceURI and element local-name "elementName".
     * 
+     * @param namespaceURI
     * @param elem
     * @param elementName
     * @return The element value
     */
-    protected String getElementValue(Element elem, String elementName) {
+    protected String getElementValue(String namespaceURI, Element elem, String elementName) {

-        NodeList list = elem.getElementsByTagName(elementName);
+        NodeList list = elem.getElementsByTagNameNS(namespaceURI, elementName);
        if (list == null)
            return null;
        Element e = (Element) list.item(0);
@ -733,6 +778,21 @@ public class SiteMapParser {
        return null;
    }

+    /**
+     * Get the element's textual content. This will match any namespace
+     * (elementName is the localName).
+     * 
+     * @param elem
+     *            The element is a child of "elem"
+     * @param elementName
+     *            The element name is "elementName".
+     * @return The element value
+     */
+    protected String getElementValue(Element elem, String elementName) {
+
+        return getElementValue("*", elem, elementName);
+    }
+
    /**
     * Get the element's attribute value.
     * 
@ -743,7 +803,7 @@ public class SiteMapParser {
     */
    protected String getElementAttributeValue(Element elem, String elementName, String attributeName) {

-        NodeList list = elem.getElementsByTagName(elementName);
+        NodeList list = elem.getElementsByTagNameNS("*", elementName);
        Element e = (Element) list.item(0);
        if (e != null) {
            return e.getAttribute(attributeName);
--- a/src/main/java/crawlercommons/sitemaps/SiteMapParserSAX.java
+++ b/src/main/java/crawlercommons/sitemaps/SiteMapParserSAX.java
@ -88,6 +88,12 @@ public class SiteMapParserSAX extends SiteMapParser {

    private boolean allowPartial = false;

+    /**
+     * Indicates whether the parser should work with the namespace from the
+     * specifications or any namespace. Defaults to false.
+     **/
+    protected boolean strictNamespace = false;
+
    public SiteMapParserSAX() {
        this(true, false);
    }
@ -109,6 +115,22 @@ public class SiteMapParserSAX extends SiteMapParser {
        return strict;
    }

+    /**
+     * @return whether the parser allows any namespace or just the one from the
+     *         specification
+     */
+    public boolean isStrictNamespace() {
+        return strictNamespace;
+    }
+
+    /**
+     * Sets the parser to allow any namespace or just the one from the
+     * specification
+     */
+    public void setStrictNamespace(boolean s) {
+        strictNamespace = s;
+    }
+
    /**
     * Returns a SiteMap or SiteMapIndex given an online sitemap URL
     *
@ -236,7 +258,8 @@ public class SiteMapParserSAX extends SiteMapParser {
                }
                throw new UnknownFormatException("Can't parse a gzipped sitemap with the embedded MediaType of: " + embeddedType + " (at: " + url + ")");
            }
-            mediaType = MEDIA_TYPE_REGISTRY.getSupertype(mediaType); // Check parent
+            mediaType = MEDIA_TYPE_REGISTRY.getSupertype(mediaType); // Check
+                                                                     // parent
        }

        throw new UnknownFormatException("Can't parse a sitemap with the MediaType of: " + contentType + " (at: " + url + ")");
@ -356,9 +379,15 @@ public class SiteMapParserSAX extends SiteMapParser {
    protected AbstractSiteMap processXml(URL sitemapUrl, InputSource is) throws UnknownFormatException {

        SAXParserFactory factory = SAXParserFactory.newInstance();
-        // disable validation and avoid that remote DTDs, schemas, etc. are fetched
+
+        // disable validation and avoid that remote DTDs, schemas, etc. are
+        // fetched
        factory.setValidating(false);
        factory.setXIncludeAware(false);
+
+        // support the use of an explicit namespace.
+        factory.setNamespaceAware(true);
+
        try {
            factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
        } catch (Exception e) {
@ -374,9 +403,14 @@ public class SiteMapParserSAX extends SiteMapParser {
                    return new InputSource(new StringReader(""));
                }
            });
+            handler.setStrictNamespace(isStrictNamespace());
            saxParser.parse(is, handler);
            AbstractSiteMap sitemap = handler.getSiteMap();
            if (sitemap == null) {
+                UnknownFormatException ex = handler.getException();
+                if (ex != null) {
+                    throw ex;
+                }
                throw new UnknownFormatException("Unknown XML format for: " + sitemapUrl);
            }
            return sitemap;
--- a/src/main/java/crawlercommons/sitemaps/SiteMapTester.java
+++ b/src/main/java/crawlercommons/sitemaps/SiteMapTester.java
@ -44,6 +44,8 @@ public class SiteMapTester {
            LOG.error("               bypass automatic MIME type detection");
            LOG.error("Java properties:");
            LOG.error("  sitemap.useSax  if true use SAX parser to process sitemaps");
+            LOG.error("  sitemap.strictNamespace");
+            LOG.error("                  if true sitemaps are required to use the standard namespace URI");
        } else {
            URL url = new URL(args[0]);
            String mt = (args.length > 1) ? args[1] : null;
@ -60,6 +62,7 @@ public class SiteMapTester {
        byte[] content = IOUtils.toByteArray(url);

        boolean useSaxParser = new Boolean(System.getProperty("sitemap.useSax"));
+        boolean strictNamespace = new Boolean(System.getProperty("sitemap.strictNamespace"));

        LOG.info("Parsing {} {} using {} parser", url, ((mt != null && !mt.isEmpty()) ? "as MIME type " + mt : ""), (useSaxParser ? "SAX" : "DOM"));

@ -67,6 +70,7 @@ public class SiteMapTester {
        if (useSaxParser) {
            p = saxParser;
        }
+        p.setStrictNamespace(strictNamespace);

        AbstractSiteMap sm = null;
        // guesses the mimetype
--- a/src/main/java/crawlercommons/sitemaps/sax/AtomHandler.java
+++ b/src/main/java/crawlercommons/sitemaps/sax/AtomHandler.java
@ -78,11 +78,11 @@ class AtomHandler extends DelegatorHandler {
    }

    public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
-        if ("entry".equals(qName)) {
+        if ("entry".equals(localName)) {
            loc = null;
            lastMod = null;
            rel = null;
-        } else if ("link".equals(qName)) {
+        } else if ("link".equals(localName)) {
            String href = attributes.getValue("href");
            if (href == null)
                return;
@ -91,8 +91,10 @@ class AtomHandler extends DelegatorHandler {
            String r = attributes.getValue("rel");
            if (loc == null || (!valid && v) || (rel != null && r == null)) {
                // - first link, or in case of multiple links:
-                // - (for a strict parser only) this link is valid and the first one is not valid
-                // - has no rel attribute while the first one does (e.g., rel="edit", rel="alternate")
+                // - (for a strict parser only) this link is valid and the first
+                // one is not valid
+                // - has no rel attribute while the first one does (e.g.,
+                // rel="edit", rel="alternate")
                try {
                    loc = new URL(href);
                    rel = r;
@ -114,9 +116,9 @@ class AtomHandler extends DelegatorHandler {
    }

    public void characters(char[] ch, int start, int length) throws SAXException {
-        String qName = super.currentElement();
+        String localName = super.currentElement();
        String value = String.valueOf(ch, start, length);
-        if ("updated".equals(qName)) {
+        if ("updated".equals(localName)) {
            lastMod = value;
        }
    }
--- a/src/main/java/crawlercommons/sitemaps/sax/DelegatorHandler.java
+++ b/src/main/java/crawlercommons/sitemaps/sax/DelegatorHandler.java
@ -25,6 +25,7 @@ import org.xml.sax.SAXParseException;
 import org.xml.sax.helpers.DefaultHandler;

 import crawlercommons.sitemaps.AbstractSiteMap;
+import crawlercommons.sitemaps.Namespace;
 import crawlercommons.sitemaps.UnknownFormatException;

 /**
@ -37,6 +38,7 @@ public class DelegatorHandler extends DefaultHandler {
    private DelegatorHandler delegate;
    private URL url;
    private boolean strict;
+    private boolean strictNamespace;
    private UnknownFormatException exception;

    protected DelegatorHandler(LinkedList<String> elementStack, boolean strict) {
@ -58,11 +60,27 @@ public class DelegatorHandler extends DefaultHandler {
        return strict;
    }

+    /**
+     * @return whether the parser allows any namespace or just the one from the
+     *         specification
+     */
+    public boolean isStrictNamespace() {
+        return strictNamespace;
+    }
+
+    /**
+     * Sets the parser to allow any namespace or just the one from the
+     * specification
+     */
+    public void setStrictNamespace(boolean s) {
+        strictNamespace = s;
+    }
+
    protected void setException(UnknownFormatException exception) {
        this.exception = exception;
    }

-    protected UnknownFormatException getException() {
+    public UnknownFormatException getException() {
        return exception;
    }

@ -70,7 +88,7 @@ public class DelegatorHandler extends DefaultHandler {
        if (elementStack.isEmpty() || delegate == null) {
            startRootElement(uri, localName, qName, attributes);
        } else {
-            elementStack.push(qName);
+            elementStack.push(localName);
        }
        if (delegate != null) {
            delegate.startElement(uri, localName, qName, attributes);
@ -78,23 +96,32 @@ public class DelegatorHandler extends DefaultHandler {
    }

    private void startRootElement(String uri, String localName, String qName, Attributes attributes) {
-        elementStack.push(qName);
-        if ("sitemapindex".equals(qName)) {
-            delegate = new XMLIndexHandler(url, elementStack, strict);
-        } else if ("urlset".equals(qName)) {
-            delegate = new XMLHandler(url, elementStack, strict);
-        } else if ("feed".equals(qName)) {
+        elementStack.push(localName);
+
+        if ("feed".equals(localName)) {
            delegate = new AtomHandler(url, elementStack, strict);
        }
-        // See if it is a RSS feed by looking for a "channel" element. This
-        // avoids the issue
+        // See if it is a RSS feed by looking for the localName "channel"
+        // element .
+        // This avoids the issue
        // of having the outer tag named <rdf:RDF> that was causing this code to
        // fail. Inside of
        // the <rss> or <rdf> tag is a <channel> tag, so we can use that.
        // See https://github.com/crawler-commons/crawler-commons/issues/87
        // and also RSS 1.0 specification http://web.resource.org/rss/1.0/spec
-        else if ("channel".equals(qName)) {
+        else if ("channel".equals(localName)) {
            delegate = new RSSHandler(url, elementStack, strict);
+        } else if (isStrictNamespace() && !Namespace.SITEMAP.equals(uri)) {
+            setException(new UnknownFormatException("Namespace " + uri + " does not match standard namespace " + Namespace.SITEMAP));
+            return;
+        } else if ("sitemapindex".equals(localName)) {
+            delegate = new XMLIndexHandler(url, elementStack, strict);
+        } else if ("urlset".equals(localName)) {
+            delegate = new XMLHandler(url, elementStack, strict);
+        }
+        if (delegate != null) {
+            // configure delegate
+            delegate.setStrictNamespace(isStrictNamespace());
        }
    }

--- a/src/main/java/crawlercommons/sitemaps/sax/XMLHandler.java
+++ b/src/main/java/crawlercommons/sitemaps/sax/XMLHandler.java
@ -29,6 +29,7 @@ import org.xml.sax.SAXParseException;

 import crawlercommons.sitemaps.AbstractSiteMap;
 import crawlercommons.sitemaps.AbstractSiteMap.SitemapType;
+import crawlercommons.sitemaps.Namespace;
 import crawlercommons.sitemaps.SiteMap;
 import crawlercommons.sitemaps.SiteMapURL;

@ -61,6 +62,7 @@ class XMLHandler extends DelegatorHandler {
    private String changeFreq;
    private String priority;
    private int i = 0;
+    private boolean currentElementNamespaceIsValid;

    XMLHandler(URL url, LinkedList<String> elementStack, boolean strict) {
        super(elementStack, strict);
@ -70,8 +72,14 @@ class XMLHandler extends DelegatorHandler {
    }

    public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
+        if (isStrictNamespace() && !Namespace.SITEMAP.equals(uri)) {
+            currentElementNamespaceIsValid = false;
+            return;
+        }
+        currentElementNamespaceIsValid = true;
+
        // flush any unclosed or missing URL element
-        if (loc.length() > 0 && ("loc".equals(qName) || "url".equals(qName))) {
+        if (loc.length() > 0 && ("loc".equals(localName) || "url".equals(localName))) {
            // check whether loc isn't white space only
            for (int i = 0; i < loc.length(); i++) {
                if (!Character.isWhitespace(loc.charAt(i))) {
@ -80,7 +88,7 @@ class XMLHandler extends DelegatorHandler {
                }
            }
            loc = new StringBuilder();
-            if ("url".equals(qName)) {
+            if ("url".equals(localName)) {
                // reset also attributes
                lastMod = null;
                changeFreq = null;
@ -90,23 +98,29 @@ class XMLHandler extends DelegatorHandler {
    }

    public void endElement(String uri, String localName, String qName) throws SAXException {
-        if ("url".equals(qName) && "urlset".equals(currentElementParent())) {
+        if (isStrictNamespace() && !Namespace.SITEMAP.equals(uri)) {
+            return;
+        }
+        if ("url".equals(localName) && "urlset".equals(currentElementParent())) {
            maybeAddSiteMapUrl();
-        } else if ("urlset".equals(qName)) {
+        } else if ("urlset".equals(localName)) {
            sitemap.setProcessed(true);
        }
    }

    public void characters(char[] ch, int start, int length) throws SAXException {
-        String qName = super.currentElement();
+        if (isStrictNamespace() && !currentElementNamespaceIsValid) {
+            return;
+        }
+        String localName = super.currentElement();
        String value = String.valueOf(ch, start, length);
-        if ("loc".equals(qName) || "url".equals(qName)) {
+        if ("loc".equals(localName) || "url".equals(localName)) {
            loc.append(value);
-        } else if ("changefreq".equals(qName)) {
+        } else if ("changefreq".equals(localName)) {
            changeFreq = value;
-        } else if ("lastmod".equals(qName)) {
+        } else if ("lastmod".equals(localName)) {
            lastMod = value;
-        } else if ("priority".equals(qName)) {
+        } else if ("priority".equals(localName)) {
            priority = value;
        }
    }
--- a/src/main/java/crawlercommons/sitemaps/sax/XMLIndexHandler.java
+++ b/src/main/java/crawlercommons/sitemaps/sax/XMLIndexHandler.java
@ -28,6 +28,7 @@ import org.xml.sax.SAXException;
 import org.xml.sax.SAXParseException;

 import crawlercommons.sitemaps.AbstractSiteMap;
+import crawlercommons.sitemaps.Namespace;
 import crawlercommons.sitemaps.SiteMap;
 import crawlercommons.sitemaps.SiteMapIndex;
 import crawlercommons.sitemaps.AbstractSiteMap.SitemapType;
@ -70,6 +71,9 @@ class XMLIndexHandler extends DelegatorHandler {
    }

    public void endElement(String uri, String localName, String qName) throws SAXException {
+        if (isStrictNamespace() && !Namespace.SITEMAP.equals(uri)) {
+            return;
+        }
        if ("sitemap".equals(currentElement())) {
            maybeAddSiteMap();
        } else if ("sitemapindex".equals(currentElement())) {
--- a/src/test/java/crawlercommons/filters/basic/BasicURLNormalizerTest.java
+++ b/src/test/java/crawlercommons/filters/basic/BasicURLNormalizerTest.java
@ -111,8 +111,7 @@ public class BasicURLNormalizerTest {
        normalizeTest("http://foo.com:81/", "http://foo.com:81/");
        // check that empty port is removed
        normalizeTest("http://example.com:/", "http://example.com/");
-        normalizeTest("https://example.com:/foobar.html",
-                "https://example.com/foobar.html");
+        normalizeTest("https://example.com:/foobar.html", "https://example.com/foobar.html");

        // check that null path is normalized
        normalizeTest("http://foo.com", "http://foo.com/");
--- a/src/test/java/crawlercommons/sitemaps/SiteMapParserSAXTest.java
+++ b/src/test/java/crawlercommons/sitemaps/SiteMapParserSAXTest.java
@ -36,6 +36,8 @@ import org.junit.runners.JUnit4;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

+import crawlercommons.sitemaps.AbstractSiteMap.SitemapType;
+
 import static org.junit.Assert.*;

@RunWith(JUnit4.class)
@ -92,6 +94,50 @@ public class SiteMapParserSAXTest {
        assertEquals("http://www.example.com/dynsitemap?date=lastyear&all=false", currentSiteMap.getUrl().toString());
    }

+    @Test
+    public void testSitemapWithNamespace() throws UnknownFormatException, IOException {
+        SiteMapParser parser = new SiteMapParserSAX();
+        parser.setStrictNamespace(true);
+        byte[] content = getResourceAsBytes("src/test/resources/sitemaps/sitemap.ns.xml");
+
+        URL url = new URL("http://www.example.com/sitemap.ns.xml");
+        AbstractSiteMap asm = parser.parseSiteMap(content, url);
+        assertEquals(SitemapType.XML, asm.getType());
+        assertEquals(true, asm instanceof SiteMap);
+        assertEquals(true, asm.isProcessed());
+        SiteMap sm = (SiteMap) asm;
+
+        assertEquals(2, sm.getSiteMapUrls().size());
+        assertEquals(SiteMapURL.ChangeFrequency.DAILY, sm.getSiteMapUrls().iterator().next().getChangeFrequency());
+    }
+
+    @Test
+    public void testSitemapWithWrongNamespace() throws UnknownFormatException, IOException {
+        SiteMapParser parser = new SiteMapParserSAX();
+        parser.setStrictNamespace(true);
+
+        byte[] content = getResourceAsBytes("src/test/resources/sitemaps/sitemap.badns.xml");
+
+        URL url = new URL("http://www.example.com/sitemap.badns.xml");
+        AbstractSiteMap asm;
+        try {
+            asm = parser.parseSiteMap(content, url);
+            fail("Expected an UnknownFormatException because of wrong namespace");
+        } catch (UnknownFormatException e) {
+            assertTrue(e.getMessage().contains("does not match standard namespace"));
+        }
+
+        // try again in lenient mode
+        parser.setStrictNamespace(false);
+        asm = parser.parseSiteMap(content, url);
+        assertEquals(SitemapType.XML, asm.getType());
+        assertEquals(true, asm instanceof SiteMap);
+        assertEquals(true, asm.isProcessed());
+        SiteMap sm = (SiteMap) asm;
+
+        assertEquals(2, sm.getSiteMapUrls().size());
+    }
+
    @Test
    public void testFullDateFormat() {
        SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm+hh:00", Locale.ROOT);
--- a/src/test/java/crawlercommons/sitemaps/SiteMapParserTest.java
+++ b/src/test/java/crawlercommons/sitemaps/SiteMapParserTest.java
@ -96,6 +96,49 @@ public class SiteMapParserTest {
        assertEquals("http://www.example.com/dynsitemap?date=lastyear&all=false", currentSiteMap.getUrl().toString());
    }

+    @Test
+    public void testSitemapWithNamespace() throws UnknownFormatException, IOException {
+        SiteMapParser parser = new SiteMapParser();
+        byte[] content = getResourceAsBytes("src/test/resources/sitemaps/sitemap.ns.xml");
+
+        URL url = new URL("http://www.example.com/sitemap.ns.xml");
+        AbstractSiteMap asm = parser.parseSiteMap(content, url);
+        assertEquals(SitemapType.XML, asm.getType());
+        assertEquals(true, asm instanceof SiteMap);
+        assertEquals(true, asm.isProcessed());
+        SiteMap sm = (SiteMap) asm;
+
+        assertEquals(2, sm.getSiteMapUrls().size());
+        assertEquals(SiteMapURL.ChangeFrequency.DAILY, sm.getSiteMapUrls().iterator().next().getChangeFrequency());
+    }
+
+    @Test
+    public void testSitemapWithWrongNamespace() throws UnknownFormatException, IOException {
+        SiteMapParser parser = new SiteMapParser();
+        parser.setStrictNamespace(true);
+
+        byte[] content = getResourceAsBytes("src/test/resources/sitemaps/sitemap.badns.xml");
+
+        URL url = new URL("http://www.example.com/sitemap.badns.xml");
+        AbstractSiteMap asm = parser.parseSiteMap(content, url);
+        assertEquals(SitemapType.XML, asm.getType());
+        assertEquals(true, asm instanceof SiteMap);
+        assertEquals(true, asm.isProcessed());
+        SiteMap sm = (SiteMap) asm;
+
+        assertEquals(0, sm.getSiteMapUrls().size());
+
+        // try again in lenient mode
+        parser.setStrictNamespace(false);
+        asm = parser.parseSiteMap(content, url);
+        assertEquals(SitemapType.XML, asm.getType());
+        assertEquals(true, asm instanceof SiteMap);
+        assertEquals(true, asm.isProcessed());
+        sm = (SiteMap) asm;
+
+        assertEquals(2, sm.getSiteMapUrls().size());
+    }
+
    @Test
    public void testFullDateFormat() {
        SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm+hh:00", Locale.ROOT);
@ -364,7 +407,12 @@ public class SiteMapParserTest {
        AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
        assertSame("Not an RSS", SitemapType.RSS, asm.getType());
        assertNotNull("GMT timestamp not parsed", asm.getLastModified());
-        assertEquals("GMT timestamp", 1483619690000L, asm.getLastModified().getTime()); // Thu, 05 Jan 17 12:34:50 GMT
+        assertEquals("GMT timestamp", 1483619690000L, asm.getLastModified().getTime()); // Thu,
+                                                                                        // 05
+                                                                                        // Jan
+                                                                                        // 17
+                                                                                        // 12:34:50
+                                                                                        // GMT

        SiteMap rss = (SiteMap) asm;
        assertEquals("Incorrect items count", 7, rss.getSiteMapUrls().size());
@ -474,7 +522,7 @@ public class SiteMapParserTest {

    /**
     * Read a test resource file and return its content as byte array.
-     *
+     * 
     * @param resourceName
     *            path to the resource file
     * @return byte content of the file
--- a/src/test/resources/sitemaps/sitemap.badns.xml
+++ b/src/test/resources/sitemaps/sitemap.badns.xml
@ -0,0 +1,16 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<urlset xmlns="http://www.google.com/schemas/sitemap/0.9"
+	xmlns:xhtml="http://www.w3.org/1999/xhtml">
+	<url>
+		<loc>http://www.example.com/1</loc>
+		<changefreq>daily</changefreq>
+	</url>
+	<url>
+		<loc>
+			http://www.example.com/2
+		</loc>
+		<changefreq>
+			daily
+		</changefreq>
+	</url>
+</urlset>
--- a/src/test/resources/sitemaps/sitemap.ns.xml
+++ b/src/test/resources/sitemaps/sitemap.ns.xml
@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<sit:urlset xmlns:image="http://www.google.com/schemas/sitemap-image/1.1" xmlns:video="http://www.google.com/schemas/sitemap-video/1.1" xmlns:sit="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.w3.org/1999/xhtml link.xsd http://www.google.com/schemas/sitemap-video/1.1 video.xsd http://www.sitemaps.org/schemas/sitemap/0.9 sitemap.xsd http://www.google.com/schemas/sitemap-image/1.1 image.xsd">
+    <sit:url>
+        <sit:loc>http://www.example.com/1</sit:loc>
+        <sit:changefreq>daily</sit:changefreq>
+    </sit:url>
+    <sit:url>
+        <sit:loc>http://www.example.com/2</sit:loc>
+        <sit:changefreq>daily</sit:changefreq>
+    </sit:url>
+</sit:urlset>