Sitemap: removed DOM parser, fixes #177 (#181)

2024-05-09 23:56:04 +02:00 · 2017-10-27 07:48:22 +01:00 · 2017-10-27 07:48:22 +01:00 · ee69049db0
parent d2de87cf92
commit ee69049db0
6 changed files with 87 additions and 1521 deletions
--- a/src/main/java/crawlercommons/sitemaps/SiteMapParser.java
+++ b/src/main/java/crawlercommons/sitemaps/SiteMapParser.java
@ -29,12 +29,12 @@ import java.io.StringReader;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.ArrayList;
-import java.util.Date;
 import java.util.List;
 import java.util.zip.GZIPInputStream;

-import javax.xml.parsers.DocumentBuilder;
-import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;

 import org.apache.commons.io.FilenameUtils;
 import org.apache.commons.io.IOUtils;
@ -44,17 +44,12 @@ import org.apache.tika.mime.MediaType;
 import org.apache.tika.mime.MediaTypeRegistry;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
 import org.xml.sax.EntityResolver;
-import org.xml.sax.ErrorHandler;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
-import org.xml.sax.SAXParseException;

 import crawlercommons.sitemaps.AbstractSiteMap.SitemapType;
+import crawlercommons.sitemaps.sax.DelegatorHandler;

 public class SiteMapParser {
    public static final Logger LOG = LoggerFactory.getLogger(SiteMapParser.class);
@ -87,11 +82,12 @@ public class SiteMapParser {
    /**
     * True (by default) meaning that invalid URLs should be rejected, as the
     * official docs allow the siteMapURLs to be only under the base url:
-     * http://www.sitemaps.org/protocol.html#location Also checks that the
-     * correct namespace is used.
+     * http://www.sitemaps.org/protocol.html#location
     */
    protected boolean strict = true;

+    private boolean allowPartial = false;
+
    /**
     * Indicates whether the parser should work with the namespace from the
     * specifications or any namespace. Defaults to false.
@ -99,11 +95,16 @@ public class SiteMapParser {
    protected boolean strictNamespace = false;

    public SiteMapParser() {
-        // default constructor
+        this(true, false);
    }

    public SiteMapParser(boolean strict) {
+        this(strict, false);
+    }
+
+    public SiteMapParser(boolean strict, boolean allowPartial) {
        this.strict = strict;
+        this.allowPartial = allowPartial;
    }

    /**
@ -124,7 +125,7 @@ public class SiteMapParser {

    /**
     * Sets the parser to allow any namespace or just the one from the
-     *         specification
+     * specification
     */
    public void setStrictNamespace(boolean s) {
        strictNamespace = s;
@ -376,442 +377,65 @@ public class SiteMapParser {
     *             {@link org.xml.sax.InputSource}
     */
    protected AbstractSiteMap processXml(URL sitemapUrl, InputSource is) throws UnknownFormatException {
-        Document doc = null;
+
+        SAXParserFactory factory = SAXParserFactory.newInstance();
+
+        // disable validation and avoid that remote DTDs, schemas, etc. are
+        // fetched
+        factory.setValidating(false);
+        factory.setXIncludeAware(false);
+
+        // support the use of an explicit namespace.
+        factory.setNamespaceAware(true);

        try {
-            DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
-
-            // disable validation and avoid that remote DTDs, schemas, etc. are
-            // fetched
-            dbf.setValidating(false);
-
-            // support an explicitly named namespace.
-            dbf.setNamespaceAware(true);
-
-            dbf.setXIncludeAware(false);
-            dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
-            DocumentBuilder db = dbf.newDocumentBuilder();
-
-            db.setEntityResolver(new EntityResolver() {
+            factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
+        } catch (Exception e) {
+            throw new RuntimeException("Failed to configure XML parser: " + e.toString());
+        }
+        DelegatorHandler handler = new DelegatorHandler(sitemapUrl, strict);
+        try {
+            SAXParser saxParser = factory.newSAXParser();
+            saxParser.getXMLReader().setEntityResolver(new EntityResolver() {
                // noop entity resolver, does not fetch remote content
                @Override
                public InputSource resolveEntity(String publicId, String systemId) {
                    return new InputSource(new StringReader(""));
                }
            });
-
-            db.setErrorHandler(new ErrorHandler() {
-                public void warning(SAXParseException e) throws SAXException {
-                    LOG.warn("Warning parsing XML: {}", e.toString());
+            handler.setStrictNamespace(isStrictNamespace());
+            saxParser.parse(is, handler);
+            AbstractSiteMap sitemap = handler.getSiteMap();
+            if (sitemap == null) {
+                UnknownFormatException ex = handler.getException();
+                if (ex != null) {
+                    throw ex;
                }
-
-                public void fatalError(SAXParseException e) throws SAXException {
-                    LOG.error("Fatal error parsing XML: {}", e.toString());
-                    throw e;
-                }
-
-                public void error(SAXParseException e) throws SAXException {
-                    LOG.error("Error parsing XML: {}", e.toString());
-                    throw e;
-                }
-            });
-            doc = db.parse(is);
-        } catch (Exception e) {
-            LOG.debug(e.toString(), e);
-            throw new UnknownFormatException("Error parsing XML for: " + sitemapUrl);
-        }
-
-        // See if this is a sitemap index
-        NodeList nodeList = doc.getElementsByTagNameNS("*", "sitemapindex");
-        if (nodeList.getLength() > 0) {
-            nodeList = doc.getElementsByTagNameNS("*", "sitemap");
-            return parseSitemapIndex(sitemapUrl, nodeList);
-        } else if (doc.getElementsByTagNameNS("*", "urlset").getLength() > 0) {
-            // This is a regular Sitemap
-            return parseXmlSitemap(sitemapUrl, doc);
-        } else if (doc.getElementsByTagNameNS("*", "link").getLength() > 0) {
-            // Could be RSS or Atom
-            return parseSyndicationFormat(sitemapUrl, doc);
-        }
-
-        throw new UnknownFormatException("Unknown XML format for: " + sitemapUrl);
-    }
-
-    /**
-     * Parse XML that contains a valid Sitemap. Example of a Sitemap:
-     * 
-     * <pre>
-     * {@code 
-     * <?xml version="1.0" encoding="UTF-8"?> 
-     *   <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> 
-     *     <url>
-     *       <loc>http://www.example.com/</loc> 
-     *       <lastmod>lastmod>2005-01-01</lastmod>
-     *       <changefreq>monthly</changefreq>
-     *       <priority>0.8</priority>
-     *     </url> 
-     *     <url>
-     *       <loc>http://www.example.com/catalog?item=12&amp;desc=vacation_hawaii</loc>
-     *       <changefreq>weekly</changefreq>
-     *     </url>
-     *   </urlset>
-     * }
-     * </pre>
-     * 
-     * @param sitemapUrl
-     *            a sitemap {@link java.net.URL}
-     * @param doc
-     *            a {@link org.w3c.dom.Document} sitemap snippet
-     * @return The sitemap
-     */
-    protected SiteMap parseXmlSitemap(URL sitemapUrl, Document doc) {
-
-        SiteMap sitemap = new SiteMap(sitemapUrl);
-        sitemap.setType(SitemapType.XML);
-
-        String namespace = Namespace.SITEMAP;
-        if (!strictNamespace) {
-            namespace = "*";
-        }
-
-        NodeList list = doc.getElementsByTagNameNS(namespace, "url");
-
-        // Loop through the <url>s
-        for (int i = 0; i < list.getLength(); i++) {
-
-            Node n = list.item(i);
-            if (n.getNodeType() == Node.ELEMENT_NODE) {
-                Element elem = (Element) n;
-                String lastMod = getElementValue(namespace, elem, "lastmod");
-                String changeFreq = getElementValue(namespace, elem, "changefreq");
-                String priority = getElementValue(namespace, elem, "priority");
-                String loc = getElementValue(namespace, elem, "loc");
-
-                addUrlIntoSitemap(loc, sitemap, lastMod, changeFreq, priority, i);
+                throw new UnknownFormatException("Unknown XML format for: " + sitemapUrl);
            }
-        }
-
-        sitemap.setProcessed(true);
-        return sitemap;
-    }
-
-    /**
-     * Parse XML that contains a Sitemap Index. Example Sitemap Index:
-     * 
-     * <pre>
-     * {@code
-     * <?xml version="1.0" encoding="UTF-8"?> 
-     *   <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
-     *     <sitemap>
-     *       <loc>http://www.example.com/sitemap1.xml.gz</loc>
-     *       <lastmod>2004-10-01T18:23:17+00:00</lastmod>
-     *     </sitemap>
-     *     <sitemap>
-     *       <loc>http://www.example.com/sitemap2.xml.gz</loc>
-     *       <lastmod>2005-01-01</lastmod>
-     *     </sitemap>
-     *   </sitemapindex>
-     * }
-     * </pre>
-     * 
-     * @param url
-     *            - URL of Sitemap Index
-     * @param nodeList
-     *            a {@link org.w3c.dom.NodeList} backing the sitemap
-     * @return The site map index
-     */
-    protected SiteMapIndex parseSitemapIndex(URL url, NodeList nodeList) {
-
-        LOG.debug("Parsing Sitemap Index");
-
-        SiteMapIndex sitemapIndex = new SiteMapIndex(url);
-        sitemapIndex.setType(SitemapType.INDEX);
-
-        // Loop through the <sitemap>s
-        for (int i = 0; i < nodeList.getLength() && i < MAX_URLS; i++) {
-
-            Node firstNode = nodeList.item(i);
-
-            if (firstNode.getNodeType() == Node.ELEMENT_NODE) {
-                Element elem = (Element) firstNode;
-                String loc = null;
-                String namespace = Namespace.SITEMAP;
-                if (!strictNamespace) {
-                    namespace = "*";
-                }
-                loc = getElementValue(namespace, elem, "loc");
-
-                // try the text content when no loc element
-                // has been specified
-                if (loc == null) {
-                    loc = elem.getTextContent().trim();
-                }
-
-                try {
-                    URL sitemapUrl = new URL(loc);
-                    String lastmod = getElementValue(namespace, elem, "lastmod");
-                    Date lastModified = SiteMap.convertToDate(lastmod);
-
-                    // Right now we are not worried about sitemapUrls that point
-                    // to different websites.
-
-                    SiteMap s = new SiteMap(sitemapUrl, lastModified);
-                    sitemapIndex.addSitemap(s);
-                    LOG.debug("  {}. {}", (i + 1), s);
-                } catch (MalformedURLException e) {
-                    LOG.trace("Don't create an entry with a bad URL", e);
-                    LOG.debug("Bad url: [{}]", loc);
-                }
-            }
-        }
-        sitemapIndex.setProcessed(true);
-        return sitemapIndex;
-    }
-
-    /**
-     * Parse the XML document, looking for a <b>feed</b> element to determine if
-     * it's an <b>Atom doc</b> <b>rss</b> to determine if it's an <b>RSS
-     * doc</b>.
-     * 
-     * @param sitemapUrl
-     *            the URL location of the Sitemap
-     * @param doc
-     *            - XML document to parse
-     * @return The sitemap
-     * @throws UnknownFormatException
-     *             if XML does not appear to be Atom or RSS
-     */
-    protected SiteMap parseSyndicationFormat(URL sitemapUrl, Document doc) throws UnknownFormatException {
-
-        SiteMap sitemap = new SiteMap(sitemapUrl);
-
-        // See if this is an Atom feed by looking for "feed" element
-        NodeList list = doc.getElementsByTagNameNS("*", "feed");
-        if (list.getLength() > 0) {
-            parseAtom(sitemap, (Element) list.item(0), doc);
-            sitemap.setProcessed(true);
            return sitemap;
-        } else {
-            // See if it is a RSS feed by looking for a "channel" element. This
-            // avoids the issue
-            // of having the outer tag named <rdf:RDF> that was causing this
-            // code to fail. Inside of
-            // the <rss> or <rdf> tag is a <channel> tag, so we can use that.
-            // See https://github.com/crawler-commons/crawler-commons/issues/87
-            // and also RSS 1.0 specification
-            // http://web.resource.org/rss/1.0/spec
-            list = doc.getElementsByTagNameNS("*", "channel");
-            if (list.getLength() > 0) {
-                parseRSS(sitemap, doc);
+        } catch (IOException e) {
+            LOG.warn("Error parsing sitemap {}: {}", sitemapUrl, e.getMessage());
+            UnknownFormatException ufe = new UnknownFormatException("Failed to parse " + sitemapUrl);
+            ufe.initCause(e);
+            throw ufe;
+        } catch (SAXException e) {
+            LOG.warn("Error parsing sitemap {}: {}", sitemapUrl, e.getMessage());
+            AbstractSiteMap sitemap = handler.getSiteMap();
+            if (allowPartial && sitemap != null) {
+                LOG.warn("Processed broken/partial sitemap for '" + sitemapUrl + "'");
                sitemap.setProcessed(true);
                return sitemap;
            } else {
-                throw new UnknownFormatException("Unknown syndication format at " + sitemapUrl);
+                UnknownFormatException ufe = new UnknownFormatException("Failed to parse " + sitemapUrl);
+                ufe.initCause(e);
+                throw ufe;
            }
+        } catch (ParserConfigurationException e) {
+            throw new IllegalStateException(e);
        }
    }

-    /**
-     * <p>
-     * Parse the XML document which is assumed to be in Atom format. Atom 1.0
-     * example:
-     * </p>
-     * 
-     * <pre>
-     * {@code
-     * <?xml version="1.0" encoding="utf-8"?>
-     *   <feed xmlns="http://www.w3.org/2005/Atom">
-     *     <title>Example Feed</title>
-     *     <subtitle>A subtitle.</subtitle>
-     *     <link href="http://example.org/feed/" rel="self"/>
-     *     <link href="http://example.org/"/>
-     *     <modified>2003-12-13T18:30:02Z</modified>
-     *     <author>
-     *       <name>John Doe</name>
-     *       <email>johndoe@example.com</email>
-     *     </author>
-     *     <id>urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6</id>
-     *     <entry>
-     *       <title>Atom-Powered Robots Run Amok</title>
-     *       <link href="http://example.org/2003/12/13/atom03"/>
-     *       <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
-     *       <updated>2003-12-13T18:30:02Z</updated>
-     *       <summary>Some text.</summary>
-     *     </entry>
-     *     ...
-     *   </feed>
-     * }
-     * </pre>
-     * 
-     * @param sitemap
-     *            a {@link crawlercommons.sitemaps.SiteMap} backing the Atom
-     *            feed
-     * @param elem
-     *            {@link org.w3c.dom.Element}'s to populate from the Sitemap
-     * @param doc
-     *            {@link org.w3c.dom.Document} to populate with the parse output
-     */
-    protected void parseAtom(SiteMap sitemap, Element elem, Document doc) {
-
-        // Grab items from <feed><entry><link href="URL" /></entry></feed>
-        // Use lastmod date from <feed><modified>DATE</modified></feed>
-
-        LOG.debug("Parsing Atom XML");
-
-        sitemap.setType(SitemapType.ATOM);
-
-        String lastMod = getElementValue(elem, "modified");
-        LOG.debug("lastMod = {}", lastMod);
-
-        NodeList list = doc.getElementsByTagNameNS("*", "entry");
-
-        // Loop through the <entry>s
-        for (int i = 0; i < list.getLength() && i < MAX_URLS; i++) {
-
-            Node n = list.item(i);
-            if (n.getNodeType() == Node.ELEMENT_NODE) {
-                elem = (Element) n;
-                String href = getElementAttributeValue(elem, "link", "href");
-
-                addUrlIntoSitemap(href, sitemap, lastMod, null, null, i);
-            }
-        }
-    }
-
-    /**
-     * Parse XML document which is assumed to be in RSS format. RSS 2.0 example:
-     * 
-     * <pre>
-     * {@code
-     * <?xml version="1.0"?>
-     *   <rss version="2.0">
-     *     <channel> 
-     *       <title>Lift Off News</title>
-     *       <link>http://liftoff.msfc.nasa.gov/</link>
-     *       <description>Liftoff to Space Exploration.</description>
-     *       <language>en-us</language>
-     *       <pubDate>Tue, 10 Jun 2003 04:00:00 GMT</pubDate>
-     *       <lastBuildDate>Tue, 10 Jun 2003 09:41:01 GMT</lastBuildDate>
-     *       <docs>http://blogs.law.harvard.edu/tech/rss</docs>
-     *       <generator>Weblog Editor 2.0</generator>
-     *       <managingEditor>editor@example.com</managingEditor>
-     *       <webMaster>webmaster@example.com</webMaster>
-     *       <ttl>5</ttl>
-     *       <item>
-     *         <title>Star City</title>
-     *         <link>http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp</link>
-     *         <description>How do Americans get ready to work with Russians aboard the
-     *         International Space Station? They take a crash course in culture,
-     *         language and protocol at Russia's Star City.
-     *         </description>
-     *         <pubDate>Tue, 03 Jun 2003 09:39:21 GMT</pubDate>
-     *         <guid>http://liftoff.msfc.nasa.gov/2003/06/03.html#item573</guid>
-     *       </item>
-     *       <item>
-     *         <title>Space Exploration</title>
-     *         <link>http://liftoff.msfc.nasa.gov/</link>
-     *         <description>Sky watchers in Europe, Asia, and parts of Alaska and Canada 
-     *         will experience a partial eclipse of the Sun on Saturday, May 31.
-     *         </description>
-     *         <pubDate>Fri, 30 May 2003 11:06:42 GMT</pubDate>
-     *         <guid>http://liftoff.msfc.nasa.gov/2003/05/30.html#item572</guid>
-     *       </item>
-     *     </channel>
-     *   </rss>
-     * }
-     * </pre>
-     * 
-     * @param sitemap
-     *            a {@link crawlercommons.sitemaps.SiteMap} object to populate
-     *            with the RCC content
-     * @param doc
-     *            {@link org.w3c.dom.Document} to populate with the parse output
-     */
-    protected void parseRSS(SiteMap sitemap, Document doc) {
-
-        // Grab items from <item><link>URL</link></item>
-        // and last modified date from <pubDate>DATE</pubDate>
-
-        LOG.debug("Parsing RSS doc");
-        sitemap.setType(SitemapType.RSS);
-        NodeList list = doc.getElementsByTagNameNS("*", "channel");
-        Element elem = (Element) list.item(0);
-
-        // Treat publication date as last mod (Tue, 10 Jun 2003 04:00:00 GMT)
-        String channelLastMod = AbstractSiteMap.normalizeRSSTimestamp(getElementValue(elem, "pubDate"));
-        LOG.debug("channel's lastMod = {}", channelLastMod);
-        sitemap.setLastModified(channelLastMod);
-
-        list = doc.getElementsByTagNameNS("*", "item");
-        // Loop through the <item>s
-        for (int i = 0; i < list.getLength() && i < MAX_URLS; i++) {
-
-            Node n = list.item(i);
-            if (n.getNodeType() == Node.ELEMENT_NODE) {
-                elem = (Element) n;
-                String link = getElementValue(elem, "link");
-                String itemLastMod = AbstractSiteMap.normalizeRSSTimestamp(getElementValue(elem, "pubDate"));
-
-                addUrlIntoSitemap(link, sitemap, itemLastMod, null, null, i);
-            }
-        }
-    }
-
-    /**
-     * Get the element's textual content. Find element under parent element,
-     * with namespaceURI and element local-name "elementName".
-     * 
-     * @param namespaceURI
-     * @param elem
-     * @param elementName
-     * @return The element value
-     */
-    protected String getElementValue(String namespaceURI, Element elem, String elementName) {
-
-        NodeList list = elem.getElementsByTagNameNS(namespaceURI, elementName);
-        if (list == null)
-            return null;
-        Element e = (Element) list.item(0);
-        if (e != null) {
-            return e.getTextContent();
-        }
-        return null;
-    }
-
-    /**
-     * Get the element's textual content. This will match any namespace
-     * (elementName is the localName).
-     * 
-     * @param elem
-     *            The element is a child of "elem"
-     * @param elementName
-     *            The element name is "elementName".
-     * @return The element value
-     */
-    protected String getElementValue(Element elem, String elementName) {
-
-        return getElementValue("*", elem, elementName);
-    }
-
-    /**
-     * Get the element's attribute value.
-     * 
-     * @param elem
-     * @param elementName
-     * @param attributeName
-     * @return The element attribute value
-     */
-    protected String getElementAttributeValue(Element elem, String elementName, String attributeName) {
-
-        NodeList list = elem.getElementsByTagNameNS("*", elementName);
-        Element e = (Element) list.item(0);
-        if (e != null) {
-            return e.getAttribute(attributeName);
-        }
-
-        return null;
-    }
-
    /**
     * Adds the given URL to the given sitemap while showing the relevant logs
     * 
--- a/src/main/java/crawlercommons/sitemaps/SiteMapParserSAX.java
+++ b/src/main/java/crawlercommons/sitemaps/SiteMapParserSAX.java
@ -1,515 +0,0 @@
-/**
- * Copyright 2016 Crawler-Commons
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package crawlercommons.sitemaps;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.apache.tika.mime.MediaType.APPLICATION_XML;
-import static org.apache.tika.mime.MediaType.TEXT_PLAIN;
-
-import java.io.BufferedReader;
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.StringReader;
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.zip.GZIPInputStream;
-
-import javax.xml.parsers.ParserConfigurationException;
-import javax.xml.parsers.SAXParser;
-import javax.xml.parsers.SAXParserFactory;
-
-import org.apache.commons.io.FilenameUtils;
-import org.apache.commons.io.IOUtils;
-import org.apache.commons.io.input.BOMInputStream;
-import org.apache.tika.Tika;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.mime.MediaTypeRegistry;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.xml.sax.EntityResolver;
-import org.xml.sax.InputSource;
-import org.xml.sax.SAXException;
-
-import crawlercommons.sitemaps.AbstractSiteMap.SitemapType;
-import crawlercommons.sitemaps.sax.DelegatorHandler;
-
-public class SiteMapParserSAX extends SiteMapParser {
-    public static final Logger LOG = LoggerFactory.getLogger(SiteMapParserSAX.class);
-
-    /**
-     * According to the specs, 50K URLs per Sitemap is the max
-     */
-    private static final int MAX_URLS = 50000;
-
-    /**
-     * Sitemaps (including sitemap index files) &quot;must be no larger than
-     * 50MB (52,428,800 bytes)&quot; as specified in the
-     * <a href="https://www.sitemaps.org/protocol.html#index">Sitemaps XML
-     * format</a> (before Nov. 2016 the limit has been 10MB).
-     */
-    public static final int MAX_BYTES_ALLOWED = 52428800;
-
-    /* Tika's MediaType components */
-    private static final Tika TIKA = new Tika();
-    private static final MediaTypeRegistry MEDIA_TYPE_REGISTRY = MediaTypeRegistry.getDefaultRegistry();
-
-    private static final List<MediaType> XML_MEDIA_TYPES = new ArrayList<>();
-    private static final List<MediaType> TEXT_MEDIA_TYPES = new ArrayList<>();
-    private static final List<MediaType> GZ_MEDIA_TYPES = new ArrayList<>();
-
-    static {
-        initMediaTypes();
-    }
-
-    /**
-     * True (by default) meaning that invalid URLs should be rejected, as the
-     * official docs allow the siteMapURLs to be only under the base url:
-     * http://www.sitemaps.org/protocol.html#location
-     */
-    protected boolean strict = true;
-
-    private boolean allowPartial = false;
-
-    /**
-     * Indicates whether the parser should work with the namespace from the
-     * specifications or any namespace. Defaults to false.
-     **/
-    protected boolean strictNamespace = false;
-
-    public SiteMapParserSAX() {
-        this(true, false);
-    }
-
-    public SiteMapParserSAX(boolean strict) {
-        this(strict, false);
-    }
-
-    public SiteMapParserSAX(boolean strict, boolean allowPartial) {
-        this.strict = strict;
-        this.allowPartial = allowPartial;
-    }
-
-    /**
-     * @return whether invalid URLs will be rejected (where invalid means that
-     *         the url is not under the base url)
-     */
-    public boolean isStrict() {
-        return strict;
-    }
-
-    /**
-     * @return whether the parser allows any namespace or just the one from the
-     *         specification
-     */
-    public boolean isStrictNamespace() {
-        return strictNamespace;
-    }
-
-    /**
-     * Sets the parser to allow any namespace or just the one from the
-     * specification
-     */
-    public void setStrictNamespace(boolean s) {
-        strictNamespace = s;
-    }
-
-    /**
-     * Returns a SiteMap or SiteMapIndex given an online sitemap URL
-     *
-     * Please note that this method is a static method which goes online and
-     * fetches the sitemap then parses it
-     *
-     * This method is a convenience method for a user who has a sitemap URL and
-     * wants a "Keep it simple" way to parse it.
-     * 
-     * @param onlineSitemapUrl
-     *            URL of the online sitemap
-     * @return Extracted SiteMap/SiteMapIndex or null if the onlineSitemapUrl is
-     *         null
-     * @throws UnknownFormatException
-     *             if there is an error parsing the sitemap
-     * @throws IOException
-     *             if there is an error reading in the site map
-     *             {@link java.net.URL}
-     */
-    public AbstractSiteMap parseSiteMap(URL onlineSitemapUrl) throws UnknownFormatException, IOException {
-        if (onlineSitemapUrl == null) {
-            return null;
-        }
-        byte[] bytes = IOUtils.toByteArray(onlineSitemapUrl);
-        return parseSiteMap(bytes, onlineSitemapUrl);
-    }
-
-    /**
-     * Returns a processed copy of an unprocessed sitemap object, i.e. transfer
-     * the value of getLastModified(). Please note that the sitemap input stays
-     * unchanged. Note that contentType is assumed to be correct; in general it
-     * is more robust to use the method that doesn't take a contentType, but
-     * instead detects this using Tika.
-     * 
-     * @param contentType
-     *            MIME type of content
-     * @param content
-     *            raw bytes of sitemap file
-     * @param sitemap
-     *            an {@link crawlercommons.sitemaps.AbstractSiteMap}
-     *            implementation
-     * @return Extracted SiteMap/SiteMapIndex
-     * @throws UnknownFormatException
-     *             if there is an error parsing the sitemap
-     * @throws IOException
-     *             if there is an error reading in the site map
-     *             {@link java.net.URL}
-     */
-    public AbstractSiteMap parseSiteMap(String contentType, byte[] content, final AbstractSiteMap sitemap) throws UnknownFormatException, IOException {
-        AbstractSiteMap asmCopy = parseSiteMap(contentType, content, sitemap.getUrl());
-        asmCopy.setLastModified(sitemap.getLastModified());
-        return asmCopy;
-    }
-
-    /**
-     * Parse a sitemap, given the content bytes and the URL.
-     * 
-     * @param content
-     *            raw bytes of sitemap file
-     * @param url
-     *            URL to sitemap file
-     * @return Extracted SiteMap/SiteMapIndex
-     * @throws UnknownFormatException
-     *             if there is an error parsing the sitemap
-     * @throws IOException
-     *             if there is an error reading in the site map
-     *             {@link java.net.URL}
-     */
-    public AbstractSiteMap parseSiteMap(byte[] content, URL url) throws UnknownFormatException, IOException {
-        if (url == null) {
-            return null;
-        }
-        String filename = FilenameUtils.getName(url.getPath());
-        String contentType = TIKA.detect(content, filename);
-        return parseSiteMap(contentType, content, url);
-    }
-
-    /**
-     * Parse a sitemap, given the MIME type, the content bytes, and the URL.
-     * Note that contentType is assumed to be correct; in general it is more
-     * robust to use the method that doesn't take a contentType, but instead
-     * detects this using Tika.
-     * 
-     * @param contentType
-     *            MIME type of content
-     * @param content
-     *            raw bytes of sitemap file
-     * @param url
-     *            URL to sitemap file
-     * @return Extracted SiteMap/SiteMapIndex
-     * @throws UnknownFormatException
-     *             if there is an error parsing the sitemap
-     * @throws IOException
-     *             if there is an error reading in the site map
-     *             {@link java.net.URL}
-     */
-    public AbstractSiteMap parseSiteMap(String contentType, byte[] content, URL url) throws UnknownFormatException, IOException {
-        MediaType mediaType = MediaType.parse(contentType);
-
-        // Octet-stream is the father of all binary types
-        while (mediaType != null && !mediaType.equals(MediaType.OCTET_STREAM)) {
-            if (XML_MEDIA_TYPES.contains(mediaType)) {
-                return processXml(url, content);
-            } else if (TEXT_MEDIA_TYPES.contains(mediaType)) {
-                return processText(url, content);
-            } else if (GZ_MEDIA_TYPES.contains(mediaType)) {
-                InputStream decompressed;
-                MediaType embeddedType;
-                try {
-                    decompressed = new GZIPInputStream(new ByteArrayInputStream(content));
-                    embeddedType = MediaType.parse(TIKA.detect(decompressed));
-                } catch (Exception e) {
-                    UnknownFormatException err = new UnknownFormatException("Failed to detect embedded MediaType of gzipped sitemap: " + url + ", caused by " + e);
-                    err.initCause(e);
-                    throw err;
-                }
-                if (XML_MEDIA_TYPES.contains(embeddedType)) {
-                    return processGzippedXML(url, content);
-                } else if (TEXT_MEDIA_TYPES.contains(embeddedType)) {
-                    // re-open decompressed stream and parse as text
-                    decompressed = new GZIPInputStream(new ByteArrayInputStream(content));
-                    return processText(url, decompressed);
-                } else if (GZ_MEDIA_TYPES.contains(embeddedType)) {
-                    throw new UnknownFormatException("Can't parse gzip recursively: " + url);
-                }
-                throw new UnknownFormatException("Can't parse a gzipped sitemap with the embedded MediaType of: " + embeddedType + " (at: " + url + ")");
-            }
-            mediaType = MEDIA_TYPE_REGISTRY.getSupertype(mediaType); // Check
-                                                                     // parent
-        }
-
-        throw new UnknownFormatException("Can't parse a sitemap with the MediaType of: " + contentType + " (at: " + url + ")");
-    }
-
-    /**
-     * Parse the given XML content.
-     * 
-     * @param sitemapUrl
-     *            URL to sitemap file
-     * @param xmlContent
-     *            the byte[] backing the sitemapUrl
-     * @return The site map
-     * @throws UnknownFormatException
-     *             if there is an error parsing the sitemap
-     */
-    protected AbstractSiteMap processXml(URL sitemapUrl, byte[] xmlContent) throws UnknownFormatException {
-
-        BOMInputStream bomIs = new BOMInputStream(new ByteArrayInputStream(xmlContent));
-        InputSource is = new InputSource();
-        is.setCharacterStream(new BufferedReader(new InputStreamReader(bomIs, UTF_8)));
-
-        return processXml(sitemapUrl, is);
-    }
-
-    /**
-     * Process a text-based Sitemap. Text sitemaps only list URLs but no
-     * priorities, last mods, etc.
-     * 
-     * @param sitemapUrl
-     *            URL to sitemap file
-     * @param content
-     *            the byte[] backing the sitemapUrl
-     * @return The site map
-     * @throws IOException
-     *             if there is an error reading in the site map content
-     */
-    protected SiteMap processText(URL sitemapUrl, byte[] content) throws IOException {
-        return processText(sitemapUrl, new ByteArrayInputStream(content));
-    }
-
-    /**
-     * Process a text-based Sitemap. Text sitemaps only list URLs but no
-     * priorities, last mods, etc.
-     *
-     * @param sitemapUrl
-     *            URL to sitemap file
-     * @param stream
-     *            content stream
-     * @return The site map
-     * @throws IOException
-     *             if there is an error reading in the site map content
-     */
-    protected SiteMap processText(URL sitemapUrl, InputStream stream) throws IOException {
-        LOG.debug("Processing textual Sitemap");
-
-        SiteMap textSiteMap = new SiteMap(sitemapUrl);
-        textSiteMap.setType(SitemapType.TEXT);
-
-        BOMInputStream bomIs = new BOMInputStream(stream);
-        @SuppressWarnings("resource")
-        BufferedReader reader = new BufferedReader(new InputStreamReader(bomIs, UTF_8));
-
-        String line;
-        int i = 1;
-        while ((line = reader.readLine()) != null) {
-            if (line.length() > 0 && i <= MAX_URLS) {
-                addUrlIntoSitemap(line, textSiteMap, null, null, null, i++);
-            }
-        }
-        textSiteMap.setProcessed(true);
-
-        return textSiteMap;
-    }
-
-    /**
-     * Decompress the gzipped content and process the resulting XML Sitemap.
-     * 
-     * @param url
-     *            - URL of the gzipped content
-     * @param response
-     *            - Gzipped content
-     * @return the site map
-     * @throws UnknownFormatException
-     *             if there is an error parsing the gzip
-     * @throws IOException
-     *             if there is an error reading in the gzip {@link java.net.URL}
-     */
-    protected AbstractSiteMap processGzippedXML(URL url, byte[] response) throws IOException, UnknownFormatException {
-
-        LOG.debug("Processing gzipped XML");
-
-        InputStream is = new ByteArrayInputStream(response);
-
-        // Remove .gz ending
-        String xmlUrl = url.toString().replaceFirst("\\.gz$", "");
-        LOG.debug("XML url = {}", xmlUrl);
-
-        BOMInputStream decompressed = new BOMInputStream(new GZIPInputStream(is));
-        InputSource in = new InputSource(decompressed);
-        in.setSystemId(xmlUrl);
-        return processXml(url, in);
-    }
-
-    /**
-     * Parse the given XML content.
-     * 
-     * @param sitemapUrl
-     *            a sitemap {@link java.net.URL}
-     * @param is
-     *            an {@link org.xml.sax.InputSource} backing the sitemap
-     * @return the site map
-     * @throws UnknownFormatException
-     *             if there is an error parsing the
-     *             {@link org.xml.sax.InputSource}
-     */
-    protected AbstractSiteMap processXml(URL sitemapUrl, InputSource is) throws UnknownFormatException {
-
-        SAXParserFactory factory = SAXParserFactory.newInstance();
-
-        // disable validation and avoid that remote DTDs, schemas, etc. are
-        // fetched
-        factory.setValidating(false);
-        factory.setXIncludeAware(false);
-
-        // support the use of an explicit namespace.
-        factory.setNamespaceAware(true);
-
-        try {
-            factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
-        } catch (Exception e) {
-            throw new RuntimeException("Failed to configure XML parser: " + e.toString());
-        }
-        DelegatorHandler handler = new DelegatorHandler(sitemapUrl, strict);
-        try {
-            SAXParser saxParser = factory.newSAXParser();
-            saxParser.getXMLReader().setEntityResolver(new EntityResolver() {
-                // noop entity resolver, does not fetch remote content
-                @Override
-                public InputSource resolveEntity(String publicId, String systemId) {
-                    return new InputSource(new StringReader(""));
-                }
-            });
-            handler.setStrictNamespace(isStrictNamespace());
-            saxParser.parse(is, handler);
-            AbstractSiteMap sitemap = handler.getSiteMap();
-            if (sitemap == null) {
-                UnknownFormatException ex = handler.getException();
-                if (ex != null) {
-                    throw ex;
-                }
-                throw new UnknownFormatException("Unknown XML format for: " + sitemapUrl);
-            }
-            return sitemap;
-        } catch (IOException e) {
-            LOG.warn("Error parsing sitemap {}: {}", sitemapUrl, e.getMessage());
-            UnknownFormatException ufe = new UnknownFormatException("Failed to parse " + sitemapUrl);
-            ufe.initCause(e);
-            throw ufe;
-        } catch (SAXException e) {
-            LOG.warn("Error parsing sitemap {}: {}", sitemapUrl, e.getMessage());
-            AbstractSiteMap sitemap = handler.getSiteMap();
-            if (allowPartial && sitemap != null) {
-                LOG.warn("Processed broken/partial sitemap for '" + sitemapUrl + "'");
-                sitemap.setProcessed(true);
-                return sitemap;
-            } else {
-                UnknownFormatException ufe = new UnknownFormatException("Failed to parse " + sitemapUrl);
-                ufe.initCause(e);
-                throw ufe;
-            }
-        } catch (ParserConfigurationException e) {
-            throw new IllegalStateException(e);
-        }
-    }
-
-    /**
-     * Adds the given URL to the given sitemap while showing the relevant logs
-     * 
-     * @param urlStr
-     *            an URL string to add to the
-     *            {@link crawlercommons.sitemaps.SiteMap}
-     * @param siteMap
-     *            the sitemap to add URL(s) to
-     * @param lastMod
-     *            last time the {@link crawlercommons.sitemaps.SiteMapURL} was
-     *            modified
-     * @param changeFreq
-     *            the {@link crawlercommons.sitemaps.SiteMapURL} change frquency
-     * @param priority
-     *            priority of this {@link crawlercommons.sitemaps.SiteMapURL}
-     * @param urlIndex
-     *            index position to which this entry has been added
-     */
-    protected void addUrlIntoSitemap(String urlStr, SiteMap siteMap, String lastMod, String changeFreq, String priority, int urlIndex) {
-        try {
-            URL url = new URL(urlStr); // Checking the URL
-            boolean valid = urlIsValid(siteMap.getBaseUrl(), url.toString());
-
-            if (valid || !strict) {
-                SiteMapURL sUrl = new SiteMapURL(url.toString(), lastMod, changeFreq, priority, valid);
-                siteMap.addSiteMapUrl(sUrl);
-                LOG.debug("  {}. {}", urlIndex + 1, sUrl);
-            } else {
-                LOG.warn("URL: {} is excluded from the sitemap as it is not a valid url = not under the base url: {}", url.toExternalForm(), siteMap.getBaseUrl());
-            }
-        } catch (MalformedURLException e) {
-            LOG.warn("Bad url: [{}]", urlStr);
-            LOG.trace("Can't create a sitemap entry with a bad URL", e);
-        }
-    }
-
-    /**
-     * See if testUrl is under sitemapBaseUrl. Only URLs under sitemapBaseUrl
-     * are valid.
-     * 
-     * @param sitemapBaseUrl
-     * @param testUrl
-     * @return true if testUrl is under sitemapBaseUrl, false otherwise
-     */
-    public static boolean urlIsValid(String sitemapBaseUrl, String testUrl) {
-        boolean ret = false;
-
-        // Don't try a comparison if the URL is too short to match
-        if (sitemapBaseUrl != null && sitemapBaseUrl.length() <= testUrl.length()) {
-            String u = testUrl.substring(0, sitemapBaseUrl.length());
-            ret = sitemapBaseUrl.equals(u);
-        }
-
-        return ret;
-    }
-
-    /**
-     * Performs a one time intialization of Tika's Media-Type components and
-     * media type collection constants <br/>
-     * Please note that this is a private static method which is called once per
-     * CLASS (not per instance / object)
-     */
-    private static void initMediaTypes() {
-        /* XML media types (and all aliases) */
-        XML_MEDIA_TYPES.add(APPLICATION_XML);
-        XML_MEDIA_TYPES.addAll(MEDIA_TYPE_REGISTRY.getAliases(APPLICATION_XML));
-
-        /* TEXT media types (and all aliases) */
-        TEXT_MEDIA_TYPES.add(TEXT_PLAIN);
-        TEXT_MEDIA_TYPES.addAll(MEDIA_TYPE_REGISTRY.getAliases(TEXT_PLAIN));
-
-        /* GZIP media types (and all aliases) */
-        MediaType gzipMediaType = MediaType.parse("application/gzip");
-        GZ_MEDIA_TYPES.add(gzipMediaType);
-        GZ_MEDIA_TYPES.addAll(MEDIA_TYPE_REGISTRY.getAliases(gzipMediaType));
-    }
-}
--- a/src/main/java/crawlercommons/sitemaps/SiteMapTester.java
+++ b/src/main/java/crawlercommons/sitemaps/SiteMapTester.java
@ -31,8 +31,7 @@ import org.slf4j.LoggerFactory;
 public class SiteMapTester {

    private static final Logger LOG = LoggerFactory.getLogger(SiteMapTester.class);
-    private static SiteMapParser parser = new SiteMapParser(false);
-    private static SiteMapParser saxParser = new SiteMapParserSAX(false, true);
+    private static SiteMapParser saxParser = new SiteMapParser(false, true);

    public static void main(String[] args) throws IOException, UnknownFormatException {
        if (args.length < 1) {
@ -43,7 +42,6 @@ public class SiteMapTester {
            LOG.error("  MIME_TYPE    force processing sitemap as MIME type,");
            LOG.error("               bypass automatic MIME type detection");
            LOG.error("Java properties:");
-            LOG.error("  sitemap.useSax  if true use SAX parser to process sitemaps");
            LOG.error("  sitemap.strictNamespace");
            LOG.error("                  if true sitemaps are required to use the standard namespace URI");
        } else {
@ -61,23 +59,17 @@ public class SiteMapTester {
    private static void parse(URL url, String mt) throws IOException, UnknownFormatException {
        byte[] content = IOUtils.toByteArray(url);

-        boolean useSaxParser = new Boolean(System.getProperty("sitemap.useSax"));
+        LOG.info("Parsing {} {}", url, ((mt != null && !mt.isEmpty()) ? "as MIME type " + mt : ""));
+
        boolean strictNamespace = new Boolean(System.getProperty("sitemap.strictNamespace"));
-
-        LOG.info("Parsing {} {} using {} parser", url, ((mt != null && !mt.isEmpty()) ? "as MIME type " + mt : ""), (useSaxParser ? "SAX" : "DOM"));
-
-        SiteMapParser p = parser;
-        if (useSaxParser) {
-            p = saxParser;
-        }
-        p.setStrictNamespace(strictNamespace);
+        saxParser.setStrictNamespace(strictNamespace);

        AbstractSiteMap sm = null;
        // guesses the mimetype
        if (mt == null || mt.equals("")) {
-            sm = p.parseSiteMap(content, url);
+            sm = saxParser.parseSiteMap(content, url);
        } else {
-            sm = p.parseSiteMap(mt, content, url);
+            sm = saxParser.parseSiteMap(mt, content, url);
        }

        if (sm.isIndex()) {
--- a/src/test/java/crawlercommons/domains/EffectiveTldFinderTest.java
+++ b/src/test/java/crawlercommons/domains/EffectiveTldFinderTest.java
@ -177,7 +177,8 @@ public class EffectiveTldFinderTest {
        assertEquals("xn--80abbembcyvesfij3at4loa4ff.xn--p1ai", ad);
        // rare but possible mixed use of UTF-8 and Punycode
        ad = EffectiveTldFinder.getAssignedDomain("xn--90a1af.бесплатныеобъявления.рф");
-        // assertEquals("xn--80abbembcyvesfij3at4loa4ff.xn--p1ai", ad); // TODO #179
+        // TODO #179
+        // assertEquals("xn--80abbembcyvesfij3at4loa4ff.xn--p1ai", ad);
    }

    @Test
--- a/src/test/java/crawlercommons/sitemaps/SiteMapParserSAXTest.java
+++ b/src/test/java/crawlercommons/sitemaps/SiteMapParserSAXTest.java
@ -1,534 +0,0 @@
-/**
- * Copyright 2016 Crawler-Commons
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package crawlercommons.sitemaps;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.net.URL;
-import java.text.SimpleDateFormat;
-import java.util.Date;
-import java.util.Iterator;
-import java.util.Locale;
-
-import org.apache.commons.io.IOUtils;
-import org.junit.After;
-import org.junit.Before;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import crawlercommons.sitemaps.AbstractSiteMap.SitemapType;
-
-import static org.junit.Assert.*;
-
-@RunWith(JUnit4.class)
-public class SiteMapParserSAXTest {
-
-    private static final Logger LOG = LoggerFactory.getLogger(SiteMapParserSAXTest.class);
-
-    @Before
-    public void setUp() throws Exception {
-    }
-
-    @After
-    public void tearDown() throws Exception {
-    }
-
-    @Test
-    public void testSitemapIndex() throws UnknownFormatException, IOException {
-        SiteMapParser parser = new SiteMapParserSAX();
-        String contentType = "text/xml";
-        StringBuilder scontent = new StringBuilder(1024);
-        scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n").append("<sitemapindex xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n").append(" <sitemap>\n")
-                        .append("  <loc>http://www.example.com/sitemap1.xml.gz</loc>\n").append("  <lastmod><![CDATA[2004-10-01T18:23:17+00:00]]></lastmod>\n").append(" </sitemap>\n")
-                        .append("<sitemap>\n").append("  <loc>http://www.example.com/sitemap2.xml.gz</loc>\n").append("  <lastmod>2005-01-01</lastmod>\n").append(" </sitemap>\n")
-                        .append("<sitemap>\n").append("  <loc>http://www.example.com/dynsitemap?date=now&amp;all=true</loc>\n").append(" </sitemap>\n").append("<sitemap>\n")
-                        .append("  <loc>http://www.example.com/dynsitemap<![CDATA[?date=lastyear&all=false]]></loc>\n").append(" </sitemap>\n").append("</sitemapindex>");
-        byte[] content = scontent.toString().getBytes(UTF_8);
-        URL url = new URL("http://www.example.com/sitemapindex.xml");
-
-        AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
-        assertEquals(true, asm.isIndex());
-        assertEquals(true, asm instanceof SiteMapIndex);
-
-        SiteMapIndex smi = (SiteMapIndex) asm;
-        assertEquals(4, smi.getSitemaps().size());
-
-        AbstractSiteMap currentSiteMap = smi.getSitemap(new URL("http://www.example.com/sitemap1.xml.gz"));
-        assertNotNull(currentSiteMap);
-        assertEquals("http://www.example.com/sitemap1.xml.gz", currentSiteMap.getUrl().toString());
-        assertEquals(SiteMap.convertToDate("2004-10-01T18:23:17+00:00"), currentSiteMap.getLastModified());
-
-        assertTrue(currentSiteMap.toString().contains("T18:23"));
-
-        currentSiteMap = smi.getSitemap(new URL("http://www.example.com/sitemap2.xml.gz"));
-        assertNotNull(currentSiteMap);
-        assertEquals("http://www.example.com/sitemap2.xml.gz", currentSiteMap.getUrl().toString());
-        assertEquals(SiteMap.convertToDate("2005-01-01"), currentSiteMap.getLastModified());
-
-        currentSiteMap = smi.getSitemap(new URL("http://www.example.com/dynsitemap?date=now&all=true"));
-        assertNotNull("<loc> with entities not found", currentSiteMap);
-        assertEquals("http://www.example.com/dynsitemap?date=now&all=true", currentSiteMap.getUrl().toString());
-
-        currentSiteMap = smi.getSitemap(new URL("http://www.example.com/dynsitemap?date=lastyear&all=false"));
-        assertNotNull("<loc> with CDATA not found", currentSiteMap);
-        assertEquals("http://www.example.com/dynsitemap?date=lastyear&all=false", currentSiteMap.getUrl().toString());
-    }
-
-    @Test
-    public void testSitemapWithNamespace() throws UnknownFormatException, IOException {
-        SiteMapParser parser = new SiteMapParserSAX();
-        parser.setStrictNamespace(true);
-        byte[] content = getResourceAsBytes("src/test/resources/sitemaps/sitemap.ns.xml");
-
-        URL url = new URL("http://www.example.com/sitemap.ns.xml");
-        AbstractSiteMap asm = parser.parseSiteMap(content, url);
-        assertEquals(SitemapType.XML, asm.getType());
-        assertEquals(true, asm instanceof SiteMap);
-        assertEquals(true, asm.isProcessed());
-        SiteMap sm = (SiteMap) asm;
-
-        assertEquals(2, sm.getSiteMapUrls().size());
-        assertEquals(SiteMapURL.ChangeFrequency.DAILY, sm.getSiteMapUrls().iterator().next().getChangeFrequency());
-    }
-
-    @Test
-    public void testSitemapWithWrongNamespace() throws UnknownFormatException, IOException {
-        SiteMapParser parser = new SiteMapParserSAX();
-        parser.setStrictNamespace(true);
-
-        byte[] content = getResourceAsBytes("src/test/resources/sitemaps/sitemap.badns.xml");
-
-        URL url = new URL("http://www.example.com/sitemap.badns.xml");
-        AbstractSiteMap asm;
-        try {
-            asm = parser.parseSiteMap(content, url);
-            fail("Expected an UnknownFormatException because of wrong namespace");
-        } catch (UnknownFormatException e) {
-            assertTrue(e.getMessage().contains("does not match standard namespace"));
-        }
-
-        // try again in lenient mode
-        parser.setStrictNamespace(false);
-        asm = parser.parseSiteMap(content, url);
-        assertEquals(SitemapType.XML, asm.getType());
-        assertEquals(true, asm instanceof SiteMap);
-        assertEquals(true, asm.isProcessed());
-        SiteMap sm = (SiteMap) asm;
-
-        assertEquals(2, sm.getSiteMapUrls().size());
-    }
-
-    @Test
-    public void testFullDateFormat() {
-        SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm+hh:00", Locale.ROOT);
-        Date date = new Date();
-        LOG.info(format.format(date));
-        LOG.info(SiteMap.getFullDateFormat().format(date));
-    }
-
-    @Test
-    public void testSitemapTXT() throws UnknownFormatException, IOException {
-        SiteMapParser parser = new SiteMapParserSAX();
-        String contentType = "text/plain";
-        String scontent = "http://www.example.com/catalog?item=1\nhttp://www.example.com/catalog?item=11";
-        byte[] content = scontent.getBytes(UTF_8);
-        URL url = new URL("http://www.example.com/sitemap.txt");
-
-        AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
-        assertEquals(false, asm.isIndex());
-        assertEquals(true, asm instanceof SiteMap);
-
-        SiteMap sm = (SiteMap) asm;
-        assertEquals(2, sm.getSiteMapUrls().size());
-    }
-
-    @Test
-    public void testSitemapTXTWithXMLExt() throws UnknownFormatException, IOException {
-        SiteMapParser parser = new SiteMapParserSAX();
-        String scontent = "http://www.example.com/catalog?item=1\nhttp://www.example.com/catalog?item=11";
-        byte[] content = scontent.getBytes(UTF_8);
-        URL url = new URL("http://www.example.com/sitemap.xml");
-        String contentType = "text/plain";
-
-        AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
-        assertEquals(false, asm.isIndex());
-        assertEquals(true, asm instanceof SiteMap);
-
-        SiteMap sm = (SiteMap) asm;
-        assertEquals(2, sm.getSiteMapUrls().size());
-    }
-
-    @Test
-    public void testSitemapXML() throws UnknownFormatException, IOException {
-        SiteMapParser parser = new SiteMapParserSAX();
-        String contentType = "text/xml";
-        byte[] content = getXMLSitemapAsBytes();
-        URL url = new URL("http://www.example.com/sitemap.xml");
-
-        AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
-        assertEquals(false, asm.isIndex());
-        assertEquals(true, asm instanceof SiteMap);
-
-        SiteMap sm = (SiteMap) asm;
-        assertEquals(5, sm.getSiteMapUrls().size());
-
-        SiteMapURL[] found = sm.getSiteMapUrls().toArray(new SiteMapURL[5]);
-        for (int i = 0; i < found.length; i++) {
-            assertEquals(sitemapURLs[i].replaceAll("&amp;", "&"), found[i].getUrl().toExternalForm());
-        }
-    }
-
-    @Test
-    public void testSitemapXMLMediaTypes() throws UnknownFormatException, IOException {
-        SiteMapParser parser = new SiteMapParserSAX();
-        byte[] content = getXMLSitemapAsBytes();
-        URL url = new URL("http://www.example.com/sitemap.nonXmlExt");
-
-        final String[] XML_CONTENT_TYPES = new String[] { "text/xml", "application/x-xml", "application/xml", "application/atom+xml", "application/rss+xml" };
-        for (String contentType : XML_CONTENT_TYPES) {
-            AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
-            assertEquals(false, asm.isIndex());
-            assertEquals(true, asm instanceof SiteMap);
-            SiteMap sm = (SiteMap) asm;
-            assertEquals(5, sm.getSiteMapUrls().size());
-            SiteMapURL[] found = sm.getSiteMapUrls().toArray(new SiteMapURL[5]);
-            for (int i = 0; i < found.length; i++) {
-                assertEquals(sitemapURLs[i].replaceAll("&amp;", "&"), found[i].getUrl().toExternalForm());
-            }
-        }
-    }
-
-    /**
-     * This Sitemap contains badly formatted XML and can't be read
-     */
-    @Test(expected = UnknownFormatException.class)
-    public void testSitemapParserBrokenXml() throws IOException, UnknownFormatException {
-        SiteMapParser parser = new SiteMapParserSAX();
-        String contentType = "text/xml";
-        StringBuilder scontent = new StringBuilder(1024);
-        scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">")
-                        .append("<url><!-- This file is not a valid XML file --></url>").append("<url><loc> http://cs.harding.edu/fmccown/sitemaps/something.html</loc>")
-                        .append("</url><!-- missing opening url tag --></url></urlset>");
-        byte[] content = scontent.toString().getBytes(UTF_8);
-        URL url = new URL("http://www.example.com/sitemapindex.xml");
-
-        parser.parseSiteMap(contentType, content, url); // This Sitemap contains
-                                                        // badly formatted XML
-                                                        // and can't be read
-    }
-
-    @Test
-    public void testMissingLocSitemapIndexFile() throws UnknownFormatException, IOException {
-        SiteMapParser parser = new SiteMapParserSAX();
-        byte[] content = getResourceAsBytes("src/test/resources/sitemaps/sitemap.index.xml");
-
-        URL url = new URL("http://www.example.com/sitemap.index.xml");
-        AbstractSiteMap asm = parser.parseSiteMap(content, url);
-        assertEquals(true, asm.isIndex());
-        assertEquals(true, asm instanceof SiteMapIndex);
-        SiteMapIndex sm = (SiteMapIndex) asm;
-        assertEquals(15, sm.getSitemaps().size());
-        String sitemap = "https://example.com/sitemap.jss?portalCode=10260&lang=en";
-        assertNotNull("Sitemap " + sitemap + " not found in sitemap index", sm.getSitemap(new URL(sitemap)));
-    }
-
-    @Test
-    public void testSitemapGZ() throws UnknownFormatException, IOException {
-        SiteMapParser parser = new SiteMapParserSAX();
-        String contentType = "application/gzip";
-        byte[] content = getResourceAsBytes("src/test/resources/sitemaps/xmlSitemap.gz");
-
-        URL url = new URL("http://www.example.com/sitemap.xml.gz");
-        AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
-        assertEquals(false, asm.isIndex());
-        assertEquals(true, asm instanceof SiteMap);
-        SiteMap sm = (SiteMap) asm;
-        assertEquals(5, sm.getSiteMapUrls().size());
-    }
-
-    @Test
-    public void testSitemapTextGZ() throws UnknownFormatException, IOException {
-        SiteMapParser parser = new SiteMapParserSAX();
-        String contentType = "application/gzip";
-        byte[] content = this.getResourceAsBytes("src/test/resources/sitemaps/sitemap.txt.gz");
-
-        URL url = new URL("http://www.example.com/sitemap.txt.gz");
-        AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
-        assertEquals(false, asm.isIndex());
-        assertEquals(true, asm instanceof SiteMap);
-        SiteMap sm = (SiteMap) asm;
-        assertEquals(5, sm.getSiteMapUrls().size());
-    }
-
-    @Test
-    public void testSitemapGZMediaTypes() throws UnknownFormatException, IOException {
-        SiteMapParser parser = new SiteMapParserSAX();
-        byte[] content = getResourceAsBytes("src/test/resources/sitemaps/xmlSitemap.gz");
-
-        final String[] GZ_CONTENT_TYPES = new String[] { "application/gzip", "application/x-gzip", "application/x-gunzip", "application/gzipped", "application/gzip-compressed", "gzip/document" };
-        for (String contentType : GZ_CONTENT_TYPES) {
-            URL url = new URL("http://www.example.com/sitemap");
-            AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
-            assertEquals(false, asm.isIndex());
-            assertEquals(true, asm instanceof SiteMap);
-            SiteMap sm = (SiteMap) asm;
-            assertEquals(5, sm.getSiteMapUrls().size());
-        }
-    }
-
-    @Test(expected = UnknownFormatException.class)
-    public void testSitemapWithOctetMediaType() throws UnknownFormatException, IOException {
-        SiteMapParser parser = new SiteMapParserSAX();
-        String contentType = "application/octet-stream";
-        byte[] content = getXMLSitemapAsBytes();
-        URL url = new URL("http://www.example.com/sitemap");
-
-        AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
-        assertEquals(false, asm.isIndex());
-        assertEquals(true, asm instanceof SiteMap);
-
-        SiteMap sm = (SiteMap) asm;
-        assertEquals(5, sm.getSiteMapUrls().size());
-
-        SiteMapURL[] found = sm.getSiteMapUrls().toArray(new SiteMapURL[5]);
-        for (int i = 0; i < found.length; i++) {
-            assertEquals(sitemapURLs[i], found[i].getUrl().toExternalForm());
-        }
-    }
-
-    @Test
-    public void testLenientParser() throws UnknownFormatException, IOException {
-        SiteMapParser parser = new SiteMapParserSAX();
-        String contentType = "text/xml";
-        StringBuilder scontent = new StringBuilder(1024);
-        scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">").append("<url>")
-                        .append("<loc>http://www.example.com/</loc>").append("</url>").append("</urlset>");
-        byte[] content = scontent.toString().getBytes(UTF_8);
-
-        URL url = new URL("http://www.example.com/subsection/sitemap.xml");
-        AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
-        assertEquals(false, asm.isIndex());
-        assertEquals(true, asm instanceof SiteMap);
-
-        SiteMap sm = (SiteMap) asm;
-        assertEquals(0, sm.getSiteMapUrls().size());
-
-        // Now try again with lenient parsing. We should get one invalid URL
-        parser = new SiteMapParserSAX(false);
-        asm = parser.parseSiteMap(contentType, content, url);
-        assertEquals(false, asm.isIndex());
-        assertEquals(true, asm instanceof SiteMap);
-
-        sm = (SiteMap) asm;
-        assertEquals(1, sm.getSiteMapUrls().size());
-        assertFalse(sm.getSiteMapUrls().iterator().next().isValid());
-    }
-
-    @Test
-    public void testAtomFormat() throws UnknownFormatException, IOException {
-        SiteMapParser parser = new SiteMapParserSAX();
-        byte[] content = getResourceAsBytes("src/test/resources/sitemaps/atom.xml");
-        URL url = new URL("http://example.org/atom.xml");
-
-        SiteMap sm = (SiteMap) parser.parseSiteMap(content, url);
-        assertEquals(1, sm.getSiteMapUrls().size());
-        assertEquals(new URL("http://example.org/2003/12/13/atom03"), sm.getSiteMapUrls().iterator().next().getUrl());
-    }
-
-    /**
-     * Test processing RSS 1.0 sitemaps, which don't have an <rss> tag. E.g.
-     * http://rss.slashdot.org/slashdot/slashdotMain?format=xml
-     * 
-     * See https://github.com/crawler-commons/crawler-commons/issues/87
-     * 
-     * @throws IOException
-     * @throws UnknownFormatException
-     */
-    @Test
-    public void testRSS10SyndicationFormat() throws UnknownFormatException, IOException {
-        SiteMapParser parser = new SiteMapParserSAX();
-
-        String contentType = "text/xml";
-        URL url = new URL("http://www.example.com/sitemapindex.xml");
-        StringBuilder scontent = new StringBuilder(1024);
-        scontent.append("<?xml version=\"1.0\"?>")
-                        .append("<rdf:RDF xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"  xmlns=\"http://purl.org/rss/1.0/\">")
-                        .append("<channel rdf:about=\"http://www.xml.com/xml/news.rss\">")
-                        .append("<title>XML.com</title>")
-                        .append("<link>http://www.example.com/pub</link>")
-                        .append("<description>example.com</description>")
-                        .append("<image rdf:resource=\"http://www.example.com/universal/images/xml_tiny.gif\" />")
-                        .append("<items><rdf:Seq><rdf:li resource=\"http://www.example.com/pub/2000/08/09/xslt/xslt.html\" />")
-                        .append("<rdf:li resource=\"http://www.example.com/pub/2000/08/09/rdfdb/index.html\" /></rdf:Seq></items></channel>")
-                        .append("<image rdf:about=\"http://www.example.com/universal/images/xml_tiny.gif\"><title>XML.com</title><link>http://www.xml.com</link>")
-                        .append("<url>http://www.example.com/universal/images/xml_tiny.gif</url></image>")
-                        .append("<item rdf:about=\"http://www.example.com/pub/2000/08/09/xslt/xslt.html\"><title>Processing Inclusions with XSLT</title>")
-                        .append("<link>http://www.example.com/pub/2000/08/09/xslt/xslt.html</link>")
-                        .append("<description>Processing document inclusions with general XML tools can be problematic. This article proposes a way of preserving inclusion"
-                                        + "information through SAX-based processing. </description> </item> </rdf:RDF>");
-        byte[] content = scontent.toString().getBytes(UTF_8);
-        AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
-        assertEquals(false, asm.isIndex());
-        assertEquals(true, asm instanceof SiteMap);
-
-        SiteMap sm = (SiteMap) asm;
-        assertEquals(1, sm.getSiteMapUrls().size());
-        assertEquals("http://www.example.com/pub/2000/08/09/xslt/xslt.html", sm.getSiteMapUrls().iterator().next().getUrl().toString());
-    }
-
-    @Test
-    public void testRSSPubDate() throws UnknownFormatException, IOException {
-        SiteMapParser parser = new SiteMapParserSAX();
-        String contentType = "text/xml";
-        byte[] content = getResourceAsBytes("src/test/resources/rss/xmlRss_pubDate.xml");
-        URL url = new URL("http://www.example.com/rss.xml");
-
-        AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
-        assertSame("Not an RSS", SitemapType.RSS, asm.getType());
-        assertNotNull("GMT timestamp not parsed", asm.getLastModified());
-        long pubDate = 1483619690000L; // Thu, 05 Jan 17 12:34:50 GMT
-        assertEquals("GMT timestamp", pubDate, asm.getLastModified().getTime());
-        SiteMap rss = (SiteMap) asm;
-        assertEquals("Incorrect items count", 7, rss.getSiteMapUrls().size());
-        Iterator<SiteMapURL> it = rss.getSiteMapUrls().iterator();
-        assertPubDate("Local differental offset", "article_1", pubDate + 1000, it);
-        assertPubDate("Short year", "article_2", pubDate + 2000, it);
-        assertPubDate("No weekday", "article_3", pubDate + 3000, it);
-        assertPubDate("No weekday and short year", "article_4", pubDate + 4000, it);
-        assertPubDate("No time zone(incorrect)", "article_5", null, it);
-        assertPubDate("Empty field", "article_6", null, it);
-        assertPubDate("Missed field", "article_7", null, it);
-    }
-
-    private static void assertPubDate(String message, String path, Long pubDate, Iterator<SiteMapURL> it) {
-        assertTrue(message + " item missed", it.hasNext());
-        SiteMapURL url = it.next();
-        assertEquals(message + " link", "http://www.example.com/" + path, url.getUrl().toString());
-        if (pubDate == null) {
-            assertNull(message + " pubDate not NULL", url.getLastModified());
-        } else {
-            assertNotNull(message + " pubDate is missing", url.getLastModified());
-            assertEquals(message + " pub date", pubDate.longValue(), url.getLastModified().getTime());
-        }
-    }
-
-    @Test
-    public void testPartialSitemapsAllowed() throws UnknownFormatException, IOException {
-
-        SiteMapParser parser = new SiteMapParserSAX(false, true);
-        String contentType = "text/xml";
-        StringBuilder scontent = new StringBuilder(1024);
-        scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">").append("<url>")
-                        .append("<loc>http://www.example.com/</lo");
-
-        byte[] content = scontent.toString().getBytes(UTF_8);
-
-        URL url = new URL("http://www.example.com/subsection/sitemap.xml");
-
-        AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
-        assertEquals(false, asm.isIndex());
-        assertEquals(true, asm instanceof SiteMap);
-
-        SiteMap sm = (SiteMap) asm;
-        assertEquals(1, sm.getSiteMapUrls().size());
-        assertFalse(sm.getSiteMapUrls().iterator().next().isValid());
-    }
-
-    @Test
-    public void testUrlLocUrl() throws UnknownFormatException, IOException {
-        SiteMapParser parser = new SiteMapParserSAX(false);
-        String contentType = "text/xml";
-        StringBuilder scontent = new StringBuilder(1024);
-        scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">").append("<url>").append("<loc>").append("<url>")
-                        .append("<![CDATA[").append("http://jobs.optistaffing.com/EXPERIENCED-DISPATCHER-NEEDED-NOW----Jobs-in-Vancouver-WA/2333221").append("]]>").append("</url>").append("</loc>")
-                        .append("<lastmod>2015-04-28</lastmod>").append("<changefreq>daily</changefreq>").append("</url>").append("</urlset>");
-
-        byte[] content = scontent.toString().getBytes(UTF_8);
-
-        URL url = new URL("http://www.example.com/subsection/sitemap.xml");
-        AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
-        assertEquals(false, asm.isIndex());
-        assertEquals(true, asm instanceof SiteMap);
-
-        SiteMap sm = (SiteMap) asm;
-        assertEquals(1, sm.getSiteMapUrls().size());
-        assertFalse(sm.getSiteMapUrls().iterator().next().isValid());
-    }
-
-    @Test
-    public void testPartialSitemapIndicesAllowed() throws UnknownFormatException, IOException {
-
-        SiteMapParser parser = new SiteMapParserSAX(false, true);
-        String contentType = "text/xml";
-        StringBuilder scontent = new StringBuilder(1024);
-        scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append("<sitemapindex xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">")
-                        .append("<sitemap><loc>http://www.example.com/sitemap1.xml.gz</loc><las");
-        byte[] content = scontent.toString().getBytes(UTF_8);
-
-        URL url = new URL("http://www.example.com/subsection/sitemap.xml");
-
-        AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
-        assertEquals(true, asm.isIndex());
-        assertEquals(true, asm instanceof SiteMapIndex);
-
-        SiteMapIndex smi = (SiteMapIndex) asm;
-        assertEquals(1, smi.getSitemaps().size());
-    }
-
-    /**
-     * Returns a good simple default XML sitemap as a byte array
-     */
-    private byte[] getXMLSitemapAsBytes() {
-        StringBuilder scontent = new StringBuilder(1024);
-        scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">");
-        scontent.append("<url>  <loc>").append(sitemapURLs[0]).append("</loc>  <lastmod>2005-01-01</lastmod>").append("  <changefreq>monthly</changefreq>").append("  <priority>0.8</priority>")
-                        .append("</url>");
-        scontent.append("<url>  <loc>").append(sitemapURLs[1]).append("</loc>  <changefreq>weekly</changefreq>").append("</url>");
-        scontent.append("<url>  <loc>").append(sitemapURLs[2]).append("</loc>  <lastmod>2004-12-23</lastmod>").append("  <changefreq>weekly</changefreq>").append("</url>");
-        scontent.append("<url>  <loc>").append(sitemapURLs[3]).append("</loc>  <lastmod>2004-12-23T18:00:15+00:00</lastmod>").append("  <priority>0.3</priority>").append("</url>");
-        scontent.append("<url>  <loc><url><![CDATA[").append(sitemapURLs[4]).append("]]></url></loc>  <lastmod>2004-11-23</lastmod>").append("</url>");
-        scontent.append("</urlset>");
-
-        return scontent.toString().getBytes(UTF_8);
-    }
-
-    /**
-     * Read a test resource file and return its content as byte array.
-     * 
-     * @param resourceName
-     *            path to the resource file
-     * @return byte content of the file
-     * @throws IOException
-     */
-    private byte[] getResourceAsBytes(String resourceName) throws IOException {
-        File file = new File(resourceName);
-        InputStream is = new FileInputStream(file);
-        return IOUtils.toByteArray(is);
-    }
-
-    private static String[] sitemapURLs = new String[] { "http://www.example.com/", "http://www.example.com/catalog?item=12&amp;desc=vacation_hawaii",
-                    "http://www.example.com/catalog?item=73&amp;desc=vacation_new_zealand", "http://www.example.com/catalog?item=74&amp;desc=vacation_newfoundland",
-                    "http://www.example.com/catalog?item=83&desc=vacation_usa" };
-
-}
--- a/src/test/java/crawlercommons/sitemaps/SiteMapParserTest.java
+++ b/src/test/java/crawlercommons/sitemaps/SiteMapParserTest.java
@ -17,6 +17,13 @@
 package crawlercommons.sitemaps;

 import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertSame;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;

 import java.io.File;
 import java.io.FileInputStream;
@ -31,7 +38,6 @@ import java.util.Locale;
 import org.apache.commons.io.IOUtils;
 import org.junit.After;
 import org.junit.Before;
-import org.junit.Ignore;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;
@ -40,8 +46,6 @@ import org.slf4j.LoggerFactory;

 import crawlercommons.sitemaps.AbstractSiteMap.SitemapType;

-import static org.junit.Assert.*;
-
@RunWith(JUnit4.class)
 public class SiteMapParserTest {

@ -99,6 +103,7 @@ public class SiteMapParserTest {
    @Test
    public void testSitemapWithNamespace() throws UnknownFormatException, IOException {
        SiteMapParser parser = new SiteMapParser();
+        parser.setStrictNamespace(true);
        byte[] content = getResourceAsBytes("src/test/resources/sitemaps/sitemap.ns.xml");

        URL url = new URL("http://www.example.com/sitemap.ns.xml");
@ -120,13 +125,13 @@ public class SiteMapParserTest {
        byte[] content = getResourceAsBytes("src/test/resources/sitemaps/sitemap.badns.xml");

        URL url = new URL("http://www.example.com/sitemap.badns.xml");
-        AbstractSiteMap asm = parser.parseSiteMap(content, url);
-        assertEquals(SitemapType.XML, asm.getType());
-        assertEquals(true, asm instanceof SiteMap);
-        assertEquals(true, asm.isProcessed());
-        SiteMap sm = (SiteMap) asm;
-
-        assertEquals(0, sm.getSiteMapUrls().size());
+        AbstractSiteMap asm;
+        try {
+            asm = parser.parseSiteMap(content, url);
+            fail("Expected an UnknownFormatException because of wrong namespace");
+        } catch (UnknownFormatException e) {
+            assertTrue(e.getMessage().contains("does not match standard namespace"));
+        }

        // try again in lenient mode
        parser.setStrictNamespace(false);
@ -134,7 +139,7 @@ public class SiteMapParserTest {
        assertEquals(SitemapType.XML, asm.getType());
        assertEquals(true, asm instanceof SiteMap);
        assertEquals(true, asm.isProcessed());
-        sm = (SiteMap) asm;
+        SiteMap sm = (SiteMap) asm;

        assertEquals(2, sm.getSiteMapUrls().size());
    }
@ -407,20 +412,15 @@ public class SiteMapParserTest {
        AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
        assertSame("Not an RSS", SitemapType.RSS, asm.getType());
        assertNotNull("GMT timestamp not parsed", asm.getLastModified());
-        assertEquals("GMT timestamp", 1483619690000L, asm.getLastModified().getTime()); // Thu,
-                                                                                        // 05
-                                                                                        // Jan
-                                                                                        // 17
-                                                                                        // 12:34:50
-                                                                                        // GMT
-
+        long pubDate = 1483619690000L; // Thu, 05 Jan 17 12:34:50 GMT
+        assertEquals("GMT timestamp", pubDate, asm.getLastModified().getTime());
        SiteMap rss = (SiteMap) asm;
        assertEquals("Incorrect items count", 7, rss.getSiteMapUrls().size());
        Iterator<SiteMapURL> it = rss.getSiteMapUrls().iterator();
-        assertPubDate("Local differental offset", "article_1", 1483619691000L, it);
-        assertPubDate("Short year", "article_2", 1483619692000L, it);
-        assertPubDate("No weekday", "article_3", 1483619693000L, it);
-        assertPubDate("No weekday and short year", "article_4", 1483619694000L, it);
+        assertPubDate("Local differental offset", "article_1", pubDate + 1000, it);
+        assertPubDate("Short year", "article_2", pubDate + 2000, it);
+        assertPubDate("No weekday", "article_3", pubDate + 3000, it);
+        assertPubDate("No weekday and short year", "article_4", pubDate + 4000, it);
        assertPubDate("No time zone(incorrect)", "article_5", null, it);
        assertPubDate("Empty field", "article_6", null, it);
        assertPubDate("Missed field", "article_7", null, it);
@ -438,11 +438,10 @@ public class SiteMapParserTest {
        }
    }

-    @Ignore("fails for DOM-based parser")
    @Test
    public void testPartialSitemapsAllowed() throws UnknownFormatException, IOException {

-        SiteMapParser parser = new SiteMapParser();
+        SiteMapParser parser = new SiteMapParser(false, true);
        String contentType = "text/xml";
        StringBuilder scontent = new StringBuilder(1024);
        scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">").append("<url>")
@ -482,11 +481,10 @@ public class SiteMapParserTest {
        assertFalse(sm.getSiteMapUrls().iterator().next().isValid());
    }

-    @Ignore("fails for DOM-based parser")
    @Test
    public void testPartialSitemapIndicesAllowed() throws UnknownFormatException, IOException {

-        SiteMapParser parser = new SiteMapParser();
+        SiteMapParser parser = new SiteMapParser(false, true);
        String contentType = "text/xml";
        StringBuilder scontent = new StringBuilder(1024);
        scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append("<sitemapindex xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">")