Merge pull request #198 from crawler-commons/kkrugler_no-tika

Remove Tika dependency
2024-05-24 19:36:06 +02:00 · 2018-04-02 12:55:40 -07:00 · 2018-04-02 12:55:40 -07:00 · 14153c4eae
parent 5132651a6e 165888ba7e
commit 14153c4eae
8 changed files with 363 additions and 125 deletions
--- a/pom.xml
+++ b/pom.xml
@ -316,7 +316,6 @@
 		<!-- Dependencies -->

 		<commons-io.version>2.4</commons-io.version>
-		<tika-core.version>1.17</tika-core.version>
 		<slf4j-api.version>1.7.7</slf4j-api.version>
 		<junit.version>4.7</junit.version>
 		<slf4j-log4j12.version>1.7.7</slf4j-log4j12.version>
@ -358,12 +357,6 @@
 			<version>${commons-io.version}</version>
 		</dependency>

-		<dependency>
-			<groupId>org.apache.tika</groupId>
-			<artifactId>tika-core</artifactId>
-			<version>${tika-core.version}</version>
-		</dependency>
-
 		<dependency>
 			<groupId>org.slf4j</groupId>
 			<artifactId>slf4j-api</artifactId>
--- a/src/main/java/crawlercommons/mimetypes/MimeTypeDetector.java
+++ b/src/main/java/crawlercommons/mimetypes/MimeTypeDetector.java
@ -0,0 +1,231 @@
+package crawlercommons.mimetypes;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+
+public class MimeTypeDetector {
+
+    private static String[] XML_MIMETYPES = new String[] {
+                    "application/xml",
+                    "application/x-xml",
+                    "text/xml",
+                    "application/atom+xml",
+                    "application/rss+xml",
+                    "text/rss"
+                    };
+
+    private static String[] TEXT_MIMETYPES = new String[] {
+                    "text/plain"
+                    };
+
+    private static String[] GZIP_MIMETYPES = new String[] {
+                    "application/gzip",
+                    "application/gzip-compressed",
+                    "application/gzipped",
+                    "application/x-gzip",
+                    "application/x-gzip-compressed",
+                    "application/x-gunzip",
+                    "gzip/document"
+                    };
+
+    private static String[][] MIMETYPES = {
+                    XML_MIMETYPES,
+                    TEXT_MIMETYPES,
+                    GZIP_MIMETYPES
+                    };
+
+    private static byte[] UTF8_BOM = {
+                    (byte) 0xEF,
+                    (byte) 0xBB,
+                    (byte) 0xBF
+                    };
+
+    private static class MimeTypeEntry {
+        private String mimeType;
+        private byte[] pattern;
+
+        public MimeTypeEntry(String mimeType, String pattern) {
+            this(mimeType, pattern, false);
+        }
+
+        public MimeTypeEntry(String mimeType, String pattern, boolean addBOM) {
+            this.mimeType = mimeType;
+
+            byte[] patternBytes = pattern.getBytes(StandardCharsets.UTF_8);
+            if (addBOM) {
+                this.pattern = new byte[UTF8_BOM.length + patternBytes.length];
+                System.arraycopy(UTF8_BOM, 0, this.pattern, 0, UTF8_BOM.length);
+                System.arraycopy(patternBytes, 0, this.pattern, UTF8_BOM.length, patternBytes.length);
+            } else {
+                this.pattern = patternBytes;
+            }
+        }
+
+        public MimeTypeEntry(String mimeType, int... pattern) {
+            this.mimeType = mimeType;
+            this.pattern = makeBytePattern(pattern);
+        }
+
+        private byte[] makeBytePattern(int[] pattern) {
+            byte[] result = new byte[pattern.length];
+            for (int i = 0; i < pattern.length; i++) {
+                result[i] = (byte) (pattern[i] & 0xFF);
+            }
+
+            return result;
+        }
+
+        public String getMimeType() {
+            return mimeType;
+        }
+
+        public byte[] getPattern() {
+            return pattern;
+        }
+    }
+
+    private List<MimeTypeEntry> mimeTypes;
+    private int maxPatternLength;
+
+    public MimeTypeDetector() {
+        mimeTypes = new ArrayList<>();
+
+        // Add all text patterns without and with a BOM.
+        mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<?xml"));
+        mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<?xml", true));
+        mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<?XML"));
+        mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<?XML", true));
+        mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<!--"));
+        mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<!--", true));
+
+        mimeTypes.add(new MimeTypeEntry(TEXT_MIMETYPES[0], "http://"));
+        mimeTypes.add(new MimeTypeEntry(TEXT_MIMETYPES[0], "http://", true));
+        mimeTypes.add(new MimeTypeEntry(TEXT_MIMETYPES[0], "https://"));
+        mimeTypes.add(new MimeTypeEntry(TEXT_MIMETYPES[0], "https://", true));
+
+        mimeTypes.add(new MimeTypeEntry(GZIP_MIMETYPES[0], "\037\213"));
+        mimeTypes.add(new MimeTypeEntry(GZIP_MIMETYPES[0], 0x1F, 0x8B));
+
+        maxPatternLength = 0;
+        for (MimeTypeEntry entry : mimeTypes) {
+            maxPatternLength = Math.max(maxPatternLength, entry.getPattern().length);
+        }
+    }
+
+    public String detect(byte[] content) {
+        for (MimeTypeEntry entry : mimeTypes) {
+            if (patternMatches(entry.getPattern(), content, 0, content.length)) {
+                return entry.getMimeType();
+            }
+        }
+
+        // No mime-type detected.
+        return null;
+    }
+
+    public String detect(byte[] content, int offset, int length) {
+        for (MimeTypeEntry entry : mimeTypes) {
+            if (patternMatches(entry.getPattern(), content, offset, length)) {
+                return entry.getMimeType();
+            }
+        }
+
+        // No mime-type detected.
+        return null;
+    }
+
+    private boolean patternMatches(byte[] pattern, byte[] content, int offset, int length) {
+        if (pattern.length > length) {
+            return false;
+        }
+
+        for (int i = 0; i < pattern.length; i++) {
+            if (pattern[i] != content[offset + i]) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    public String detect(InputStream is) throws IOException {
+        if (!is.markSupported()) {
+            throw new IllegalArgumentException("Can't detect mime type for input stream that doesn't support mark/reset");
+        }
+
+        is.mark(maxPatternLength);
+        byte[] content = new byte[maxPatternLength];
+
+        try {
+            int contentLength = is.read(content);
+            return detect(content, 0, contentLength);
+        } finally {
+            is.reset();
+        }
+    }
+
+    public boolean isXml(String mimeType) {
+        if (mimeType == null) {
+            return false;
+        }
+
+        for (String xmlMimeType : XML_MIMETYPES) {
+            if (mimeType.equals(xmlMimeType)) {
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    public boolean isText(String mimeType) {
+        if (mimeType == null) {
+            return false;
+        }
+
+        for (String textMimeType : TEXT_MIMETYPES) {
+            if (mimeType.equals(textMimeType)) {
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    public boolean isGzip(String mimeType) {
+        if (mimeType == null) {
+            return false;
+        }
+
+        for (String gzipMimeType : GZIP_MIMETYPES) {
+            if (mimeType.equals(gzipMimeType)) {
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    public String normalize(String contentType, byte[] content) {
+        String normalizedContentType = contentType.toLowerCase(Locale.ROOT);
+        for (String[] mimeTypes : MIMETYPES) {
+            for (String mimeType : mimeTypes) {
+                if (normalizedContentType.equals(mimeType)) {
+                    return mimeTypes[0];
+                }
+            }
+        }
+
+        String result = detect(content);
+        if (result != null) {
+            return result;
+        }
+
+        return null;
+    }
+
+}
--- a/src/main/java/crawlercommons/sitemaps/SiteMapParser.java
+++ b/src/main/java/crawlercommons/sitemaps/SiteMapParser.java
@ -17,9 +17,8 @@
 package crawlercommons.sitemaps;

 import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.apache.tika.mime.MediaType.APPLICATION_XML;
-import static org.apache.tika.mime.MediaType.TEXT_PLAIN;

+import java.io.BufferedInputStream;
 import java.io.BufferedReader;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
@ -28,26 +27,22 @@ import java.io.InputStreamReader;
 import java.io.StringReader;
 import java.net.MalformedURLException;
 import java.net.URL;
-import java.util.ArrayList;
-import java.util.List;
+import java.util.Locale;
 import java.util.zip.GZIPInputStream;

 import javax.xml.parsers.ParserConfigurationException;
 import javax.xml.parsers.SAXParser;
 import javax.xml.parsers.SAXParserFactory;

-import org.apache.commons.io.FilenameUtils;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.input.BOMInputStream;
-import org.apache.tika.Tika;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.mime.MediaTypeRegistry;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.xml.sax.EntityResolver;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;

+import crawlercommons.mimetypes.MimeTypeDetector;
 import crawlercommons.sitemaps.AbstractSiteMap.SitemapType;
 import crawlercommons.sitemaps.sax.DelegatorHandler;

@ -67,18 +62,6 @@ public class SiteMapParser {
     */
    public static final int MAX_BYTES_ALLOWED = 52428800;

-    /* Tika's MediaType components */
-    private static final Tika TIKA = new Tika();
-    private static final MediaTypeRegistry MEDIA_TYPE_REGISTRY = MediaTypeRegistry.getDefaultRegistry();
-
-    private static final List<MediaType> XML_MEDIA_TYPES = new ArrayList<>();
-    private static final List<MediaType> TEXT_MEDIA_TYPES = new ArrayList<>();
-    private static final List<MediaType> GZ_MEDIA_TYPES = new ArrayList<>();
-
-    static {
-        initMediaTypes();
-    }
-
    /**
     * True (by default) meaning that invalid URLs should be rejected, as the
     * official docs allow the siteMapURLs to be only under the base url:
@ -94,6 +77,8 @@ public class SiteMapParser {
     **/
    protected boolean strictNamespace = false;

+    private MimeTypeDetector mimeTypeDetector;
+
    public SiteMapParser() {
        this(true, false);
    }
@ -105,6 +90,8 @@ public class SiteMapParser {
    public SiteMapParser(boolean strict, boolean allowPartial) {
        this.strict = strict;
        this.allowPartial = allowPartial;
+
+        this.mimeTypeDetector = new MimeTypeDetector();
    }

    /**
@ -203,8 +190,8 @@ public class SiteMapParser {
        if (url == null) {
            return null;
        }
-        String filename = FilenameUtils.getName(url.getPath());
-        String contentType = TIKA.detect(content, filename);
+
+        String contentType = mimeTypeDetector.detect(content);
        return parseSiteMap(contentType, content, url);
    }

@ -228,41 +215,28 @@ public class SiteMapParser {
     *             {@link java.net.URL}
     */
    public AbstractSiteMap parseSiteMap(String contentType, byte[] content, URL url) throws UnknownFormatException, IOException {
-        MediaType mediaType = MediaType.parse(contentType);
+        String mimeType = mimeTypeDetector.normalize(contentType, content);

-        // Octet-stream is the father of all binary types
-        while (mediaType != null && !mediaType.equals(MediaType.OCTET_STREAM)) {
-            if (XML_MEDIA_TYPES.contains(mediaType)) {
-                return processXml(url, content);
-            } else if (TEXT_MEDIA_TYPES.contains(mediaType)) {
-                return processText(url, content);
-            } else if (GZ_MEDIA_TYPES.contains(mediaType)) {
-                InputStream decompressed;
-                MediaType embeddedType;
-                try {
-                    decompressed = new GZIPInputStream(new ByteArrayInputStream(content));
-                    embeddedType = MediaType.parse(TIKA.detect(decompressed));
-                } catch (Exception e) {
-                    UnknownFormatException err = new UnknownFormatException("Failed to detect embedded MediaType of gzipped sitemap: " + url + ", caused by " + e);
-                    err.initCause(e);
-                    throw err;
-                }
-                if (XML_MEDIA_TYPES.contains(embeddedType)) {
+        if (mimeTypeDetector.isXml(mimeType)) {
+            return processXml(url, content);
+        } else if (mimeTypeDetector.isText(mimeType)) {
+            return processText(url, content);
+        } else if (mimeTypeDetector.isGzip(mimeType)) {
+            try (InputStream decompressed = new BufferedInputStream(new GZIPInputStream(new ByteArrayInputStream(content)))) {
+                String compressedType = mimeTypeDetector.detect(decompressed);
+                if (mimeTypeDetector.isXml(compressedType)) {
                    return processGzippedXML(url, content);
-                } else if (TEXT_MEDIA_TYPES.contains(embeddedType)) {
-                    // re-open decompressed stream and parse as text
-                    decompressed = new GZIPInputStream(new ByteArrayInputStream(content));
+                } else if (mimeTypeDetector.isText(compressedType)) {
                    return processText(url, decompressed);
-                } else if (GZ_MEDIA_TYPES.contains(embeddedType)) {
-                    throw new UnknownFormatException("Can't parse gzip recursively: " + url);
                }
-                throw new UnknownFormatException("Can't parse a gzipped sitemap with the embedded MediaType of: " + embeddedType + " (at: " + url + ")");
+            } catch (Exception e) {
+                String msg = String.format(Locale.ROOT, "Failed to detect embedded MediaType of gzipped sitemap '%s'", url);
+                throw new UnknownFormatException(msg, e);
            }
-            mediaType = MEDIA_TYPE_REGISTRY.getSupertype(mediaType); // Check
-                                                                     // parent
        }

-        throw new UnknownFormatException("Can't parse a sitemap with the MediaType of: " + contentType + " (at: " + url + ")");
+        String msg = String.format(Locale.ROOT, "Can't parse a sitemap with MediaType '%s' from '%s'", contentType, url);
+        throw new UnknownFormatException(msg);
    }

    /**
@ -491,25 +465,4 @@ public class SiteMapParser {

        return ret;
    }
-
-    /**
-     * Performs a one time intialization of Tika's Media-Type components and
-     * media type collection constants <br/>
-     * Please note that this is a private static method which is called once per
-     * CLASS (not per instance / object)
-     */
-    private static void initMediaTypes() {
-        /* XML media types (and all aliases) */
-        XML_MEDIA_TYPES.add(APPLICATION_XML);
-        XML_MEDIA_TYPES.addAll(MEDIA_TYPE_REGISTRY.getAliases(APPLICATION_XML));
-
-        /* TEXT media types (and all aliases) */
-        TEXT_MEDIA_TYPES.add(TEXT_PLAIN);
-        TEXT_MEDIA_TYPES.addAll(MEDIA_TYPE_REGISTRY.getAliases(TEXT_PLAIN));
-
-        /* GZIP media types (and all aliases) */
-        MediaType gzipMediaType = MediaType.parse("application/gzip");
-        GZ_MEDIA_TYPES.add(gzipMediaType);
-        GZ_MEDIA_TYPES.addAll(MEDIA_TYPE_REGISTRY.getAliases(gzipMediaType));
-    }
 }
--- a/src/main/java/crawlercommons/sitemaps/UnknownFormatException.java
+++ b/src/main/java/crawlercommons/sitemaps/UnknownFormatException.java
@ -19,33 +19,15 @@ package crawlercommons.sitemaps;
@SuppressWarnings("serial")
 public class UnknownFormatException extends Exception {

-    private final String error;
-
-    /** Default constructor - initializes instance variable to unknown */
    public UnknownFormatException() {
        super();
-        error = "unknown";
    }

-    /**
-     * Constructor receives some kind of message that is saved in an instance
-     * variable.
-     * 
-     * @param err
-     *            a String object to use within the Execption
-     */
-    public UnknownFormatException(String err) {
-        super(err);
-        error = err;
+    public UnknownFormatException(String message) {
+        super(message);
    }

-    /**
-     * public method, callable by exception catcher. It returns the error
-     * message.
-     * 
-     * @return a populated Exception as a String
-     */
-    public String getError() {
-        return error;
+    public UnknownFormatException(String message, Throwable cause) {
+        super(message, cause);
    }
 }
--- a/src/test/java/crawlercommons/mimetypes/MimeTypeDetectorTest.java
+++ b/src/test/java/crawlercommons/mimetypes/MimeTypeDetectorTest.java
@ -0,0 +1,60 @@
+package crawlercommons.mimetypes;
+
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+
+import org.apache.commons.io.IOUtils;
+import org.junit.Test;
+
+public class MimeTypeDetectorTest {
+
+    @Test
+    public void testXMLDetection() throws Exception {
+        MimeTypeDetector detector = new MimeTypeDetector();
+
+        byte[] content = getSitemap("atom.xml");
+        String mimeType = detector.detect(content);
+        assertTrue(detector.isXml(mimeType));
+        assertFalse(detector.isText(mimeType));
+        assertFalse(detector.isGzip(mimeType));
+    }
+
+    @Test
+    public void testTextDetection() throws IOException {
+        MimeTypeDetector detector = new MimeTypeDetector();
+        byte[] content = getSitemap("sitemap.txt");
+        String mimeType = detector.detect(content);
+        assertFalse(detector.isXml(mimeType));
+        assertTrue(detector.isText(mimeType));
+        assertFalse(detector.isGzip(mimeType));
+
+        content = getSitemap("sitemap-with-bom.txt");
+        mimeType = detector.detect(content);
+        assertFalse(detector.isXml(mimeType));
+        assertTrue(detector.isText(mimeType));
+        assertFalse(detector.isGzip(mimeType));
+    }
+
+    @Test
+    public void testGzipDetection() throws IOException {
+        MimeTypeDetector detector = new MimeTypeDetector();
+
+        byte[] content = getSitemap("xmlSitemap.gz");
+        String mimeType = detector.detect(content);
+        assertFalse(detector.isXml(mimeType));
+        assertFalse(detector.isText(mimeType));
+        assertTrue(detector.isGzip(mimeType));
+
+        content = getSitemap("sitemap.txt.gz");
+        mimeType = detector.detect(content);
+        assertFalse(detector.isXml(mimeType));
+        assertFalse(detector.isText(mimeType));
+        assertTrue(detector.isGzip(mimeType));
+    }
+
+    private byte[] getSitemap(String filename) throws IOException {
+        return IOUtils.toByteArray(MimeTypeDetectorTest.class.getResourceAsStream("/sitemaps/" + filename));
+    }
+
+}
--- a/src/test/java/crawlercommons/sitemaps/SiteMapParserTest.java
+++ b/src/test/java/crawlercommons/sitemaps/SiteMapParserTest.java
@ -30,6 +30,7 @@ import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
+import java.nio.charset.StandardCharsets;
 import java.text.SimpleDateFormat;
 import java.util.Date;
 import java.util.Iterator;
@ -184,6 +185,22 @@ public class SiteMapParserTest {
        assertEquals(2, sm.getSiteMapUrls().size());
    }

+    @Test
+    public void testSitemapTXTWithWrongMimeType() throws UnknownFormatException, IOException {
+        SiteMapParser parser = new SiteMapParser();
+        String scontent = "http://www.example.com/catalog?item=1\nhttp://www.example.com/catalog?item=11";
+        byte[] content = scontent.getBytes(UTF_8);
+        URL url = new URL("http://www.example.com/sitemap.xml");
+        String contentType = "application/bogus";
+
+        AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
+        assertEquals(false, asm.isIndex());
+        assertEquals(true, asm instanceof SiteMap);
+
+        SiteMap sm = (SiteMap) asm;
+        assertEquals(2, sm.getSiteMapUrls().size());
+    }
+
    @Test
    public void testSitemapXML() throws UnknownFormatException, IOException {
        SiteMapParser parser = new SiteMapParser();
@ -200,7 +217,7 @@ public class SiteMapParserTest {

        SiteMapURL[] found = sm.getSiteMapUrls().toArray(new SiteMapURL[5]);
        for (int i = 0; i < found.length; i++) {
-            assertEquals(sitemapURLs[i].replaceAll("&amp;", "&"), found[i].getUrl().toExternalForm());
+            assertEquals(SITEMAP_URLS[i].replaceAll("&amp;", "&"), found[i].getUrl().toExternalForm());
        }
    }

@ -219,7 +236,7 @@ public class SiteMapParserTest {
            assertEquals(5, sm.getSiteMapUrls().size());
            SiteMapURL[] found = sm.getSiteMapUrls().toArray(new SiteMapURL[5]);
            for (int i = 0; i < found.length; i++) {
-                assertEquals(sitemapURLs[i].replaceAll("&amp;", "&"), found[i].getUrl().toExternalForm());
+                assertEquals(SITEMAP_URLS[i].replaceAll("&amp;", "&"), found[i].getUrl().toExternalForm());
            }
        }
    }
@ -303,23 +320,13 @@ public class SiteMapParserTest {
    }

    @Test(expected = UnknownFormatException.class)
-    public void testSitemapWithOctetMediaType() throws UnknownFormatException, IOException {
+    public void testSitemapWithInvalidContent() throws UnknownFormatException, IOException {
        SiteMapParser parser = new SiteMapParser();
        String contentType = "application/octet-stream";
-        byte[] content = getXMLSitemapAsBytes();
+        byte[] content = "this is a bogus sitemap".getBytes(StandardCharsets.UTF_8);
        URL url = new URL("http://www.example.com/sitemap");

-        AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
-        assertEquals(false, asm.isIndex());
-        assertEquals(true, asm instanceof SiteMap);
-
-        SiteMap sm = (SiteMap) asm;
-        assertEquals(5, sm.getSiteMapUrls().size());
-
-        SiteMapURL[] found = sm.getSiteMapUrls().toArray(new SiteMapURL[5]);
-        for (int i = 0; i < found.length; i++) {
-            assertEquals(sitemapURLs[i], found[i].getUrl().toExternalForm());
-        }
+        parser.parseSiteMap(contentType, content, url);
    }

    @Test
@ -327,8 +334,12 @@ public class SiteMapParserTest {
        SiteMapParser parser = new SiteMapParser();
        String contentType = "text/xml";
        StringBuilder scontent = new StringBuilder(1024);
-        scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">").append("<url>")
-                        .append("<loc>http://www.example.com/</loc>").append("</url>").append("</urlset>");
+        scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>")
+        	.append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">")
+        	.append("<url>")
+            .append("<loc>http://www.example.com/</loc>")
+            .append("</url>")
+            .append("</urlset>");
        byte[] content = scontent.toString().getBytes(UTF_8);

        URL url = new URL("http://www.example.com/subsection/sitemap.xml");
@ -507,12 +518,12 @@ public class SiteMapParserTest {
    private byte[] getXMLSitemapAsBytes() {
        StringBuilder scontent = new StringBuilder(1024);
        scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">");
-        scontent.append("<url>  <loc>").append(sitemapURLs[0]).append("</loc>  <lastmod>2005-01-01</lastmod>").append("  <changefreq>monthly</changefreq>").append("  <priority>0.8</priority>")
+        scontent.append("<url>  <loc>").append(SITEMAP_URLS[0]).append("</loc>  <lastmod>2005-01-01</lastmod>").append("  <changefreq>monthly</changefreq>").append("  <priority>0.8</priority>")
                        .append("</url>");
-        scontent.append("<url>  <loc>").append(sitemapURLs[1]).append("</loc>  <changefreq>weekly</changefreq>").append("</url>");
-        scontent.append("<url>  <loc>").append(sitemapURLs[2]).append("</loc>  <lastmod>2004-12-23</lastmod>").append("  <changefreq>weekly</changefreq>").append("</url>");
-        scontent.append("<url>  <loc>").append(sitemapURLs[3]).append("</loc>  <lastmod>2004-12-23T18:00:15+00:00</lastmod>").append("  <priority>0.3</priority>").append("</url>");
-        scontent.append("<url>  <loc><url><![CDATA[").append(sitemapURLs[4]).append("]]></url></loc>  <lastmod>2004-11-23</lastmod>").append("</url>");
+        scontent.append("<url>  <loc>").append(SITEMAP_URLS[1]).append("</loc>  <changefreq>weekly</changefreq>").append("</url>");
+        scontent.append("<url>  <loc>").append(SITEMAP_URLS[2]).append("</loc>  <lastmod>2004-12-23</lastmod>").append("  <changefreq>weekly</changefreq>").append("</url>");
+        scontent.append("<url>  <loc>").append(SITEMAP_URLS[3]).append("</loc>  <lastmod>2004-12-23T18:00:15+00:00</lastmod>").append("  <priority>0.3</priority>").append("</url>");
+        scontent.append("<url>  <loc><url><![CDATA[").append(SITEMAP_URLS[4]).append("]]></url></loc>  <lastmod>2004-11-23</lastmod>").append("</url>");
        scontent.append("</urlset>");

        return scontent.toString().getBytes(UTF_8);
@ -532,8 +543,12 @@ public class SiteMapParserTest {
        return IOUtils.toByteArray(is);
    }

-    private static String[] sitemapURLs = new String[] { "http://www.example.com/", "http://www.example.com/catalog?item=12&amp;desc=vacation_hawaii",
-                    "http://www.example.com/catalog?item=73&amp;desc=vacation_new_zealand", "http://www.example.com/catalog?item=74&amp;desc=vacation_newfoundland",
-                    "http://www.example.com/catalog?item=83&desc=vacation_usa" };
+    private static String[] SITEMAP_URLS = new String[] {
+    		"http://www.example.com/",
+    		"http://www.example.com/catalog?item=12&amp;desc=vacation_hawaii",
+            "http://www.example.com/catalog?item=73&amp;desc=vacation_new_zealand",
+            "http://www.example.com/catalog?item=74&amp;desc=vacation_newfoundland",
+             "http://www.example.com/catalog?item=83&desc=vacation_usa"
+    };

 }
--- a/src/test/resources/sitemaps/sitemap-with-bom.txt
+++ b/src/test/resources/sitemaps/sitemap-with-bom.txt
@ -0,0 +1,2 @@
+http://www.domain.com/page1.html
+http://www.domain.com/page2.html
--- a/src/test/resources/sitemaps/sitemap.txt
+++ b/src/test/resources/sitemaps/sitemap.txt
@ -0,0 +1,2 @@
+http://www.domain.com/page1.html
+http://www.domain.com/page2.html