Merge pull request #200 from sebastian-nagel/cc-198-fix-regressions

Improve MIME detection for sitemaps
2024-09-24 09:40:41 +02:00 · 2018-04-25 09:19:27 +02:00 · 2018-04-25 09:19:27 +02:00 · a9277acde2
commit a9277acde2
parent 8a34e25c41 a6b3178fc7
5 changed files with 99 additions and 32 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -6,6 +6,7 @@ Current Development 0.10-SNAPSHOT (yyyy-mm-dd)
 - Add main() to EffectiveTldFinder (sebastian-nagel) #187
 - Handle new suffixes in PaidLevelDomain (kkrugler) #183
 - Remove Tika dependency (kkrugler) #199
+- Improve MIME detection for sitemaps (sebastian-nagel) #200

 Release 0.9 (2017-10-27)
 - [Sitemaps] Removed DOM-based sitemap parser (jnioche) #177
--- a/src/main/java/crawlercommons/mimetypes/MimeTypeDetector.java
+++ b/src/main/java/crawlercommons/mimetypes/MimeTypeDetector.java
@ -15,7 +15,8 @@ public class MimeTypeDetector {
                    "text/xml",
                    "application/atom+xml",
                    "application/rss+xml",
-                    "text/rss"
+                    "text/rss",
+                    "application/rdf+xml"
                    };

    private static String[] TEXT_MIMETYPES = new String[] {
@ -44,25 +45,31 @@ public class MimeTypeDetector {
                    (byte) 0xBF
                    };

+    private static final int LEADING_WHITESPACE_MAX_SKIP = 32;
+
+    private final static boolean[] spaceCharacters = new boolean[256];
+    static {
+        spaceCharacters[0x09] = true; // \t - character tabulation (ht)
+        spaceCharacters[0x0a] = true; // \n - line feed (lf)
+        spaceCharacters[0x0b] = true; // line tabulation (vt)
+        spaceCharacters[0x0c] = true; // form feed (ff)
+        spaceCharacters[0x0d] = true; // \r - carriage return (cr)
+        spaceCharacters[0x20] = true; // space
+    }
+
    private static class MimeTypeEntry {
        private String mimeType;
        private byte[] pattern;
+        private boolean isTextPattern;

        public MimeTypeEntry(String mimeType, String pattern) {
            this(mimeType, pattern, false);
        }

-        public MimeTypeEntry(String mimeType, String pattern, boolean addBOM) {
+        public MimeTypeEntry(String mimeType, String pattern, boolean isTextPattern) {
            this.mimeType = mimeType;
-
-            byte[] patternBytes = pattern.getBytes(StandardCharsets.UTF_8);
-            if (addBOM) {
-                this.pattern = new byte[UTF8_BOM.length + patternBytes.length];
-                System.arraycopy(UTF8_BOM, 0, this.pattern, 0, UTF8_BOM.length);
-                System.arraycopy(patternBytes, 0, this.pattern, UTF8_BOM.length, patternBytes.length);
-            } else {
-                this.pattern = patternBytes;
-            }
+            this.isTextPattern = isTextPattern;
+            this.pattern = pattern.getBytes(StandardCharsets.UTF_8);
        }

        public MimeTypeEntry(String mimeType, int... pattern) {
@ -95,16 +102,16 @@ public class MimeTypeDetector {
        mimeTypes = new ArrayList<>();

        // Add all text patterns without and with a BOM.
-        mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<?xml"));
        mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<?xml", true));
-        mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<?XML"));
        mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<?XML", true));
-        mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<!--"));
        mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<!--", true));
+        mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<urlset", true));
+        mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<sitemapindex", true));
+        mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<rss", true));
+        mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<feed", true));
+        mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<rdf", true));

-        mimeTypes.add(new MimeTypeEntry(TEXT_MIMETYPES[0], "http://"));
        mimeTypes.add(new MimeTypeEntry(TEXT_MIMETYPES[0], "http://", true));
-        mimeTypes.add(new MimeTypeEntry(TEXT_MIMETYPES[0], "https://"));
        mimeTypes.add(new MimeTypeEntry(TEXT_MIMETYPES[0], "https://", true));

        mimeTypes.add(new MimeTypeEntry(GZIP_MIMETYPES[0], "\037\213"));
@ -112,26 +119,39 @@ public class MimeTypeDetector {

        maxPatternLength = 0;
        for (MimeTypeEntry entry : mimeTypes) {
-            maxPatternLength = Math.max(maxPatternLength, entry.getPattern().length);
+            int length = entry.getPattern().length;
+            if (entry.isTextPattern)
+                length += LEADING_WHITESPACE_MAX_SKIP;
+            maxPatternLength = Math.max(maxPatternLength, length);
        }
    }

    public String detect(byte[] content) {
-        for (MimeTypeEntry entry : mimeTypes) {
-            if (patternMatches(entry.getPattern(), content, 0, content.length)) {
-                return entry.getMimeType();
-            }
+        return detect(content, content.length);
    }

-        // No mime-type detected.
-        return null;
-    }
+    public String detect(byte[] content, int length) {
+        int offsetText = -1;

-    public String detect(byte[] content, int offset, int length) {
        for (MimeTypeEntry entry : mimeTypes) {
-            if (patternMatches(entry.getPattern(), content, offset, length)) {
+            if (entry.isTextPattern) {
+                if (offsetText == -1) {
+                    offsetText = 0;
+                    while (patternMatches(UTF8_BOM, content, offsetText, length) && offsetText < content.length) {
+                        offsetText += UTF8_BOM.length;
+                    }
+                    while (offsetText < content.length && spaceCharacters[content[offsetText] & 0xFF]) {
+                        offsetText++;
+                    }
+                }
+                if (patternMatches(entry.getPattern(), content, offsetText, (length-offsetText))) {
                    return entry.getMimeType();
                }
+            } else {
+                if (patternMatches(entry.getPattern(), content, 0, length)) {
+                    return entry.getMimeType();
+                }
+            }
        }

        // No mime-type detected.
@ -143,7 +163,7 @@ public class MimeTypeDetector {
            return false;
        }

-        for (int i = 0; i < pattern.length; i++) {
+        for (int i = 0; i < pattern.length && (offset + i) < content.length; i++) {
            if (pattern[i] != content[offset + i]) {
                return false;
            }
@ -162,7 +182,7 @@ public class MimeTypeDetector {

        try {
            int contentLength = is.read(content);
-            return detect(content, 0, contentLength);
+            return detect(content, contentLength);
        } finally {
            is.reset();
        }
--- a/src/main/java/crawlercommons/sitemaps/SiteMapParser.java
+++ b/src/main/java/crawlercommons/sitemaps/SiteMapParser.java
@ -192,6 +192,10 @@ public class SiteMapParser {
        }

        String contentType = mimeTypeDetector.detect(content);
+        if (contentType == null) {
+            String msg = String.format(Locale.ROOT, "Failed to detect MediaType of sitemap '%s'", url);
+            throw new UnknownFormatException(msg);
+        }
        return parseSiteMap(contentType, content, url);
    }

@ -217,6 +221,7 @@ public class SiteMapParser {
    public AbstractSiteMap parseSiteMap(String contentType, byte[] content, URL url) throws UnknownFormatException, IOException {
        String mimeType = mimeTypeDetector.normalize(contentType, content);

+        String msg;
        if (mimeTypeDetector.isXml(mimeType)) {
            return processXml(url, content);
        } else if (mimeTypeDetector.isText(mimeType)) {
@ -228,14 +233,19 @@ public class SiteMapParser {
                    return processGzippedXML(url, content);
                } else if (mimeTypeDetector.isText(compressedType)) {
                    return processText(url, decompressed);
+                } else if (compressedType == null) {
+                    msg = String.format(Locale.ROOT, "Failed to detect embedded MediaType of gzipped sitemap '%s'", url);
+                } else {
+                    msg = String.format(Locale.ROOT, "Can't parse a sitemap with MediaType '%s' (embedded in %s) from '%s'", compressedType, contentType, url);
                }
            } catch (Exception e) {
-                String msg = String.format(Locale.ROOT, "Failed to detect embedded MediaType of gzipped sitemap '%s'", url);
+                msg = String.format(Locale.ROOT, "Failed to detect embedded MediaType of gzipped sitemap '%s'", url);
                throw new UnknownFormatException(msg, e);
            }
+        } else {
+            msg = String.format(Locale.ROOT, "Can't parse a sitemap with MediaType '%s' from '%s'", contentType, url);
        }

-        String msg = String.format(Locale.ROOT, "Can't parse a sitemap with MediaType '%s' from '%s'", contentType, url);
        throw new UnknownFormatException(msg);
    }

--- a/src/test/java/crawlercommons/mimetypes/MimeTypeDetectorTest.java
+++ b/src/test/java/crawlercommons/mimetypes/MimeTypeDetectorTest.java
@ -53,6 +53,33 @@ public class MimeTypeDetectorTest {
        assertTrue(detector.isGzip(mimeType));
    }

+    @Test
+    public void testLeadingSpace() throws IOException {
+        MimeTypeDetector detector = new MimeTypeDetector();
+
+        byte[] whitespace = { (byte) 0x20, (byte) 0x0a };
+        byte[] content = getSitemap("sitemap.txt");
+        byte[] wscontent = new byte[whitespace.length + content.length];
+        System.arraycopy(whitespace, 0, wscontent, 0, whitespace.length);
+        System.arraycopy(content, 0, wscontent, whitespace.length, content.length);
+
+        String mimeType = detector.detect(wscontent);
+        assertFalse(detector.isXml(mimeType));
+        assertTrue(detector.isText(mimeType));
+        assertFalse(detector.isGzip(mimeType));
+
+        content = getSitemap("sitemap-with-bom.txt");
+        wscontent = new byte[whitespace.length + content.length];
+        System.arraycopy(content, 0, wscontent, 0, 3);
+        System.arraycopy(whitespace, 0, wscontent, 3, whitespace.length);
+        System.arraycopy(content, 3, wscontent, (3 + whitespace.length), (content.length - 3));
+
+        mimeType = detector.detect(wscontent);
+        assertFalse(detector.isXml(mimeType));
+        assertTrue(detector.isText(mimeType));
+        assertFalse(detector.isGzip(mimeType));
+    }
+
    private byte[] getSitemap(String filename) throws IOException {
        return IOUtils.toByteArray(MimeTypeDetectorTest.class.getResourceAsStream("/sitemaps/" + filename));
    }
--- a/src/test/java/crawlercommons/sitemaps/SiteMapParserTest.java
+++ b/src/test/java/crawlercommons/sitemaps/SiteMapParserTest.java
@ -411,6 +411,15 @@ public class SiteMapParserTest {
        SiteMap sm = (SiteMap) asm;
        assertEquals(1, sm.getSiteMapUrls().size());
        assertEquals("http://www.example.com/pub/2000/08/09/xslt/xslt.html", sm.getSiteMapUrls().iterator().next().getUrl().toString());
+
+        // Test RDF content type
+        contentType = "application/rdf+xml";
+        asm = parser.parseSiteMap(contentType, content, url);
+        assertEquals(1, ((SiteMap) asm).getSiteMapUrls().size());
+
+        // Test without content type
+        asm = parser.parseSiteMap(content, url);
+        assertEquals(1, ((SiteMap) asm).getSiteMapUrls().size());
    }

    @Test