diff --git a/src/main/java/crawlercommons/mimetypes/MimeTypeDetector.java b/src/main/java/crawlercommons/mimetypes/MimeTypeDetector.java index bd61382..74119a1 100644 --- a/src/main/java/crawlercommons/mimetypes/MimeTypeDetector.java +++ b/src/main/java/crawlercommons/mimetypes/MimeTypeDetector.java @@ -44,25 +44,33 @@ public class MimeTypeDetector { (byte) 0xBF }; + private static final int LEADING_WHITESPACE_MAX_SKIP = 16; + + private final static boolean[] spaceCharacters = new boolean[256]; + static { + spaceCharacters[0x09] = true; // \t - character tabulation (ht) + spaceCharacters[0x0a] = true; // \n - line feed (lf) + spaceCharacters[0x0b] = true; // line tabulation (vt) + spaceCharacters[0x0c] = true; // form feed (ff) + spaceCharacters[0x0d] = true; // \r - carriage return (cr) + spaceCharacters[0x20] = true; // space + } + private static class MimeTypeEntry { private String mimeType; private byte[] pattern; + private boolean allowBOM; + private boolean allowLeadingSpace; public MimeTypeEntry(String mimeType, String pattern) { - this(mimeType, pattern, false); + this(mimeType, pattern, false, false); } - public MimeTypeEntry(String mimeType, String pattern, boolean addBOM) { + public MimeTypeEntry(String mimeType, String pattern, boolean allowBOM, boolean allowLeadingSpace) { this.mimeType = mimeType; - - byte[] patternBytes = pattern.getBytes(StandardCharsets.UTF_8); - if (addBOM) { - this.pattern = new byte[UTF8_BOM.length + patternBytes.length]; - System.arraycopy(UTF8_BOM, 0, this.pattern, 0, UTF8_BOM.length); - System.arraycopy(patternBytes, 0, this.pattern, UTF8_BOM.length, patternBytes.length); - } else { - this.pattern = patternBytes; - } + this.allowBOM = allowBOM; + this.allowLeadingSpace = allowLeadingSpace; + this.pattern = pattern.getBytes(StandardCharsets.UTF_8); } public MimeTypeEntry(String mimeType, int... pattern) { @@ -95,43 +103,64 @@ public class MimeTypeDetector { mimeTypes = new ArrayList<>(); // Add all text patterns without and with a BOM. - mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], " 0 ? offsetBOM : 0); + while (offsetSpace < content.length && spaceCharacters[content[offsetSpace] & 0xFF]) { + offsetSpace++; + } + } + if (patternMatches(entry.getPattern(), content, offsetSpace, length)) { + return entry.getMimeType(); + } + } } // No mime-type detected. @@ -143,7 +172,7 @@ public class MimeTypeDetector { return false; } - for (int i = 0; i < pattern.length; i++) { + for (int i = 0; i < pattern.length && (offset + i) < content.length; i++) { if (pattern[i] != content[offset + i]) { return false; } diff --git a/src/main/java/crawlercommons/sitemaps/SiteMapParser.java b/src/main/java/crawlercommons/sitemaps/SiteMapParser.java index 3720eb2..b87884d 100644 --- a/src/main/java/crawlercommons/sitemaps/SiteMapParser.java +++ b/src/main/java/crawlercommons/sitemaps/SiteMapParser.java @@ -192,6 +192,10 @@ public class SiteMapParser { } String contentType = mimeTypeDetector.detect(content); + if (contentType == null) { + String msg = String.format(Locale.ROOT, "Failed to detect MediaType of sitemap '%s'", url); + throw new UnknownFormatException(msg); + } return parseSiteMap(contentType, content, url); } diff --git a/src/test/java/crawlercommons/mimetypes/MimeTypeDetectorTest.java b/src/test/java/crawlercommons/mimetypes/MimeTypeDetectorTest.java index bcfa999..7716bb7 100644 --- a/src/test/java/crawlercommons/mimetypes/MimeTypeDetectorTest.java +++ b/src/test/java/crawlercommons/mimetypes/MimeTypeDetectorTest.java @@ -53,6 +53,33 @@ public class MimeTypeDetectorTest { assertTrue(detector.isGzip(mimeType)); } + @Test + public void testLeadingSpace() throws IOException { + MimeTypeDetector detector = new MimeTypeDetector(); + + byte[] whitespace = { (byte) 0x20, (byte) 0x0a }; + byte[] content = getSitemap("sitemap.txt"); + byte[] wscontent = new byte[whitespace.length + content.length]; + System.arraycopy(whitespace, 0, wscontent, 0, whitespace.length); + System.arraycopy(content, 0, wscontent, whitespace.length, content.length); + + String mimeType = detector.detect(wscontent); + assertFalse(detector.isXml(mimeType)); + assertTrue(detector.isText(mimeType)); + assertFalse(detector.isGzip(mimeType)); + + content = getSitemap("sitemap-with-bom.txt"); + wscontent = new byte[whitespace.length + content.length]; + System.arraycopy(content, 0, wscontent, 0, 3); + System.arraycopy(whitespace, 0, wscontent, 3, whitespace.length); + System.arraycopy(content, 3, wscontent, (3 + whitespace.length), (content.length - 3)); + + mimeType = detector.detect(wscontent); + assertFalse(detector.isXml(mimeType)); + assertTrue(detector.isText(mimeType)); + assertFalse(detector.isGzip(mimeType)); + } + private byte[] getSitemap(String filename) throws IOException { return IOUtils.toByteArray(MimeTypeDetectorTest.class.getResourceAsStream("/sitemaps/" + filename)); }