1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-05-25 11:46:03 +02:00

Fix MIME detection for sitemaps:

- avoid NPE if no MIME type has been detected
- allow optional leading white space before MIME patterns
  (after optional BOM)
This commit is contained in:
Sebastian Nagel 2018-04-10 19:17:14 +02:00
parent 12155888bc
commit 34c19d811a
3 changed files with 91 additions and 31 deletions

View File

@ -44,25 +44,33 @@ public class MimeTypeDetector {
(byte) 0xBF
};
private static final int LEADING_WHITESPACE_MAX_SKIP = 16;
private final static boolean[] spaceCharacters = new boolean[256];
static {
spaceCharacters[0x09] = true; // \t - character tabulation (ht)
spaceCharacters[0x0a] = true; // \n - line feed (lf)
spaceCharacters[0x0b] = true; // line tabulation (vt)
spaceCharacters[0x0c] = true; // form feed (ff)
spaceCharacters[0x0d] = true; // \r - carriage return (cr)
spaceCharacters[0x20] = true; // space
}
private static class MimeTypeEntry {
private String mimeType;
private byte[] pattern;
private boolean allowBOM;
private boolean allowLeadingSpace;
public MimeTypeEntry(String mimeType, String pattern) {
this(mimeType, pattern, false);
this(mimeType, pattern, false, false);
}
public MimeTypeEntry(String mimeType, String pattern, boolean addBOM) {
public MimeTypeEntry(String mimeType, String pattern, boolean allowBOM, boolean allowLeadingSpace) {
this.mimeType = mimeType;
byte[] patternBytes = pattern.getBytes(StandardCharsets.UTF_8);
if (addBOM) {
this.pattern = new byte[UTF8_BOM.length + patternBytes.length];
System.arraycopy(UTF8_BOM, 0, this.pattern, 0, UTF8_BOM.length);
System.arraycopy(patternBytes, 0, this.pattern, UTF8_BOM.length, patternBytes.length);
} else {
this.pattern = patternBytes;
}
this.allowBOM = allowBOM;
this.allowLeadingSpace = allowLeadingSpace;
this.pattern = pattern.getBytes(StandardCharsets.UTF_8);
}
public MimeTypeEntry(String mimeType, int... pattern) {
@ -95,43 +103,64 @@ public class MimeTypeDetector {
mimeTypes = new ArrayList<>();
// Add all text patterns without and with a BOM.
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<?xml"));
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<?xml", true));
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<?XML"));
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<?XML", true));
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<!--"));
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<!--", true));
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<?xml", true, true));
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<?XML", true, true));
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<!--", true, true));
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<urlset", true, true));
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<sitemapindex", true, true));
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<rss", true, true));
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<feed", true, true));
mimeTypes.add(new MimeTypeEntry(TEXT_MIMETYPES[0], "http://"));
mimeTypes.add(new MimeTypeEntry(TEXT_MIMETYPES[0], "http://", true));
mimeTypes.add(new MimeTypeEntry(TEXT_MIMETYPES[0], "https://"));
mimeTypes.add(new MimeTypeEntry(TEXT_MIMETYPES[0], "https://", true));
mimeTypes.add(new MimeTypeEntry(TEXT_MIMETYPES[0], "http://", true, true));
mimeTypes.add(new MimeTypeEntry(TEXT_MIMETYPES[0], "https://", true, true));
mimeTypes.add(new MimeTypeEntry(GZIP_MIMETYPES[0], "\037\213"));
mimeTypes.add(new MimeTypeEntry(GZIP_MIMETYPES[0], 0x1F, 0x8B));
maxPatternLength = 0;
for (MimeTypeEntry entry : mimeTypes) {
maxPatternLength = Math.max(maxPatternLength, entry.getPattern().length);
int length = entry.getPattern().length;
if (entry.allowBOM)
length += UTF8_BOM.length;
if (entry.allowLeadingSpace)
length += LEADING_WHITESPACE_MAX_SKIP;
maxPatternLength = Math.max(maxPatternLength, length);
}
}
public String detect(byte[] content) {
for (MimeTypeEntry entry : mimeTypes) {
if (patternMatches(entry.getPattern(), content, 0, content.length)) {
return entry.getMimeType();
}
}
// No mime-type detected.
return null;
return detect(content, 0, content.length);
}
public String detect(byte[] content, int offset, int length) {
int offsetBOM = -1;
int offsetSpace = -1;
for (MimeTypeEntry entry : mimeTypes) {
if (patternMatches(entry.getPattern(), content, offset, length)) {
return entry.getMimeType();
}
if (entry.allowBOM) {
if (offsetBOM == -1) {
offsetBOM = offset;
while (patternMatches(UTF8_BOM, content, offsetBOM, length) && offsetBOM < content.length) {
offsetBOM += UTF8_BOM.length;
}
}
if (patternMatches(entry.getPattern(), content, offsetBOM, length)) {
return entry.getMimeType();
}
}
if (entry.allowLeadingSpace) {
if (offsetSpace == -1) {
offsetSpace = (offsetBOM > 0 ? offsetBOM : 0);
while (offsetSpace < content.length && spaceCharacters[content[offsetSpace] & 0xFF]) {
offsetSpace++;
}
}
if (patternMatches(entry.getPattern(), content, offsetSpace, length)) {
return entry.getMimeType();
}
}
}
// No mime-type detected.
@ -143,7 +172,7 @@ public class MimeTypeDetector {
return false;
}
for (int i = 0; i < pattern.length; i++) {
for (int i = 0; i < pattern.length && (offset + i) < content.length; i++) {
if (pattern[i] != content[offset + i]) {
return false;
}

View File

@ -192,6 +192,10 @@ public class SiteMapParser {
}
String contentType = mimeTypeDetector.detect(content);
if (contentType == null) {
String msg = String.format(Locale.ROOT, "Failed to detect MediaType of sitemap '%s'", url);
throw new UnknownFormatException(msg);
}
return parseSiteMap(contentType, content, url);
}

View File

@ -53,6 +53,33 @@ public class MimeTypeDetectorTest {
assertTrue(detector.isGzip(mimeType));
}
@Test
public void testLeadingSpace() throws IOException {
MimeTypeDetector detector = new MimeTypeDetector();
byte[] whitespace = { (byte) 0x20, (byte) 0x0a };
byte[] content = getSitemap("sitemap.txt");
byte[] wscontent = new byte[whitespace.length + content.length];
System.arraycopy(whitespace, 0, wscontent, 0, whitespace.length);
System.arraycopy(content, 0, wscontent, whitespace.length, content.length);
String mimeType = detector.detect(wscontent);
assertFalse(detector.isXml(mimeType));
assertTrue(detector.isText(mimeType));
assertFalse(detector.isGzip(mimeType));
content = getSitemap("sitemap-with-bom.txt");
wscontent = new byte[whitespace.length + content.length];
System.arraycopy(content, 0, wscontent, 0, 3);
System.arraycopy(whitespace, 0, wscontent, 3, whitespace.length);
System.arraycopy(content, 3, wscontent, (3 + whitespace.length), (content.length - 3));
mimeType = detector.detect(wscontent);
assertFalse(detector.isXml(mimeType));
assertTrue(detector.isText(mimeType));
assertFalse(detector.isGzip(mimeType));
}
private byte[] getSitemap(String filename) throws IOException {
return IOUtils.toByteArray(MimeTypeDetectorTest.class.getResourceAsStream("/sitemaps/" + filename));
}