mirror of
https://github.com/crawler-commons/crawler-commons
synced 2024-09-24 09:40:41 +02:00
Merge pull request #200 from sebastian-nagel/cc-198-fix-regressions
Improve MIME detection for sitemaps
This commit is contained in:
commit
a9277acde2
@ -6,6 +6,7 @@ Current Development 0.10-SNAPSHOT (yyyy-mm-dd)
|
||||
- Add main() to EffectiveTldFinder (sebastian-nagel) #187
|
||||
- Handle new suffixes in PaidLevelDomain (kkrugler) #183
|
||||
- Remove Tika dependency (kkrugler) #199
|
||||
- Improve MIME detection for sitemaps (sebastian-nagel) #200
|
||||
|
||||
Release 0.9 (2017-10-27)
|
||||
- [Sitemaps] Removed DOM-based sitemap parser (jnioche) #177
|
||||
|
@ -15,7 +15,8 @@ public class MimeTypeDetector {
|
||||
"text/xml",
|
||||
"application/atom+xml",
|
||||
"application/rss+xml",
|
||||
"text/rss"
|
||||
"text/rss",
|
||||
"application/rdf+xml"
|
||||
};
|
||||
|
||||
private static String[] TEXT_MIMETYPES = new String[] {
|
||||
@ -44,25 +45,31 @@ public class MimeTypeDetector {
|
||||
(byte) 0xBF
|
||||
};
|
||||
|
||||
private static final int LEADING_WHITESPACE_MAX_SKIP = 32;
|
||||
|
||||
private final static boolean[] spaceCharacters = new boolean[256];
|
||||
static {
|
||||
spaceCharacters[0x09] = true; // \t - character tabulation (ht)
|
||||
spaceCharacters[0x0a] = true; // \n - line feed (lf)
|
||||
spaceCharacters[0x0b] = true; // line tabulation (vt)
|
||||
spaceCharacters[0x0c] = true; // form feed (ff)
|
||||
spaceCharacters[0x0d] = true; // \r - carriage return (cr)
|
||||
spaceCharacters[0x20] = true; // space
|
||||
}
|
||||
|
||||
private static class MimeTypeEntry {
|
||||
private String mimeType;
|
||||
private byte[] pattern;
|
||||
private boolean isTextPattern;
|
||||
|
||||
public MimeTypeEntry(String mimeType, String pattern) {
|
||||
this(mimeType, pattern, false);
|
||||
}
|
||||
|
||||
public MimeTypeEntry(String mimeType, String pattern, boolean addBOM) {
|
||||
public MimeTypeEntry(String mimeType, String pattern, boolean isTextPattern) {
|
||||
this.mimeType = mimeType;
|
||||
|
||||
byte[] patternBytes = pattern.getBytes(StandardCharsets.UTF_8);
|
||||
if (addBOM) {
|
||||
this.pattern = new byte[UTF8_BOM.length + patternBytes.length];
|
||||
System.arraycopy(UTF8_BOM, 0, this.pattern, 0, UTF8_BOM.length);
|
||||
System.arraycopy(patternBytes, 0, this.pattern, UTF8_BOM.length, patternBytes.length);
|
||||
} else {
|
||||
this.pattern = patternBytes;
|
||||
}
|
||||
this.isTextPattern = isTextPattern;
|
||||
this.pattern = pattern.getBytes(StandardCharsets.UTF_8);
|
||||
}
|
||||
|
||||
public MimeTypeEntry(String mimeType, int... pattern) {
|
||||
@ -95,16 +102,16 @@ public class MimeTypeDetector {
|
||||
mimeTypes = new ArrayList<>();
|
||||
|
||||
// Add all text patterns without and with a BOM.
|
||||
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<?xml"));
|
||||
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<?xml", true));
|
||||
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<?XML"));
|
||||
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<?XML", true));
|
||||
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<!--"));
|
||||
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<!--", true));
|
||||
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<urlset", true));
|
||||
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<sitemapindex", true));
|
||||
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<rss", true));
|
||||
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<feed", true));
|
||||
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<rdf", true));
|
||||
|
||||
mimeTypes.add(new MimeTypeEntry(TEXT_MIMETYPES[0], "http://"));
|
||||
mimeTypes.add(new MimeTypeEntry(TEXT_MIMETYPES[0], "http://", true));
|
||||
mimeTypes.add(new MimeTypeEntry(TEXT_MIMETYPES[0], "https://"));
|
||||
mimeTypes.add(new MimeTypeEntry(TEXT_MIMETYPES[0], "https://", true));
|
||||
|
||||
mimeTypes.add(new MimeTypeEntry(GZIP_MIMETYPES[0], "\037\213"));
|
||||
@ -112,26 +119,39 @@ public class MimeTypeDetector {
|
||||
|
||||
maxPatternLength = 0;
|
||||
for (MimeTypeEntry entry : mimeTypes) {
|
||||
maxPatternLength = Math.max(maxPatternLength, entry.getPattern().length);
|
||||
int length = entry.getPattern().length;
|
||||
if (entry.isTextPattern)
|
||||
length += LEADING_WHITESPACE_MAX_SKIP;
|
||||
maxPatternLength = Math.max(maxPatternLength, length);
|
||||
}
|
||||
}
|
||||
|
||||
public String detect(byte[] content) {
|
||||
for (MimeTypeEntry entry : mimeTypes) {
|
||||
if (patternMatches(entry.getPattern(), content, 0, content.length)) {
|
||||
return entry.getMimeType();
|
||||
}
|
||||
return detect(content, content.length);
|
||||
}
|
||||
|
||||
// No mime-type detected.
|
||||
return null;
|
||||
}
|
||||
public String detect(byte[] content, int length) {
|
||||
int offsetText = -1;
|
||||
|
||||
public String detect(byte[] content, int offset, int length) {
|
||||
for (MimeTypeEntry entry : mimeTypes) {
|
||||
if (patternMatches(entry.getPattern(), content, offset, length)) {
|
||||
if (entry.isTextPattern) {
|
||||
if (offsetText == -1) {
|
||||
offsetText = 0;
|
||||
while (patternMatches(UTF8_BOM, content, offsetText, length) && offsetText < content.length) {
|
||||
offsetText += UTF8_BOM.length;
|
||||
}
|
||||
while (offsetText < content.length && spaceCharacters[content[offsetText] & 0xFF]) {
|
||||
offsetText++;
|
||||
}
|
||||
}
|
||||
if (patternMatches(entry.getPattern(), content, offsetText, (length-offsetText))) {
|
||||
return entry.getMimeType();
|
||||
}
|
||||
} else {
|
||||
if (patternMatches(entry.getPattern(), content, 0, length)) {
|
||||
return entry.getMimeType();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// No mime-type detected.
|
||||
@ -143,7 +163,7 @@ public class MimeTypeDetector {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < pattern.length; i++) {
|
||||
for (int i = 0; i < pattern.length && (offset + i) < content.length; i++) {
|
||||
if (pattern[i] != content[offset + i]) {
|
||||
return false;
|
||||
}
|
||||
@ -162,7 +182,7 @@ public class MimeTypeDetector {
|
||||
|
||||
try {
|
||||
int contentLength = is.read(content);
|
||||
return detect(content, 0, contentLength);
|
||||
return detect(content, contentLength);
|
||||
} finally {
|
||||
is.reset();
|
||||
}
|
||||
|
@ -192,6 +192,10 @@ public class SiteMapParser {
|
||||
}
|
||||
|
||||
String contentType = mimeTypeDetector.detect(content);
|
||||
if (contentType == null) {
|
||||
String msg = String.format(Locale.ROOT, "Failed to detect MediaType of sitemap '%s'", url);
|
||||
throw new UnknownFormatException(msg);
|
||||
}
|
||||
return parseSiteMap(contentType, content, url);
|
||||
}
|
||||
|
||||
@ -217,6 +221,7 @@ public class SiteMapParser {
|
||||
public AbstractSiteMap parseSiteMap(String contentType, byte[] content, URL url) throws UnknownFormatException, IOException {
|
||||
String mimeType = mimeTypeDetector.normalize(contentType, content);
|
||||
|
||||
String msg;
|
||||
if (mimeTypeDetector.isXml(mimeType)) {
|
||||
return processXml(url, content);
|
||||
} else if (mimeTypeDetector.isText(mimeType)) {
|
||||
@ -228,14 +233,19 @@ public class SiteMapParser {
|
||||
return processGzippedXML(url, content);
|
||||
} else if (mimeTypeDetector.isText(compressedType)) {
|
||||
return processText(url, decompressed);
|
||||
} else if (compressedType == null) {
|
||||
msg = String.format(Locale.ROOT, "Failed to detect embedded MediaType of gzipped sitemap '%s'", url);
|
||||
} else {
|
||||
msg = String.format(Locale.ROOT, "Can't parse a sitemap with MediaType '%s' (embedded in %s) from '%s'", compressedType, contentType, url);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
String msg = String.format(Locale.ROOT, "Failed to detect embedded MediaType of gzipped sitemap '%s'", url);
|
||||
msg = String.format(Locale.ROOT, "Failed to detect embedded MediaType of gzipped sitemap '%s'", url);
|
||||
throw new UnknownFormatException(msg, e);
|
||||
}
|
||||
} else {
|
||||
msg = String.format(Locale.ROOT, "Can't parse a sitemap with MediaType '%s' from '%s'", contentType, url);
|
||||
}
|
||||
|
||||
String msg = String.format(Locale.ROOT, "Can't parse a sitemap with MediaType '%s' from '%s'", contentType, url);
|
||||
throw new UnknownFormatException(msg);
|
||||
}
|
||||
|
||||
|
@ -53,6 +53,33 @@ public class MimeTypeDetectorTest {
|
||||
assertTrue(detector.isGzip(mimeType));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLeadingSpace() throws IOException {
|
||||
MimeTypeDetector detector = new MimeTypeDetector();
|
||||
|
||||
byte[] whitespace = { (byte) 0x20, (byte) 0x0a };
|
||||
byte[] content = getSitemap("sitemap.txt");
|
||||
byte[] wscontent = new byte[whitespace.length + content.length];
|
||||
System.arraycopy(whitespace, 0, wscontent, 0, whitespace.length);
|
||||
System.arraycopy(content, 0, wscontent, whitespace.length, content.length);
|
||||
|
||||
String mimeType = detector.detect(wscontent);
|
||||
assertFalse(detector.isXml(mimeType));
|
||||
assertTrue(detector.isText(mimeType));
|
||||
assertFalse(detector.isGzip(mimeType));
|
||||
|
||||
content = getSitemap("sitemap-with-bom.txt");
|
||||
wscontent = new byte[whitespace.length + content.length];
|
||||
System.arraycopy(content, 0, wscontent, 0, 3);
|
||||
System.arraycopy(whitespace, 0, wscontent, 3, whitespace.length);
|
||||
System.arraycopy(content, 3, wscontent, (3 + whitespace.length), (content.length - 3));
|
||||
|
||||
mimeType = detector.detect(wscontent);
|
||||
assertFalse(detector.isXml(mimeType));
|
||||
assertTrue(detector.isText(mimeType));
|
||||
assertFalse(detector.isGzip(mimeType));
|
||||
}
|
||||
|
||||
private byte[] getSitemap(String filename) throws IOException {
|
||||
return IOUtils.toByteArray(MimeTypeDetectorTest.class.getResourceAsStream("/sitemaps/" + filename));
|
||||
}
|
||||
|
@ -411,6 +411,15 @@ public class SiteMapParserTest {
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
assertEquals(1, sm.getSiteMapUrls().size());
|
||||
assertEquals("http://www.example.com/pub/2000/08/09/xslt/xslt.html", sm.getSiteMapUrls().iterator().next().getUrl().toString());
|
||||
|
||||
// Test RDF content type
|
||||
contentType = "application/rdf+xml";
|
||||
asm = parser.parseSiteMap(contentType, content, url);
|
||||
assertEquals(1, ((SiteMap) asm).getSiteMapUrls().size());
|
||||
|
||||
// Test without content type
|
||||
asm = parser.parseSiteMap(content, url);
|
||||
assertEquals(1, ((SiteMap) asm).getSiteMapUrls().size());
|
||||
}
|
||||
|
||||
@Test
|
||||
|
Loading…
Reference in New Issue
Block a user