1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-05-24 19:36:06 +02:00

Merge pull request #198 from crawler-commons/kkrugler_no-tika

Remove Tika dependency
This commit is contained in:
Ken Krugler 2018-04-02 12:55:40 -07:00 committed by GitHub
commit 14153c4eae
Signed by: GitHub
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 363 additions and 125 deletions

View File

@ -316,7 +316,6 @@
<!-- Dependencies -->
<commons-io.version>2.4</commons-io.version>
<tika-core.version>1.17</tika-core.version>
<slf4j-api.version>1.7.7</slf4j-api.version>
<junit.version>4.7</junit.version>
<slf4j-log4j12.version>1.7.7</slf4j-log4j12.version>
@ -358,12 +357,6 @@
<version>${commons-io.version}</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>${tika-core.version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>

View File

@ -0,0 +1,231 @@
package crawlercommons.mimetypes;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
public class MimeTypeDetector {
private static String[] XML_MIMETYPES = new String[] {
"application/xml",
"application/x-xml",
"text/xml",
"application/atom+xml",
"application/rss+xml",
"text/rss"
};
private static String[] TEXT_MIMETYPES = new String[] {
"text/plain"
};
private static String[] GZIP_MIMETYPES = new String[] {
"application/gzip",
"application/gzip-compressed",
"application/gzipped",
"application/x-gzip",
"application/x-gzip-compressed",
"application/x-gunzip",
"gzip/document"
};
private static String[][] MIMETYPES = {
XML_MIMETYPES,
TEXT_MIMETYPES,
GZIP_MIMETYPES
};
private static byte[] UTF8_BOM = {
(byte) 0xEF,
(byte) 0xBB,
(byte) 0xBF
};
private static class MimeTypeEntry {
private String mimeType;
private byte[] pattern;
public MimeTypeEntry(String mimeType, String pattern) {
this(mimeType, pattern, false);
}
public MimeTypeEntry(String mimeType, String pattern, boolean addBOM) {
this.mimeType = mimeType;
byte[] patternBytes = pattern.getBytes(StandardCharsets.UTF_8);
if (addBOM) {
this.pattern = new byte[UTF8_BOM.length + patternBytes.length];
System.arraycopy(UTF8_BOM, 0, this.pattern, 0, UTF8_BOM.length);
System.arraycopy(patternBytes, 0, this.pattern, UTF8_BOM.length, patternBytes.length);
} else {
this.pattern = patternBytes;
}
}
public MimeTypeEntry(String mimeType, int... pattern) {
this.mimeType = mimeType;
this.pattern = makeBytePattern(pattern);
}
private byte[] makeBytePattern(int[] pattern) {
byte[] result = new byte[pattern.length];
for (int i = 0; i < pattern.length; i++) {
result[i] = (byte) (pattern[i] & 0xFF);
}
return result;
}
public String getMimeType() {
return mimeType;
}
public byte[] getPattern() {
return pattern;
}
}
private List<MimeTypeEntry> mimeTypes;
private int maxPatternLength;
public MimeTypeDetector() {
mimeTypes = new ArrayList<>();
// Add all text patterns without and with a BOM.
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<?xml"));
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<?xml", true));
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<?XML"));
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<?XML", true));
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<!--"));
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<!--", true));
mimeTypes.add(new MimeTypeEntry(TEXT_MIMETYPES[0], "http://"));
mimeTypes.add(new MimeTypeEntry(TEXT_MIMETYPES[0], "http://", true));
mimeTypes.add(new MimeTypeEntry(TEXT_MIMETYPES[0], "https://"));
mimeTypes.add(new MimeTypeEntry(TEXT_MIMETYPES[0], "https://", true));
mimeTypes.add(new MimeTypeEntry(GZIP_MIMETYPES[0], "\037\213"));
mimeTypes.add(new MimeTypeEntry(GZIP_MIMETYPES[0], 0x1F, 0x8B));
maxPatternLength = 0;
for (MimeTypeEntry entry : mimeTypes) {
maxPatternLength = Math.max(maxPatternLength, entry.getPattern().length);
}
}
public String detect(byte[] content) {
for (MimeTypeEntry entry : mimeTypes) {
if (patternMatches(entry.getPattern(), content, 0, content.length)) {
return entry.getMimeType();
}
}
// No mime-type detected.
return null;
}
public String detect(byte[] content, int offset, int length) {
for (MimeTypeEntry entry : mimeTypes) {
if (patternMatches(entry.getPattern(), content, offset, length)) {
return entry.getMimeType();
}
}
// No mime-type detected.
return null;
}
private boolean patternMatches(byte[] pattern, byte[] content, int offset, int length) {
if (pattern.length > length) {
return false;
}
for (int i = 0; i < pattern.length; i++) {
if (pattern[i] != content[offset + i]) {
return false;
}
}
return true;
}
public String detect(InputStream is) throws IOException {
if (!is.markSupported()) {
throw new IllegalArgumentException("Can't detect mime type for input stream that doesn't support mark/reset");
}
is.mark(maxPatternLength);
byte[] content = new byte[maxPatternLength];
try {
int contentLength = is.read(content);
return detect(content, 0, contentLength);
} finally {
is.reset();
}
}
public boolean isXml(String mimeType) {
if (mimeType == null) {
return false;
}
for (String xmlMimeType : XML_MIMETYPES) {
if (mimeType.equals(xmlMimeType)) {
return true;
}
}
return false;
}
public boolean isText(String mimeType) {
if (mimeType == null) {
return false;
}
for (String textMimeType : TEXT_MIMETYPES) {
if (mimeType.equals(textMimeType)) {
return true;
}
}
return false;
}
public boolean isGzip(String mimeType) {
if (mimeType == null) {
return false;
}
for (String gzipMimeType : GZIP_MIMETYPES) {
if (mimeType.equals(gzipMimeType)) {
return true;
}
}
return false;
}
public String normalize(String contentType, byte[] content) {
String normalizedContentType = contentType.toLowerCase(Locale.ROOT);
for (String[] mimeTypes : MIMETYPES) {
for (String mimeType : mimeTypes) {
if (normalizedContentType.equals(mimeType)) {
return mimeTypes[0];
}
}
}
String result = detect(content);
if (result != null) {
return result;
}
return null;
}
}

View File

@ -17,9 +17,8 @@
package crawlercommons.sitemaps;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.apache.tika.mime.MediaType.APPLICATION_XML;
import static org.apache.tika.mime.MediaType.TEXT_PLAIN;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
@ -28,26 +27,22 @@ import java.io.InputStreamReader;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.zip.GZIPInputStream;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.BOMInputStream;
import org.apache.tika.Tika;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import crawlercommons.mimetypes.MimeTypeDetector;
import crawlercommons.sitemaps.AbstractSiteMap.SitemapType;
import crawlercommons.sitemaps.sax.DelegatorHandler;
@ -67,18 +62,6 @@ public class SiteMapParser {
*/
public static final int MAX_BYTES_ALLOWED = 52428800;
/* Tika's MediaType components */
private static final Tika TIKA = new Tika();
private static final MediaTypeRegistry MEDIA_TYPE_REGISTRY = MediaTypeRegistry.getDefaultRegistry();
private static final List<MediaType> XML_MEDIA_TYPES = new ArrayList<>();
private static final List<MediaType> TEXT_MEDIA_TYPES = new ArrayList<>();
private static final List<MediaType> GZ_MEDIA_TYPES = new ArrayList<>();
static {
initMediaTypes();
}
/**
* True (by default) meaning that invalid URLs should be rejected, as the
* official docs allow the siteMapURLs to be only under the base url:
@ -94,6 +77,8 @@ public class SiteMapParser {
**/
protected boolean strictNamespace = false;
private MimeTypeDetector mimeTypeDetector;
public SiteMapParser() {
this(true, false);
}
@ -105,6 +90,8 @@ public class SiteMapParser {
public SiteMapParser(boolean strict, boolean allowPartial) {
this.strict = strict;
this.allowPartial = allowPartial;
this.mimeTypeDetector = new MimeTypeDetector();
}
/**
@ -203,8 +190,8 @@ public class SiteMapParser {
if (url == null) {
return null;
}
String filename = FilenameUtils.getName(url.getPath());
String contentType = TIKA.detect(content, filename);
String contentType = mimeTypeDetector.detect(content);
return parseSiteMap(contentType, content, url);
}
@ -228,41 +215,28 @@ public class SiteMapParser {
* {@link java.net.URL}
*/
public AbstractSiteMap parseSiteMap(String contentType, byte[] content, URL url) throws UnknownFormatException, IOException {
MediaType mediaType = MediaType.parse(contentType);
String mimeType = mimeTypeDetector.normalize(contentType, content);
// Octet-stream is the father of all binary types
while (mediaType != null && !mediaType.equals(MediaType.OCTET_STREAM)) {
if (XML_MEDIA_TYPES.contains(mediaType)) {
return processXml(url, content);
} else if (TEXT_MEDIA_TYPES.contains(mediaType)) {
return processText(url, content);
} else if (GZ_MEDIA_TYPES.contains(mediaType)) {
InputStream decompressed;
MediaType embeddedType;
try {
decompressed = new GZIPInputStream(new ByteArrayInputStream(content));
embeddedType = MediaType.parse(TIKA.detect(decompressed));
} catch (Exception e) {
UnknownFormatException err = new UnknownFormatException("Failed to detect embedded MediaType of gzipped sitemap: " + url + ", caused by " + e);
err.initCause(e);
throw err;
}
if (XML_MEDIA_TYPES.contains(embeddedType)) {
if (mimeTypeDetector.isXml(mimeType)) {
return processXml(url, content);
} else if (mimeTypeDetector.isText(mimeType)) {
return processText(url, content);
} else if (mimeTypeDetector.isGzip(mimeType)) {
try (InputStream decompressed = new BufferedInputStream(new GZIPInputStream(new ByteArrayInputStream(content)))) {
String compressedType = mimeTypeDetector.detect(decompressed);
if (mimeTypeDetector.isXml(compressedType)) {
return processGzippedXML(url, content);
} else if (TEXT_MEDIA_TYPES.contains(embeddedType)) {
// re-open decompressed stream and parse as text
decompressed = new GZIPInputStream(new ByteArrayInputStream(content));
} else if (mimeTypeDetector.isText(compressedType)) {
return processText(url, decompressed);
} else if (GZ_MEDIA_TYPES.contains(embeddedType)) {
throw new UnknownFormatException("Can't parse gzip recursively: " + url);
}
throw new UnknownFormatException("Can't parse a gzipped sitemap with the embedded MediaType of: " + embeddedType + " (at: " + url + ")");
} catch (Exception e) {
String msg = String.format(Locale.ROOT, "Failed to detect embedded MediaType of gzipped sitemap '%s'", url);
throw new UnknownFormatException(msg, e);
}
mediaType = MEDIA_TYPE_REGISTRY.getSupertype(mediaType); // Check
// parent
}
throw new UnknownFormatException("Can't parse a sitemap with the MediaType of: " + contentType + " (at: " + url + ")");
String msg = String.format(Locale.ROOT, "Can't parse a sitemap with MediaType '%s' from '%s'", contentType, url);
throw new UnknownFormatException(msg);
}
/**
@ -491,25 +465,4 @@ public class SiteMapParser {
return ret;
}
/**
* Performs a one time intialization of Tika's Media-Type components and
* media type collection constants <br/>
* Please note that this is a private static method which is called once per
* CLASS (not per instance / object)
*/
private static void initMediaTypes() {
/* XML media types (and all aliases) */
XML_MEDIA_TYPES.add(APPLICATION_XML);
XML_MEDIA_TYPES.addAll(MEDIA_TYPE_REGISTRY.getAliases(APPLICATION_XML));
/* TEXT media types (and all aliases) */
TEXT_MEDIA_TYPES.add(TEXT_PLAIN);
TEXT_MEDIA_TYPES.addAll(MEDIA_TYPE_REGISTRY.getAliases(TEXT_PLAIN));
/* GZIP media types (and all aliases) */
MediaType gzipMediaType = MediaType.parse("application/gzip");
GZ_MEDIA_TYPES.add(gzipMediaType);
GZ_MEDIA_TYPES.addAll(MEDIA_TYPE_REGISTRY.getAliases(gzipMediaType));
}
}

View File

@ -19,33 +19,15 @@ package crawlercommons.sitemaps;
@SuppressWarnings("serial")
public class UnknownFormatException extends Exception {
private final String error;
/** Default constructor - initializes instance variable to unknown */
public UnknownFormatException() {
super();
error = "unknown";
}
/**
* Constructor receives some kind of message that is saved in an instance
* variable.
*
* @param err
* a String object to use within the Execption
*/
public UnknownFormatException(String err) {
super(err);
error = err;
public UnknownFormatException(String message) {
super(message);
}
/**
* public method, callable by exception catcher. It returns the error
* message.
*
* @return a populated Exception as a String
*/
public String getError() {
return error;
public UnknownFormatException(String message, Throwable cause) {
super(message, cause);
}
}

View File

@ -0,0 +1,60 @@
package crawlercommons.mimetypes;
import static org.junit.Assert.*;
import java.io.IOException;
import org.apache.commons.io.IOUtils;
import org.junit.Test;
public class MimeTypeDetectorTest {
@Test
public void testXMLDetection() throws Exception {
MimeTypeDetector detector = new MimeTypeDetector();
byte[] content = getSitemap("atom.xml");
String mimeType = detector.detect(content);
assertTrue(detector.isXml(mimeType));
assertFalse(detector.isText(mimeType));
assertFalse(detector.isGzip(mimeType));
}
@Test
public void testTextDetection() throws IOException {
MimeTypeDetector detector = new MimeTypeDetector();
byte[] content = getSitemap("sitemap.txt");
String mimeType = detector.detect(content);
assertFalse(detector.isXml(mimeType));
assertTrue(detector.isText(mimeType));
assertFalse(detector.isGzip(mimeType));
content = getSitemap("sitemap-with-bom.txt");
mimeType = detector.detect(content);
assertFalse(detector.isXml(mimeType));
assertTrue(detector.isText(mimeType));
assertFalse(detector.isGzip(mimeType));
}
@Test
public void testGzipDetection() throws IOException {
MimeTypeDetector detector = new MimeTypeDetector();
byte[] content = getSitemap("xmlSitemap.gz");
String mimeType = detector.detect(content);
assertFalse(detector.isXml(mimeType));
assertFalse(detector.isText(mimeType));
assertTrue(detector.isGzip(mimeType));
content = getSitemap("sitemap.txt.gz");
mimeType = detector.detect(content);
assertFalse(detector.isXml(mimeType));
assertFalse(detector.isText(mimeType));
assertTrue(detector.isGzip(mimeType));
}
private byte[] getSitemap(String filename) throws IOException {
return IOUtils.toByteArray(MimeTypeDetectorTest.class.getResourceAsStream("/sitemaps/" + filename));
}
}

View File

@ -30,6 +30,7 @@ import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;
@ -184,6 +185,22 @@ public class SiteMapParserTest {
assertEquals(2, sm.getSiteMapUrls().size());
}
@Test
public void testSitemapTXTWithWrongMimeType() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
String scontent = "http://www.example.com/catalog?item=1\nhttp://www.example.com/catalog?item=11";
byte[] content = scontent.getBytes(UTF_8);
URL url = new URL("http://www.example.com/sitemap.xml");
String contentType = "application/bogus";
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
assertEquals(2, sm.getSiteMapUrls().size());
}
@Test
public void testSitemapXML() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
@ -200,7 +217,7 @@ public class SiteMapParserTest {
SiteMapURL[] found = sm.getSiteMapUrls().toArray(new SiteMapURL[5]);
for (int i = 0; i < found.length; i++) {
assertEquals(sitemapURLs[i].replaceAll("&amp;", "&"), found[i].getUrl().toExternalForm());
assertEquals(SITEMAP_URLS[i].replaceAll("&amp;", "&"), found[i].getUrl().toExternalForm());
}
}
@ -219,7 +236,7 @@ public class SiteMapParserTest {
assertEquals(5, sm.getSiteMapUrls().size());
SiteMapURL[] found = sm.getSiteMapUrls().toArray(new SiteMapURL[5]);
for (int i = 0; i < found.length; i++) {
assertEquals(sitemapURLs[i].replaceAll("&amp;", "&"), found[i].getUrl().toExternalForm());
assertEquals(SITEMAP_URLS[i].replaceAll("&amp;", "&"), found[i].getUrl().toExternalForm());
}
}
}
@ -303,23 +320,13 @@ public class SiteMapParserTest {
}
@Test(expected = UnknownFormatException.class)
public void testSitemapWithOctetMediaType() throws UnknownFormatException, IOException {
public void testSitemapWithInvalidContent() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
String contentType = "application/octet-stream";
byte[] content = getXMLSitemapAsBytes();
byte[] content = "this is a bogus sitemap".getBytes(StandardCharsets.UTF_8);
URL url = new URL("http://www.example.com/sitemap");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
assertEquals(5, sm.getSiteMapUrls().size());
SiteMapURL[] found = sm.getSiteMapUrls().toArray(new SiteMapURL[5]);
for (int i = 0; i < found.length; i++) {
assertEquals(sitemapURLs[i], found[i].getUrl().toExternalForm());
}
parser.parseSiteMap(contentType, content, url);
}
@Test
@ -327,8 +334,12 @@ public class SiteMapParserTest {
SiteMapParser parser = new SiteMapParser();
String contentType = "text/xml";
StringBuilder scontent = new StringBuilder(1024);
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">").append("<url>")
.append("<loc>http://www.example.com/</loc>").append("</url>").append("</urlset>");
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>")
.append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">")
.append("<url>")
.append("<loc>http://www.example.com/</loc>")
.append("</url>")
.append("</urlset>");
byte[] content = scontent.toString().getBytes(UTF_8);
URL url = new URL("http://www.example.com/subsection/sitemap.xml");
@ -507,12 +518,12 @@ public class SiteMapParserTest {
private byte[] getXMLSitemapAsBytes() {
StringBuilder scontent = new StringBuilder(1024);
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">");
scontent.append("<url> <loc>").append(sitemapURLs[0]).append("</loc> <lastmod>2005-01-01</lastmod>").append(" <changefreq>monthly</changefreq>").append(" <priority>0.8</priority>")
scontent.append("<url> <loc>").append(SITEMAP_URLS[0]).append("</loc> <lastmod>2005-01-01</lastmod>").append(" <changefreq>monthly</changefreq>").append(" <priority>0.8</priority>")
.append("</url>");
scontent.append("<url> <loc>").append(sitemapURLs[1]).append("</loc> <changefreq>weekly</changefreq>").append("</url>");
scontent.append("<url> <loc>").append(sitemapURLs[2]).append("</loc> <lastmod>2004-12-23</lastmod>").append(" <changefreq>weekly</changefreq>").append("</url>");
scontent.append("<url> <loc>").append(sitemapURLs[3]).append("</loc> <lastmod>2004-12-23T18:00:15+00:00</lastmod>").append(" <priority>0.3</priority>").append("</url>");
scontent.append("<url> <loc><url><![CDATA[").append(sitemapURLs[4]).append("]]></url></loc> <lastmod>2004-11-23</lastmod>").append("</url>");
scontent.append("<url> <loc>").append(SITEMAP_URLS[1]).append("</loc> <changefreq>weekly</changefreq>").append("</url>");
scontent.append("<url> <loc>").append(SITEMAP_URLS[2]).append("</loc> <lastmod>2004-12-23</lastmod>").append(" <changefreq>weekly</changefreq>").append("</url>");
scontent.append("<url> <loc>").append(SITEMAP_URLS[3]).append("</loc> <lastmod>2004-12-23T18:00:15+00:00</lastmod>").append(" <priority>0.3</priority>").append("</url>");
scontent.append("<url> <loc><url><![CDATA[").append(SITEMAP_URLS[4]).append("]]></url></loc> <lastmod>2004-11-23</lastmod>").append("</url>");
scontent.append("</urlset>");
return scontent.toString().getBytes(UTF_8);
@ -532,8 +543,12 @@ public class SiteMapParserTest {
return IOUtils.toByteArray(is);
}
private static String[] sitemapURLs = new String[] { "http://www.example.com/", "http://www.example.com/catalog?item=12&amp;desc=vacation_hawaii",
"http://www.example.com/catalog?item=73&amp;desc=vacation_new_zealand", "http://www.example.com/catalog?item=74&amp;desc=vacation_newfoundland",
"http://www.example.com/catalog?item=83&desc=vacation_usa" };
private static String[] SITEMAP_URLS = new String[] {
"http://www.example.com/",
"http://www.example.com/catalog?item=12&amp;desc=vacation_hawaii",
"http://www.example.com/catalog?item=73&amp;desc=vacation_new_zealand",
"http://www.example.com/catalog?item=74&amp;desc=vacation_newfoundland",
"http://www.example.com/catalog?item=83&desc=vacation_usa"
};
}

View File

@ -0,0 +1,2 @@
http://www.domain.com/page1.html
http://www.domain.com/page2.html

View File

@ -0,0 +1,2 @@
http://www.domain.com/page1.html
http://www.domain.com/page2.html