mirror of
https://github.com/crawler-commons/crawler-commons
synced 2024-05-24 19:36:06 +02:00
Merge pull request #198 from crawler-commons/kkrugler_no-tika
Remove Tika dependency
This commit is contained in:
commit
14153c4eae
7
pom.xml
7
pom.xml
|
@ -316,7 +316,6 @@
|
|||
<!-- Dependencies -->
|
||||
|
||||
<commons-io.version>2.4</commons-io.version>
|
||||
<tika-core.version>1.17</tika-core.version>
|
||||
<slf4j-api.version>1.7.7</slf4j-api.version>
|
||||
<junit.version>4.7</junit.version>
|
||||
<slf4j-log4j12.version>1.7.7</slf4j-log4j12.version>
|
||||
|
@ -358,12 +357,6 @@
|
|||
<version>${commons-io.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.tika</groupId>
|
||||
<artifactId>tika-core</artifactId>
|
||||
<version>${tika-core.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-api</artifactId>
|
||||
|
|
|
@ -0,0 +1,231 @@
|
|||
package crawlercommons.mimetypes;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
|
||||
public class MimeTypeDetector {
|
||||
|
||||
private static String[] XML_MIMETYPES = new String[] {
|
||||
"application/xml",
|
||||
"application/x-xml",
|
||||
"text/xml",
|
||||
"application/atom+xml",
|
||||
"application/rss+xml",
|
||||
"text/rss"
|
||||
};
|
||||
|
||||
private static String[] TEXT_MIMETYPES = new String[] {
|
||||
"text/plain"
|
||||
};
|
||||
|
||||
private static String[] GZIP_MIMETYPES = new String[] {
|
||||
"application/gzip",
|
||||
"application/gzip-compressed",
|
||||
"application/gzipped",
|
||||
"application/x-gzip",
|
||||
"application/x-gzip-compressed",
|
||||
"application/x-gunzip",
|
||||
"gzip/document"
|
||||
};
|
||||
|
||||
private static String[][] MIMETYPES = {
|
||||
XML_MIMETYPES,
|
||||
TEXT_MIMETYPES,
|
||||
GZIP_MIMETYPES
|
||||
};
|
||||
|
||||
private static byte[] UTF8_BOM = {
|
||||
(byte) 0xEF,
|
||||
(byte) 0xBB,
|
||||
(byte) 0xBF
|
||||
};
|
||||
|
||||
private static class MimeTypeEntry {
|
||||
private String mimeType;
|
||||
private byte[] pattern;
|
||||
|
||||
public MimeTypeEntry(String mimeType, String pattern) {
|
||||
this(mimeType, pattern, false);
|
||||
}
|
||||
|
||||
public MimeTypeEntry(String mimeType, String pattern, boolean addBOM) {
|
||||
this.mimeType = mimeType;
|
||||
|
||||
byte[] patternBytes = pattern.getBytes(StandardCharsets.UTF_8);
|
||||
if (addBOM) {
|
||||
this.pattern = new byte[UTF8_BOM.length + patternBytes.length];
|
||||
System.arraycopy(UTF8_BOM, 0, this.pattern, 0, UTF8_BOM.length);
|
||||
System.arraycopy(patternBytes, 0, this.pattern, UTF8_BOM.length, patternBytes.length);
|
||||
} else {
|
||||
this.pattern = patternBytes;
|
||||
}
|
||||
}
|
||||
|
||||
public MimeTypeEntry(String mimeType, int... pattern) {
|
||||
this.mimeType = mimeType;
|
||||
this.pattern = makeBytePattern(pattern);
|
||||
}
|
||||
|
||||
private byte[] makeBytePattern(int[] pattern) {
|
||||
byte[] result = new byte[pattern.length];
|
||||
for (int i = 0; i < pattern.length; i++) {
|
||||
result[i] = (byte) (pattern[i] & 0xFF);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
public String getMimeType() {
|
||||
return mimeType;
|
||||
}
|
||||
|
||||
public byte[] getPattern() {
|
||||
return pattern;
|
||||
}
|
||||
}
|
||||
|
||||
private List<MimeTypeEntry> mimeTypes;
|
||||
private int maxPatternLength;
|
||||
|
||||
public MimeTypeDetector() {
|
||||
mimeTypes = new ArrayList<>();
|
||||
|
||||
// Add all text patterns without and with a BOM.
|
||||
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<?xml"));
|
||||
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<?xml", true));
|
||||
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<?XML"));
|
||||
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<?XML", true));
|
||||
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<!--"));
|
||||
mimeTypes.add(new MimeTypeEntry(XML_MIMETYPES[0], "<!--", true));
|
||||
|
||||
mimeTypes.add(new MimeTypeEntry(TEXT_MIMETYPES[0], "http://"));
|
||||
mimeTypes.add(new MimeTypeEntry(TEXT_MIMETYPES[0], "http://", true));
|
||||
mimeTypes.add(new MimeTypeEntry(TEXT_MIMETYPES[0], "https://"));
|
||||
mimeTypes.add(new MimeTypeEntry(TEXT_MIMETYPES[0], "https://", true));
|
||||
|
||||
mimeTypes.add(new MimeTypeEntry(GZIP_MIMETYPES[0], "\037\213"));
|
||||
mimeTypes.add(new MimeTypeEntry(GZIP_MIMETYPES[0], 0x1F, 0x8B));
|
||||
|
||||
maxPatternLength = 0;
|
||||
for (MimeTypeEntry entry : mimeTypes) {
|
||||
maxPatternLength = Math.max(maxPatternLength, entry.getPattern().length);
|
||||
}
|
||||
}
|
||||
|
||||
public String detect(byte[] content) {
|
||||
for (MimeTypeEntry entry : mimeTypes) {
|
||||
if (patternMatches(entry.getPattern(), content, 0, content.length)) {
|
||||
return entry.getMimeType();
|
||||
}
|
||||
}
|
||||
|
||||
// No mime-type detected.
|
||||
return null;
|
||||
}
|
||||
|
||||
public String detect(byte[] content, int offset, int length) {
|
||||
for (MimeTypeEntry entry : mimeTypes) {
|
||||
if (patternMatches(entry.getPattern(), content, offset, length)) {
|
||||
return entry.getMimeType();
|
||||
}
|
||||
}
|
||||
|
||||
// No mime-type detected.
|
||||
return null;
|
||||
}
|
||||
|
||||
private boolean patternMatches(byte[] pattern, byte[] content, int offset, int length) {
|
||||
if (pattern.length > length) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < pattern.length; i++) {
|
||||
if (pattern[i] != content[offset + i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public String detect(InputStream is) throws IOException {
|
||||
if (!is.markSupported()) {
|
||||
throw new IllegalArgumentException("Can't detect mime type for input stream that doesn't support mark/reset");
|
||||
}
|
||||
|
||||
is.mark(maxPatternLength);
|
||||
byte[] content = new byte[maxPatternLength];
|
||||
|
||||
try {
|
||||
int contentLength = is.read(content);
|
||||
return detect(content, 0, contentLength);
|
||||
} finally {
|
||||
is.reset();
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isXml(String mimeType) {
|
||||
if (mimeType == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (String xmlMimeType : XML_MIMETYPES) {
|
||||
if (mimeType.equals(xmlMimeType)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean isText(String mimeType) {
|
||||
if (mimeType == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (String textMimeType : TEXT_MIMETYPES) {
|
||||
if (mimeType.equals(textMimeType)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean isGzip(String mimeType) {
|
||||
if (mimeType == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (String gzipMimeType : GZIP_MIMETYPES) {
|
||||
if (mimeType.equals(gzipMimeType)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public String normalize(String contentType, byte[] content) {
|
||||
String normalizedContentType = contentType.toLowerCase(Locale.ROOT);
|
||||
for (String[] mimeTypes : MIMETYPES) {
|
||||
for (String mimeType : mimeTypes) {
|
||||
if (normalizedContentType.equals(mimeType)) {
|
||||
return mimeTypes[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
String result = detect(content);
|
||||
if (result != null) {
|
||||
return result;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
|
@ -17,9 +17,8 @@
|
|||
package crawlercommons.sitemaps;
|
||||
|
||||
import static java.nio.charset.StandardCharsets.UTF_8;
|
||||
import static org.apache.tika.mime.MediaType.APPLICATION_XML;
|
||||
import static org.apache.tika.mime.MediaType.TEXT_PLAIN;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
|
@ -28,26 +27,22 @@ import java.io.InputStreamReader;
|
|||
import java.io.StringReader;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import javax.xml.parsers.SAXParser;
|
||||
import javax.xml.parsers.SAXParserFactory;
|
||||
|
||||
import org.apache.commons.io.FilenameUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.io.input.BOMInputStream;
|
||||
import org.apache.tika.Tika;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.mime.MediaTypeRegistry;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.xml.sax.EntityResolver;
|
||||
import org.xml.sax.InputSource;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import crawlercommons.mimetypes.MimeTypeDetector;
|
||||
import crawlercommons.sitemaps.AbstractSiteMap.SitemapType;
|
||||
import crawlercommons.sitemaps.sax.DelegatorHandler;
|
||||
|
||||
|
@ -67,18 +62,6 @@ public class SiteMapParser {
|
|||
*/
|
||||
public static final int MAX_BYTES_ALLOWED = 52428800;
|
||||
|
||||
/* Tika's MediaType components */
|
||||
private static final Tika TIKA = new Tika();
|
||||
private static final MediaTypeRegistry MEDIA_TYPE_REGISTRY = MediaTypeRegistry.getDefaultRegistry();
|
||||
|
||||
private static final List<MediaType> XML_MEDIA_TYPES = new ArrayList<>();
|
||||
private static final List<MediaType> TEXT_MEDIA_TYPES = new ArrayList<>();
|
||||
private static final List<MediaType> GZ_MEDIA_TYPES = new ArrayList<>();
|
||||
|
||||
static {
|
||||
initMediaTypes();
|
||||
}
|
||||
|
||||
/**
|
||||
* True (by default) meaning that invalid URLs should be rejected, as the
|
||||
* official docs allow the siteMapURLs to be only under the base url:
|
||||
|
@ -94,6 +77,8 @@ public class SiteMapParser {
|
|||
**/
|
||||
protected boolean strictNamespace = false;
|
||||
|
||||
private MimeTypeDetector mimeTypeDetector;
|
||||
|
||||
public SiteMapParser() {
|
||||
this(true, false);
|
||||
}
|
||||
|
@ -105,6 +90,8 @@ public class SiteMapParser {
|
|||
public SiteMapParser(boolean strict, boolean allowPartial) {
|
||||
this.strict = strict;
|
||||
this.allowPartial = allowPartial;
|
||||
|
||||
this.mimeTypeDetector = new MimeTypeDetector();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -203,8 +190,8 @@ public class SiteMapParser {
|
|||
if (url == null) {
|
||||
return null;
|
||||
}
|
||||
String filename = FilenameUtils.getName(url.getPath());
|
||||
String contentType = TIKA.detect(content, filename);
|
||||
|
||||
String contentType = mimeTypeDetector.detect(content);
|
||||
return parseSiteMap(contentType, content, url);
|
||||
}
|
||||
|
||||
|
@ -228,41 +215,28 @@ public class SiteMapParser {
|
|||
* {@link java.net.URL}
|
||||
*/
|
||||
public AbstractSiteMap parseSiteMap(String contentType, byte[] content, URL url) throws UnknownFormatException, IOException {
|
||||
MediaType mediaType = MediaType.parse(contentType);
|
||||
String mimeType = mimeTypeDetector.normalize(contentType, content);
|
||||
|
||||
// Octet-stream is the father of all binary types
|
||||
while (mediaType != null && !mediaType.equals(MediaType.OCTET_STREAM)) {
|
||||
if (XML_MEDIA_TYPES.contains(mediaType)) {
|
||||
return processXml(url, content);
|
||||
} else if (TEXT_MEDIA_TYPES.contains(mediaType)) {
|
||||
return processText(url, content);
|
||||
} else if (GZ_MEDIA_TYPES.contains(mediaType)) {
|
||||
InputStream decompressed;
|
||||
MediaType embeddedType;
|
||||
try {
|
||||
decompressed = new GZIPInputStream(new ByteArrayInputStream(content));
|
||||
embeddedType = MediaType.parse(TIKA.detect(decompressed));
|
||||
} catch (Exception e) {
|
||||
UnknownFormatException err = new UnknownFormatException("Failed to detect embedded MediaType of gzipped sitemap: " + url + ", caused by " + e);
|
||||
err.initCause(e);
|
||||
throw err;
|
||||
}
|
||||
if (XML_MEDIA_TYPES.contains(embeddedType)) {
|
||||
if (mimeTypeDetector.isXml(mimeType)) {
|
||||
return processXml(url, content);
|
||||
} else if (mimeTypeDetector.isText(mimeType)) {
|
||||
return processText(url, content);
|
||||
} else if (mimeTypeDetector.isGzip(mimeType)) {
|
||||
try (InputStream decompressed = new BufferedInputStream(new GZIPInputStream(new ByteArrayInputStream(content)))) {
|
||||
String compressedType = mimeTypeDetector.detect(decompressed);
|
||||
if (mimeTypeDetector.isXml(compressedType)) {
|
||||
return processGzippedXML(url, content);
|
||||
} else if (TEXT_MEDIA_TYPES.contains(embeddedType)) {
|
||||
// re-open decompressed stream and parse as text
|
||||
decompressed = new GZIPInputStream(new ByteArrayInputStream(content));
|
||||
} else if (mimeTypeDetector.isText(compressedType)) {
|
||||
return processText(url, decompressed);
|
||||
} else if (GZ_MEDIA_TYPES.contains(embeddedType)) {
|
||||
throw new UnknownFormatException("Can't parse gzip recursively: " + url);
|
||||
}
|
||||
throw new UnknownFormatException("Can't parse a gzipped sitemap with the embedded MediaType of: " + embeddedType + " (at: " + url + ")");
|
||||
} catch (Exception e) {
|
||||
String msg = String.format(Locale.ROOT, "Failed to detect embedded MediaType of gzipped sitemap '%s'", url);
|
||||
throw new UnknownFormatException(msg, e);
|
||||
}
|
||||
mediaType = MEDIA_TYPE_REGISTRY.getSupertype(mediaType); // Check
|
||||
// parent
|
||||
}
|
||||
|
||||
throw new UnknownFormatException("Can't parse a sitemap with the MediaType of: " + contentType + " (at: " + url + ")");
|
||||
String msg = String.format(Locale.ROOT, "Can't parse a sitemap with MediaType '%s' from '%s'", contentType, url);
|
||||
throw new UnknownFormatException(msg);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -491,25 +465,4 @@ public class SiteMapParser {
|
|||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* Performs a one time intialization of Tika's Media-Type components and
|
||||
* media type collection constants <br/>
|
||||
* Please note that this is a private static method which is called once per
|
||||
* CLASS (not per instance / object)
|
||||
*/
|
||||
private static void initMediaTypes() {
|
||||
/* XML media types (and all aliases) */
|
||||
XML_MEDIA_TYPES.add(APPLICATION_XML);
|
||||
XML_MEDIA_TYPES.addAll(MEDIA_TYPE_REGISTRY.getAliases(APPLICATION_XML));
|
||||
|
||||
/* TEXT media types (and all aliases) */
|
||||
TEXT_MEDIA_TYPES.add(TEXT_PLAIN);
|
||||
TEXT_MEDIA_TYPES.addAll(MEDIA_TYPE_REGISTRY.getAliases(TEXT_PLAIN));
|
||||
|
||||
/* GZIP media types (and all aliases) */
|
||||
MediaType gzipMediaType = MediaType.parse("application/gzip");
|
||||
GZ_MEDIA_TYPES.add(gzipMediaType);
|
||||
GZ_MEDIA_TYPES.addAll(MEDIA_TYPE_REGISTRY.getAliases(gzipMediaType));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,33 +19,15 @@ package crawlercommons.sitemaps;
|
|||
@SuppressWarnings("serial")
|
||||
public class UnknownFormatException extends Exception {
|
||||
|
||||
private final String error;
|
||||
|
||||
/** Default constructor - initializes instance variable to unknown */
|
||||
public UnknownFormatException() {
|
||||
super();
|
||||
error = "unknown";
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor receives some kind of message that is saved in an instance
|
||||
* variable.
|
||||
*
|
||||
* @param err
|
||||
* a String object to use within the Execption
|
||||
*/
|
||||
public UnknownFormatException(String err) {
|
||||
super(err);
|
||||
error = err;
|
||||
public UnknownFormatException(String message) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
/**
|
||||
* public method, callable by exception catcher. It returns the error
|
||||
* message.
|
||||
*
|
||||
* @return a populated Exception as a String
|
||||
*/
|
||||
public String getError() {
|
||||
return error;
|
||||
public UnknownFormatException(String message, Throwable cause) {
|
||||
super(message, cause);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,60 @@
|
|||
package crawlercommons.mimetypes;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.Test;
|
||||
|
||||
public class MimeTypeDetectorTest {
|
||||
|
||||
@Test
|
||||
public void testXMLDetection() throws Exception {
|
||||
MimeTypeDetector detector = new MimeTypeDetector();
|
||||
|
||||
byte[] content = getSitemap("atom.xml");
|
||||
String mimeType = detector.detect(content);
|
||||
assertTrue(detector.isXml(mimeType));
|
||||
assertFalse(detector.isText(mimeType));
|
||||
assertFalse(detector.isGzip(mimeType));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTextDetection() throws IOException {
|
||||
MimeTypeDetector detector = new MimeTypeDetector();
|
||||
byte[] content = getSitemap("sitemap.txt");
|
||||
String mimeType = detector.detect(content);
|
||||
assertFalse(detector.isXml(mimeType));
|
||||
assertTrue(detector.isText(mimeType));
|
||||
assertFalse(detector.isGzip(mimeType));
|
||||
|
||||
content = getSitemap("sitemap-with-bom.txt");
|
||||
mimeType = detector.detect(content);
|
||||
assertFalse(detector.isXml(mimeType));
|
||||
assertTrue(detector.isText(mimeType));
|
||||
assertFalse(detector.isGzip(mimeType));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGzipDetection() throws IOException {
|
||||
MimeTypeDetector detector = new MimeTypeDetector();
|
||||
|
||||
byte[] content = getSitemap("xmlSitemap.gz");
|
||||
String mimeType = detector.detect(content);
|
||||
assertFalse(detector.isXml(mimeType));
|
||||
assertFalse(detector.isText(mimeType));
|
||||
assertTrue(detector.isGzip(mimeType));
|
||||
|
||||
content = getSitemap("sitemap.txt.gz");
|
||||
mimeType = detector.detect(content);
|
||||
assertFalse(detector.isXml(mimeType));
|
||||
assertFalse(detector.isText(mimeType));
|
||||
assertTrue(detector.isGzip(mimeType));
|
||||
}
|
||||
|
||||
private byte[] getSitemap(String filename) throws IOException {
|
||||
return IOUtils.toByteArray(MimeTypeDetectorTest.class.getResourceAsStream("/sitemaps/" + filename));
|
||||
}
|
||||
|
||||
}
|
|
@ -30,6 +30,7 @@ import java.io.FileInputStream;
|
|||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Date;
|
||||
import java.util.Iterator;
|
||||
|
@ -184,6 +185,22 @@ public class SiteMapParserTest {
|
|||
assertEquals(2, sm.getSiteMapUrls().size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSitemapTXTWithWrongMimeType() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParser();
|
||||
String scontent = "http://www.example.com/catalog?item=1\nhttp://www.example.com/catalog?item=11";
|
||||
byte[] content = scontent.getBytes(UTF_8);
|
||||
URL url = new URL("http://www.example.com/sitemap.xml");
|
||||
String contentType = "application/bogus";
|
||||
|
||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
assertEquals(false, asm.isIndex());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
assertEquals(2, sm.getSiteMapUrls().size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSitemapXML() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParser();
|
||||
|
@ -200,7 +217,7 @@ public class SiteMapParserTest {
|
|||
|
||||
SiteMapURL[] found = sm.getSiteMapUrls().toArray(new SiteMapURL[5]);
|
||||
for (int i = 0; i < found.length; i++) {
|
||||
assertEquals(sitemapURLs[i].replaceAll("&", "&"), found[i].getUrl().toExternalForm());
|
||||
assertEquals(SITEMAP_URLS[i].replaceAll("&", "&"), found[i].getUrl().toExternalForm());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -219,7 +236,7 @@ public class SiteMapParserTest {
|
|||
assertEquals(5, sm.getSiteMapUrls().size());
|
||||
SiteMapURL[] found = sm.getSiteMapUrls().toArray(new SiteMapURL[5]);
|
||||
for (int i = 0; i < found.length; i++) {
|
||||
assertEquals(sitemapURLs[i].replaceAll("&", "&"), found[i].getUrl().toExternalForm());
|
||||
assertEquals(SITEMAP_URLS[i].replaceAll("&", "&"), found[i].getUrl().toExternalForm());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -303,23 +320,13 @@ public class SiteMapParserTest {
|
|||
}
|
||||
|
||||
@Test(expected = UnknownFormatException.class)
|
||||
public void testSitemapWithOctetMediaType() throws UnknownFormatException, IOException {
|
||||
public void testSitemapWithInvalidContent() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParser();
|
||||
String contentType = "application/octet-stream";
|
||||
byte[] content = getXMLSitemapAsBytes();
|
||||
byte[] content = "this is a bogus sitemap".getBytes(StandardCharsets.UTF_8);
|
||||
URL url = new URL("http://www.example.com/sitemap");
|
||||
|
||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
assertEquals(false, asm.isIndex());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
assertEquals(5, sm.getSiteMapUrls().size());
|
||||
|
||||
SiteMapURL[] found = sm.getSiteMapUrls().toArray(new SiteMapURL[5]);
|
||||
for (int i = 0; i < found.length; i++) {
|
||||
assertEquals(sitemapURLs[i], found[i].getUrl().toExternalForm());
|
||||
}
|
||||
parser.parseSiteMap(contentType, content, url);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -327,8 +334,12 @@ public class SiteMapParserTest {
|
|||
SiteMapParser parser = new SiteMapParser();
|
||||
String contentType = "text/xml";
|
||||
StringBuilder scontent = new StringBuilder(1024);
|
||||
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">").append("<url>")
|
||||
.append("<loc>http://www.example.com/</loc>").append("</url>").append("</urlset>");
|
||||
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>")
|
||||
.append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">")
|
||||
.append("<url>")
|
||||
.append("<loc>http://www.example.com/</loc>")
|
||||
.append("</url>")
|
||||
.append("</urlset>");
|
||||
byte[] content = scontent.toString().getBytes(UTF_8);
|
||||
|
||||
URL url = new URL("http://www.example.com/subsection/sitemap.xml");
|
||||
|
@ -507,12 +518,12 @@ public class SiteMapParserTest {
|
|||
private byte[] getXMLSitemapAsBytes() {
|
||||
StringBuilder scontent = new StringBuilder(1024);
|
||||
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">");
|
||||
scontent.append("<url> <loc>").append(sitemapURLs[0]).append("</loc> <lastmod>2005-01-01</lastmod>").append(" <changefreq>monthly</changefreq>").append(" <priority>0.8</priority>")
|
||||
scontent.append("<url> <loc>").append(SITEMAP_URLS[0]).append("</loc> <lastmod>2005-01-01</lastmod>").append(" <changefreq>monthly</changefreq>").append(" <priority>0.8</priority>")
|
||||
.append("</url>");
|
||||
scontent.append("<url> <loc>").append(sitemapURLs[1]).append("</loc> <changefreq>weekly</changefreq>").append("</url>");
|
||||
scontent.append("<url> <loc>").append(sitemapURLs[2]).append("</loc> <lastmod>2004-12-23</lastmod>").append(" <changefreq>weekly</changefreq>").append("</url>");
|
||||
scontent.append("<url> <loc>").append(sitemapURLs[3]).append("</loc> <lastmod>2004-12-23T18:00:15+00:00</lastmod>").append(" <priority>0.3</priority>").append("</url>");
|
||||
scontent.append("<url> <loc><url><![CDATA[").append(sitemapURLs[4]).append("]]></url></loc> <lastmod>2004-11-23</lastmod>").append("</url>");
|
||||
scontent.append("<url> <loc>").append(SITEMAP_URLS[1]).append("</loc> <changefreq>weekly</changefreq>").append("</url>");
|
||||
scontent.append("<url> <loc>").append(SITEMAP_URLS[2]).append("</loc> <lastmod>2004-12-23</lastmod>").append(" <changefreq>weekly</changefreq>").append("</url>");
|
||||
scontent.append("<url> <loc>").append(SITEMAP_URLS[3]).append("</loc> <lastmod>2004-12-23T18:00:15+00:00</lastmod>").append(" <priority>0.3</priority>").append("</url>");
|
||||
scontent.append("<url> <loc><url><![CDATA[").append(SITEMAP_URLS[4]).append("]]></url></loc> <lastmod>2004-11-23</lastmod>").append("</url>");
|
||||
scontent.append("</urlset>");
|
||||
|
||||
return scontent.toString().getBytes(UTF_8);
|
||||
|
@ -532,8 +543,12 @@ public class SiteMapParserTest {
|
|||
return IOUtils.toByteArray(is);
|
||||
}
|
||||
|
||||
private static String[] sitemapURLs = new String[] { "http://www.example.com/", "http://www.example.com/catalog?item=12&desc=vacation_hawaii",
|
||||
"http://www.example.com/catalog?item=73&desc=vacation_new_zealand", "http://www.example.com/catalog?item=74&desc=vacation_newfoundland",
|
||||
"http://www.example.com/catalog?item=83&desc=vacation_usa" };
|
||||
private static String[] SITEMAP_URLS = new String[] {
|
||||
"http://www.example.com/",
|
||||
"http://www.example.com/catalog?item=12&desc=vacation_hawaii",
|
||||
"http://www.example.com/catalog?item=73&desc=vacation_new_zealand",
|
||||
"http://www.example.com/catalog?item=74&desc=vacation_newfoundland",
|
||||
"http://www.example.com/catalog?item=83&desc=vacation_usa"
|
||||
};
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
http://www.domain.com/page1.html
|
||||
http://www.domain.com/page2.html
|
|
@ -0,0 +1,2 @@
|
|||
http://www.domain.com/page1.html
|
||||
http://www.domain.com/page2.html
|
Loading…
Reference in New Issue