1
0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-09-27 18:29:49 +02:00

Issue 47: [Sitemaps] SiteMapParser Tika detection doesn't work well on some cases

new Tika().detect(URL) -- Will solve the mentioned problem.

BUT it will cause out library to fetch the sitemap twice.


A better solution should be sought.
Maybe use new Tika().detect(bytes, filename);
This commit is contained in:
avraham2@gmail.com 2014-08-19 19:08:27 +00:00
parent 19e2918aca
commit 983cce7c07
2 changed files with 16 additions and 1 deletions

View File

@ -1,6 +1,7 @@
Crawler-Commons Change Log
Release 0.5
- Issue 47: [Sitemaps] SiteMapParser Tika detection doesn't work well on some cases (Avi Hayun)
- Issue 40: [Sitemaps] Add Tika MediaType Support (Avi Hayun)
- Issue 39: [Sitemaps] Add the Parser a convenience method with only a URL argument (Avi Hayun via lewismc)
- Issue 42: [Sitemaps] Add more JUnit tests (Avi Hayun via lewismc)

View File

@ -31,6 +31,7 @@ import java.util.zip.GZIPInputStream;
import javax.xml.parsers.DocumentBuilderFactory;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.BOMInputStream;
import org.apache.tika.Tika;
@ -68,6 +69,7 @@ public class SiteMapParser {
static {
initMediaTypes();
}
private static Tika tika;
/** True (by default) if invalid URLs should be rejected */
private boolean strict;
@ -91,10 +93,22 @@ public class SiteMapParser {
* Returns a SiteMap or SiteMapIndex given an online sitemap URL<br/>
* Please note that this method is a static method which goes online and fetches the sitemap then parses it<br/><br/>
* This method is a convenience method for a user who has a sitemap URL and wants a "Keep it simple" way to parse it.
*
* @param onlineSitemapUrl URL of the online sitemap
* @return AbstractSiteMap object or null if the onlineSitemap is null
**/
public AbstractSiteMap parseSiteMap(URL onlineSitemapUrl) throws UnknownFormatException, IOException {
if (onlineSitemapUrl == null) {
return null;
}
if (tika == null) {
tika = new Tika();
}
byte[] bytes = IOUtils.toByteArray(onlineSitemapUrl);
String contentType = new Tika().detect(onlineSitemapUrl);
String filename = FilenameUtils.getName(onlineSitemapUrl.getPath());
String contentType = tika.detect(bytes, filename);
return parseSiteMap(contentType, bytes, onlineSitemapUrl);
}