mirror of
https://github.com/crawler-commons/crawler-commons
synced 2024-05-09 23:56:04 +02:00
parent
d2de87cf92
commit
ee69049db0
|
@ -29,12 +29,12 @@ import java.io.StringReader;
|
|||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import javax.xml.parsers.DocumentBuilder;
|
||||
import javax.xml.parsers.DocumentBuilderFactory;
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import javax.xml.parsers.SAXParser;
|
||||
import javax.xml.parsers.SAXParserFactory;
|
||||
|
||||
import org.apache.commons.io.FilenameUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
@ -44,17 +44,12 @@ import org.apache.tika.mime.MediaType;
|
|||
import org.apache.tika.mime.MediaTypeRegistry;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.w3c.dom.Document;
|
||||
import org.w3c.dom.Element;
|
||||
import org.w3c.dom.Node;
|
||||
import org.w3c.dom.NodeList;
|
||||
import org.xml.sax.EntityResolver;
|
||||
import org.xml.sax.ErrorHandler;
|
||||
import org.xml.sax.InputSource;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.xml.sax.SAXParseException;
|
||||
|
||||
import crawlercommons.sitemaps.AbstractSiteMap.SitemapType;
|
||||
import crawlercommons.sitemaps.sax.DelegatorHandler;
|
||||
|
||||
public class SiteMapParser {
|
||||
public static final Logger LOG = LoggerFactory.getLogger(SiteMapParser.class);
|
||||
|
@ -87,11 +82,12 @@ public class SiteMapParser {
|
|||
/**
|
||||
* True (by default) meaning that invalid URLs should be rejected, as the
|
||||
* official docs allow the siteMapURLs to be only under the base url:
|
||||
* http://www.sitemaps.org/protocol.html#location Also checks that the
|
||||
* correct namespace is used.
|
||||
* http://www.sitemaps.org/protocol.html#location
|
||||
*/
|
||||
protected boolean strict = true;
|
||||
|
||||
private boolean allowPartial = false;
|
||||
|
||||
/**
|
||||
* Indicates whether the parser should work with the namespace from the
|
||||
* specifications or any namespace. Defaults to false.
|
||||
|
@ -99,11 +95,16 @@ public class SiteMapParser {
|
|||
protected boolean strictNamespace = false;
|
||||
|
||||
public SiteMapParser() {
|
||||
// default constructor
|
||||
this(true, false);
|
||||
}
|
||||
|
||||
public SiteMapParser(boolean strict) {
|
||||
this(strict, false);
|
||||
}
|
||||
|
||||
public SiteMapParser(boolean strict, boolean allowPartial) {
|
||||
this.strict = strict;
|
||||
this.allowPartial = allowPartial;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -124,7 +125,7 @@ public class SiteMapParser {
|
|||
|
||||
/**
|
||||
* Sets the parser to allow any namespace or just the one from the
|
||||
* specification
|
||||
* specification
|
||||
*/
|
||||
public void setStrictNamespace(boolean s) {
|
||||
strictNamespace = s;
|
||||
|
@ -376,442 +377,65 @@ public class SiteMapParser {
|
|||
* {@link org.xml.sax.InputSource}
|
||||
*/
|
||||
protected AbstractSiteMap processXml(URL sitemapUrl, InputSource is) throws UnknownFormatException {
|
||||
Document doc = null;
|
||||
|
||||
SAXParserFactory factory = SAXParserFactory.newInstance();
|
||||
|
||||
// disable validation and avoid that remote DTDs, schemas, etc. are
|
||||
// fetched
|
||||
factory.setValidating(false);
|
||||
factory.setXIncludeAware(false);
|
||||
|
||||
// support the use of an explicit namespace.
|
||||
factory.setNamespaceAware(true);
|
||||
|
||||
try {
|
||||
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
|
||||
|
||||
// disable validation and avoid that remote DTDs, schemas, etc. are
|
||||
// fetched
|
||||
dbf.setValidating(false);
|
||||
|
||||
// support an explicitly named namespace.
|
||||
dbf.setNamespaceAware(true);
|
||||
|
||||
dbf.setXIncludeAware(false);
|
||||
dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
|
||||
DocumentBuilder db = dbf.newDocumentBuilder();
|
||||
|
||||
db.setEntityResolver(new EntityResolver() {
|
||||
factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Failed to configure XML parser: " + e.toString());
|
||||
}
|
||||
DelegatorHandler handler = new DelegatorHandler(sitemapUrl, strict);
|
||||
try {
|
||||
SAXParser saxParser = factory.newSAXParser();
|
||||
saxParser.getXMLReader().setEntityResolver(new EntityResolver() {
|
||||
// noop entity resolver, does not fetch remote content
|
||||
@Override
|
||||
public InputSource resolveEntity(String publicId, String systemId) {
|
||||
return new InputSource(new StringReader(""));
|
||||
}
|
||||
});
|
||||
|
||||
db.setErrorHandler(new ErrorHandler() {
|
||||
public void warning(SAXParseException e) throws SAXException {
|
||||
LOG.warn("Warning parsing XML: {}", e.toString());
|
||||
handler.setStrictNamespace(isStrictNamespace());
|
||||
saxParser.parse(is, handler);
|
||||
AbstractSiteMap sitemap = handler.getSiteMap();
|
||||
if (sitemap == null) {
|
||||
UnknownFormatException ex = handler.getException();
|
||||
if (ex != null) {
|
||||
throw ex;
|
||||
}
|
||||
|
||||
public void fatalError(SAXParseException e) throws SAXException {
|
||||
LOG.error("Fatal error parsing XML: {}", e.toString());
|
||||
throw e;
|
||||
}
|
||||
|
||||
public void error(SAXParseException e) throws SAXException {
|
||||
LOG.error("Error parsing XML: {}", e.toString());
|
||||
throw e;
|
||||
}
|
||||
});
|
||||
doc = db.parse(is);
|
||||
} catch (Exception e) {
|
||||
LOG.debug(e.toString(), e);
|
||||
throw new UnknownFormatException("Error parsing XML for: " + sitemapUrl);
|
||||
}
|
||||
|
||||
// See if this is a sitemap index
|
||||
NodeList nodeList = doc.getElementsByTagNameNS("*", "sitemapindex");
|
||||
if (nodeList.getLength() > 0) {
|
||||
nodeList = doc.getElementsByTagNameNS("*", "sitemap");
|
||||
return parseSitemapIndex(sitemapUrl, nodeList);
|
||||
} else if (doc.getElementsByTagNameNS("*", "urlset").getLength() > 0) {
|
||||
// This is a regular Sitemap
|
||||
return parseXmlSitemap(sitemapUrl, doc);
|
||||
} else if (doc.getElementsByTagNameNS("*", "link").getLength() > 0) {
|
||||
// Could be RSS or Atom
|
||||
return parseSyndicationFormat(sitemapUrl, doc);
|
||||
}
|
||||
|
||||
throw new UnknownFormatException("Unknown XML format for: " + sitemapUrl);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse XML that contains a valid Sitemap. Example of a Sitemap:
|
||||
*
|
||||
* <pre>
|
||||
* {@code
|
||||
* <?xml version="1.0" encoding="UTF-8"?>
|
||||
* <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
* <url>
|
||||
* <loc>http://www.example.com/</loc>
|
||||
* <lastmod>lastmod>2005-01-01</lastmod>
|
||||
* <changefreq>monthly</changefreq>
|
||||
* <priority>0.8</priority>
|
||||
* </url>
|
||||
* <url>
|
||||
* <loc>http://www.example.com/catalog?item=12&desc=vacation_hawaii</loc>
|
||||
* <changefreq>weekly</changefreq>
|
||||
* </url>
|
||||
* </urlset>
|
||||
* }
|
||||
* </pre>
|
||||
*
|
||||
* @param sitemapUrl
|
||||
* a sitemap {@link java.net.URL}
|
||||
* @param doc
|
||||
* a {@link org.w3c.dom.Document} sitemap snippet
|
||||
* @return The sitemap
|
||||
*/
|
||||
protected SiteMap parseXmlSitemap(URL sitemapUrl, Document doc) {
|
||||
|
||||
SiteMap sitemap = new SiteMap(sitemapUrl);
|
||||
sitemap.setType(SitemapType.XML);
|
||||
|
||||
String namespace = Namespace.SITEMAP;
|
||||
if (!strictNamespace) {
|
||||
namespace = "*";
|
||||
}
|
||||
|
||||
NodeList list = doc.getElementsByTagNameNS(namespace, "url");
|
||||
|
||||
// Loop through the <url>s
|
||||
for (int i = 0; i < list.getLength(); i++) {
|
||||
|
||||
Node n = list.item(i);
|
||||
if (n.getNodeType() == Node.ELEMENT_NODE) {
|
||||
Element elem = (Element) n;
|
||||
String lastMod = getElementValue(namespace, elem, "lastmod");
|
||||
String changeFreq = getElementValue(namespace, elem, "changefreq");
|
||||
String priority = getElementValue(namespace, elem, "priority");
|
||||
String loc = getElementValue(namespace, elem, "loc");
|
||||
|
||||
addUrlIntoSitemap(loc, sitemap, lastMod, changeFreq, priority, i);
|
||||
throw new UnknownFormatException("Unknown XML format for: " + sitemapUrl);
|
||||
}
|
||||
}
|
||||
|
||||
sitemap.setProcessed(true);
|
||||
return sitemap;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse XML that contains a Sitemap Index. Example Sitemap Index:
|
||||
*
|
||||
* <pre>
|
||||
* {@code
|
||||
* <?xml version="1.0" encoding="UTF-8"?>
|
||||
* <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
* <sitemap>
|
||||
* <loc>http://www.example.com/sitemap1.xml.gz</loc>
|
||||
* <lastmod>2004-10-01T18:23:17+00:00</lastmod>
|
||||
* </sitemap>
|
||||
* <sitemap>
|
||||
* <loc>http://www.example.com/sitemap2.xml.gz</loc>
|
||||
* <lastmod>2005-01-01</lastmod>
|
||||
* </sitemap>
|
||||
* </sitemapindex>
|
||||
* }
|
||||
* </pre>
|
||||
*
|
||||
* @param url
|
||||
* - URL of Sitemap Index
|
||||
* @param nodeList
|
||||
* a {@link org.w3c.dom.NodeList} backing the sitemap
|
||||
* @return The site map index
|
||||
*/
|
||||
protected SiteMapIndex parseSitemapIndex(URL url, NodeList nodeList) {
|
||||
|
||||
LOG.debug("Parsing Sitemap Index");
|
||||
|
||||
SiteMapIndex sitemapIndex = new SiteMapIndex(url);
|
||||
sitemapIndex.setType(SitemapType.INDEX);
|
||||
|
||||
// Loop through the <sitemap>s
|
||||
for (int i = 0; i < nodeList.getLength() && i < MAX_URLS; i++) {
|
||||
|
||||
Node firstNode = nodeList.item(i);
|
||||
|
||||
if (firstNode.getNodeType() == Node.ELEMENT_NODE) {
|
||||
Element elem = (Element) firstNode;
|
||||
String loc = null;
|
||||
String namespace = Namespace.SITEMAP;
|
||||
if (!strictNamespace) {
|
||||
namespace = "*";
|
||||
}
|
||||
loc = getElementValue(namespace, elem, "loc");
|
||||
|
||||
// try the text content when no loc element
|
||||
// has been specified
|
||||
if (loc == null) {
|
||||
loc = elem.getTextContent().trim();
|
||||
}
|
||||
|
||||
try {
|
||||
URL sitemapUrl = new URL(loc);
|
||||
String lastmod = getElementValue(namespace, elem, "lastmod");
|
||||
Date lastModified = SiteMap.convertToDate(lastmod);
|
||||
|
||||
// Right now we are not worried about sitemapUrls that point
|
||||
// to different websites.
|
||||
|
||||
SiteMap s = new SiteMap(sitemapUrl, lastModified);
|
||||
sitemapIndex.addSitemap(s);
|
||||
LOG.debug(" {}. {}", (i + 1), s);
|
||||
} catch (MalformedURLException e) {
|
||||
LOG.trace("Don't create an entry with a bad URL", e);
|
||||
LOG.debug("Bad url: [{}]", loc);
|
||||
}
|
||||
}
|
||||
}
|
||||
sitemapIndex.setProcessed(true);
|
||||
return sitemapIndex;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse the XML document, looking for a <b>feed</b> element to determine if
|
||||
* it's an <b>Atom doc</b> <b>rss</b> to determine if it's an <b>RSS
|
||||
* doc</b>.
|
||||
*
|
||||
* @param sitemapUrl
|
||||
* the URL location of the Sitemap
|
||||
* @param doc
|
||||
* - XML document to parse
|
||||
* @return The sitemap
|
||||
* @throws UnknownFormatException
|
||||
* if XML does not appear to be Atom or RSS
|
||||
*/
|
||||
protected SiteMap parseSyndicationFormat(URL sitemapUrl, Document doc) throws UnknownFormatException {
|
||||
|
||||
SiteMap sitemap = new SiteMap(sitemapUrl);
|
||||
|
||||
// See if this is an Atom feed by looking for "feed" element
|
||||
NodeList list = doc.getElementsByTagNameNS("*", "feed");
|
||||
if (list.getLength() > 0) {
|
||||
parseAtom(sitemap, (Element) list.item(0), doc);
|
||||
sitemap.setProcessed(true);
|
||||
return sitemap;
|
||||
} else {
|
||||
// See if it is a RSS feed by looking for a "channel" element. This
|
||||
// avoids the issue
|
||||
// of having the outer tag named <rdf:RDF> that was causing this
|
||||
// code to fail. Inside of
|
||||
// the <rss> or <rdf> tag is a <channel> tag, so we can use that.
|
||||
// See https://github.com/crawler-commons/crawler-commons/issues/87
|
||||
// and also RSS 1.0 specification
|
||||
// http://web.resource.org/rss/1.0/spec
|
||||
list = doc.getElementsByTagNameNS("*", "channel");
|
||||
if (list.getLength() > 0) {
|
||||
parseRSS(sitemap, doc);
|
||||
} catch (IOException e) {
|
||||
LOG.warn("Error parsing sitemap {}: {}", sitemapUrl, e.getMessage());
|
||||
UnknownFormatException ufe = new UnknownFormatException("Failed to parse " + sitemapUrl);
|
||||
ufe.initCause(e);
|
||||
throw ufe;
|
||||
} catch (SAXException e) {
|
||||
LOG.warn("Error parsing sitemap {}: {}", sitemapUrl, e.getMessage());
|
||||
AbstractSiteMap sitemap = handler.getSiteMap();
|
||||
if (allowPartial && sitemap != null) {
|
||||
LOG.warn("Processed broken/partial sitemap for '" + sitemapUrl + "'");
|
||||
sitemap.setProcessed(true);
|
||||
return sitemap;
|
||||
} else {
|
||||
throw new UnknownFormatException("Unknown syndication format at " + sitemapUrl);
|
||||
UnknownFormatException ufe = new UnknownFormatException("Failed to parse " + sitemapUrl);
|
||||
ufe.initCause(e);
|
||||
throw ufe;
|
||||
}
|
||||
} catch (ParserConfigurationException e) {
|
||||
throw new IllegalStateException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Parse the XML document which is assumed to be in Atom format. Atom 1.0
|
||||
* example:
|
||||
* </p>
|
||||
*
|
||||
* <pre>
|
||||
* {@code
|
||||
* <?xml version="1.0" encoding="utf-8"?>
|
||||
* <feed xmlns="http://www.w3.org/2005/Atom">
|
||||
* <title>Example Feed</title>
|
||||
* <subtitle>A subtitle.</subtitle>
|
||||
* <link href="http://example.org/feed/" rel="self"/>
|
||||
* <link href="http://example.org/"/>
|
||||
* <modified>2003-12-13T18:30:02Z</modified>
|
||||
* <author>
|
||||
* <name>John Doe</name>
|
||||
* <email>johndoe@example.com</email>
|
||||
* </author>
|
||||
* <id>urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6</id>
|
||||
* <entry>
|
||||
* <title>Atom-Powered Robots Run Amok</title>
|
||||
* <link href="http://example.org/2003/12/13/atom03"/>
|
||||
* <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
|
||||
* <updated>2003-12-13T18:30:02Z</updated>
|
||||
* <summary>Some text.</summary>
|
||||
* </entry>
|
||||
* ...
|
||||
* </feed>
|
||||
* }
|
||||
* </pre>
|
||||
*
|
||||
* @param sitemap
|
||||
* a {@link crawlercommons.sitemaps.SiteMap} backing the Atom
|
||||
* feed
|
||||
* @param elem
|
||||
* {@link org.w3c.dom.Element}'s to populate from the Sitemap
|
||||
* @param doc
|
||||
* {@link org.w3c.dom.Document} to populate with the parse output
|
||||
*/
|
||||
protected void parseAtom(SiteMap sitemap, Element elem, Document doc) {
|
||||
|
||||
// Grab items from <feed><entry><link href="URL" /></entry></feed>
|
||||
// Use lastmod date from <feed><modified>DATE</modified></feed>
|
||||
|
||||
LOG.debug("Parsing Atom XML");
|
||||
|
||||
sitemap.setType(SitemapType.ATOM);
|
||||
|
||||
String lastMod = getElementValue(elem, "modified");
|
||||
LOG.debug("lastMod = {}", lastMod);
|
||||
|
||||
NodeList list = doc.getElementsByTagNameNS("*", "entry");
|
||||
|
||||
// Loop through the <entry>s
|
||||
for (int i = 0; i < list.getLength() && i < MAX_URLS; i++) {
|
||||
|
||||
Node n = list.item(i);
|
||||
if (n.getNodeType() == Node.ELEMENT_NODE) {
|
||||
elem = (Element) n;
|
||||
String href = getElementAttributeValue(elem, "link", "href");
|
||||
|
||||
addUrlIntoSitemap(href, sitemap, lastMod, null, null, i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse XML document which is assumed to be in RSS format. RSS 2.0 example:
|
||||
*
|
||||
* <pre>
|
||||
* {@code
|
||||
* <?xml version="1.0"?>
|
||||
* <rss version="2.0">
|
||||
* <channel>
|
||||
* <title>Lift Off News</title>
|
||||
* <link>http://liftoff.msfc.nasa.gov/</link>
|
||||
* <description>Liftoff to Space Exploration.</description>
|
||||
* <language>en-us</language>
|
||||
* <pubDate>Tue, 10 Jun 2003 04:00:00 GMT</pubDate>
|
||||
* <lastBuildDate>Tue, 10 Jun 2003 09:41:01 GMT</lastBuildDate>
|
||||
* <docs>http://blogs.law.harvard.edu/tech/rss</docs>
|
||||
* <generator>Weblog Editor 2.0</generator>
|
||||
* <managingEditor>editor@example.com</managingEditor>
|
||||
* <webMaster>webmaster@example.com</webMaster>
|
||||
* <ttl>5</ttl>
|
||||
* <item>
|
||||
* <title>Star City</title>
|
||||
* <link>http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp</link>
|
||||
* <description>How do Americans get ready to work with Russians aboard the
|
||||
* International Space Station? They take a crash course in culture,
|
||||
* language and protocol at Russia's Star City.
|
||||
* </description>
|
||||
* <pubDate>Tue, 03 Jun 2003 09:39:21 GMT</pubDate>
|
||||
* <guid>http://liftoff.msfc.nasa.gov/2003/06/03.html#item573</guid>
|
||||
* </item>
|
||||
* <item>
|
||||
* <title>Space Exploration</title>
|
||||
* <link>http://liftoff.msfc.nasa.gov/</link>
|
||||
* <description>Sky watchers in Europe, Asia, and parts of Alaska and Canada
|
||||
* will experience a partial eclipse of the Sun on Saturday, May 31.
|
||||
* </description>
|
||||
* <pubDate>Fri, 30 May 2003 11:06:42 GMT</pubDate>
|
||||
* <guid>http://liftoff.msfc.nasa.gov/2003/05/30.html#item572</guid>
|
||||
* </item>
|
||||
* </channel>
|
||||
* </rss>
|
||||
* }
|
||||
* </pre>
|
||||
*
|
||||
* @param sitemap
|
||||
* a {@link crawlercommons.sitemaps.SiteMap} object to populate
|
||||
* with the RCC content
|
||||
* @param doc
|
||||
* {@link org.w3c.dom.Document} to populate with the parse output
|
||||
*/
|
||||
protected void parseRSS(SiteMap sitemap, Document doc) {
|
||||
|
||||
// Grab items from <item><link>URL</link></item>
|
||||
// and last modified date from <pubDate>DATE</pubDate>
|
||||
|
||||
LOG.debug("Parsing RSS doc");
|
||||
sitemap.setType(SitemapType.RSS);
|
||||
NodeList list = doc.getElementsByTagNameNS("*", "channel");
|
||||
Element elem = (Element) list.item(0);
|
||||
|
||||
// Treat publication date as last mod (Tue, 10 Jun 2003 04:00:00 GMT)
|
||||
String channelLastMod = AbstractSiteMap.normalizeRSSTimestamp(getElementValue(elem, "pubDate"));
|
||||
LOG.debug("channel's lastMod = {}", channelLastMod);
|
||||
sitemap.setLastModified(channelLastMod);
|
||||
|
||||
list = doc.getElementsByTagNameNS("*", "item");
|
||||
// Loop through the <item>s
|
||||
for (int i = 0; i < list.getLength() && i < MAX_URLS; i++) {
|
||||
|
||||
Node n = list.item(i);
|
||||
if (n.getNodeType() == Node.ELEMENT_NODE) {
|
||||
elem = (Element) n;
|
||||
String link = getElementValue(elem, "link");
|
||||
String itemLastMod = AbstractSiteMap.normalizeRSSTimestamp(getElementValue(elem, "pubDate"));
|
||||
|
||||
addUrlIntoSitemap(link, sitemap, itemLastMod, null, null, i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the element's textual content. Find element under parent element,
|
||||
* with namespaceURI and element local-name "elementName".
|
||||
*
|
||||
* @param namespaceURI
|
||||
* @param elem
|
||||
* @param elementName
|
||||
* @return The element value
|
||||
*/
|
||||
protected String getElementValue(String namespaceURI, Element elem, String elementName) {
|
||||
|
||||
NodeList list = elem.getElementsByTagNameNS(namespaceURI, elementName);
|
||||
if (list == null)
|
||||
return null;
|
||||
Element e = (Element) list.item(0);
|
||||
if (e != null) {
|
||||
return e.getTextContent();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the element's textual content. This will match any namespace
|
||||
* (elementName is the localName).
|
||||
*
|
||||
* @param elem
|
||||
* The element is a child of "elem"
|
||||
* @param elementName
|
||||
* The element name is "elementName".
|
||||
* @return The element value
|
||||
*/
|
||||
protected String getElementValue(Element elem, String elementName) {
|
||||
|
||||
return getElementValue("*", elem, elementName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the element's attribute value.
|
||||
*
|
||||
* @param elem
|
||||
* @param elementName
|
||||
* @param attributeName
|
||||
* @return The element attribute value
|
||||
*/
|
||||
protected String getElementAttributeValue(Element elem, String elementName, String attributeName) {
|
||||
|
||||
NodeList list = elem.getElementsByTagNameNS("*", elementName);
|
||||
Element e = (Element) list.item(0);
|
||||
if (e != null) {
|
||||
return e.getAttribute(attributeName);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds the given URL to the given sitemap while showing the relevant logs
|
||||
*
|
||||
|
|
|
@ -1,515 +0,0 @@
|
|||
/**
|
||||
* Copyright 2016 Crawler-Commons
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package crawlercommons.sitemaps;
|
||||
|
||||
import static java.nio.charset.StandardCharsets.UTF_8;
|
||||
import static org.apache.tika.mime.MediaType.APPLICATION_XML;
|
||||
import static org.apache.tika.mime.MediaType.TEXT_PLAIN;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.StringReader;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import javax.xml.parsers.SAXParser;
|
||||
import javax.xml.parsers.SAXParserFactory;
|
||||
|
||||
import org.apache.commons.io.FilenameUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.io.input.BOMInputStream;
|
||||
import org.apache.tika.Tika;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.mime.MediaTypeRegistry;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.xml.sax.EntityResolver;
|
||||
import org.xml.sax.InputSource;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import crawlercommons.sitemaps.AbstractSiteMap.SitemapType;
|
||||
import crawlercommons.sitemaps.sax.DelegatorHandler;
|
||||
|
||||
public class SiteMapParserSAX extends SiteMapParser {
|
||||
public static final Logger LOG = LoggerFactory.getLogger(SiteMapParserSAX.class);
|
||||
|
||||
/**
|
||||
* According to the specs, 50K URLs per Sitemap is the max
|
||||
*/
|
||||
private static final int MAX_URLS = 50000;
|
||||
|
||||
/**
|
||||
* Sitemaps (including sitemap index files) "must be no larger than
|
||||
* 50MB (52,428,800 bytes)" as specified in the
|
||||
* <a href="https://www.sitemaps.org/protocol.html#index">Sitemaps XML
|
||||
* format</a> (before Nov. 2016 the limit has been 10MB).
|
||||
*/
|
||||
public static final int MAX_BYTES_ALLOWED = 52428800;
|
||||
|
||||
/* Tika's MediaType components */
|
||||
private static final Tika TIKA = new Tika();
|
||||
private static final MediaTypeRegistry MEDIA_TYPE_REGISTRY = MediaTypeRegistry.getDefaultRegistry();
|
||||
|
||||
private static final List<MediaType> XML_MEDIA_TYPES = new ArrayList<>();
|
||||
private static final List<MediaType> TEXT_MEDIA_TYPES = new ArrayList<>();
|
||||
private static final List<MediaType> GZ_MEDIA_TYPES = new ArrayList<>();
|
||||
|
||||
static {
|
||||
initMediaTypes();
|
||||
}
|
||||
|
||||
/**
|
||||
* True (by default) meaning that invalid URLs should be rejected, as the
|
||||
* official docs allow the siteMapURLs to be only under the base url:
|
||||
* http://www.sitemaps.org/protocol.html#location
|
||||
*/
|
||||
protected boolean strict = true;
|
||||
|
||||
private boolean allowPartial = false;
|
||||
|
||||
/**
|
||||
* Indicates whether the parser should work with the namespace from the
|
||||
* specifications or any namespace. Defaults to false.
|
||||
**/
|
||||
protected boolean strictNamespace = false;
|
||||
|
||||
public SiteMapParserSAX() {
|
||||
this(true, false);
|
||||
}
|
||||
|
||||
public SiteMapParserSAX(boolean strict) {
|
||||
this(strict, false);
|
||||
}
|
||||
|
||||
public SiteMapParserSAX(boolean strict, boolean allowPartial) {
|
||||
this.strict = strict;
|
||||
this.allowPartial = allowPartial;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return whether invalid URLs will be rejected (where invalid means that
|
||||
* the url is not under the base url)
|
||||
*/
|
||||
public boolean isStrict() {
|
||||
return strict;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return whether the parser allows any namespace or just the one from the
|
||||
* specification
|
||||
*/
|
||||
public boolean isStrictNamespace() {
|
||||
return strictNamespace;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the parser to allow any namespace or just the one from the
|
||||
* specification
|
||||
*/
|
||||
public void setStrictNamespace(boolean s) {
|
||||
strictNamespace = s;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a SiteMap or SiteMapIndex given an online sitemap URL
|
||||
*
|
||||
* Please note that this method is a static method which goes online and
|
||||
* fetches the sitemap then parses it
|
||||
*
|
||||
* This method is a convenience method for a user who has a sitemap URL and
|
||||
* wants a "Keep it simple" way to parse it.
|
||||
*
|
||||
* @param onlineSitemapUrl
|
||||
* URL of the online sitemap
|
||||
* @return Extracted SiteMap/SiteMapIndex or null if the onlineSitemapUrl is
|
||||
* null
|
||||
* @throws UnknownFormatException
|
||||
* if there is an error parsing the sitemap
|
||||
* @throws IOException
|
||||
* if there is an error reading in the site map
|
||||
* {@link java.net.URL}
|
||||
*/
|
||||
public AbstractSiteMap parseSiteMap(URL onlineSitemapUrl) throws UnknownFormatException, IOException {
|
||||
if (onlineSitemapUrl == null) {
|
||||
return null;
|
||||
}
|
||||
byte[] bytes = IOUtils.toByteArray(onlineSitemapUrl);
|
||||
return parseSiteMap(bytes, onlineSitemapUrl);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a processed copy of an unprocessed sitemap object, i.e. transfer
|
||||
* the value of getLastModified(). Please note that the sitemap input stays
|
||||
* unchanged. Note that contentType is assumed to be correct; in general it
|
||||
* is more robust to use the method that doesn't take a contentType, but
|
||||
* instead detects this using Tika.
|
||||
*
|
||||
* @param contentType
|
||||
* MIME type of content
|
||||
* @param content
|
||||
* raw bytes of sitemap file
|
||||
* @param sitemap
|
||||
* an {@link crawlercommons.sitemaps.AbstractSiteMap}
|
||||
* implementation
|
||||
* @return Extracted SiteMap/SiteMapIndex
|
||||
* @throws UnknownFormatException
|
||||
* if there is an error parsing the sitemap
|
||||
* @throws IOException
|
||||
* if there is an error reading in the site map
|
||||
* {@link java.net.URL}
|
||||
*/
|
||||
public AbstractSiteMap parseSiteMap(String contentType, byte[] content, final AbstractSiteMap sitemap) throws UnknownFormatException, IOException {
|
||||
AbstractSiteMap asmCopy = parseSiteMap(contentType, content, sitemap.getUrl());
|
||||
asmCopy.setLastModified(sitemap.getLastModified());
|
||||
return asmCopy;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a sitemap, given the content bytes and the URL.
|
||||
*
|
||||
* @param content
|
||||
* raw bytes of sitemap file
|
||||
* @param url
|
||||
* URL to sitemap file
|
||||
* @return Extracted SiteMap/SiteMapIndex
|
||||
* @throws UnknownFormatException
|
||||
* if there is an error parsing the sitemap
|
||||
* @throws IOException
|
||||
* if there is an error reading in the site map
|
||||
* {@link java.net.URL}
|
||||
*/
|
||||
public AbstractSiteMap parseSiteMap(byte[] content, URL url) throws UnknownFormatException, IOException {
|
||||
if (url == null) {
|
||||
return null;
|
||||
}
|
||||
String filename = FilenameUtils.getName(url.getPath());
|
||||
String contentType = TIKA.detect(content, filename);
|
||||
return parseSiteMap(contentType, content, url);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a sitemap, given the MIME type, the content bytes, and the URL.
|
||||
* Note that contentType is assumed to be correct; in general it is more
|
||||
* robust to use the method that doesn't take a contentType, but instead
|
||||
* detects this using Tika.
|
||||
*
|
||||
* @param contentType
|
||||
* MIME type of content
|
||||
* @param content
|
||||
* raw bytes of sitemap file
|
||||
* @param url
|
||||
* URL to sitemap file
|
||||
* @return Extracted SiteMap/SiteMapIndex
|
||||
* @throws UnknownFormatException
|
||||
* if there is an error parsing the sitemap
|
||||
* @throws IOException
|
||||
* if there is an error reading in the site map
|
||||
* {@link java.net.URL}
|
||||
*/
|
||||
public AbstractSiteMap parseSiteMap(String contentType, byte[] content, URL url) throws UnknownFormatException, IOException {
|
||||
MediaType mediaType = MediaType.parse(contentType);
|
||||
|
||||
// Octet-stream is the father of all binary types
|
||||
while (mediaType != null && !mediaType.equals(MediaType.OCTET_STREAM)) {
|
||||
if (XML_MEDIA_TYPES.contains(mediaType)) {
|
||||
return processXml(url, content);
|
||||
} else if (TEXT_MEDIA_TYPES.contains(mediaType)) {
|
||||
return processText(url, content);
|
||||
} else if (GZ_MEDIA_TYPES.contains(mediaType)) {
|
||||
InputStream decompressed;
|
||||
MediaType embeddedType;
|
||||
try {
|
||||
decompressed = new GZIPInputStream(new ByteArrayInputStream(content));
|
||||
embeddedType = MediaType.parse(TIKA.detect(decompressed));
|
||||
} catch (Exception e) {
|
||||
UnknownFormatException err = new UnknownFormatException("Failed to detect embedded MediaType of gzipped sitemap: " + url + ", caused by " + e);
|
||||
err.initCause(e);
|
||||
throw err;
|
||||
}
|
||||
if (XML_MEDIA_TYPES.contains(embeddedType)) {
|
||||
return processGzippedXML(url, content);
|
||||
} else if (TEXT_MEDIA_TYPES.contains(embeddedType)) {
|
||||
// re-open decompressed stream and parse as text
|
||||
decompressed = new GZIPInputStream(new ByteArrayInputStream(content));
|
||||
return processText(url, decompressed);
|
||||
} else if (GZ_MEDIA_TYPES.contains(embeddedType)) {
|
||||
throw new UnknownFormatException("Can't parse gzip recursively: " + url);
|
||||
}
|
||||
throw new UnknownFormatException("Can't parse a gzipped sitemap with the embedded MediaType of: " + embeddedType + " (at: " + url + ")");
|
||||
}
|
||||
mediaType = MEDIA_TYPE_REGISTRY.getSupertype(mediaType); // Check
|
||||
// parent
|
||||
}
|
||||
|
||||
throw new UnknownFormatException("Can't parse a sitemap with the MediaType of: " + contentType + " (at: " + url + ")");
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse the given XML content.
|
||||
*
|
||||
* @param sitemapUrl
|
||||
* URL to sitemap file
|
||||
* @param xmlContent
|
||||
* the byte[] backing the sitemapUrl
|
||||
* @return The site map
|
||||
* @throws UnknownFormatException
|
||||
* if there is an error parsing the sitemap
|
||||
*/
|
||||
protected AbstractSiteMap processXml(URL sitemapUrl, byte[] xmlContent) throws UnknownFormatException {
|
||||
|
||||
BOMInputStream bomIs = new BOMInputStream(new ByteArrayInputStream(xmlContent));
|
||||
InputSource is = new InputSource();
|
||||
is.setCharacterStream(new BufferedReader(new InputStreamReader(bomIs, UTF_8)));
|
||||
|
||||
return processXml(sitemapUrl, is);
|
||||
}
|
||||
|
||||
/**
|
||||
* Process a text-based Sitemap. Text sitemaps only list URLs but no
|
||||
* priorities, last mods, etc.
|
||||
*
|
||||
* @param sitemapUrl
|
||||
* URL to sitemap file
|
||||
* @param content
|
||||
* the byte[] backing the sitemapUrl
|
||||
* @return The site map
|
||||
* @throws IOException
|
||||
* if there is an error reading in the site map content
|
||||
*/
|
||||
protected SiteMap processText(URL sitemapUrl, byte[] content) throws IOException {
|
||||
return processText(sitemapUrl, new ByteArrayInputStream(content));
|
||||
}
|
||||
|
||||
/**
|
||||
* Process a text-based Sitemap. Text sitemaps only list URLs but no
|
||||
* priorities, last mods, etc.
|
||||
*
|
||||
* @param sitemapUrl
|
||||
* URL to sitemap file
|
||||
* @param stream
|
||||
* content stream
|
||||
* @return The site map
|
||||
* @throws IOException
|
||||
* if there is an error reading in the site map content
|
||||
*/
|
||||
protected SiteMap processText(URL sitemapUrl, InputStream stream) throws IOException {
|
||||
LOG.debug("Processing textual Sitemap");
|
||||
|
||||
SiteMap textSiteMap = new SiteMap(sitemapUrl);
|
||||
textSiteMap.setType(SitemapType.TEXT);
|
||||
|
||||
BOMInputStream bomIs = new BOMInputStream(stream);
|
||||
@SuppressWarnings("resource")
|
||||
BufferedReader reader = new BufferedReader(new InputStreamReader(bomIs, UTF_8));
|
||||
|
||||
String line;
|
||||
int i = 1;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
if (line.length() > 0 && i <= MAX_URLS) {
|
||||
addUrlIntoSitemap(line, textSiteMap, null, null, null, i++);
|
||||
}
|
||||
}
|
||||
textSiteMap.setProcessed(true);
|
||||
|
||||
return textSiteMap;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decompress the gzipped content and process the resulting XML Sitemap.
|
||||
*
|
||||
* @param url
|
||||
* - URL of the gzipped content
|
||||
* @param response
|
||||
* - Gzipped content
|
||||
* @return the site map
|
||||
* @throws UnknownFormatException
|
||||
* if there is an error parsing the gzip
|
||||
* @throws IOException
|
||||
* if there is an error reading in the gzip {@link java.net.URL}
|
||||
*/
|
||||
protected AbstractSiteMap processGzippedXML(URL url, byte[] response) throws IOException, UnknownFormatException {
|
||||
|
||||
LOG.debug("Processing gzipped XML");
|
||||
|
||||
InputStream is = new ByteArrayInputStream(response);
|
||||
|
||||
// Remove .gz ending
|
||||
String xmlUrl = url.toString().replaceFirst("\\.gz$", "");
|
||||
LOG.debug("XML url = {}", xmlUrl);
|
||||
|
||||
BOMInputStream decompressed = new BOMInputStream(new GZIPInputStream(is));
|
||||
InputSource in = new InputSource(decompressed);
|
||||
in.setSystemId(xmlUrl);
|
||||
return processXml(url, in);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse the given XML content.
|
||||
*
|
||||
* @param sitemapUrl
|
||||
* a sitemap {@link java.net.URL}
|
||||
* @param is
|
||||
* an {@link org.xml.sax.InputSource} backing the sitemap
|
||||
* @return the site map
|
||||
* @throws UnknownFormatException
|
||||
* if there is an error parsing the
|
||||
* {@link org.xml.sax.InputSource}
|
||||
*/
|
||||
protected AbstractSiteMap processXml(URL sitemapUrl, InputSource is) throws UnknownFormatException {
|
||||
|
||||
SAXParserFactory factory = SAXParserFactory.newInstance();
|
||||
|
||||
// disable validation and avoid that remote DTDs, schemas, etc. are
|
||||
// fetched
|
||||
factory.setValidating(false);
|
||||
factory.setXIncludeAware(false);
|
||||
|
||||
// support the use of an explicit namespace.
|
||||
factory.setNamespaceAware(true);
|
||||
|
||||
try {
|
||||
factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Failed to configure XML parser: " + e.toString());
|
||||
}
|
||||
DelegatorHandler handler = new DelegatorHandler(sitemapUrl, strict);
|
||||
try {
|
||||
SAXParser saxParser = factory.newSAXParser();
|
||||
saxParser.getXMLReader().setEntityResolver(new EntityResolver() {
|
||||
// noop entity resolver, does not fetch remote content
|
||||
@Override
|
||||
public InputSource resolveEntity(String publicId, String systemId) {
|
||||
return new InputSource(new StringReader(""));
|
||||
}
|
||||
});
|
||||
handler.setStrictNamespace(isStrictNamespace());
|
||||
saxParser.parse(is, handler);
|
||||
AbstractSiteMap sitemap = handler.getSiteMap();
|
||||
if (sitemap == null) {
|
||||
UnknownFormatException ex = handler.getException();
|
||||
if (ex != null) {
|
||||
throw ex;
|
||||
}
|
||||
throw new UnknownFormatException("Unknown XML format for: " + sitemapUrl);
|
||||
}
|
||||
return sitemap;
|
||||
} catch (IOException e) {
|
||||
LOG.warn("Error parsing sitemap {}: {}", sitemapUrl, e.getMessage());
|
||||
UnknownFormatException ufe = new UnknownFormatException("Failed to parse " + sitemapUrl);
|
||||
ufe.initCause(e);
|
||||
throw ufe;
|
||||
} catch (SAXException e) {
|
||||
LOG.warn("Error parsing sitemap {}: {}", sitemapUrl, e.getMessage());
|
||||
AbstractSiteMap sitemap = handler.getSiteMap();
|
||||
if (allowPartial && sitemap != null) {
|
||||
LOG.warn("Processed broken/partial sitemap for '" + sitemapUrl + "'");
|
||||
sitemap.setProcessed(true);
|
||||
return sitemap;
|
||||
} else {
|
||||
UnknownFormatException ufe = new UnknownFormatException("Failed to parse " + sitemapUrl);
|
||||
ufe.initCause(e);
|
||||
throw ufe;
|
||||
}
|
||||
} catch (ParserConfigurationException e) {
|
||||
throw new IllegalStateException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds the given URL to the given sitemap while showing the relevant logs
|
||||
*
|
||||
* @param urlStr
|
||||
* an URL string to add to the
|
||||
* {@link crawlercommons.sitemaps.SiteMap}
|
||||
* @param siteMap
|
||||
* the sitemap to add URL(s) to
|
||||
* @param lastMod
|
||||
* last time the {@link crawlercommons.sitemaps.SiteMapURL} was
|
||||
* modified
|
||||
* @param changeFreq
|
||||
* the {@link crawlercommons.sitemaps.SiteMapURL} change frquency
|
||||
* @param priority
|
||||
* priority of this {@link crawlercommons.sitemaps.SiteMapURL}
|
||||
* @param urlIndex
|
||||
* index position to which this entry has been added
|
||||
*/
|
||||
protected void addUrlIntoSitemap(String urlStr, SiteMap siteMap, String lastMod, String changeFreq, String priority, int urlIndex) {
|
||||
try {
|
||||
URL url = new URL(urlStr); // Checking the URL
|
||||
boolean valid = urlIsValid(siteMap.getBaseUrl(), url.toString());
|
||||
|
||||
if (valid || !strict) {
|
||||
SiteMapURL sUrl = new SiteMapURL(url.toString(), lastMod, changeFreq, priority, valid);
|
||||
siteMap.addSiteMapUrl(sUrl);
|
||||
LOG.debug(" {}. {}", urlIndex + 1, sUrl);
|
||||
} else {
|
||||
LOG.warn("URL: {} is excluded from the sitemap as it is not a valid url = not under the base url: {}", url.toExternalForm(), siteMap.getBaseUrl());
|
||||
}
|
||||
} catch (MalformedURLException e) {
|
||||
LOG.warn("Bad url: [{}]", urlStr);
|
||||
LOG.trace("Can't create a sitemap entry with a bad URL", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* See if testUrl is under sitemapBaseUrl. Only URLs under sitemapBaseUrl
|
||||
* are valid.
|
||||
*
|
||||
* @param sitemapBaseUrl
|
||||
* @param testUrl
|
||||
* @return true if testUrl is under sitemapBaseUrl, false otherwise
|
||||
*/
|
||||
public static boolean urlIsValid(String sitemapBaseUrl, String testUrl) {
|
||||
boolean ret = false;
|
||||
|
||||
// Don't try a comparison if the URL is too short to match
|
||||
if (sitemapBaseUrl != null && sitemapBaseUrl.length() <= testUrl.length()) {
|
||||
String u = testUrl.substring(0, sitemapBaseUrl.length());
|
||||
ret = sitemapBaseUrl.equals(u);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* Performs a one time intialization of Tika's Media-Type components and
|
||||
* media type collection constants <br/>
|
||||
* Please note that this is a private static method which is called once per
|
||||
* CLASS (not per instance / object)
|
||||
*/
|
||||
private static void initMediaTypes() {
|
||||
/* XML media types (and all aliases) */
|
||||
XML_MEDIA_TYPES.add(APPLICATION_XML);
|
||||
XML_MEDIA_TYPES.addAll(MEDIA_TYPE_REGISTRY.getAliases(APPLICATION_XML));
|
||||
|
||||
/* TEXT media types (and all aliases) */
|
||||
TEXT_MEDIA_TYPES.add(TEXT_PLAIN);
|
||||
TEXT_MEDIA_TYPES.addAll(MEDIA_TYPE_REGISTRY.getAliases(TEXT_PLAIN));
|
||||
|
||||
/* GZIP media types (and all aliases) */
|
||||
MediaType gzipMediaType = MediaType.parse("application/gzip");
|
||||
GZ_MEDIA_TYPES.add(gzipMediaType);
|
||||
GZ_MEDIA_TYPES.addAll(MEDIA_TYPE_REGISTRY.getAliases(gzipMediaType));
|
||||
}
|
||||
}
|
|
@ -31,8 +31,7 @@ import org.slf4j.LoggerFactory;
|
|||
public class SiteMapTester {
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(SiteMapTester.class);
|
||||
private static SiteMapParser parser = new SiteMapParser(false);
|
||||
private static SiteMapParser saxParser = new SiteMapParserSAX(false, true);
|
||||
private static SiteMapParser saxParser = new SiteMapParser(false, true);
|
||||
|
||||
public static void main(String[] args) throws IOException, UnknownFormatException {
|
||||
if (args.length < 1) {
|
||||
|
@ -43,7 +42,6 @@ public class SiteMapTester {
|
|||
LOG.error(" MIME_TYPE force processing sitemap as MIME type,");
|
||||
LOG.error(" bypass automatic MIME type detection");
|
||||
LOG.error("Java properties:");
|
||||
LOG.error(" sitemap.useSax if true use SAX parser to process sitemaps");
|
||||
LOG.error(" sitemap.strictNamespace");
|
||||
LOG.error(" if true sitemaps are required to use the standard namespace URI");
|
||||
} else {
|
||||
|
@ -61,23 +59,17 @@ public class SiteMapTester {
|
|||
private static void parse(URL url, String mt) throws IOException, UnknownFormatException {
|
||||
byte[] content = IOUtils.toByteArray(url);
|
||||
|
||||
boolean useSaxParser = new Boolean(System.getProperty("sitemap.useSax"));
|
||||
LOG.info("Parsing {} {}", url, ((mt != null && !mt.isEmpty()) ? "as MIME type " + mt : ""));
|
||||
|
||||
boolean strictNamespace = new Boolean(System.getProperty("sitemap.strictNamespace"));
|
||||
|
||||
LOG.info("Parsing {} {} using {} parser", url, ((mt != null && !mt.isEmpty()) ? "as MIME type " + mt : ""), (useSaxParser ? "SAX" : "DOM"));
|
||||
|
||||
SiteMapParser p = parser;
|
||||
if (useSaxParser) {
|
||||
p = saxParser;
|
||||
}
|
||||
p.setStrictNamespace(strictNamespace);
|
||||
saxParser.setStrictNamespace(strictNamespace);
|
||||
|
||||
AbstractSiteMap sm = null;
|
||||
// guesses the mimetype
|
||||
if (mt == null || mt.equals("")) {
|
||||
sm = p.parseSiteMap(content, url);
|
||||
sm = saxParser.parseSiteMap(content, url);
|
||||
} else {
|
||||
sm = p.parseSiteMap(mt, content, url);
|
||||
sm = saxParser.parseSiteMap(mt, content, url);
|
||||
}
|
||||
|
||||
if (sm.isIndex()) {
|
||||
|
|
|
@ -177,7 +177,8 @@ public class EffectiveTldFinderTest {
|
|||
assertEquals("xn--80abbembcyvesfij3at4loa4ff.xn--p1ai", ad);
|
||||
// rare but possible mixed use of UTF-8 and Punycode
|
||||
ad = EffectiveTldFinder.getAssignedDomain("xn--90a1af.бесплатныеобъявления.рф");
|
||||
// assertEquals("xn--80abbembcyvesfij3at4loa4ff.xn--p1ai", ad); // TODO #179
|
||||
// TODO #179
|
||||
// assertEquals("xn--80abbembcyvesfij3at4loa4ff.xn--p1ai", ad);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
@ -1,534 +0,0 @@
|
|||
/**
|
||||
* Copyright 2016 Crawler-Commons
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package crawlercommons.sitemaps;
|
||||
|
||||
import static java.nio.charset.StandardCharsets.UTF_8;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.URL;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Date;
|
||||
import java.util.Iterator;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.junit.runners.JUnit4;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import crawlercommons.sitemaps.AbstractSiteMap.SitemapType;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
@RunWith(JUnit4.class)
|
||||
public class SiteMapParserSAXTest {
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(SiteMapParserSAXTest.class);
|
||||
|
||||
@Before
|
||||
public void setUp() throws Exception {
|
||||
}
|
||||
|
||||
@After
|
||||
public void tearDown() throws Exception {
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSitemapIndex() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParserSAX();
|
||||
String contentType = "text/xml";
|
||||
StringBuilder scontent = new StringBuilder(1024);
|
||||
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n").append("<sitemapindex xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n").append(" <sitemap>\n")
|
||||
.append(" <loc>http://www.example.com/sitemap1.xml.gz</loc>\n").append(" <lastmod><![CDATA[2004-10-01T18:23:17+00:00]]></lastmod>\n").append(" </sitemap>\n")
|
||||
.append("<sitemap>\n").append(" <loc>http://www.example.com/sitemap2.xml.gz</loc>\n").append(" <lastmod>2005-01-01</lastmod>\n").append(" </sitemap>\n")
|
||||
.append("<sitemap>\n").append(" <loc>http://www.example.com/dynsitemap?date=now&all=true</loc>\n").append(" </sitemap>\n").append("<sitemap>\n")
|
||||
.append(" <loc>http://www.example.com/dynsitemap<![CDATA[?date=lastyear&all=false]]></loc>\n").append(" </sitemap>\n").append("</sitemapindex>");
|
||||
byte[] content = scontent.toString().getBytes(UTF_8);
|
||||
URL url = new URL("http://www.example.com/sitemapindex.xml");
|
||||
|
||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
assertEquals(true, asm.isIndex());
|
||||
assertEquals(true, asm instanceof SiteMapIndex);
|
||||
|
||||
SiteMapIndex smi = (SiteMapIndex) asm;
|
||||
assertEquals(4, smi.getSitemaps().size());
|
||||
|
||||
AbstractSiteMap currentSiteMap = smi.getSitemap(new URL("http://www.example.com/sitemap1.xml.gz"));
|
||||
assertNotNull(currentSiteMap);
|
||||
assertEquals("http://www.example.com/sitemap1.xml.gz", currentSiteMap.getUrl().toString());
|
||||
assertEquals(SiteMap.convertToDate("2004-10-01T18:23:17+00:00"), currentSiteMap.getLastModified());
|
||||
|
||||
assertTrue(currentSiteMap.toString().contains("T18:23"));
|
||||
|
||||
currentSiteMap = smi.getSitemap(new URL("http://www.example.com/sitemap2.xml.gz"));
|
||||
assertNotNull(currentSiteMap);
|
||||
assertEquals("http://www.example.com/sitemap2.xml.gz", currentSiteMap.getUrl().toString());
|
||||
assertEquals(SiteMap.convertToDate("2005-01-01"), currentSiteMap.getLastModified());
|
||||
|
||||
currentSiteMap = smi.getSitemap(new URL("http://www.example.com/dynsitemap?date=now&all=true"));
|
||||
assertNotNull("<loc> with entities not found", currentSiteMap);
|
||||
assertEquals("http://www.example.com/dynsitemap?date=now&all=true", currentSiteMap.getUrl().toString());
|
||||
|
||||
currentSiteMap = smi.getSitemap(new URL("http://www.example.com/dynsitemap?date=lastyear&all=false"));
|
||||
assertNotNull("<loc> with CDATA not found", currentSiteMap);
|
||||
assertEquals("http://www.example.com/dynsitemap?date=lastyear&all=false", currentSiteMap.getUrl().toString());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSitemapWithNamespace() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParserSAX();
|
||||
parser.setStrictNamespace(true);
|
||||
byte[] content = getResourceAsBytes("src/test/resources/sitemaps/sitemap.ns.xml");
|
||||
|
||||
URL url = new URL("http://www.example.com/sitemap.ns.xml");
|
||||
AbstractSiteMap asm = parser.parseSiteMap(content, url);
|
||||
assertEquals(SitemapType.XML, asm.getType());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
assertEquals(true, asm.isProcessed());
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
|
||||
assertEquals(2, sm.getSiteMapUrls().size());
|
||||
assertEquals(SiteMapURL.ChangeFrequency.DAILY, sm.getSiteMapUrls().iterator().next().getChangeFrequency());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSitemapWithWrongNamespace() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParserSAX();
|
||||
parser.setStrictNamespace(true);
|
||||
|
||||
byte[] content = getResourceAsBytes("src/test/resources/sitemaps/sitemap.badns.xml");
|
||||
|
||||
URL url = new URL("http://www.example.com/sitemap.badns.xml");
|
||||
AbstractSiteMap asm;
|
||||
try {
|
||||
asm = parser.parseSiteMap(content, url);
|
||||
fail("Expected an UnknownFormatException because of wrong namespace");
|
||||
} catch (UnknownFormatException e) {
|
||||
assertTrue(e.getMessage().contains("does not match standard namespace"));
|
||||
}
|
||||
|
||||
// try again in lenient mode
|
||||
parser.setStrictNamespace(false);
|
||||
asm = parser.parseSiteMap(content, url);
|
||||
assertEquals(SitemapType.XML, asm.getType());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
assertEquals(true, asm.isProcessed());
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
|
||||
assertEquals(2, sm.getSiteMapUrls().size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFullDateFormat() {
|
||||
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm+hh:00", Locale.ROOT);
|
||||
Date date = new Date();
|
||||
LOG.info(format.format(date));
|
||||
LOG.info(SiteMap.getFullDateFormat().format(date));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSitemapTXT() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParserSAX();
|
||||
String contentType = "text/plain";
|
||||
String scontent = "http://www.example.com/catalog?item=1\nhttp://www.example.com/catalog?item=11";
|
||||
byte[] content = scontent.getBytes(UTF_8);
|
||||
URL url = new URL("http://www.example.com/sitemap.txt");
|
||||
|
||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
assertEquals(false, asm.isIndex());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
assertEquals(2, sm.getSiteMapUrls().size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSitemapTXTWithXMLExt() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParserSAX();
|
||||
String scontent = "http://www.example.com/catalog?item=1\nhttp://www.example.com/catalog?item=11";
|
||||
byte[] content = scontent.getBytes(UTF_8);
|
||||
URL url = new URL("http://www.example.com/sitemap.xml");
|
||||
String contentType = "text/plain";
|
||||
|
||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
assertEquals(false, asm.isIndex());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
assertEquals(2, sm.getSiteMapUrls().size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSitemapXML() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParserSAX();
|
||||
String contentType = "text/xml";
|
||||
byte[] content = getXMLSitemapAsBytes();
|
||||
URL url = new URL("http://www.example.com/sitemap.xml");
|
||||
|
||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
assertEquals(false, asm.isIndex());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
assertEquals(5, sm.getSiteMapUrls().size());
|
||||
|
||||
SiteMapURL[] found = sm.getSiteMapUrls().toArray(new SiteMapURL[5]);
|
||||
for (int i = 0; i < found.length; i++) {
|
||||
assertEquals(sitemapURLs[i].replaceAll("&", "&"), found[i].getUrl().toExternalForm());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSitemapXMLMediaTypes() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParserSAX();
|
||||
byte[] content = getXMLSitemapAsBytes();
|
||||
URL url = new URL("http://www.example.com/sitemap.nonXmlExt");
|
||||
|
||||
final String[] XML_CONTENT_TYPES = new String[] { "text/xml", "application/x-xml", "application/xml", "application/atom+xml", "application/rss+xml" };
|
||||
for (String contentType : XML_CONTENT_TYPES) {
|
||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
assertEquals(false, asm.isIndex());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
assertEquals(5, sm.getSiteMapUrls().size());
|
||||
SiteMapURL[] found = sm.getSiteMapUrls().toArray(new SiteMapURL[5]);
|
||||
for (int i = 0; i < found.length; i++) {
|
||||
assertEquals(sitemapURLs[i].replaceAll("&", "&"), found[i].getUrl().toExternalForm());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This Sitemap contains badly formatted XML and can't be read
|
||||
*/
|
||||
@Test(expected = UnknownFormatException.class)
|
||||
public void testSitemapParserBrokenXml() throws IOException, UnknownFormatException {
|
||||
SiteMapParser parser = new SiteMapParserSAX();
|
||||
String contentType = "text/xml";
|
||||
StringBuilder scontent = new StringBuilder(1024);
|
||||
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">")
|
||||
.append("<url><!-- This file is not a valid XML file --></url>").append("<url><loc> http://cs.harding.edu/fmccown/sitemaps/something.html</loc>")
|
||||
.append("</url><!-- missing opening url tag --></url></urlset>");
|
||||
byte[] content = scontent.toString().getBytes(UTF_8);
|
||||
URL url = new URL("http://www.example.com/sitemapindex.xml");
|
||||
|
||||
parser.parseSiteMap(contentType, content, url); // This Sitemap contains
|
||||
// badly formatted XML
|
||||
// and can't be read
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMissingLocSitemapIndexFile() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParserSAX();
|
||||
byte[] content = getResourceAsBytes("src/test/resources/sitemaps/sitemap.index.xml");
|
||||
|
||||
URL url = new URL("http://www.example.com/sitemap.index.xml");
|
||||
AbstractSiteMap asm = parser.parseSiteMap(content, url);
|
||||
assertEquals(true, asm.isIndex());
|
||||
assertEquals(true, asm instanceof SiteMapIndex);
|
||||
SiteMapIndex sm = (SiteMapIndex) asm;
|
||||
assertEquals(15, sm.getSitemaps().size());
|
||||
String sitemap = "https://example.com/sitemap.jss?portalCode=10260&lang=en";
|
||||
assertNotNull("Sitemap " + sitemap + " not found in sitemap index", sm.getSitemap(new URL(sitemap)));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSitemapGZ() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParserSAX();
|
||||
String contentType = "application/gzip";
|
||||
byte[] content = getResourceAsBytes("src/test/resources/sitemaps/xmlSitemap.gz");
|
||||
|
||||
URL url = new URL("http://www.example.com/sitemap.xml.gz");
|
||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
assertEquals(false, asm.isIndex());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
assertEquals(5, sm.getSiteMapUrls().size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSitemapTextGZ() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParserSAX();
|
||||
String contentType = "application/gzip";
|
||||
byte[] content = this.getResourceAsBytes("src/test/resources/sitemaps/sitemap.txt.gz");
|
||||
|
||||
URL url = new URL("http://www.example.com/sitemap.txt.gz");
|
||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
assertEquals(false, asm.isIndex());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
assertEquals(5, sm.getSiteMapUrls().size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSitemapGZMediaTypes() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParserSAX();
|
||||
byte[] content = getResourceAsBytes("src/test/resources/sitemaps/xmlSitemap.gz");
|
||||
|
||||
final String[] GZ_CONTENT_TYPES = new String[] { "application/gzip", "application/x-gzip", "application/x-gunzip", "application/gzipped", "application/gzip-compressed", "gzip/document" };
|
||||
for (String contentType : GZ_CONTENT_TYPES) {
|
||||
URL url = new URL("http://www.example.com/sitemap");
|
||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
assertEquals(false, asm.isIndex());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
assertEquals(5, sm.getSiteMapUrls().size());
|
||||
}
|
||||
}
|
||||
|
||||
@Test(expected = UnknownFormatException.class)
|
||||
public void testSitemapWithOctetMediaType() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParserSAX();
|
||||
String contentType = "application/octet-stream";
|
||||
byte[] content = getXMLSitemapAsBytes();
|
||||
URL url = new URL("http://www.example.com/sitemap");
|
||||
|
||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
assertEquals(false, asm.isIndex());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
assertEquals(5, sm.getSiteMapUrls().size());
|
||||
|
||||
SiteMapURL[] found = sm.getSiteMapUrls().toArray(new SiteMapURL[5]);
|
||||
for (int i = 0; i < found.length; i++) {
|
||||
assertEquals(sitemapURLs[i], found[i].getUrl().toExternalForm());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLenientParser() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParserSAX();
|
||||
String contentType = "text/xml";
|
||||
StringBuilder scontent = new StringBuilder(1024);
|
||||
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">").append("<url>")
|
||||
.append("<loc>http://www.example.com/</loc>").append("</url>").append("</urlset>");
|
||||
byte[] content = scontent.toString().getBytes(UTF_8);
|
||||
|
||||
URL url = new URL("http://www.example.com/subsection/sitemap.xml");
|
||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
assertEquals(false, asm.isIndex());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
assertEquals(0, sm.getSiteMapUrls().size());
|
||||
|
||||
// Now try again with lenient parsing. We should get one invalid URL
|
||||
parser = new SiteMapParserSAX(false);
|
||||
asm = parser.parseSiteMap(contentType, content, url);
|
||||
assertEquals(false, asm.isIndex());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
|
||||
sm = (SiteMap) asm;
|
||||
assertEquals(1, sm.getSiteMapUrls().size());
|
||||
assertFalse(sm.getSiteMapUrls().iterator().next().isValid());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAtomFormat() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParserSAX();
|
||||
byte[] content = getResourceAsBytes("src/test/resources/sitemaps/atom.xml");
|
||||
URL url = new URL("http://example.org/atom.xml");
|
||||
|
||||
SiteMap sm = (SiteMap) parser.parseSiteMap(content, url);
|
||||
assertEquals(1, sm.getSiteMapUrls().size());
|
||||
assertEquals(new URL("http://example.org/2003/12/13/atom03"), sm.getSiteMapUrls().iterator().next().getUrl());
|
||||
}
|
||||
|
||||
/**
|
||||
* Test processing RSS 1.0 sitemaps, which don't have an <rss> tag. E.g.
|
||||
* http://rss.slashdot.org/slashdot/slashdotMain?format=xml
|
||||
*
|
||||
* See https://github.com/crawler-commons/crawler-commons/issues/87
|
||||
*
|
||||
* @throws IOException
|
||||
* @throws UnknownFormatException
|
||||
*/
|
||||
@Test
|
||||
public void testRSS10SyndicationFormat() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParserSAX();
|
||||
|
||||
String contentType = "text/xml";
|
||||
URL url = new URL("http://www.example.com/sitemapindex.xml");
|
||||
StringBuilder scontent = new StringBuilder(1024);
|
||||
scontent.append("<?xml version=\"1.0\"?>")
|
||||
.append("<rdf:RDF xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\" xmlns=\"http://purl.org/rss/1.0/\">")
|
||||
.append("<channel rdf:about=\"http://www.xml.com/xml/news.rss\">")
|
||||
.append("<title>XML.com</title>")
|
||||
.append("<link>http://www.example.com/pub</link>")
|
||||
.append("<description>example.com</description>")
|
||||
.append("<image rdf:resource=\"http://www.example.com/universal/images/xml_tiny.gif\" />")
|
||||
.append("<items><rdf:Seq><rdf:li resource=\"http://www.example.com/pub/2000/08/09/xslt/xslt.html\" />")
|
||||
.append("<rdf:li resource=\"http://www.example.com/pub/2000/08/09/rdfdb/index.html\" /></rdf:Seq></items></channel>")
|
||||
.append("<image rdf:about=\"http://www.example.com/universal/images/xml_tiny.gif\"><title>XML.com</title><link>http://www.xml.com</link>")
|
||||
.append("<url>http://www.example.com/universal/images/xml_tiny.gif</url></image>")
|
||||
.append("<item rdf:about=\"http://www.example.com/pub/2000/08/09/xslt/xslt.html\"><title>Processing Inclusions with XSLT</title>")
|
||||
.append("<link>http://www.example.com/pub/2000/08/09/xslt/xslt.html</link>")
|
||||
.append("<description>Processing document inclusions with general XML tools can be problematic. This article proposes a way of preserving inclusion"
|
||||
+ "information through SAX-based processing. </description> </item> </rdf:RDF>");
|
||||
byte[] content = scontent.toString().getBytes(UTF_8);
|
||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
assertEquals(false, asm.isIndex());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
assertEquals(1, sm.getSiteMapUrls().size());
|
||||
assertEquals("http://www.example.com/pub/2000/08/09/xslt/xslt.html", sm.getSiteMapUrls().iterator().next().getUrl().toString());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRSSPubDate() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParserSAX();
|
||||
String contentType = "text/xml";
|
||||
byte[] content = getResourceAsBytes("src/test/resources/rss/xmlRss_pubDate.xml");
|
||||
URL url = new URL("http://www.example.com/rss.xml");
|
||||
|
||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
assertSame("Not an RSS", SitemapType.RSS, asm.getType());
|
||||
assertNotNull("GMT timestamp not parsed", asm.getLastModified());
|
||||
long pubDate = 1483619690000L; // Thu, 05 Jan 17 12:34:50 GMT
|
||||
assertEquals("GMT timestamp", pubDate, asm.getLastModified().getTime());
|
||||
SiteMap rss = (SiteMap) asm;
|
||||
assertEquals("Incorrect items count", 7, rss.getSiteMapUrls().size());
|
||||
Iterator<SiteMapURL> it = rss.getSiteMapUrls().iterator();
|
||||
assertPubDate("Local differental offset", "article_1", pubDate + 1000, it);
|
||||
assertPubDate("Short year", "article_2", pubDate + 2000, it);
|
||||
assertPubDate("No weekday", "article_3", pubDate + 3000, it);
|
||||
assertPubDate("No weekday and short year", "article_4", pubDate + 4000, it);
|
||||
assertPubDate("No time zone(incorrect)", "article_5", null, it);
|
||||
assertPubDate("Empty field", "article_6", null, it);
|
||||
assertPubDate("Missed field", "article_7", null, it);
|
||||
}
|
||||
|
||||
private static void assertPubDate(String message, String path, Long pubDate, Iterator<SiteMapURL> it) {
|
||||
assertTrue(message + " item missed", it.hasNext());
|
||||
SiteMapURL url = it.next();
|
||||
assertEquals(message + " link", "http://www.example.com/" + path, url.getUrl().toString());
|
||||
if (pubDate == null) {
|
||||
assertNull(message + " pubDate not NULL", url.getLastModified());
|
||||
} else {
|
||||
assertNotNull(message + " pubDate is missing", url.getLastModified());
|
||||
assertEquals(message + " pub date", pubDate.longValue(), url.getLastModified().getTime());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPartialSitemapsAllowed() throws UnknownFormatException, IOException {
|
||||
|
||||
SiteMapParser parser = new SiteMapParserSAX(false, true);
|
||||
String contentType = "text/xml";
|
||||
StringBuilder scontent = new StringBuilder(1024);
|
||||
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">").append("<url>")
|
||||
.append("<loc>http://www.example.com/</lo");
|
||||
|
||||
byte[] content = scontent.toString().getBytes(UTF_8);
|
||||
|
||||
URL url = new URL("http://www.example.com/subsection/sitemap.xml");
|
||||
|
||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
assertEquals(false, asm.isIndex());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
assertEquals(1, sm.getSiteMapUrls().size());
|
||||
assertFalse(sm.getSiteMapUrls().iterator().next().isValid());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUrlLocUrl() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParserSAX(false);
|
||||
String contentType = "text/xml";
|
||||
StringBuilder scontent = new StringBuilder(1024);
|
||||
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">").append("<url>").append("<loc>").append("<url>")
|
||||
.append("<![CDATA[").append("http://jobs.optistaffing.com/EXPERIENCED-DISPATCHER-NEEDED-NOW----Jobs-in-Vancouver-WA/2333221").append("]]>").append("</url>").append("</loc>")
|
||||
.append("<lastmod>2015-04-28</lastmod>").append("<changefreq>daily</changefreq>").append("</url>").append("</urlset>");
|
||||
|
||||
byte[] content = scontent.toString().getBytes(UTF_8);
|
||||
|
||||
URL url = new URL("http://www.example.com/subsection/sitemap.xml");
|
||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
assertEquals(false, asm.isIndex());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
assertEquals(1, sm.getSiteMapUrls().size());
|
||||
assertFalse(sm.getSiteMapUrls().iterator().next().isValid());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPartialSitemapIndicesAllowed() throws UnknownFormatException, IOException {
|
||||
|
||||
SiteMapParser parser = new SiteMapParserSAX(false, true);
|
||||
String contentType = "text/xml";
|
||||
StringBuilder scontent = new StringBuilder(1024);
|
||||
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append("<sitemapindex xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">")
|
||||
.append("<sitemap><loc>http://www.example.com/sitemap1.xml.gz</loc><las");
|
||||
byte[] content = scontent.toString().getBytes(UTF_8);
|
||||
|
||||
URL url = new URL("http://www.example.com/subsection/sitemap.xml");
|
||||
|
||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
assertEquals(true, asm.isIndex());
|
||||
assertEquals(true, asm instanceof SiteMapIndex);
|
||||
|
||||
SiteMapIndex smi = (SiteMapIndex) asm;
|
||||
assertEquals(1, smi.getSitemaps().size());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a good simple default XML sitemap as a byte array
|
||||
*/
|
||||
private byte[] getXMLSitemapAsBytes() {
|
||||
StringBuilder scontent = new StringBuilder(1024);
|
||||
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">");
|
||||
scontent.append("<url> <loc>").append(sitemapURLs[0]).append("</loc> <lastmod>2005-01-01</lastmod>").append(" <changefreq>monthly</changefreq>").append(" <priority>0.8</priority>")
|
||||
.append("</url>");
|
||||
scontent.append("<url> <loc>").append(sitemapURLs[1]).append("</loc> <changefreq>weekly</changefreq>").append("</url>");
|
||||
scontent.append("<url> <loc>").append(sitemapURLs[2]).append("</loc> <lastmod>2004-12-23</lastmod>").append(" <changefreq>weekly</changefreq>").append("</url>");
|
||||
scontent.append("<url> <loc>").append(sitemapURLs[3]).append("</loc> <lastmod>2004-12-23T18:00:15+00:00</lastmod>").append(" <priority>0.3</priority>").append("</url>");
|
||||
scontent.append("<url> <loc><url><![CDATA[").append(sitemapURLs[4]).append("]]></url></loc> <lastmod>2004-11-23</lastmod>").append("</url>");
|
||||
scontent.append("</urlset>");
|
||||
|
||||
return scontent.toString().getBytes(UTF_8);
|
||||
}
|
||||
|
||||
/**
|
||||
* Read a test resource file and return its content as byte array.
|
||||
*
|
||||
* @param resourceName
|
||||
* path to the resource file
|
||||
* @return byte content of the file
|
||||
* @throws IOException
|
||||
*/
|
||||
private byte[] getResourceAsBytes(String resourceName) throws IOException {
|
||||
File file = new File(resourceName);
|
||||
InputStream is = new FileInputStream(file);
|
||||
return IOUtils.toByteArray(is);
|
||||
}
|
||||
|
||||
private static String[] sitemapURLs = new String[] { "http://www.example.com/", "http://www.example.com/catalog?item=12&desc=vacation_hawaii",
|
||||
"http://www.example.com/catalog?item=73&desc=vacation_new_zealand", "http://www.example.com/catalog?item=74&desc=vacation_newfoundland",
|
||||
"http://www.example.com/catalog?item=83&desc=vacation_usa" };
|
||||
|
||||
}
|
|
@ -17,6 +17,13 @@
|
|||
package crawlercommons.sitemaps;
|
||||
|
||||
import static java.nio.charset.StandardCharsets.UTF_8;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertNotNull;
|
||||
import static org.junit.Assert.assertNull;
|
||||
import static org.junit.Assert.assertSame;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.junit.Assert.fail;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
|
@ -31,7 +38,6 @@ import java.util.Locale;
|
|||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.junit.runners.JUnit4;
|
||||
|
@ -40,8 +46,6 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import crawlercommons.sitemaps.AbstractSiteMap.SitemapType;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
@RunWith(JUnit4.class)
|
||||
public class SiteMapParserTest {
|
||||
|
||||
|
@ -99,6 +103,7 @@ public class SiteMapParserTest {
|
|||
@Test
|
||||
public void testSitemapWithNamespace() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParser();
|
||||
parser.setStrictNamespace(true);
|
||||
byte[] content = getResourceAsBytes("src/test/resources/sitemaps/sitemap.ns.xml");
|
||||
|
||||
URL url = new URL("http://www.example.com/sitemap.ns.xml");
|
||||
|
@ -120,13 +125,13 @@ public class SiteMapParserTest {
|
|||
byte[] content = getResourceAsBytes("src/test/resources/sitemaps/sitemap.badns.xml");
|
||||
|
||||
URL url = new URL("http://www.example.com/sitemap.badns.xml");
|
||||
AbstractSiteMap asm = parser.parseSiteMap(content, url);
|
||||
assertEquals(SitemapType.XML, asm.getType());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
assertEquals(true, asm.isProcessed());
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
|
||||
assertEquals(0, sm.getSiteMapUrls().size());
|
||||
AbstractSiteMap asm;
|
||||
try {
|
||||
asm = parser.parseSiteMap(content, url);
|
||||
fail("Expected an UnknownFormatException because of wrong namespace");
|
||||
} catch (UnknownFormatException e) {
|
||||
assertTrue(e.getMessage().contains("does not match standard namespace"));
|
||||
}
|
||||
|
||||
// try again in lenient mode
|
||||
parser.setStrictNamespace(false);
|
||||
|
@ -134,7 +139,7 @@ public class SiteMapParserTest {
|
|||
assertEquals(SitemapType.XML, asm.getType());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
assertEquals(true, asm.isProcessed());
|
||||
sm = (SiteMap) asm;
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
|
||||
assertEquals(2, sm.getSiteMapUrls().size());
|
||||
}
|
||||
|
@ -407,20 +412,15 @@ public class SiteMapParserTest {
|
|||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
assertSame("Not an RSS", SitemapType.RSS, asm.getType());
|
||||
assertNotNull("GMT timestamp not parsed", asm.getLastModified());
|
||||
assertEquals("GMT timestamp", 1483619690000L, asm.getLastModified().getTime()); // Thu,
|
||||
// 05
|
||||
// Jan
|
||||
// 17
|
||||
// 12:34:50
|
||||
// GMT
|
||||
|
||||
long pubDate = 1483619690000L; // Thu, 05 Jan 17 12:34:50 GMT
|
||||
assertEquals("GMT timestamp", pubDate, asm.getLastModified().getTime());
|
||||
SiteMap rss = (SiteMap) asm;
|
||||
assertEquals("Incorrect items count", 7, rss.getSiteMapUrls().size());
|
||||
Iterator<SiteMapURL> it = rss.getSiteMapUrls().iterator();
|
||||
assertPubDate("Local differental offset", "article_1", 1483619691000L, it);
|
||||
assertPubDate("Short year", "article_2", 1483619692000L, it);
|
||||
assertPubDate("No weekday", "article_3", 1483619693000L, it);
|
||||
assertPubDate("No weekday and short year", "article_4", 1483619694000L, it);
|
||||
assertPubDate("Local differental offset", "article_1", pubDate + 1000, it);
|
||||
assertPubDate("Short year", "article_2", pubDate + 2000, it);
|
||||
assertPubDate("No weekday", "article_3", pubDate + 3000, it);
|
||||
assertPubDate("No weekday and short year", "article_4", pubDate + 4000, it);
|
||||
assertPubDate("No time zone(incorrect)", "article_5", null, it);
|
||||
assertPubDate("Empty field", "article_6", null, it);
|
||||
assertPubDate("Missed field", "article_7", null, it);
|
||||
|
@ -438,11 +438,10 @@ public class SiteMapParserTest {
|
|||
}
|
||||
}
|
||||
|
||||
@Ignore("fails for DOM-based parser")
|
||||
@Test
|
||||
public void testPartialSitemapsAllowed() throws UnknownFormatException, IOException {
|
||||
|
||||
SiteMapParser parser = new SiteMapParser();
|
||||
SiteMapParser parser = new SiteMapParser(false, true);
|
||||
String contentType = "text/xml";
|
||||
StringBuilder scontent = new StringBuilder(1024);
|
||||
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">").append("<url>")
|
||||
|
@ -482,11 +481,10 @@ public class SiteMapParserTest {
|
|||
assertFalse(sm.getSiteMapUrls().iterator().next().isValid());
|
||||
}
|
||||
|
||||
@Ignore("fails for DOM-based parser")
|
||||
@Test
|
||||
public void testPartialSitemapIndicesAllowed() throws UnknownFormatException, IOException {
|
||||
|
||||
SiteMapParser parser = new SiteMapParser();
|
||||
SiteMapParser parser = new SiteMapParser(false, true);
|
||||
String contentType = "text/xml";
|
||||
StringBuilder scontent = new StringBuilder(1024);
|
||||
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append("<sitemapindex xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">")
|
||||
|
|
Loading…
Reference in New Issue