mirror of
https://github.com/crawler-commons/crawler-commons
synced 2024-05-20 18:36:03 +02:00
Add namespace aware DOM/SAX parsing for XML Sitemaps (#176)
* Add namespace aware DOM/SAX parsing for XML Sitemaps. RSS and Atom parsing is also namespace aware, but finding elements is left "relaxed" by only matching on the element "localName". * Lenient namespacing in non strict mode + applied formatting * Introduced separate field strictNamespace to sitemapparsers + added test to saxparser * Fixes Javadoc * Fixes the fix for the Javadoc * Allow to set strictNamespace in SiteMapTester - Fix strict namespace handling in SitemapParserSAX: - pass strictNamespace from DelegatorHandler to delegates - ignore text if inside an element of invalid namespace - use SAX parser in unit test - set exception and pass it to calling DelegatorHandler if namespace does not match
This commit is contained in:
parent
5e60792a0b
commit
6adb771b72
|
@ -79,10 +79,9 @@ public class EffectiveTldFinder {
|
|||
domains = new HashMap<>();
|
||||
try {
|
||||
if (null == effectiveTldDataStream && null != this.getClass().getResource(ETLD_DATA)) {
|
||||
effectiveTldDataStream = this.getClass().getResourceAsStream(ETLD_DATA);
|
||||
effectiveTldDataStream = this.getClass().getResourceAsStream(ETLD_DATA);
|
||||
}
|
||||
BufferedReader input = new BufferedReader(new InputStreamReader(
|
||||
effectiveTldDataStream, StandardCharsets.UTF_8));
|
||||
BufferedReader input = new BufferedReader(new InputStreamReader(effectiveTldDataStream, StandardCharsets.UTF_8));
|
||||
String line = null;
|
||||
while (null != (line = input.readLine())) {
|
||||
if (line.length() == 0 || (line.length() > 1 && line.startsWith(COMMENT))) {
|
||||
|
|
|
@ -67,12 +67,8 @@ public abstract class AbstractSiteMap {
|
|||
private static final ThreadLocal<DateFormat[]> RSS_DATE_FORMATS = new ThreadLocal<DateFormat[]>() {
|
||||
@Override
|
||||
protected DateFormat[] initialValue() {
|
||||
return new DateFormat[] {
|
||||
new SimpleDateFormat("EEE, dd MMM yy HH:mm:ss Z", Locale.ROOT),
|
||||
new SimpleDateFormat("dd MMM yy HH:mm:ss Z", Locale.ROOT),
|
||||
new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss Z", Locale.ROOT),
|
||||
new SimpleDateFormat("dd MMM yyyy HH:mm:ss Z", Locale.ROOT)
|
||||
};
|
||||
return new DateFormat[] { new SimpleDateFormat("EEE, dd MMM yy HH:mm:ss Z", Locale.ROOT), new SimpleDateFormat("dd MMM yy HH:mm:ss Z", Locale.ROOT),
|
||||
new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss Z", Locale.ROOT), new SimpleDateFormat("dd MMM yyyy HH:mm:ss Z", Locale.ROOT) };
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -205,13 +201,13 @@ public abstract class AbstractSiteMap {
|
|||
}
|
||||
|
||||
/**
|
||||
* Converts pubDate of RSS to the string representation which could be parsed
|
||||
* in {@link #convertToDate(String)} method.
|
||||
*
|
||||
* Converts pubDate of RSS to the string representation which could be
|
||||
* parsed in {@link #convertToDate(String)} method.
|
||||
*
|
||||
* @param pubDate
|
||||
* - date time of pubDate in RFC822
|
||||
* @return converted to "yyyy-MM-dd'T'HH:mm:ssZ" format or original value if it doesn't
|
||||
* follow the RFC822
|
||||
* @return converted to "yyyy-MM-dd'T'HH:mm:ssZ" format or
|
||||
* original value if it doesn't follow the RFC822
|
||||
*/
|
||||
public static String normalizeRSSTimestamp(String pubDate) {
|
||||
if (pubDate == null) {
|
||||
|
|
|
@ -0,0 +1,38 @@
|
|||
/**
|
||||
* Copyright 2016 Crawler-Commons
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package crawlercommons.sitemaps;
|
||||
|
||||
/**
|
||||
* supported sitemap formats:
|
||||
* https://www.sitemaps.org/protocol.html#otherformats
|
||||
*/
|
||||
public class Namespace {
|
||||
|
||||
public static final String SITEMAP = "http://www.sitemaps.org/schemas/sitemap/0.9";
|
||||
|
||||
/**
|
||||
* RSS and Atom sitemap formats do not have strict definition. But if we do
|
||||
* not parse as namespace aware, then RSS/Atom files that choose to use
|
||||
* namespaces will break. The relaxed compromise for RSS/Atom is to always
|
||||
* parse as "namespace aware", but we will only match elements by the
|
||||
* localName, accepting any element namespace.
|
||||
*/
|
||||
public static final String RSS_2_0 = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
|
||||
public static final String ATOM_0_3 = "http://purl.org/atom/ns#";
|
||||
public static final String ATOM_1_0 = "http://www.w3.org/2005/Atom";
|
||||
|
||||
}
|
|
@ -87,12 +87,19 @@ public class SiteMapParser {
|
|||
/**
|
||||
* True (by default) meaning that invalid URLs should be rejected, as the
|
||||
* official docs allow the siteMapURLs to be only under the base url:
|
||||
* http://www.sitemaps.org/protocol.html#location
|
||||
* http://www.sitemaps.org/protocol.html#location Also checks that the
|
||||
* correct namespace is used.
|
||||
*/
|
||||
protected boolean strict = true;
|
||||
|
||||
/**
|
||||
* Indicates whether the parser should work with the namespace from the
|
||||
* specifications or any namespace. Defaults to false.
|
||||
**/
|
||||
protected boolean strictNamespace = false;
|
||||
|
||||
public SiteMapParser() {
|
||||
//default constructor
|
||||
// default constructor
|
||||
}
|
||||
|
||||
public SiteMapParser(boolean strict) {
|
||||
|
@ -107,6 +114,22 @@ public class SiteMapParser {
|
|||
return strict;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return whether the parser allows any namespace or just the one from the
|
||||
* specification
|
||||
*/
|
||||
public boolean isStrictNamespace() {
|
||||
return strictNamespace;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the parser to allow any namespace or just the one from the
|
||||
* specification
|
||||
*/
|
||||
public void setStrictNamespace(boolean s) {
|
||||
strictNamespace = s;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a SiteMap or SiteMapIndex given an online sitemap URL
|
||||
*
|
||||
|
@ -234,7 +257,8 @@ public class SiteMapParser {
|
|||
}
|
||||
throw new UnknownFormatException("Can't parse a gzipped sitemap with the embedded MediaType of: " + embeddedType + " (at: " + url + ")");
|
||||
}
|
||||
mediaType = MEDIA_TYPE_REGISTRY.getSupertype(mediaType); // Check parent
|
||||
mediaType = MEDIA_TYPE_REGISTRY.getSupertype(mediaType); // Check
|
||||
// parent
|
||||
}
|
||||
|
||||
throw new UnknownFormatException("Can't parse a sitemap with the MediaType of: " + contentType + " (at: " + url + ")");
|
||||
|
@ -352,16 +376,22 @@ public class SiteMapParser {
|
|||
* {@link org.xml.sax.InputSource}
|
||||
*/
|
||||
protected AbstractSiteMap processXml(URL sitemapUrl, InputSource is) throws UnknownFormatException {
|
||||
|
||||
Document doc = null;
|
||||
|
||||
try {
|
||||
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
|
||||
// disable validation and avoid that remote DTDs, schemas, etc. are fetched
|
||||
|
||||
// disable validation and avoid that remote DTDs, schemas, etc. are
|
||||
// fetched
|
||||
dbf.setValidating(false);
|
||||
|
||||
// support an explicitly named namespace.
|
||||
dbf.setNamespaceAware(true);
|
||||
|
||||
dbf.setXIncludeAware(false);
|
||||
dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
|
||||
DocumentBuilder db = dbf.newDocumentBuilder();
|
||||
|
||||
db.setEntityResolver(new EntityResolver() {
|
||||
// noop entity resolver, does not fetch remote content
|
||||
@Override
|
||||
|
@ -369,14 +399,17 @@ public class SiteMapParser {
|
|||
return new InputSource(new StringReader(""));
|
||||
}
|
||||
});
|
||||
|
||||
db.setErrorHandler(new ErrorHandler() {
|
||||
public void warning(SAXParseException e) throws SAXException {
|
||||
LOG.warn("Warning parsing XML: {}", e.toString());
|
||||
}
|
||||
|
||||
public void fatalError(SAXParseException e) throws SAXException {
|
||||
LOG.error("Fatal error parsing XML: {}", e.toString());
|
||||
throw e;
|
||||
}
|
||||
|
||||
public void error(SAXParseException e) throws SAXException {
|
||||
LOG.error("Error parsing XML: {}", e.toString());
|
||||
throw e;
|
||||
|
@ -389,14 +422,14 @@ public class SiteMapParser {
|
|||
}
|
||||
|
||||
// See if this is a sitemap index
|
||||
NodeList nodeList = doc.getElementsByTagName("sitemapindex");
|
||||
NodeList nodeList = doc.getElementsByTagNameNS("*", "sitemapindex");
|
||||
if (nodeList.getLength() > 0) {
|
||||
nodeList = doc.getElementsByTagName("sitemap");
|
||||
nodeList = doc.getElementsByTagNameNS("*", "sitemap");
|
||||
return parseSitemapIndex(sitemapUrl, nodeList);
|
||||
} else if (doc.getElementsByTagName("urlset").getLength() > 0) {
|
||||
} else if (doc.getElementsByTagNameNS("*", "urlset").getLength() > 0) {
|
||||
// This is a regular Sitemap
|
||||
return parseXmlSitemap(sitemapUrl, doc);
|
||||
} else if (doc.getElementsByTagName("link").getLength() > 0) {
|
||||
} else if (doc.getElementsByTagNameNS("*", "link").getLength() > 0) {
|
||||
// Could be RSS or Atom
|
||||
return parseSyndicationFormat(sitemapUrl, doc);
|
||||
}
|
||||
|
@ -436,7 +469,12 @@ public class SiteMapParser {
|
|||
SiteMap sitemap = new SiteMap(sitemapUrl);
|
||||
sitemap.setType(SitemapType.XML);
|
||||
|
||||
NodeList list = doc.getElementsByTagName("url");
|
||||
String namespace = Namespace.SITEMAP;
|
||||
if (!strictNamespace) {
|
||||
namespace = "*";
|
||||
}
|
||||
|
||||
NodeList list = doc.getElementsByTagNameNS(namespace, "url");
|
||||
|
||||
// Loop through the <url>s
|
||||
for (int i = 0; i < list.getLength(); i++) {
|
||||
|
@ -444,10 +482,10 @@ public class SiteMapParser {
|
|||
Node n = list.item(i);
|
||||
if (n.getNodeType() == Node.ELEMENT_NODE) {
|
||||
Element elem = (Element) n;
|
||||
String lastMod = getElementValue(elem, "lastmod");
|
||||
String changeFreq = getElementValue(elem, "changefreq");
|
||||
String priority = getElementValue(elem, "priority");
|
||||
String loc = getElementValue(elem, "loc");
|
||||
String lastMod = getElementValue(namespace, elem, "lastmod");
|
||||
String changeFreq = getElementValue(namespace, elem, "changefreq");
|
||||
String priority = getElementValue(namespace, elem, "priority");
|
||||
String loc = getElementValue(namespace, elem, "loc");
|
||||
|
||||
addUrlIntoSitemap(loc, sitemap, lastMod, changeFreq, priority, i);
|
||||
}
|
||||
|
@ -496,7 +534,12 @@ public class SiteMapParser {
|
|||
|
||||
if (firstNode.getNodeType() == Node.ELEMENT_NODE) {
|
||||
Element elem = (Element) firstNode;
|
||||
String loc = getElementValue(elem, "loc");
|
||||
String loc = null;
|
||||
String namespace = Namespace.SITEMAP;
|
||||
if (!strictNamespace) {
|
||||
namespace = "*";
|
||||
}
|
||||
loc = getElementValue(namespace, elem, "loc");
|
||||
|
||||
// try the text content when no loc element
|
||||
// has been specified
|
||||
|
@ -506,7 +549,7 @@ public class SiteMapParser {
|
|||
|
||||
try {
|
||||
URL sitemapUrl = new URL(loc);
|
||||
String lastmod = getElementValue(elem, "lastmod");
|
||||
String lastmod = getElementValue(namespace, elem, "lastmod");
|
||||
Date lastModified = SiteMap.convertToDate(lastmod);
|
||||
|
||||
// Right now we are not worried about sitemapUrls that point
|
||||
|
@ -543,7 +586,7 @@ public class SiteMapParser {
|
|||
SiteMap sitemap = new SiteMap(sitemapUrl);
|
||||
|
||||
// See if this is an Atom feed by looking for "feed" element
|
||||
NodeList list = doc.getElementsByTagName("feed");
|
||||
NodeList list = doc.getElementsByTagNameNS("*", "feed");
|
||||
if (list.getLength() > 0) {
|
||||
parseAtom(sitemap, (Element) list.item(0), doc);
|
||||
sitemap.setProcessed(true);
|
||||
|
@ -557,7 +600,7 @@ public class SiteMapParser {
|
|||
// See https://github.com/crawler-commons/crawler-commons/issues/87
|
||||
// and also RSS 1.0 specification
|
||||
// http://web.resource.org/rss/1.0/spec
|
||||
list = doc.getElementsByTagName("channel");
|
||||
list = doc.getElementsByTagNameNS("*", "channel");
|
||||
if (list.getLength() > 0) {
|
||||
parseRSS(sitemap, doc);
|
||||
sitemap.setProcessed(true);
|
||||
|
@ -620,7 +663,7 @@ public class SiteMapParser {
|
|||
String lastMod = getElementValue(elem, "modified");
|
||||
LOG.debug("lastMod = {}", lastMod);
|
||||
|
||||
NodeList list = doc.getElementsByTagName("entry");
|
||||
NodeList list = doc.getElementsByTagNameNS("*", "entry");
|
||||
|
||||
// Loop through the <entry>s
|
||||
for (int i = 0; i < list.getLength() && i < MAX_URLS; i++) {
|
||||
|
@ -691,7 +734,7 @@ public class SiteMapParser {
|
|||
|
||||
LOG.debug("Parsing RSS doc");
|
||||
sitemap.setType(SitemapType.RSS);
|
||||
NodeList list = doc.getElementsByTagName("channel");
|
||||
NodeList list = doc.getElementsByTagNameNS("*", "channel");
|
||||
Element elem = (Element) list.item(0);
|
||||
|
||||
// Treat publication date as last mod (Tue, 10 Jun 2003 04:00:00 GMT)
|
||||
|
@ -699,7 +742,7 @@ public class SiteMapParser {
|
|||
LOG.debug("channel's lastMod = {}", channelLastMod);
|
||||
sitemap.setLastModified(channelLastMod);
|
||||
|
||||
list = doc.getElementsByTagName("item");
|
||||
list = doc.getElementsByTagNameNS("*", "item");
|
||||
// Loop through the <item>s
|
||||
for (int i = 0; i < list.getLength() && i < MAX_URLS; i++) {
|
||||
|
||||
|
@ -715,15 +758,17 @@ public class SiteMapParser {
|
|||
}
|
||||
|
||||
/**
|
||||
* Get the element's textual content.
|
||||
* Get the element's textual content. Find element under parent element,
|
||||
* with namespaceURI and element local-name "elementName".
|
||||
*
|
||||
* @param namespaceURI
|
||||
* @param elem
|
||||
* @param elementName
|
||||
* @return The element value
|
||||
*/
|
||||
protected String getElementValue(Element elem, String elementName) {
|
||||
protected String getElementValue(String namespaceURI, Element elem, String elementName) {
|
||||
|
||||
NodeList list = elem.getElementsByTagName(elementName);
|
||||
NodeList list = elem.getElementsByTagNameNS(namespaceURI, elementName);
|
||||
if (list == null)
|
||||
return null;
|
||||
Element e = (Element) list.item(0);
|
||||
|
@ -733,6 +778,21 @@ public class SiteMapParser {
|
|||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the element's textual content. This will match any namespace
|
||||
* (elementName is the localName).
|
||||
*
|
||||
* @param elem
|
||||
* The element is a child of "elem"
|
||||
* @param elementName
|
||||
* The element name is "elementName".
|
||||
* @return The element value
|
||||
*/
|
||||
protected String getElementValue(Element elem, String elementName) {
|
||||
|
||||
return getElementValue("*", elem, elementName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the element's attribute value.
|
||||
*
|
||||
|
@ -743,7 +803,7 @@ public class SiteMapParser {
|
|||
*/
|
||||
protected String getElementAttributeValue(Element elem, String elementName, String attributeName) {
|
||||
|
||||
NodeList list = elem.getElementsByTagName(elementName);
|
||||
NodeList list = elem.getElementsByTagNameNS("*", elementName);
|
||||
Element e = (Element) list.item(0);
|
||||
if (e != null) {
|
||||
return e.getAttribute(attributeName);
|
||||
|
|
|
@ -88,6 +88,12 @@ public class SiteMapParserSAX extends SiteMapParser {
|
|||
|
||||
private boolean allowPartial = false;
|
||||
|
||||
/**
|
||||
* Indicates whether the parser should work with the namespace from the
|
||||
* specifications or any namespace. Defaults to false.
|
||||
**/
|
||||
protected boolean strictNamespace = false;
|
||||
|
||||
public SiteMapParserSAX() {
|
||||
this(true, false);
|
||||
}
|
||||
|
@ -109,6 +115,22 @@ public class SiteMapParserSAX extends SiteMapParser {
|
|||
return strict;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return whether the parser allows any namespace or just the one from the
|
||||
* specification
|
||||
*/
|
||||
public boolean isStrictNamespace() {
|
||||
return strictNamespace;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the parser to allow any namespace or just the one from the
|
||||
* specification
|
||||
*/
|
||||
public void setStrictNamespace(boolean s) {
|
||||
strictNamespace = s;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a SiteMap or SiteMapIndex given an online sitemap URL
|
||||
*
|
||||
|
@ -236,7 +258,8 @@ public class SiteMapParserSAX extends SiteMapParser {
|
|||
}
|
||||
throw new UnknownFormatException("Can't parse a gzipped sitemap with the embedded MediaType of: " + embeddedType + " (at: " + url + ")");
|
||||
}
|
||||
mediaType = MEDIA_TYPE_REGISTRY.getSupertype(mediaType); // Check parent
|
||||
mediaType = MEDIA_TYPE_REGISTRY.getSupertype(mediaType); // Check
|
||||
// parent
|
||||
}
|
||||
|
||||
throw new UnknownFormatException("Can't parse a sitemap with the MediaType of: " + contentType + " (at: " + url + ")");
|
||||
|
@ -356,9 +379,15 @@ public class SiteMapParserSAX extends SiteMapParser {
|
|||
protected AbstractSiteMap processXml(URL sitemapUrl, InputSource is) throws UnknownFormatException {
|
||||
|
||||
SAXParserFactory factory = SAXParserFactory.newInstance();
|
||||
// disable validation and avoid that remote DTDs, schemas, etc. are fetched
|
||||
|
||||
// disable validation and avoid that remote DTDs, schemas, etc. are
|
||||
// fetched
|
||||
factory.setValidating(false);
|
||||
factory.setXIncludeAware(false);
|
||||
|
||||
// support the use of an explicit namespace.
|
||||
factory.setNamespaceAware(true);
|
||||
|
||||
try {
|
||||
factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
|
||||
} catch (Exception e) {
|
||||
|
@ -374,9 +403,14 @@ public class SiteMapParserSAX extends SiteMapParser {
|
|||
return new InputSource(new StringReader(""));
|
||||
}
|
||||
});
|
||||
handler.setStrictNamespace(isStrictNamespace());
|
||||
saxParser.parse(is, handler);
|
||||
AbstractSiteMap sitemap = handler.getSiteMap();
|
||||
if (sitemap == null) {
|
||||
UnknownFormatException ex = handler.getException();
|
||||
if (ex != null) {
|
||||
throw ex;
|
||||
}
|
||||
throw new UnknownFormatException("Unknown XML format for: " + sitemapUrl);
|
||||
}
|
||||
return sitemap;
|
||||
|
|
|
@ -44,6 +44,8 @@ public class SiteMapTester {
|
|||
LOG.error(" bypass automatic MIME type detection");
|
||||
LOG.error("Java properties:");
|
||||
LOG.error(" sitemap.useSax if true use SAX parser to process sitemaps");
|
||||
LOG.error(" sitemap.strictNamespace");
|
||||
LOG.error(" if true sitemaps are required to use the standard namespace URI");
|
||||
} else {
|
||||
URL url = new URL(args[0]);
|
||||
String mt = (args.length > 1) ? args[1] : null;
|
||||
|
@ -60,6 +62,7 @@ public class SiteMapTester {
|
|||
byte[] content = IOUtils.toByteArray(url);
|
||||
|
||||
boolean useSaxParser = new Boolean(System.getProperty("sitemap.useSax"));
|
||||
boolean strictNamespace = new Boolean(System.getProperty("sitemap.strictNamespace"));
|
||||
|
||||
LOG.info("Parsing {} {} using {} parser", url, ((mt != null && !mt.isEmpty()) ? "as MIME type " + mt : ""), (useSaxParser ? "SAX" : "DOM"));
|
||||
|
||||
|
@ -67,6 +70,7 @@ public class SiteMapTester {
|
|||
if (useSaxParser) {
|
||||
p = saxParser;
|
||||
}
|
||||
p.setStrictNamespace(strictNamespace);
|
||||
|
||||
AbstractSiteMap sm = null;
|
||||
// guesses the mimetype
|
||||
|
|
|
@ -78,11 +78,11 @@ class AtomHandler extends DelegatorHandler {
|
|||
}
|
||||
|
||||
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
|
||||
if ("entry".equals(qName)) {
|
||||
if ("entry".equals(localName)) {
|
||||
loc = null;
|
||||
lastMod = null;
|
||||
rel = null;
|
||||
} else if ("link".equals(qName)) {
|
||||
} else if ("link".equals(localName)) {
|
||||
String href = attributes.getValue("href");
|
||||
if (href == null)
|
||||
return;
|
||||
|
@ -91,8 +91,10 @@ class AtomHandler extends DelegatorHandler {
|
|||
String r = attributes.getValue("rel");
|
||||
if (loc == null || (!valid && v) || (rel != null && r == null)) {
|
||||
// - first link, or in case of multiple links:
|
||||
// - (for a strict parser only) this link is valid and the first one is not valid
|
||||
// - has no rel attribute while the first one does (e.g., rel="edit", rel="alternate")
|
||||
// - (for a strict parser only) this link is valid and the first
|
||||
// one is not valid
|
||||
// - has no rel attribute while the first one does (e.g.,
|
||||
// rel="edit", rel="alternate")
|
||||
try {
|
||||
loc = new URL(href);
|
||||
rel = r;
|
||||
|
@ -114,9 +116,9 @@ class AtomHandler extends DelegatorHandler {
|
|||
}
|
||||
|
||||
public void characters(char[] ch, int start, int length) throws SAXException {
|
||||
String qName = super.currentElement();
|
||||
String localName = super.currentElement();
|
||||
String value = String.valueOf(ch, start, length);
|
||||
if ("updated".equals(qName)) {
|
||||
if ("updated".equals(localName)) {
|
||||
lastMod = value;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -25,6 +25,7 @@ import org.xml.sax.SAXParseException;
|
|||
import org.xml.sax.helpers.DefaultHandler;
|
||||
|
||||
import crawlercommons.sitemaps.AbstractSiteMap;
|
||||
import crawlercommons.sitemaps.Namespace;
|
||||
import crawlercommons.sitemaps.UnknownFormatException;
|
||||
|
||||
/**
|
||||
|
@ -37,6 +38,7 @@ public class DelegatorHandler extends DefaultHandler {
|
|||
private DelegatorHandler delegate;
|
||||
private URL url;
|
||||
private boolean strict;
|
||||
private boolean strictNamespace;
|
||||
private UnknownFormatException exception;
|
||||
|
||||
protected DelegatorHandler(LinkedList<String> elementStack, boolean strict) {
|
||||
|
@ -58,11 +60,27 @@ public class DelegatorHandler extends DefaultHandler {
|
|||
return strict;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return whether the parser allows any namespace or just the one from the
|
||||
* specification
|
||||
*/
|
||||
public boolean isStrictNamespace() {
|
||||
return strictNamespace;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the parser to allow any namespace or just the one from the
|
||||
* specification
|
||||
*/
|
||||
public void setStrictNamespace(boolean s) {
|
||||
strictNamespace = s;
|
||||
}
|
||||
|
||||
protected void setException(UnknownFormatException exception) {
|
||||
this.exception = exception;
|
||||
}
|
||||
|
||||
protected UnknownFormatException getException() {
|
||||
public UnknownFormatException getException() {
|
||||
return exception;
|
||||
}
|
||||
|
||||
|
@ -70,7 +88,7 @@ public class DelegatorHandler extends DefaultHandler {
|
|||
if (elementStack.isEmpty() || delegate == null) {
|
||||
startRootElement(uri, localName, qName, attributes);
|
||||
} else {
|
||||
elementStack.push(qName);
|
||||
elementStack.push(localName);
|
||||
}
|
||||
if (delegate != null) {
|
||||
delegate.startElement(uri, localName, qName, attributes);
|
||||
|
@ -78,23 +96,32 @@ public class DelegatorHandler extends DefaultHandler {
|
|||
}
|
||||
|
||||
private void startRootElement(String uri, String localName, String qName, Attributes attributes) {
|
||||
elementStack.push(qName);
|
||||
if ("sitemapindex".equals(qName)) {
|
||||
delegate = new XMLIndexHandler(url, elementStack, strict);
|
||||
} else if ("urlset".equals(qName)) {
|
||||
delegate = new XMLHandler(url, elementStack, strict);
|
||||
} else if ("feed".equals(qName)) {
|
||||
elementStack.push(localName);
|
||||
|
||||
if ("feed".equals(localName)) {
|
||||
delegate = new AtomHandler(url, elementStack, strict);
|
||||
}
|
||||
// See if it is a RSS feed by looking for a "channel" element. This
|
||||
// avoids the issue
|
||||
// See if it is a RSS feed by looking for the localName "channel"
|
||||
// element .
|
||||
// This avoids the issue
|
||||
// of having the outer tag named <rdf:RDF> that was causing this code to
|
||||
// fail. Inside of
|
||||
// the <rss> or <rdf> tag is a <channel> tag, so we can use that.
|
||||
// See https://github.com/crawler-commons/crawler-commons/issues/87
|
||||
// and also RSS 1.0 specification http://web.resource.org/rss/1.0/spec
|
||||
else if ("channel".equals(qName)) {
|
||||
else if ("channel".equals(localName)) {
|
||||
delegate = new RSSHandler(url, elementStack, strict);
|
||||
} else if (isStrictNamespace() && !Namespace.SITEMAP.equals(uri)) {
|
||||
setException(new UnknownFormatException("Namespace " + uri + " does not match standard namespace " + Namespace.SITEMAP));
|
||||
return;
|
||||
} else if ("sitemapindex".equals(localName)) {
|
||||
delegate = new XMLIndexHandler(url, elementStack, strict);
|
||||
} else if ("urlset".equals(localName)) {
|
||||
delegate = new XMLHandler(url, elementStack, strict);
|
||||
}
|
||||
if (delegate != null) {
|
||||
// configure delegate
|
||||
delegate.setStrictNamespace(isStrictNamespace());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -29,6 +29,7 @@ import org.xml.sax.SAXParseException;
|
|||
|
||||
import crawlercommons.sitemaps.AbstractSiteMap;
|
||||
import crawlercommons.sitemaps.AbstractSiteMap.SitemapType;
|
||||
import crawlercommons.sitemaps.Namespace;
|
||||
import crawlercommons.sitemaps.SiteMap;
|
||||
import crawlercommons.sitemaps.SiteMapURL;
|
||||
|
||||
|
@ -61,6 +62,7 @@ class XMLHandler extends DelegatorHandler {
|
|||
private String changeFreq;
|
||||
private String priority;
|
||||
private int i = 0;
|
||||
private boolean currentElementNamespaceIsValid;
|
||||
|
||||
XMLHandler(URL url, LinkedList<String> elementStack, boolean strict) {
|
||||
super(elementStack, strict);
|
||||
|
@ -70,8 +72,14 @@ class XMLHandler extends DelegatorHandler {
|
|||
}
|
||||
|
||||
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
|
||||
if (isStrictNamespace() && !Namespace.SITEMAP.equals(uri)) {
|
||||
currentElementNamespaceIsValid = false;
|
||||
return;
|
||||
}
|
||||
currentElementNamespaceIsValid = true;
|
||||
|
||||
// flush any unclosed or missing URL element
|
||||
if (loc.length() > 0 && ("loc".equals(qName) || "url".equals(qName))) {
|
||||
if (loc.length() > 0 && ("loc".equals(localName) || "url".equals(localName))) {
|
||||
// check whether loc isn't white space only
|
||||
for (int i = 0; i < loc.length(); i++) {
|
||||
if (!Character.isWhitespace(loc.charAt(i))) {
|
||||
|
@ -80,7 +88,7 @@ class XMLHandler extends DelegatorHandler {
|
|||
}
|
||||
}
|
||||
loc = new StringBuilder();
|
||||
if ("url".equals(qName)) {
|
||||
if ("url".equals(localName)) {
|
||||
// reset also attributes
|
||||
lastMod = null;
|
||||
changeFreq = null;
|
||||
|
@ -90,23 +98,29 @@ class XMLHandler extends DelegatorHandler {
|
|||
}
|
||||
|
||||
public void endElement(String uri, String localName, String qName) throws SAXException {
|
||||
if ("url".equals(qName) && "urlset".equals(currentElementParent())) {
|
||||
if (isStrictNamespace() && !Namespace.SITEMAP.equals(uri)) {
|
||||
return;
|
||||
}
|
||||
if ("url".equals(localName) && "urlset".equals(currentElementParent())) {
|
||||
maybeAddSiteMapUrl();
|
||||
} else if ("urlset".equals(qName)) {
|
||||
} else if ("urlset".equals(localName)) {
|
||||
sitemap.setProcessed(true);
|
||||
}
|
||||
}
|
||||
|
||||
public void characters(char[] ch, int start, int length) throws SAXException {
|
||||
String qName = super.currentElement();
|
||||
if (isStrictNamespace() && !currentElementNamespaceIsValid) {
|
||||
return;
|
||||
}
|
||||
String localName = super.currentElement();
|
||||
String value = String.valueOf(ch, start, length);
|
||||
if ("loc".equals(qName) || "url".equals(qName)) {
|
||||
if ("loc".equals(localName) || "url".equals(localName)) {
|
||||
loc.append(value);
|
||||
} else if ("changefreq".equals(qName)) {
|
||||
} else if ("changefreq".equals(localName)) {
|
||||
changeFreq = value;
|
||||
} else if ("lastmod".equals(qName)) {
|
||||
} else if ("lastmod".equals(localName)) {
|
||||
lastMod = value;
|
||||
} else if ("priority".equals(qName)) {
|
||||
} else if ("priority".equals(localName)) {
|
||||
priority = value;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -28,6 +28,7 @@ import org.xml.sax.SAXException;
|
|||
import org.xml.sax.SAXParseException;
|
||||
|
||||
import crawlercommons.sitemaps.AbstractSiteMap;
|
||||
import crawlercommons.sitemaps.Namespace;
|
||||
import crawlercommons.sitemaps.SiteMap;
|
||||
import crawlercommons.sitemaps.SiteMapIndex;
|
||||
import crawlercommons.sitemaps.AbstractSiteMap.SitemapType;
|
||||
|
@ -70,6 +71,9 @@ class XMLIndexHandler extends DelegatorHandler {
|
|||
}
|
||||
|
||||
public void endElement(String uri, String localName, String qName) throws SAXException {
|
||||
if (isStrictNamespace() && !Namespace.SITEMAP.equals(uri)) {
|
||||
return;
|
||||
}
|
||||
if ("sitemap".equals(currentElement())) {
|
||||
maybeAddSiteMap();
|
||||
} else if ("sitemapindex".equals(currentElement())) {
|
||||
|
|
|
@ -111,8 +111,7 @@ public class BasicURLNormalizerTest {
|
|||
normalizeTest("http://foo.com:81/", "http://foo.com:81/");
|
||||
// check that empty port is removed
|
||||
normalizeTest("http://example.com:/", "http://example.com/");
|
||||
normalizeTest("https://example.com:/foobar.html",
|
||||
"https://example.com/foobar.html");
|
||||
normalizeTest("https://example.com:/foobar.html", "https://example.com/foobar.html");
|
||||
|
||||
// check that null path is normalized
|
||||
normalizeTest("http://foo.com", "http://foo.com/");
|
||||
|
|
|
@ -36,6 +36,8 @@ import org.junit.runners.JUnit4;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import crawlercommons.sitemaps.AbstractSiteMap.SitemapType;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
@RunWith(JUnit4.class)
|
||||
|
@ -92,6 +94,50 @@ public class SiteMapParserSAXTest {
|
|||
assertEquals("http://www.example.com/dynsitemap?date=lastyear&all=false", currentSiteMap.getUrl().toString());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSitemapWithNamespace() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParserSAX();
|
||||
parser.setStrictNamespace(true);
|
||||
byte[] content = getResourceAsBytes("src/test/resources/sitemaps/sitemap.ns.xml");
|
||||
|
||||
URL url = new URL("http://www.example.com/sitemap.ns.xml");
|
||||
AbstractSiteMap asm = parser.parseSiteMap(content, url);
|
||||
assertEquals(SitemapType.XML, asm.getType());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
assertEquals(true, asm.isProcessed());
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
|
||||
assertEquals(2, sm.getSiteMapUrls().size());
|
||||
assertEquals(SiteMapURL.ChangeFrequency.DAILY, sm.getSiteMapUrls().iterator().next().getChangeFrequency());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSitemapWithWrongNamespace() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParserSAX();
|
||||
parser.setStrictNamespace(true);
|
||||
|
||||
byte[] content = getResourceAsBytes("src/test/resources/sitemaps/sitemap.badns.xml");
|
||||
|
||||
URL url = new URL("http://www.example.com/sitemap.badns.xml");
|
||||
AbstractSiteMap asm;
|
||||
try {
|
||||
asm = parser.parseSiteMap(content, url);
|
||||
fail("Expected an UnknownFormatException because of wrong namespace");
|
||||
} catch (UnknownFormatException e) {
|
||||
assertTrue(e.getMessage().contains("does not match standard namespace"));
|
||||
}
|
||||
|
||||
// try again in lenient mode
|
||||
parser.setStrictNamespace(false);
|
||||
asm = parser.parseSiteMap(content, url);
|
||||
assertEquals(SitemapType.XML, asm.getType());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
assertEquals(true, asm.isProcessed());
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
|
||||
assertEquals(2, sm.getSiteMapUrls().size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFullDateFormat() {
|
||||
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm+hh:00", Locale.ROOT);
|
||||
|
|
|
@ -96,6 +96,49 @@ public class SiteMapParserTest {
|
|||
assertEquals("http://www.example.com/dynsitemap?date=lastyear&all=false", currentSiteMap.getUrl().toString());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSitemapWithNamespace() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParser();
|
||||
byte[] content = getResourceAsBytes("src/test/resources/sitemaps/sitemap.ns.xml");
|
||||
|
||||
URL url = new URL("http://www.example.com/sitemap.ns.xml");
|
||||
AbstractSiteMap asm = parser.parseSiteMap(content, url);
|
||||
assertEquals(SitemapType.XML, asm.getType());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
assertEquals(true, asm.isProcessed());
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
|
||||
assertEquals(2, sm.getSiteMapUrls().size());
|
||||
assertEquals(SiteMapURL.ChangeFrequency.DAILY, sm.getSiteMapUrls().iterator().next().getChangeFrequency());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSitemapWithWrongNamespace() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParser();
|
||||
parser.setStrictNamespace(true);
|
||||
|
||||
byte[] content = getResourceAsBytes("src/test/resources/sitemaps/sitemap.badns.xml");
|
||||
|
||||
URL url = new URL("http://www.example.com/sitemap.badns.xml");
|
||||
AbstractSiteMap asm = parser.parseSiteMap(content, url);
|
||||
assertEquals(SitemapType.XML, asm.getType());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
assertEquals(true, asm.isProcessed());
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
|
||||
assertEquals(0, sm.getSiteMapUrls().size());
|
||||
|
||||
// try again in lenient mode
|
||||
parser.setStrictNamespace(false);
|
||||
asm = parser.parseSiteMap(content, url);
|
||||
assertEquals(SitemapType.XML, asm.getType());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
assertEquals(true, asm.isProcessed());
|
||||
sm = (SiteMap) asm;
|
||||
|
||||
assertEquals(2, sm.getSiteMapUrls().size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFullDateFormat() {
|
||||
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm+hh:00", Locale.ROOT);
|
||||
|
@ -364,7 +407,12 @@ public class SiteMapParserTest {
|
|||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
assertSame("Not an RSS", SitemapType.RSS, asm.getType());
|
||||
assertNotNull("GMT timestamp not parsed", asm.getLastModified());
|
||||
assertEquals("GMT timestamp", 1483619690000L, asm.getLastModified().getTime()); // Thu, 05 Jan 17 12:34:50 GMT
|
||||
assertEquals("GMT timestamp", 1483619690000L, asm.getLastModified().getTime()); // Thu,
|
||||
// 05
|
||||
// Jan
|
||||
// 17
|
||||
// 12:34:50
|
||||
// GMT
|
||||
|
||||
SiteMap rss = (SiteMap) asm;
|
||||
assertEquals("Incorrect items count", 7, rss.getSiteMapUrls().size());
|
||||
|
@ -474,7 +522,7 @@ public class SiteMapParserTest {
|
|||
|
||||
/**
|
||||
* Read a test resource file and return its content as byte array.
|
||||
*
|
||||
*
|
||||
* @param resourceName
|
||||
* path to the resource file
|
||||
* @return byte content of the file
|
||||
|
|
|
@ -0,0 +1,16 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<urlset xmlns="http://www.google.com/schemas/sitemap/0.9"
|
||||
xmlns:xhtml="http://www.w3.org/1999/xhtml">
|
||||
<url>
|
||||
<loc>http://www.example.com/1</loc>
|
||||
<changefreq>daily</changefreq>
|
||||
</url>
|
||||
<url>
|
||||
<loc>
|
||||
http://www.example.com/2
|
||||
</loc>
|
||||
<changefreq>
|
||||
daily
|
||||
</changefreq>
|
||||
</url>
|
||||
</urlset>
|
|
@ -0,0 +1,11 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<sit:urlset xmlns:image="http://www.google.com/schemas/sitemap-image/1.1" xmlns:video="http://www.google.com/schemas/sitemap-video/1.1" xmlns:sit="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.w3.org/1999/xhtml link.xsd http://www.google.com/schemas/sitemap-video/1.1 video.xsd http://www.sitemaps.org/schemas/sitemap/0.9 sitemap.xsd http://www.google.com/schemas/sitemap-image/1.1 image.xsd">
|
||||
<sit:url>
|
||||
<sit:loc>http://www.example.com/1</sit:loc>
|
||||
<sit:changefreq>daily</sit:changefreq>
|
||||
</sit:url>
|
||||
<sit:url>
|
||||
<sit:loc>http://www.example.com/2</sit:loc>
|
||||
<sit:changefreq>daily</sit:changefreq>
|
||||
</sit:url>
|
||||
</sit:urlset>
|
Loading…
Reference in New Issue