1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-05-20 18:36:03 +02:00

Add namespace aware DOM/SAX parsing for XML Sitemaps (#176)

* Add namespace aware DOM/SAX parsing for XML Sitemaps.  RSS and Atom parsing is also namespace aware, but finding elements is left "relaxed" by only matching on the element "localName".

* Lenient namespacing in non strict mode + applied formatting

* Introduced separate field strictNamespace to sitemapparsers + added test to saxparser

* Fixes Javadoc

* Fixes the fix for the Javadoc

* Allow to set strictNamespace in SiteMapTester

- Fix strict namespace handling in SitemapParserSAX:
- pass strictNamespace from DelegatorHandler to delegates
- ignore text if inside an element of invalid namespace
- use SAX parser in unit test
- set exception and pass it to calling DelegatorHandler if namespace
  does not match
This commit is contained in:
Julien Nioche 2017-10-17 10:47:17 +01:00 committed by GitHub
parent 5e60792a0b
commit 6adb771b72
15 changed files with 369 additions and 71 deletions

View File

@ -79,10 +79,9 @@ public class EffectiveTldFinder {
domains = new HashMap<>();
try {
if (null == effectiveTldDataStream && null != this.getClass().getResource(ETLD_DATA)) {
effectiveTldDataStream = this.getClass().getResourceAsStream(ETLD_DATA);
effectiveTldDataStream = this.getClass().getResourceAsStream(ETLD_DATA);
}
BufferedReader input = new BufferedReader(new InputStreamReader(
effectiveTldDataStream, StandardCharsets.UTF_8));
BufferedReader input = new BufferedReader(new InputStreamReader(effectiveTldDataStream, StandardCharsets.UTF_8));
String line = null;
while (null != (line = input.readLine())) {
if (line.length() == 0 || (line.length() > 1 && line.startsWith(COMMENT))) {

View File

@ -67,12 +67,8 @@ public abstract class AbstractSiteMap {
private static final ThreadLocal<DateFormat[]> RSS_DATE_FORMATS = new ThreadLocal<DateFormat[]>() {
@Override
protected DateFormat[] initialValue() {
return new DateFormat[] {
new SimpleDateFormat("EEE, dd MMM yy HH:mm:ss Z", Locale.ROOT),
new SimpleDateFormat("dd MMM yy HH:mm:ss Z", Locale.ROOT),
new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss Z", Locale.ROOT),
new SimpleDateFormat("dd MMM yyyy HH:mm:ss Z", Locale.ROOT)
};
return new DateFormat[] { new SimpleDateFormat("EEE, dd MMM yy HH:mm:ss Z", Locale.ROOT), new SimpleDateFormat("dd MMM yy HH:mm:ss Z", Locale.ROOT),
new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss Z", Locale.ROOT), new SimpleDateFormat("dd MMM yyyy HH:mm:ss Z", Locale.ROOT) };
}
};
@ -205,13 +201,13 @@ public abstract class AbstractSiteMap {
}
/**
* Converts pubDate of RSS to the string representation which could be parsed
* in {@link #convertToDate(String)} method.
*
* Converts pubDate of RSS to the string representation which could be
* parsed in {@link #convertToDate(String)} method.
*
* @param pubDate
* - date time of pubDate in RFC822
* @return converted to &quot;yyyy-MM-dd'T'HH:mm:ssZ&quot; format or original value if it doesn't
* follow the RFC822
* @return converted to &quot;yyyy-MM-dd'T'HH:mm:ssZ&quot; format or
* original value if it doesn't follow the RFC822
*/
public static String normalizeRSSTimestamp(String pubDate) {
if (pubDate == null) {

View File

@ -0,0 +1,38 @@
/**
* Copyright 2016 Crawler-Commons
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package crawlercommons.sitemaps;
/**
* supported sitemap formats:
* https://www.sitemaps.org/protocol.html#otherformats
*/
public class Namespace {
public static final String SITEMAP = "http://www.sitemaps.org/schemas/sitemap/0.9";
/**
* RSS and Atom sitemap formats do not have strict definition. But if we do
* not parse as namespace aware, then RSS/Atom files that choose to use
* namespaces will break. The relaxed compromise for RSS/Atom is to always
* parse as "namespace aware", but we will only match elements by the
* localName, accepting any element namespace.
*/
public static final String RSS_2_0 = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
public static final String ATOM_0_3 = "http://purl.org/atom/ns#";
public static final String ATOM_1_0 = "http://www.w3.org/2005/Atom";
}

View File

@ -87,12 +87,19 @@ public class SiteMapParser {
/**
* True (by default) meaning that invalid URLs should be rejected, as the
* official docs allow the siteMapURLs to be only under the base url:
* http://www.sitemaps.org/protocol.html#location
* http://www.sitemaps.org/protocol.html#location Also checks that the
* correct namespace is used.
*/
protected boolean strict = true;
/**
* Indicates whether the parser should work with the namespace from the
* specifications or any namespace. Defaults to false.
**/
protected boolean strictNamespace = false;
public SiteMapParser() {
//default constructor
// default constructor
}
public SiteMapParser(boolean strict) {
@ -107,6 +114,22 @@ public class SiteMapParser {
return strict;
}
/**
* @return whether the parser allows any namespace or just the one from the
* specification
*/
public boolean isStrictNamespace() {
return strictNamespace;
}
/**
* Sets the parser to allow any namespace or just the one from the
* specification
*/
public void setStrictNamespace(boolean s) {
strictNamespace = s;
}
/**
* Returns a SiteMap or SiteMapIndex given an online sitemap URL
*
@ -234,7 +257,8 @@ public class SiteMapParser {
}
throw new UnknownFormatException("Can't parse a gzipped sitemap with the embedded MediaType of: " + embeddedType + " (at: " + url + ")");
}
mediaType = MEDIA_TYPE_REGISTRY.getSupertype(mediaType); // Check parent
mediaType = MEDIA_TYPE_REGISTRY.getSupertype(mediaType); // Check
// parent
}
throw new UnknownFormatException("Can't parse a sitemap with the MediaType of: " + contentType + " (at: " + url + ")");
@ -352,16 +376,22 @@ public class SiteMapParser {
* {@link org.xml.sax.InputSource}
*/
protected AbstractSiteMap processXml(URL sitemapUrl, InputSource is) throws UnknownFormatException {
Document doc = null;
try {
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
// disable validation and avoid that remote DTDs, schemas, etc. are fetched
// disable validation and avoid that remote DTDs, schemas, etc. are
// fetched
dbf.setValidating(false);
// support an explicitly named namespace.
dbf.setNamespaceAware(true);
dbf.setXIncludeAware(false);
dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
DocumentBuilder db = dbf.newDocumentBuilder();
db.setEntityResolver(new EntityResolver() {
// noop entity resolver, does not fetch remote content
@Override
@ -369,14 +399,17 @@ public class SiteMapParser {
return new InputSource(new StringReader(""));
}
});
db.setErrorHandler(new ErrorHandler() {
public void warning(SAXParseException e) throws SAXException {
LOG.warn("Warning parsing XML: {}", e.toString());
}
public void fatalError(SAXParseException e) throws SAXException {
LOG.error("Fatal error parsing XML: {}", e.toString());
throw e;
}
public void error(SAXParseException e) throws SAXException {
LOG.error("Error parsing XML: {}", e.toString());
throw e;
@ -389,14 +422,14 @@ public class SiteMapParser {
}
// See if this is a sitemap index
NodeList nodeList = doc.getElementsByTagName("sitemapindex");
NodeList nodeList = doc.getElementsByTagNameNS("*", "sitemapindex");
if (nodeList.getLength() > 0) {
nodeList = doc.getElementsByTagName("sitemap");
nodeList = doc.getElementsByTagNameNS("*", "sitemap");
return parseSitemapIndex(sitemapUrl, nodeList);
} else if (doc.getElementsByTagName("urlset").getLength() > 0) {
} else if (doc.getElementsByTagNameNS("*", "urlset").getLength() > 0) {
// This is a regular Sitemap
return parseXmlSitemap(sitemapUrl, doc);
} else if (doc.getElementsByTagName("link").getLength() > 0) {
} else if (doc.getElementsByTagNameNS("*", "link").getLength() > 0) {
// Could be RSS or Atom
return parseSyndicationFormat(sitemapUrl, doc);
}
@ -436,7 +469,12 @@ public class SiteMapParser {
SiteMap sitemap = new SiteMap(sitemapUrl);
sitemap.setType(SitemapType.XML);
NodeList list = doc.getElementsByTagName("url");
String namespace = Namespace.SITEMAP;
if (!strictNamespace) {
namespace = "*";
}
NodeList list = doc.getElementsByTagNameNS(namespace, "url");
// Loop through the <url>s
for (int i = 0; i < list.getLength(); i++) {
@ -444,10 +482,10 @@ public class SiteMapParser {
Node n = list.item(i);
if (n.getNodeType() == Node.ELEMENT_NODE) {
Element elem = (Element) n;
String lastMod = getElementValue(elem, "lastmod");
String changeFreq = getElementValue(elem, "changefreq");
String priority = getElementValue(elem, "priority");
String loc = getElementValue(elem, "loc");
String lastMod = getElementValue(namespace, elem, "lastmod");
String changeFreq = getElementValue(namespace, elem, "changefreq");
String priority = getElementValue(namespace, elem, "priority");
String loc = getElementValue(namespace, elem, "loc");
addUrlIntoSitemap(loc, sitemap, lastMod, changeFreq, priority, i);
}
@ -496,7 +534,12 @@ public class SiteMapParser {
if (firstNode.getNodeType() == Node.ELEMENT_NODE) {
Element elem = (Element) firstNode;
String loc = getElementValue(elem, "loc");
String loc = null;
String namespace = Namespace.SITEMAP;
if (!strictNamespace) {
namespace = "*";
}
loc = getElementValue(namespace, elem, "loc");
// try the text content when no loc element
// has been specified
@ -506,7 +549,7 @@ public class SiteMapParser {
try {
URL sitemapUrl = new URL(loc);
String lastmod = getElementValue(elem, "lastmod");
String lastmod = getElementValue(namespace, elem, "lastmod");
Date lastModified = SiteMap.convertToDate(lastmod);
// Right now we are not worried about sitemapUrls that point
@ -543,7 +586,7 @@ public class SiteMapParser {
SiteMap sitemap = new SiteMap(sitemapUrl);
// See if this is an Atom feed by looking for "feed" element
NodeList list = doc.getElementsByTagName("feed");
NodeList list = doc.getElementsByTagNameNS("*", "feed");
if (list.getLength() > 0) {
parseAtom(sitemap, (Element) list.item(0), doc);
sitemap.setProcessed(true);
@ -557,7 +600,7 @@ public class SiteMapParser {
// See https://github.com/crawler-commons/crawler-commons/issues/87
// and also RSS 1.0 specification
// http://web.resource.org/rss/1.0/spec
list = doc.getElementsByTagName("channel");
list = doc.getElementsByTagNameNS("*", "channel");
if (list.getLength() > 0) {
parseRSS(sitemap, doc);
sitemap.setProcessed(true);
@ -620,7 +663,7 @@ public class SiteMapParser {
String lastMod = getElementValue(elem, "modified");
LOG.debug("lastMod = {}", lastMod);
NodeList list = doc.getElementsByTagName("entry");
NodeList list = doc.getElementsByTagNameNS("*", "entry");
// Loop through the <entry>s
for (int i = 0; i < list.getLength() && i < MAX_URLS; i++) {
@ -691,7 +734,7 @@ public class SiteMapParser {
LOG.debug("Parsing RSS doc");
sitemap.setType(SitemapType.RSS);
NodeList list = doc.getElementsByTagName("channel");
NodeList list = doc.getElementsByTagNameNS("*", "channel");
Element elem = (Element) list.item(0);
// Treat publication date as last mod (Tue, 10 Jun 2003 04:00:00 GMT)
@ -699,7 +742,7 @@ public class SiteMapParser {
LOG.debug("channel's lastMod = {}", channelLastMod);
sitemap.setLastModified(channelLastMod);
list = doc.getElementsByTagName("item");
list = doc.getElementsByTagNameNS("*", "item");
// Loop through the <item>s
for (int i = 0; i < list.getLength() && i < MAX_URLS; i++) {
@ -715,15 +758,17 @@ public class SiteMapParser {
}
/**
* Get the element's textual content.
* Get the element's textual content. Find element under parent element,
* with namespaceURI and element local-name "elementName".
*
* @param namespaceURI
* @param elem
* @param elementName
* @return The element value
*/
protected String getElementValue(Element elem, String elementName) {
protected String getElementValue(String namespaceURI, Element elem, String elementName) {
NodeList list = elem.getElementsByTagName(elementName);
NodeList list = elem.getElementsByTagNameNS(namespaceURI, elementName);
if (list == null)
return null;
Element e = (Element) list.item(0);
@ -733,6 +778,21 @@ public class SiteMapParser {
return null;
}
/**
* Get the element's textual content. This will match any namespace
* (elementName is the localName).
*
* @param elem
* The element is a child of "elem"
* @param elementName
* The element name is "elementName".
* @return The element value
*/
protected String getElementValue(Element elem, String elementName) {
return getElementValue("*", elem, elementName);
}
/**
* Get the element's attribute value.
*
@ -743,7 +803,7 @@ public class SiteMapParser {
*/
protected String getElementAttributeValue(Element elem, String elementName, String attributeName) {
NodeList list = elem.getElementsByTagName(elementName);
NodeList list = elem.getElementsByTagNameNS("*", elementName);
Element e = (Element) list.item(0);
if (e != null) {
return e.getAttribute(attributeName);

View File

@ -88,6 +88,12 @@ public class SiteMapParserSAX extends SiteMapParser {
private boolean allowPartial = false;
/**
* Indicates whether the parser should work with the namespace from the
* specifications or any namespace. Defaults to false.
**/
protected boolean strictNamespace = false;
public SiteMapParserSAX() {
this(true, false);
}
@ -109,6 +115,22 @@ public class SiteMapParserSAX extends SiteMapParser {
return strict;
}
/**
* @return whether the parser allows any namespace or just the one from the
* specification
*/
public boolean isStrictNamespace() {
return strictNamespace;
}
/**
* Sets the parser to allow any namespace or just the one from the
* specification
*/
public void setStrictNamespace(boolean s) {
strictNamespace = s;
}
/**
* Returns a SiteMap or SiteMapIndex given an online sitemap URL
*
@ -236,7 +258,8 @@ public class SiteMapParserSAX extends SiteMapParser {
}
throw new UnknownFormatException("Can't parse a gzipped sitemap with the embedded MediaType of: " + embeddedType + " (at: " + url + ")");
}
mediaType = MEDIA_TYPE_REGISTRY.getSupertype(mediaType); // Check parent
mediaType = MEDIA_TYPE_REGISTRY.getSupertype(mediaType); // Check
// parent
}
throw new UnknownFormatException("Can't parse a sitemap with the MediaType of: " + contentType + " (at: " + url + ")");
@ -356,9 +379,15 @@ public class SiteMapParserSAX extends SiteMapParser {
protected AbstractSiteMap processXml(URL sitemapUrl, InputSource is) throws UnknownFormatException {
SAXParserFactory factory = SAXParserFactory.newInstance();
// disable validation and avoid that remote DTDs, schemas, etc. are fetched
// disable validation and avoid that remote DTDs, schemas, etc. are
// fetched
factory.setValidating(false);
factory.setXIncludeAware(false);
// support the use of an explicit namespace.
factory.setNamespaceAware(true);
try {
factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
} catch (Exception e) {
@ -374,9 +403,14 @@ public class SiteMapParserSAX extends SiteMapParser {
return new InputSource(new StringReader(""));
}
});
handler.setStrictNamespace(isStrictNamespace());
saxParser.parse(is, handler);
AbstractSiteMap sitemap = handler.getSiteMap();
if (sitemap == null) {
UnknownFormatException ex = handler.getException();
if (ex != null) {
throw ex;
}
throw new UnknownFormatException("Unknown XML format for: " + sitemapUrl);
}
return sitemap;

View File

@ -44,6 +44,8 @@ public class SiteMapTester {
LOG.error(" bypass automatic MIME type detection");
LOG.error("Java properties:");
LOG.error(" sitemap.useSax if true use SAX parser to process sitemaps");
LOG.error(" sitemap.strictNamespace");
LOG.error(" if true sitemaps are required to use the standard namespace URI");
} else {
URL url = new URL(args[0]);
String mt = (args.length > 1) ? args[1] : null;
@ -60,6 +62,7 @@ public class SiteMapTester {
byte[] content = IOUtils.toByteArray(url);
boolean useSaxParser = new Boolean(System.getProperty("sitemap.useSax"));
boolean strictNamespace = new Boolean(System.getProperty("sitemap.strictNamespace"));
LOG.info("Parsing {} {} using {} parser", url, ((mt != null && !mt.isEmpty()) ? "as MIME type " + mt : ""), (useSaxParser ? "SAX" : "DOM"));
@ -67,6 +70,7 @@ public class SiteMapTester {
if (useSaxParser) {
p = saxParser;
}
p.setStrictNamespace(strictNamespace);
AbstractSiteMap sm = null;
// guesses the mimetype

View File

@ -78,11 +78,11 @@ class AtomHandler extends DelegatorHandler {
}
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
if ("entry".equals(qName)) {
if ("entry".equals(localName)) {
loc = null;
lastMod = null;
rel = null;
} else if ("link".equals(qName)) {
} else if ("link".equals(localName)) {
String href = attributes.getValue("href");
if (href == null)
return;
@ -91,8 +91,10 @@ class AtomHandler extends DelegatorHandler {
String r = attributes.getValue("rel");
if (loc == null || (!valid && v) || (rel != null && r == null)) {
// - first link, or in case of multiple links:
// - (for a strict parser only) this link is valid and the first one is not valid
// - has no rel attribute while the first one does (e.g., rel="edit", rel="alternate")
// - (for a strict parser only) this link is valid and the first
// one is not valid
// - has no rel attribute while the first one does (e.g.,
// rel="edit", rel="alternate")
try {
loc = new URL(href);
rel = r;
@ -114,9 +116,9 @@ class AtomHandler extends DelegatorHandler {
}
public void characters(char[] ch, int start, int length) throws SAXException {
String qName = super.currentElement();
String localName = super.currentElement();
String value = String.valueOf(ch, start, length);
if ("updated".equals(qName)) {
if ("updated".equals(localName)) {
lastMod = value;
}
}

View File

@ -25,6 +25,7 @@ import org.xml.sax.SAXParseException;
import org.xml.sax.helpers.DefaultHandler;
import crawlercommons.sitemaps.AbstractSiteMap;
import crawlercommons.sitemaps.Namespace;
import crawlercommons.sitemaps.UnknownFormatException;
/**
@ -37,6 +38,7 @@ public class DelegatorHandler extends DefaultHandler {
private DelegatorHandler delegate;
private URL url;
private boolean strict;
private boolean strictNamespace;
private UnknownFormatException exception;
protected DelegatorHandler(LinkedList<String> elementStack, boolean strict) {
@ -58,11 +60,27 @@ public class DelegatorHandler extends DefaultHandler {
return strict;
}
/**
* @return whether the parser allows any namespace or just the one from the
* specification
*/
public boolean isStrictNamespace() {
return strictNamespace;
}
/**
* Sets the parser to allow any namespace or just the one from the
* specification
*/
public void setStrictNamespace(boolean s) {
strictNamespace = s;
}
protected void setException(UnknownFormatException exception) {
this.exception = exception;
}
protected UnknownFormatException getException() {
public UnknownFormatException getException() {
return exception;
}
@ -70,7 +88,7 @@ public class DelegatorHandler extends DefaultHandler {
if (elementStack.isEmpty() || delegate == null) {
startRootElement(uri, localName, qName, attributes);
} else {
elementStack.push(qName);
elementStack.push(localName);
}
if (delegate != null) {
delegate.startElement(uri, localName, qName, attributes);
@ -78,23 +96,32 @@ public class DelegatorHandler extends DefaultHandler {
}
private void startRootElement(String uri, String localName, String qName, Attributes attributes) {
elementStack.push(qName);
if ("sitemapindex".equals(qName)) {
delegate = new XMLIndexHandler(url, elementStack, strict);
} else if ("urlset".equals(qName)) {
delegate = new XMLHandler(url, elementStack, strict);
} else if ("feed".equals(qName)) {
elementStack.push(localName);
if ("feed".equals(localName)) {
delegate = new AtomHandler(url, elementStack, strict);
}
// See if it is a RSS feed by looking for a "channel" element. This
// avoids the issue
// See if it is a RSS feed by looking for the localName "channel"
// element .
// This avoids the issue
// of having the outer tag named <rdf:RDF> that was causing this code to
// fail. Inside of
// the <rss> or <rdf> tag is a <channel> tag, so we can use that.
// See https://github.com/crawler-commons/crawler-commons/issues/87
// and also RSS 1.0 specification http://web.resource.org/rss/1.0/spec
else if ("channel".equals(qName)) {
else if ("channel".equals(localName)) {
delegate = new RSSHandler(url, elementStack, strict);
} else if (isStrictNamespace() && !Namespace.SITEMAP.equals(uri)) {
setException(new UnknownFormatException("Namespace " + uri + " does not match standard namespace " + Namespace.SITEMAP));
return;
} else if ("sitemapindex".equals(localName)) {
delegate = new XMLIndexHandler(url, elementStack, strict);
} else if ("urlset".equals(localName)) {
delegate = new XMLHandler(url, elementStack, strict);
}
if (delegate != null) {
// configure delegate
delegate.setStrictNamespace(isStrictNamespace());
}
}

View File

@ -29,6 +29,7 @@ import org.xml.sax.SAXParseException;
import crawlercommons.sitemaps.AbstractSiteMap;
import crawlercommons.sitemaps.AbstractSiteMap.SitemapType;
import crawlercommons.sitemaps.Namespace;
import crawlercommons.sitemaps.SiteMap;
import crawlercommons.sitemaps.SiteMapURL;
@ -61,6 +62,7 @@ class XMLHandler extends DelegatorHandler {
private String changeFreq;
private String priority;
private int i = 0;
private boolean currentElementNamespaceIsValid;
XMLHandler(URL url, LinkedList<String> elementStack, boolean strict) {
super(elementStack, strict);
@ -70,8 +72,14 @@ class XMLHandler extends DelegatorHandler {
}
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
if (isStrictNamespace() && !Namespace.SITEMAP.equals(uri)) {
currentElementNamespaceIsValid = false;
return;
}
currentElementNamespaceIsValid = true;
// flush any unclosed or missing URL element
if (loc.length() > 0 && ("loc".equals(qName) || "url".equals(qName))) {
if (loc.length() > 0 && ("loc".equals(localName) || "url".equals(localName))) {
// check whether loc isn't white space only
for (int i = 0; i < loc.length(); i++) {
if (!Character.isWhitespace(loc.charAt(i))) {
@ -80,7 +88,7 @@ class XMLHandler extends DelegatorHandler {
}
}
loc = new StringBuilder();
if ("url".equals(qName)) {
if ("url".equals(localName)) {
// reset also attributes
lastMod = null;
changeFreq = null;
@ -90,23 +98,29 @@ class XMLHandler extends DelegatorHandler {
}
public void endElement(String uri, String localName, String qName) throws SAXException {
if ("url".equals(qName) && "urlset".equals(currentElementParent())) {
if (isStrictNamespace() && !Namespace.SITEMAP.equals(uri)) {
return;
}
if ("url".equals(localName) && "urlset".equals(currentElementParent())) {
maybeAddSiteMapUrl();
} else if ("urlset".equals(qName)) {
} else if ("urlset".equals(localName)) {
sitemap.setProcessed(true);
}
}
public void characters(char[] ch, int start, int length) throws SAXException {
String qName = super.currentElement();
if (isStrictNamespace() && !currentElementNamespaceIsValid) {
return;
}
String localName = super.currentElement();
String value = String.valueOf(ch, start, length);
if ("loc".equals(qName) || "url".equals(qName)) {
if ("loc".equals(localName) || "url".equals(localName)) {
loc.append(value);
} else if ("changefreq".equals(qName)) {
} else if ("changefreq".equals(localName)) {
changeFreq = value;
} else if ("lastmod".equals(qName)) {
} else if ("lastmod".equals(localName)) {
lastMod = value;
} else if ("priority".equals(qName)) {
} else if ("priority".equals(localName)) {
priority = value;
}
}

View File

@ -28,6 +28,7 @@ import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import crawlercommons.sitemaps.AbstractSiteMap;
import crawlercommons.sitemaps.Namespace;
import crawlercommons.sitemaps.SiteMap;
import crawlercommons.sitemaps.SiteMapIndex;
import crawlercommons.sitemaps.AbstractSiteMap.SitemapType;
@ -70,6 +71,9 @@ class XMLIndexHandler extends DelegatorHandler {
}
public void endElement(String uri, String localName, String qName) throws SAXException {
if (isStrictNamespace() && !Namespace.SITEMAP.equals(uri)) {
return;
}
if ("sitemap".equals(currentElement())) {
maybeAddSiteMap();
} else if ("sitemapindex".equals(currentElement())) {

View File

@ -111,8 +111,7 @@ public class BasicURLNormalizerTest {
normalizeTest("http://foo.com:81/", "http://foo.com:81/");
// check that empty port is removed
normalizeTest("http://example.com:/", "http://example.com/");
normalizeTest("https://example.com:/foobar.html",
"https://example.com/foobar.html");
normalizeTest("https://example.com:/foobar.html", "https://example.com/foobar.html");
// check that null path is normalized
normalizeTest("http://foo.com", "http://foo.com/");

View File

@ -36,6 +36,8 @@ import org.junit.runners.JUnit4;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import crawlercommons.sitemaps.AbstractSiteMap.SitemapType;
import static org.junit.Assert.*;
@RunWith(JUnit4.class)
@ -92,6 +94,50 @@ public class SiteMapParserSAXTest {
assertEquals("http://www.example.com/dynsitemap?date=lastyear&all=false", currentSiteMap.getUrl().toString());
}
@Test
public void testSitemapWithNamespace() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParserSAX();
parser.setStrictNamespace(true);
byte[] content = getResourceAsBytes("src/test/resources/sitemaps/sitemap.ns.xml");
URL url = new URL("http://www.example.com/sitemap.ns.xml");
AbstractSiteMap asm = parser.parseSiteMap(content, url);
assertEquals(SitemapType.XML, asm.getType());
assertEquals(true, asm instanceof SiteMap);
assertEquals(true, asm.isProcessed());
SiteMap sm = (SiteMap) asm;
assertEquals(2, sm.getSiteMapUrls().size());
assertEquals(SiteMapURL.ChangeFrequency.DAILY, sm.getSiteMapUrls().iterator().next().getChangeFrequency());
}
@Test
public void testSitemapWithWrongNamespace() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParserSAX();
parser.setStrictNamespace(true);
byte[] content = getResourceAsBytes("src/test/resources/sitemaps/sitemap.badns.xml");
URL url = new URL("http://www.example.com/sitemap.badns.xml");
AbstractSiteMap asm;
try {
asm = parser.parseSiteMap(content, url);
fail("Expected an UnknownFormatException because of wrong namespace");
} catch (UnknownFormatException e) {
assertTrue(e.getMessage().contains("does not match standard namespace"));
}
// try again in lenient mode
parser.setStrictNamespace(false);
asm = parser.parseSiteMap(content, url);
assertEquals(SitemapType.XML, asm.getType());
assertEquals(true, asm instanceof SiteMap);
assertEquals(true, asm.isProcessed());
SiteMap sm = (SiteMap) asm;
assertEquals(2, sm.getSiteMapUrls().size());
}
@Test
public void testFullDateFormat() {
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm+hh:00", Locale.ROOT);

View File

@ -96,6 +96,49 @@ public class SiteMapParserTest {
assertEquals("http://www.example.com/dynsitemap?date=lastyear&all=false", currentSiteMap.getUrl().toString());
}
@Test
public void testSitemapWithNamespace() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
byte[] content = getResourceAsBytes("src/test/resources/sitemaps/sitemap.ns.xml");
URL url = new URL("http://www.example.com/sitemap.ns.xml");
AbstractSiteMap asm = parser.parseSiteMap(content, url);
assertEquals(SitemapType.XML, asm.getType());
assertEquals(true, asm instanceof SiteMap);
assertEquals(true, asm.isProcessed());
SiteMap sm = (SiteMap) asm;
assertEquals(2, sm.getSiteMapUrls().size());
assertEquals(SiteMapURL.ChangeFrequency.DAILY, sm.getSiteMapUrls().iterator().next().getChangeFrequency());
}
@Test
public void testSitemapWithWrongNamespace() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
parser.setStrictNamespace(true);
byte[] content = getResourceAsBytes("src/test/resources/sitemaps/sitemap.badns.xml");
URL url = new URL("http://www.example.com/sitemap.badns.xml");
AbstractSiteMap asm = parser.parseSiteMap(content, url);
assertEquals(SitemapType.XML, asm.getType());
assertEquals(true, asm instanceof SiteMap);
assertEquals(true, asm.isProcessed());
SiteMap sm = (SiteMap) asm;
assertEquals(0, sm.getSiteMapUrls().size());
// try again in lenient mode
parser.setStrictNamespace(false);
asm = parser.parseSiteMap(content, url);
assertEquals(SitemapType.XML, asm.getType());
assertEquals(true, asm instanceof SiteMap);
assertEquals(true, asm.isProcessed());
sm = (SiteMap) asm;
assertEquals(2, sm.getSiteMapUrls().size());
}
@Test
public void testFullDateFormat() {
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm+hh:00", Locale.ROOT);
@ -364,7 +407,12 @@ public class SiteMapParserTest {
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
assertSame("Not an RSS", SitemapType.RSS, asm.getType());
assertNotNull("GMT timestamp not parsed", asm.getLastModified());
assertEquals("GMT timestamp", 1483619690000L, asm.getLastModified().getTime()); // Thu, 05 Jan 17 12:34:50 GMT
assertEquals("GMT timestamp", 1483619690000L, asm.getLastModified().getTime()); // Thu,
// 05
// Jan
// 17
// 12:34:50
// GMT
SiteMap rss = (SiteMap) asm;
assertEquals("Incorrect items count", 7, rss.getSiteMapUrls().size());
@ -474,7 +522,7 @@ public class SiteMapParserTest {
/**
* Read a test resource file and return its content as byte array.
*
*
* @param resourceName
* path to the resource file
* @return byte content of the file

View File

@ -0,0 +1,16 @@
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.google.com/schemas/sitemap/0.9"
xmlns:xhtml="http://www.w3.org/1999/xhtml">
<url>
<loc>http://www.example.com/1</loc>
<changefreq>daily</changefreq>
</url>
<url>
<loc>
http://www.example.com/2
</loc>
<changefreq>
daily
</changefreq>
</url>
</urlset>

View File

@ -0,0 +1,11 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<sit:urlset xmlns:image="http://www.google.com/schemas/sitemap-image/1.1" xmlns:video="http://www.google.com/schemas/sitemap-video/1.1" xmlns:sit="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.w3.org/1999/xhtml link.xsd http://www.google.com/schemas/sitemap-video/1.1 video.xsd http://www.sitemaps.org/schemas/sitemap/0.9 sitemap.xsd http://www.google.com/schemas/sitemap-image/1.1 image.xsd">
<sit:url>
<sit:loc>http://www.example.com/1</sit:loc>
<sit:changefreq>daily</sit:changefreq>
</sit:url>
<sit:url>
<sit:loc>http://www.example.com/2</sit:loc>
<sit:changefreq>daily</sit:changefreq>
</sit:url>
</sit:urlset>