1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-05-08 15:36:04 +02:00

SAX parser to stop URL at closing </loc> fixes #153

- on opening <loc> and <url> assume forgotten closing </url> tag
  only if there is more than white space in the buffer
This commit is contained in:
Sebastian Nagel 2017-06-06 21:04:28 +02:00
parent 90c2800869
commit 4dd61ded84

View File

@ -72,7 +72,20 @@ class XMLHandler extends DelegatorHandler {
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
// flush any unclosed or missing URL element
if (loc.length() > 0 && ("loc".equals(qName) || "url".equals(qName))) {
maybeAddSiteMapUrl();
// check whether loc isn't white space only
for (int i = 0; i < loc.length(); i++) {
if (!Character.isWhitespace(loc.charAt(i))) {
maybeAddSiteMapUrl();
return;
}
}
loc = new StringBuilder();
if ("url".equals(qName)) {
// reset also attributes
lastMod = null;
changeFreq = null;
priority = null;
}
}
}