mirror of
https://github.com/crawler-commons/crawler-commons
synced 2024-05-08 15:36:04 +02:00
SAX parser to stop URL at closing </loc> fixes #153
- on opening <loc> and <url> assume forgotten closing </url> tag only if there is more than white space in the buffer
This commit is contained in:
parent
90c2800869
commit
4dd61ded84
|
@ -72,7 +72,20 @@ class XMLHandler extends DelegatorHandler {
|
|||
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
|
||||
// flush any unclosed or missing URL element
|
||||
if (loc.length() > 0 && ("loc".equals(qName) || "url".equals(qName))) {
|
||||
maybeAddSiteMapUrl();
|
||||
// check whether loc isn't white space only
|
||||
for (int i = 0; i < loc.length(); i++) {
|
||||
if (!Character.isWhitespace(loc.charAt(i))) {
|
||||
maybeAddSiteMapUrl();
|
||||
return;
|
||||
}
|
||||
}
|
||||
loc = new StringBuilder();
|
||||
if ("url".equals(qName)) {
|
||||
// reset also attributes
|
||||
lastMod = null;
|
||||
changeFreq = null;
|
||||
priority = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue