1
0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-09-20 08:42:15 +02:00

[Sitemaps] Trim Unicode whitespace around URLs, fixes #224

This commit is contained in:
Sebastian Nagel 2018-12-13 16:30:23 +01:00
parent 6d3bbd2512
commit 67db8bf1be
6 changed files with 63 additions and 5 deletions

View File

@ -1,6 +1,7 @@
Crawler-Commons Change Log
Current Development 0.11-SNAPSHOT (yyyy-mm-dd)
- [Sitemaps] Trim Unicode whitespace around URLs (sebastian-nagel, kkrugler) #224
- [Sitemaps] Sitemap index: stop URL at closing </loc> (sebastian-nagel, kkrugler) #213
- [Sitemaps] Allow empty price in video sitemaps (sebastian-nagel) #221
- [Sitemaps] In case of the use of a different locale, price tag can be formatted with ',' instead of '.' leading to a NPE (goldenlink) #220

View File

@ -209,13 +209,42 @@ public class DelegatorHandler extends DefaultHandler {
}
}
/**
* Return true if character sequence contains only white space including
* Unicode whitespace, cf. {@link #isWhitespace(char)}
*/
public static boolean isAllBlank(CharSequence charSeq) {
for (int i = 0; i < charSeq.length(); i++) {
if (!Character.isWhitespace(charSeq.charAt(i))) {
if (!isWhitespace(charSeq.charAt(i))) {
return false;
}
}
return true;
}
/**
* Check whether character is any Unicode whitespace, including the space
* characters not covered by {@link Character#isWhitespace(char)}
*/
public static boolean isWhitespace(char c) {
return Character.isWhitespace(c) || c == '\u00a0' || c == '\u2007' || c == '\u202f';
}
/** Trim all whitespace including Unicode whitespace */
public static String stripAllBlank(CharSequence charSeq) {
if (charSeq.length() == 0) {
return charSeq.toString();
}
int start = 0;
int end = charSeq.length() - 1;
while (isWhitespace(charSeq.charAt(start)) && start < end) {
start++;
}
if (start < end) {
while (isWhitespace(charSeq.charAt(end))) {
end--;
}
}
return charSeq.subSequence(start, end + 1).toString();
}
}

View File

@ -150,7 +150,7 @@ class RSSHandler extends DelegatorHandler {
}
private void setLocURL() {
String value = loc.toString().trim();
String value = stripAllBlank(loc);
if (value.isEmpty()) {
return;
}

View File

@ -149,7 +149,7 @@ class XMLHandler extends DelegatorHandler {
}
private void maybeAddSiteMapUrl() {
String value = loc.toString().trim();
String value = stripAllBlank(loc);
try {
// check that the value is a valid URL
URL locURL = new URL(value);

View File

@ -103,7 +103,7 @@ class XMLIndexHandler extends DelegatorHandler {
} else if ("lastmod".equals(localName)) {
lastMod = SiteMap.convertToDate(value);
} else {
value = value.trim();
value = stripAllBlank(value);
if (!value.isEmpty() && !locClosed) {
// try non-whitespace text content as loc
// when no loc element has been specified
@ -117,7 +117,7 @@ class XMLIndexHandler extends DelegatorHandler {
}
private void maybeAddSiteMap() {
String value = loc.toString().trim();
String value = stripAllBlank(loc);
try {
// check that the value is a valid URL
URL locURL = new URL(value);

View File

@ -321,6 +321,34 @@ public class SiteMapParserTest {
assertNull("Sitemap " + urlSecondSitemap + " without modification date", secondSitemap.getLastModified());
}
@Test
public void testStripUnicodeWhiteSpace() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
StringBuilder scontent = new StringBuilder();
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") //
.append("<sitemapindex>\n") //
.append(" <sitemap>\n") //
.append(" <loc>\n") //
.append(" <![CDATA[ https://www.example.com/sitemap1.xml ]]>\n") //
.append(" </loc>\n") //
.append(" </sitemap>\n") //
.append(" <sitemap>\n") //
.append(" <loc>\n") //
.append(" <![CDATA[\u00a0https://www.example.com/sitemap2.xml ]]> \u2000\n") //
.append(" </loc>\n") //
.append(" </sitemap>\n") //
.append("</sitemapindex>");
byte[] content = scontent.toString().getBytes(UTF_8);
URL url = new URL("https://www.example.com/sitemapindex.xml");
AbstractSiteMap asm = parser.parseSiteMap(content, url);
assertEquals(true, asm.isIndex());
assertEquals(true, asm instanceof SiteMapIndex);
SiteMapIndex sm = (SiteMapIndex) asm;
assertEquals(2, sm.getSitemaps().size());
String sitemap = "https://www.example.com/sitemap2.xml";
assertNotNull("Sitemap " + sitemap + " not found in sitemap index", sm.getSitemap(new URL(sitemap)));
}
@Test
public void testSitemapGZ() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();