mirror of
https://github.com/crawler-commons/crawler-commons
synced 2024-09-20 08:42:15 +02:00
[Sitemaps] Trim Unicode whitespace around URLs, fixes #224
This commit is contained in:
parent
6d3bbd2512
commit
67db8bf1be
@ -1,6 +1,7 @@
|
||||
Crawler-Commons Change Log
|
||||
|
||||
Current Development 0.11-SNAPSHOT (yyyy-mm-dd)
|
||||
- [Sitemaps] Trim Unicode whitespace around URLs (sebastian-nagel, kkrugler) #224
|
||||
- [Sitemaps] Sitemap index: stop URL at closing </loc> (sebastian-nagel, kkrugler) #213
|
||||
- [Sitemaps] Allow empty price in video sitemaps (sebastian-nagel) #221
|
||||
- [Sitemaps] In case of the use of a different locale, price tag can be formatted with ',' instead of '.' leading to a NPE (goldenlink) #220
|
||||
|
@ -209,13 +209,42 @@ public class DelegatorHandler extends DefaultHandler {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if character sequence contains only white space including
|
||||
* Unicode whitespace, cf. {@link #isWhitespace(char)}
|
||||
*/
|
||||
public static boolean isAllBlank(CharSequence charSeq) {
|
||||
for (int i = 0; i < charSeq.length(); i++) {
|
||||
if (!Character.isWhitespace(charSeq.charAt(i))) {
|
||||
if (!isWhitespace(charSeq.charAt(i))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check whether character is any Unicode whitespace, including the space
|
||||
* characters not covered by {@link Character#isWhitespace(char)}
|
||||
*/
|
||||
public static boolean isWhitespace(char c) {
|
||||
return Character.isWhitespace(c) || c == '\u00a0' || c == '\u2007' || c == '\u202f';
|
||||
}
|
||||
|
||||
/** Trim all whitespace including Unicode whitespace */
|
||||
public static String stripAllBlank(CharSequence charSeq) {
|
||||
if (charSeq.length() == 0) {
|
||||
return charSeq.toString();
|
||||
}
|
||||
int start = 0;
|
||||
int end = charSeq.length() - 1;
|
||||
while (isWhitespace(charSeq.charAt(start)) && start < end) {
|
||||
start++;
|
||||
}
|
||||
if (start < end) {
|
||||
while (isWhitespace(charSeq.charAt(end))) {
|
||||
end--;
|
||||
}
|
||||
}
|
||||
return charSeq.subSequence(start, end + 1).toString();
|
||||
}
|
||||
}
|
||||
|
@ -150,7 +150,7 @@ class RSSHandler extends DelegatorHandler {
|
||||
}
|
||||
|
||||
private void setLocURL() {
|
||||
String value = loc.toString().trim();
|
||||
String value = stripAllBlank(loc);
|
||||
if (value.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
@ -149,7 +149,7 @@ class XMLHandler extends DelegatorHandler {
|
||||
}
|
||||
|
||||
private void maybeAddSiteMapUrl() {
|
||||
String value = loc.toString().trim();
|
||||
String value = stripAllBlank(loc);
|
||||
try {
|
||||
// check that the value is a valid URL
|
||||
URL locURL = new URL(value);
|
||||
|
@ -103,7 +103,7 @@ class XMLIndexHandler extends DelegatorHandler {
|
||||
} else if ("lastmod".equals(localName)) {
|
||||
lastMod = SiteMap.convertToDate(value);
|
||||
} else {
|
||||
value = value.trim();
|
||||
value = stripAllBlank(value);
|
||||
if (!value.isEmpty() && !locClosed) {
|
||||
// try non-whitespace text content as loc
|
||||
// when no loc element has been specified
|
||||
@ -117,7 +117,7 @@ class XMLIndexHandler extends DelegatorHandler {
|
||||
}
|
||||
|
||||
private void maybeAddSiteMap() {
|
||||
String value = loc.toString().trim();
|
||||
String value = stripAllBlank(loc);
|
||||
try {
|
||||
// check that the value is a valid URL
|
||||
URL locURL = new URL(value);
|
||||
|
@ -321,6 +321,34 @@ public class SiteMapParserTest {
|
||||
assertNull("Sitemap " + urlSecondSitemap + " without modification date", secondSitemap.getLastModified());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testStripUnicodeWhiteSpace() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParser();
|
||||
StringBuilder scontent = new StringBuilder();
|
||||
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") //
|
||||
.append("<sitemapindex>\n") //
|
||||
.append(" <sitemap>\n") //
|
||||
.append(" <loc>\n") //
|
||||
.append(" <![CDATA[ https://www.example.com/sitemap1.xml ]]>\n") //
|
||||
.append(" </loc>\n") //
|
||||
.append(" </sitemap>\n") //
|
||||
.append(" <sitemap>\n") //
|
||||
.append(" <loc>\n") //
|
||||
.append(" <![CDATA[\u00a0https://www.example.com/sitemap2.xml ]]> \u2000\n") //
|
||||
.append(" </loc>\n") //
|
||||
.append(" </sitemap>\n") //
|
||||
.append("</sitemapindex>");
|
||||
byte[] content = scontent.toString().getBytes(UTF_8);
|
||||
URL url = new URL("https://www.example.com/sitemapindex.xml");
|
||||
AbstractSiteMap asm = parser.parseSiteMap(content, url);
|
||||
assertEquals(true, asm.isIndex());
|
||||
assertEquals(true, asm instanceof SiteMapIndex);
|
||||
SiteMapIndex sm = (SiteMapIndex) asm;
|
||||
assertEquals(2, sm.getSitemaps().size());
|
||||
String sitemap = "https://www.example.com/sitemap2.xml";
|
||||
assertNotNull("Sitemap " + sitemap + " not found in sitemap index", sm.getSitemap(new URL(sitemap)));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSitemapGZ() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParser();
|
||||
|
Loading…
Reference in New Issue
Block a user