1
0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-09-23 17:33:23 +02:00

Merge pull request #307 from sebastian-nagel/cc-305-sitemaps-normalize-urls

Allow to normalize URLs in sitemaps, resolves #305
This commit is contained in:
Sebastian Nagel 2021-08-14 13:45:21 +02:00 committed by GitHub
commit 7a8bbb6ba3
Signed by: GitHub
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 137 additions and 9 deletions

View File

@ -34,6 +34,7 @@ import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.function.Consumer;
import java.util.function.Function;
import java.util.zip.GZIPInputStream;
import javax.xml.parsers.ParserConfigurationException;
@ -48,6 +49,7 @@ import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import crawlercommons.filters.URLFilter;
import crawlercommons.mimetypes.MimeTypeDetector;
import crawlercommons.sitemaps.AbstractSiteMap.SitemapType;
import crawlercommons.sitemaps.extension.Extension;
@ -100,6 +102,9 @@ public class SiteMapParser {
private MimeTypeDetector mimeTypeDetector;
/* Function to normalize or filter URLs. Does nothing by default. */
private Function<String, String> urlFilter = (String url) -> url;
/**
* SiteMapParser with strict location validation ({@link #isStrict()}) and not
* allowing partially parsed content.
@ -217,6 +222,26 @@ public class SiteMapParser {
}
}
/**
* Set URL filter function to normalize URLs found in sitemaps or filter
* URLs away if the function returns null.
*/
public void setURLFilter(Function<String, String> filter) {
urlFilter = filter;
}
/**
* Use {@link URLFilter} to filter URLs, eg. to configure that URLs found in
* sitemaps are normalized by
* {@link crawlercommons.filters.basic.BasicURLNormalizer}:
*
* <pre>
* sitemapParser.setURLFilter(new BasicURLNormalizer());
* </pre>
*/
public void setURLFilter(URLFilter filter) {
urlFilter = filter::filter;
}
/**
* Returns a SiteMap or SiteMapIndex given an online sitemap URL
*
@ -487,8 +512,13 @@ public class SiteMapParser {
if (line.isEmpty()) {
continue;
}
String urlFiltered = urlFilter.apply(line);
if (urlFiltered == null) {
LOG.debug("Filtered url: [{}]", line.substring(0, Math.min(1024, line.length())));
continue;
}
try {
URL url = new URL(line);
URL url = new URL(urlFiltered);
boolean valid = urlIsValid(textSiteMap.getBaseUrl(), url.toString());
if (valid || !strict) {
SiteMapURL sUrl = new SiteMapURL(url, valid);
@ -571,6 +601,7 @@ public class SiteMapParser {
handler.setAcceptedNamespaces(acceptedNamespaces);
}
handler.setExtensionNamespaces(extensionNamespaces);
handler.setURLFilter(urlFilter);
try {
SAXParser saxParser = factory.newSAXParser();

View File

@ -24,6 +24,8 @@ import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import crawlercommons.filters.basic.BasicURLNormalizer;
/**
* Sitemap Tool for recursively fetching all URL's from a sitemap (and all of
* it's children)
@ -46,6 +48,9 @@ public class SiteMapTester {
LOG.error(" if true sitemaps are required to use the standard namespace URI");
LOG.error(" sitemap.extensions");
LOG.error(" if true enable sitemap extension parsing");
LOG.error(" sitemap.filter.urls");
LOG.error(" if true filter and normalize all URLs found in the sitemap");
LOG.error(" using crawlercommons.filters.basic.BasicURLNormalizer");
} else {
URL url = new URL(args[0]);
String mt = (args.length > 1) ? args[1] : null;
@ -63,14 +68,19 @@ public class SiteMapTester {
LOG.info("Parsing {} {}", url, ((mt != null && !mt.isEmpty()) ? "as MIME type " + mt : ""));
boolean strictNamespace = new Boolean(System.getProperty("sitemap.strictNamespace"));
boolean strictNamespace = Boolean.getBoolean("sitemap.strictNamespace");
saxParser.setStrictNamespace(strictNamespace);
boolean enableExtensions = new Boolean(System.getProperty("sitemap.extensions"));
boolean enableExtensions = Boolean.getBoolean("sitemap.extensions");
if (enableExtensions) {
saxParser.enableExtensions();
}
boolean enableURLFilter = Boolean.getBoolean("sitemap.filter.urls");
if (enableURLFilter) {
saxParser.setURLFilter(new BasicURLNormalizer());
}
AbstractSiteMap sm = null;
// guesses the mimetype
if (mt == null || mt.equals("")) {

View File

@ -137,7 +137,12 @@ class AtomHandler extends DelegatorHandler {
LOG.debug("Missing url");
LOG.trace("Can't create an entry with a missing URL");
} else {
SiteMapURL sUrl = new SiteMapURL(loc.toString(), lastMod, null, null, valid);
String urlFiltered = urlFilter.apply(loc.toString());
if (urlFiltered == null) {
LOG.debug("Filtered URL {}", loc.toString());
return;
}
SiteMapURL sUrl = new SiteMapURL(urlFiltered, lastMod, null, null, valid);
sitemap.addSiteMapUrl(sUrl);
LOG.debug(" {}. {}", (++i), sUrl);
}

View File

@ -23,6 +23,7 @@ import java.net.URL;
import java.util.LinkedList;
import java.util.Map;
import java.util.Set;
import java.util.function.Function;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
@ -49,6 +50,7 @@ public class DelegatorHandler extends DefaultHandler {
private Set<String> acceptedNamespaces;
protected Map<String, Extension> extensionNamespaces;
private StringBuilder characterBuffer = new StringBuilder();
protected Function<String, String> urlFilter = (String url) -> url;
protected DelegatorHandler(LinkedList<String> elementStack, boolean strict) {
this.elementStack = elementStack;
@ -96,6 +98,10 @@ public class DelegatorHandler extends DefaultHandler {
return extensionNamespaces.containsKey(uri);
}
public void setURLFilter(Function<String, String> urlFilter) {
this.urlFilter = urlFilter;
}
protected void setException(UnknownFormatException exception) {
this.exception = exception;
}
@ -170,6 +176,7 @@ public class DelegatorHandler extends DefaultHandler {
}
}
delegate.setExtensionNamespaces(extensionNamespaces);
delegate.setURLFilter(urlFilter);
}
@Override
@ -273,4 +280,5 @@ public class DelegatorHandler extends DefaultHandler {
}
return charSeq.subSequence(start, end + 1).toString();
}
}

View File

@ -161,6 +161,12 @@ class RSSHandler extends DelegatorHandler {
try {
// check that the value is a valid URL
locURL = new URL(sitemap.getUrl(), value);
String urlFiltered = urlFilter.apply(locURL.toString());
if (urlFiltered == null) {
LOG.debug("Filtered URL {}", value);
return;
}
locURL = new URL(urlFiltered);
} catch (MalformedURLException e) {
LOG.debug("Bad url: [{}]", value);
LOG.trace("Can't create an entry with a bad URL", e);

View File

@ -168,10 +168,15 @@ class XMLHandler extends DelegatorHandler {
if (value == null || isAllBlank(value)) {
return;
}
String urlFiltered = urlFilter.apply(value);
if (urlFiltered == null) {
LOG.debug("Filtered URL {}", value);
return;
}
try {
// check that the value is a valid URL
URL locURL = new URL(value);
boolean valid = urlIsValid(sitemap.getBaseUrl(), value);
URL locURL = new URL(urlFiltered);
boolean valid = urlIsValid(sitemap.getBaseUrl(), locURL.toString());
if (valid || !isStrict()) {
SiteMapURL sUrl = new SiteMapURL(locURL, valid);
sUrl.setLastModified(lastMod);

View File

@ -132,9 +132,14 @@ class XMLIndexHandler extends DelegatorHandler {
return;
}
String value = stripAllBlank(loc);
String urlFiltered = urlFilter.apply(value);
if (urlFiltered == null) {
LOG.debug("Filtered URL {}", value);
return;
}
try {
// check that the value is a valid URL
URL locURL = new URL(value);
URL locURL = new URL(urlFiltered);
SiteMap s = new SiteMap(locURL, lastMod);
sitemap.addSitemap(s);
LOG.debug(" {}. {}", (i + 1), s);

View File

@ -16,7 +16,9 @@
package crawlercommons.sitemaps;
import crawlercommons.filters.basic.BasicURLNormalizer;
import crawlercommons.sitemaps.AbstractSiteMap.SitemapType;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Assertions;
@ -32,9 +34,11 @@ import java.nio.charset.StandardCharsets;
import java.time.ZonedDateTime;
import java.time.temporal.ChronoField;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.stream.Collectors;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.jupiter.api.Assertions.*;
@ -201,6 +205,27 @@ public class SiteMapParserTest {
assertEquals(2, sm.getSiteMapUrls().size());
}
@Test
public void testSitemapTXTfilterUrls() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
parser.setURLFilter(new BasicURLNormalizer());
String contentType = "text/plain";
String scontent = "www.example.com/catalog?item=1\nhttp://www.example.com/catalog?item=11#anchor";
byte[] content = scontent.getBytes(UTF_8);
URL url = new URL("http://www.example.com/sitemap.txt");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
AbstractSiteMapTest.testSerializable(asm);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
assertEquals(2, sm.getSiteMapUrls().size());
String[] normUrls = { "http://www.example.com/catalog?item=1", "http://www.example.com/catalog?item=11" };
List<String> urls = sm.getSiteMapUrls().stream().map(u -> u.getUrl().toString()).collect(Collectors.toList());
assertTrue(urls.containsAll(Arrays.asList(normUrls)));
}
@Test
public void testSitemapXML() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
@ -257,6 +282,39 @@ public class SiteMapParserTest {
}
}
@Test
public void testSitemapXMLfilterUrls() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
parser.setURLFilter(new BasicURLNormalizer());
String contentType = "text/xml";
StringBuilder scontent = new StringBuilder(1024);
getXMLSitemapAsBytes(scontent);
// remove "http://" for first URL
int pos = scontent.indexOf(SITEMAP_URLS[0][0]);
scontent.replace(pos, pos + 7, "");
// append "#anchor" to second URL
pos = scontent.indexOf(SITEMAP_URLS[1][0]) + SITEMAP_URLS[1][0].length();
scontent.replace(pos, pos, "#anchor"); // remove "http://"
byte[] content = scontent.toString().getBytes(UTF_8);
URL url = new URL("http://www.example.com/sitemap.xml");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
AbstractSiteMapTest.testSerializable(asm);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
assertEquals(5, sm.getSiteMapUrls().size());
SiteMapURL[] found = sm.getSiteMapUrls().toArray(new SiteMapURL[5]);
for (int i = 0; i < found.length; i++) {
validateSiteMapUrl(i, found[i]);
}
}
/** This Sitemap contains badly formatted XML and can't be read */
@Test
public void testSitemapParserBrokenXml() {
@ -696,8 +754,8 @@ public class SiteMapParserTest {
* UTF-8 encoded bytes
*/
private byte[] getXMLSitemapAsBytes(StringBuilder scontent) {
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>") //
.append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">");
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
scontent.append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n");
for (String[] surl : SITEMAP_URLS) {
scontent.append(" <url>\n <loc>").append(surl[0]).append("</loc>\n");
if (surl[1] != null) {