mirror of
https://github.com/crawler-commons/crawler-commons
synced 2024-09-23 17:33:23 +02:00
Merge pull request #307 from sebastian-nagel/cc-305-sitemaps-normalize-urls
Allow to normalize URLs in sitemaps, resolves #305
This commit is contained in:
commit
7a8bbb6ba3
@ -34,6 +34,7 @@ import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.Function;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
@ -48,6 +49,7 @@ import org.xml.sax.EntityResolver;
|
||||
import org.xml.sax.InputSource;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import crawlercommons.filters.URLFilter;
|
||||
import crawlercommons.mimetypes.MimeTypeDetector;
|
||||
import crawlercommons.sitemaps.AbstractSiteMap.SitemapType;
|
||||
import crawlercommons.sitemaps.extension.Extension;
|
||||
@ -100,6 +102,9 @@ public class SiteMapParser {
|
||||
|
||||
private MimeTypeDetector mimeTypeDetector;
|
||||
|
||||
/* Function to normalize or filter URLs. Does nothing by default. */
|
||||
private Function<String, String> urlFilter = (String url) -> url;
|
||||
|
||||
/**
|
||||
* SiteMapParser with strict location validation ({@link #isStrict()}) and not
|
||||
* allowing partially parsed content.
|
||||
@ -217,6 +222,26 @@ public class SiteMapParser {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set URL filter function to normalize URLs found in sitemaps or filter
|
||||
* URLs away if the function returns null.
|
||||
*/
|
||||
public void setURLFilter(Function<String, String> filter) {
|
||||
urlFilter = filter;
|
||||
}
|
||||
|
||||
/**
|
||||
* Use {@link URLFilter} to filter URLs, eg. to configure that URLs found in
|
||||
* sitemaps are normalized by
|
||||
* {@link crawlercommons.filters.basic.BasicURLNormalizer}:
|
||||
*
|
||||
* <pre>
|
||||
* sitemapParser.setURLFilter(new BasicURLNormalizer());
|
||||
* </pre>
|
||||
*/
|
||||
public void setURLFilter(URLFilter filter) {
|
||||
urlFilter = filter::filter;
|
||||
}
|
||||
/**
|
||||
* Returns a SiteMap or SiteMapIndex given an online sitemap URL
|
||||
*
|
||||
@ -487,8 +512,13 @@ public class SiteMapParser {
|
||||
if (line.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
String urlFiltered = urlFilter.apply(line);
|
||||
if (urlFiltered == null) {
|
||||
LOG.debug("Filtered url: [{}]", line.substring(0, Math.min(1024, line.length())));
|
||||
continue;
|
||||
}
|
||||
try {
|
||||
URL url = new URL(line);
|
||||
URL url = new URL(urlFiltered);
|
||||
boolean valid = urlIsValid(textSiteMap.getBaseUrl(), url.toString());
|
||||
if (valid || !strict) {
|
||||
SiteMapURL sUrl = new SiteMapURL(url, valid);
|
||||
@ -571,6 +601,7 @@ public class SiteMapParser {
|
||||
handler.setAcceptedNamespaces(acceptedNamespaces);
|
||||
}
|
||||
handler.setExtensionNamespaces(extensionNamespaces);
|
||||
handler.setURLFilter(urlFilter);
|
||||
|
||||
try {
|
||||
SAXParser saxParser = factory.newSAXParser();
|
||||
|
@ -24,6 +24,8 @@ import org.apache.commons.io.IOUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import crawlercommons.filters.basic.BasicURLNormalizer;
|
||||
|
||||
/**
|
||||
* Sitemap Tool for recursively fetching all URL's from a sitemap (and all of
|
||||
* it's children)
|
||||
@ -46,6 +48,9 @@ public class SiteMapTester {
|
||||
LOG.error(" if true sitemaps are required to use the standard namespace URI");
|
||||
LOG.error(" sitemap.extensions");
|
||||
LOG.error(" if true enable sitemap extension parsing");
|
||||
LOG.error(" sitemap.filter.urls");
|
||||
LOG.error(" if true filter and normalize all URLs found in the sitemap");
|
||||
LOG.error(" using crawlercommons.filters.basic.BasicURLNormalizer");
|
||||
} else {
|
||||
URL url = new URL(args[0]);
|
||||
String mt = (args.length > 1) ? args[1] : null;
|
||||
@ -63,14 +68,19 @@ public class SiteMapTester {
|
||||
|
||||
LOG.info("Parsing {} {}", url, ((mt != null && !mt.isEmpty()) ? "as MIME type " + mt : ""));
|
||||
|
||||
boolean strictNamespace = new Boolean(System.getProperty("sitemap.strictNamespace"));
|
||||
boolean strictNamespace = Boolean.getBoolean("sitemap.strictNamespace");
|
||||
saxParser.setStrictNamespace(strictNamespace);
|
||||
|
||||
boolean enableExtensions = new Boolean(System.getProperty("sitemap.extensions"));
|
||||
boolean enableExtensions = Boolean.getBoolean("sitemap.extensions");
|
||||
if (enableExtensions) {
|
||||
saxParser.enableExtensions();
|
||||
}
|
||||
|
||||
boolean enableURLFilter = Boolean.getBoolean("sitemap.filter.urls");
|
||||
if (enableURLFilter) {
|
||||
saxParser.setURLFilter(new BasicURLNormalizer());
|
||||
}
|
||||
|
||||
AbstractSiteMap sm = null;
|
||||
// guesses the mimetype
|
||||
if (mt == null || mt.equals("")) {
|
||||
|
@ -137,7 +137,12 @@ class AtomHandler extends DelegatorHandler {
|
||||
LOG.debug("Missing url");
|
||||
LOG.trace("Can't create an entry with a missing URL");
|
||||
} else {
|
||||
SiteMapURL sUrl = new SiteMapURL(loc.toString(), lastMod, null, null, valid);
|
||||
String urlFiltered = urlFilter.apply(loc.toString());
|
||||
if (urlFiltered == null) {
|
||||
LOG.debug("Filtered URL {}", loc.toString());
|
||||
return;
|
||||
}
|
||||
SiteMapURL sUrl = new SiteMapURL(urlFiltered, lastMod, null, null, valid);
|
||||
sitemap.addSiteMapUrl(sUrl);
|
||||
LOG.debug(" {}. {}", (++i), sUrl);
|
||||
}
|
||||
|
@ -23,6 +23,7 @@ import java.net.URL;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.function.Function;
|
||||
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.SAXException;
|
||||
@ -49,6 +50,7 @@ public class DelegatorHandler extends DefaultHandler {
|
||||
private Set<String> acceptedNamespaces;
|
||||
protected Map<String, Extension> extensionNamespaces;
|
||||
private StringBuilder characterBuffer = new StringBuilder();
|
||||
protected Function<String, String> urlFilter = (String url) -> url;
|
||||
|
||||
protected DelegatorHandler(LinkedList<String> elementStack, boolean strict) {
|
||||
this.elementStack = elementStack;
|
||||
@ -96,6 +98,10 @@ public class DelegatorHandler extends DefaultHandler {
|
||||
return extensionNamespaces.containsKey(uri);
|
||||
}
|
||||
|
||||
public void setURLFilter(Function<String, String> urlFilter) {
|
||||
this.urlFilter = urlFilter;
|
||||
}
|
||||
|
||||
protected void setException(UnknownFormatException exception) {
|
||||
this.exception = exception;
|
||||
}
|
||||
@ -170,6 +176,7 @@ public class DelegatorHandler extends DefaultHandler {
|
||||
}
|
||||
}
|
||||
delegate.setExtensionNamespaces(extensionNamespaces);
|
||||
delegate.setURLFilter(urlFilter);
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -273,4 +280,5 @@ public class DelegatorHandler extends DefaultHandler {
|
||||
}
|
||||
return charSeq.subSequence(start, end + 1).toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -161,6 +161,12 @@ class RSSHandler extends DelegatorHandler {
|
||||
try {
|
||||
// check that the value is a valid URL
|
||||
locURL = new URL(sitemap.getUrl(), value);
|
||||
String urlFiltered = urlFilter.apply(locURL.toString());
|
||||
if (urlFiltered == null) {
|
||||
LOG.debug("Filtered URL {}", value);
|
||||
return;
|
||||
}
|
||||
locURL = new URL(urlFiltered);
|
||||
} catch (MalformedURLException e) {
|
||||
LOG.debug("Bad url: [{}]", value);
|
||||
LOG.trace("Can't create an entry with a bad URL", e);
|
||||
|
@ -168,10 +168,15 @@ class XMLHandler extends DelegatorHandler {
|
||||
if (value == null || isAllBlank(value)) {
|
||||
return;
|
||||
}
|
||||
String urlFiltered = urlFilter.apply(value);
|
||||
if (urlFiltered == null) {
|
||||
LOG.debug("Filtered URL {}", value);
|
||||
return;
|
||||
}
|
||||
try {
|
||||
// check that the value is a valid URL
|
||||
URL locURL = new URL(value);
|
||||
boolean valid = urlIsValid(sitemap.getBaseUrl(), value);
|
||||
URL locURL = new URL(urlFiltered);
|
||||
boolean valid = urlIsValid(sitemap.getBaseUrl(), locURL.toString());
|
||||
if (valid || !isStrict()) {
|
||||
SiteMapURL sUrl = new SiteMapURL(locURL, valid);
|
||||
sUrl.setLastModified(lastMod);
|
||||
|
@ -132,9 +132,14 @@ class XMLIndexHandler extends DelegatorHandler {
|
||||
return;
|
||||
}
|
||||
String value = stripAllBlank(loc);
|
||||
String urlFiltered = urlFilter.apply(value);
|
||||
if (urlFiltered == null) {
|
||||
LOG.debug("Filtered URL {}", value);
|
||||
return;
|
||||
}
|
||||
try {
|
||||
// check that the value is a valid URL
|
||||
URL locURL = new URL(value);
|
||||
URL locURL = new URL(urlFiltered);
|
||||
SiteMap s = new SiteMap(locURL, lastMod);
|
||||
sitemap.addSitemap(s);
|
||||
LOG.debug(" {}. {}", (i + 1), s);
|
||||
|
@ -16,7 +16,9 @@
|
||||
|
||||
package crawlercommons.sitemaps;
|
||||
|
||||
import crawlercommons.filters.basic.BasicURLNormalizer;
|
||||
import crawlercommons.sitemaps.AbstractSiteMap.SitemapType;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
@ -32,9 +34,11 @@ import java.nio.charset.StandardCharsets;
|
||||
import java.time.ZonedDateTime;
|
||||
import java.time.temporal.ChronoField;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static java.nio.charset.StandardCharsets.UTF_8;
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
@ -201,6 +205,27 @@ public class SiteMapParserTest {
|
||||
assertEquals(2, sm.getSiteMapUrls().size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSitemapTXTfilterUrls() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParser();
|
||||
parser.setURLFilter(new BasicURLNormalizer());
|
||||
String contentType = "text/plain";
|
||||
String scontent = "www.example.com/catalog?item=1\nhttp://www.example.com/catalog?item=11#anchor";
|
||||
byte[] content = scontent.getBytes(UTF_8);
|
||||
URL url = new URL("http://www.example.com/sitemap.txt");
|
||||
|
||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
AbstractSiteMapTest.testSerializable(asm);
|
||||
assertEquals(false, asm.isIndex());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
assertEquals(2, sm.getSiteMapUrls().size());
|
||||
String[] normUrls = { "http://www.example.com/catalog?item=1", "http://www.example.com/catalog?item=11" };
|
||||
List<String> urls = sm.getSiteMapUrls().stream().map(u -> u.getUrl().toString()).collect(Collectors.toList());
|
||||
assertTrue(urls.containsAll(Arrays.asList(normUrls)));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSitemapXML() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParser();
|
||||
@ -257,6 +282,39 @@ public class SiteMapParserTest {
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSitemapXMLfilterUrls() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParser();
|
||||
parser.setURLFilter(new BasicURLNormalizer());
|
||||
String contentType = "text/xml";
|
||||
StringBuilder scontent = new StringBuilder(1024);
|
||||
getXMLSitemapAsBytes(scontent);
|
||||
|
||||
// remove "http://" for first URL
|
||||
int pos = scontent.indexOf(SITEMAP_URLS[0][0]);
|
||||
scontent.replace(pos, pos + 7, "");
|
||||
|
||||
// append "#anchor" to second URL
|
||||
pos = scontent.indexOf(SITEMAP_URLS[1][0]) + SITEMAP_URLS[1][0].length();
|
||||
scontent.replace(pos, pos, "#anchor"); // remove "http://"
|
||||
|
||||
byte[] content = scontent.toString().getBytes(UTF_8);
|
||||
URL url = new URL("http://www.example.com/sitemap.xml");
|
||||
|
||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
AbstractSiteMapTest.testSerializable(asm);
|
||||
assertEquals(false, asm.isIndex());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
assertEquals(5, sm.getSiteMapUrls().size());
|
||||
|
||||
SiteMapURL[] found = sm.getSiteMapUrls().toArray(new SiteMapURL[5]);
|
||||
for (int i = 0; i < found.length; i++) {
|
||||
validateSiteMapUrl(i, found[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/** This Sitemap contains badly formatted XML and can't be read */
|
||||
@Test
|
||||
public void testSitemapParserBrokenXml() {
|
||||
@ -696,8 +754,8 @@ public class SiteMapParserTest {
|
||||
* UTF-8 encoded bytes
|
||||
*/
|
||||
private byte[] getXMLSitemapAsBytes(StringBuilder scontent) {
|
||||
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>") //
|
||||
.append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">");
|
||||
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
|
||||
scontent.append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n");
|
||||
for (String[] surl : SITEMAP_URLS) {
|
||||
scontent.append(" <url>\n <loc>").append(surl[0]).append("</loc>\n");
|
||||
if (surl[1] != null) {
|
||||
|
Loading…
Reference in New Issue
Block a user