diff --git a/src/main/java/crawlercommons/sitemaps/SiteMapParser.java b/src/main/java/crawlercommons/sitemaps/SiteMapParser.java index 988ee33..aea86df 100644 --- a/src/main/java/crawlercommons/sitemaps/SiteMapParser.java +++ b/src/main/java/crawlercommons/sitemaps/SiteMapParser.java @@ -34,6 +34,7 @@ import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.function.Consumer; +import java.util.function.Function; import java.util.zip.GZIPInputStream; import javax.xml.parsers.ParserConfigurationException; @@ -48,6 +49,7 @@ import org.xml.sax.EntityResolver; import org.xml.sax.InputSource; import org.xml.sax.SAXException; +import crawlercommons.filters.URLFilter; import crawlercommons.mimetypes.MimeTypeDetector; import crawlercommons.sitemaps.AbstractSiteMap.SitemapType; import crawlercommons.sitemaps.extension.Extension; @@ -100,6 +102,9 @@ public class SiteMapParser { private MimeTypeDetector mimeTypeDetector; + /* Function to normalize or filter URLs. Does nothing by default. */ + private Function urlFilter = (String url) -> url; + /** * SiteMapParser with strict location validation ({@link #isStrict()}) and not * allowing partially parsed content. @@ -217,6 +222,26 @@ public class SiteMapParser { } } + /** + * Set URL filter function to normalize URLs found in sitemaps or filter + * URLs away if the function returns null. + */ + public void setURLFilter(Function filter) { + urlFilter = filter; + } + + /** + * Use {@link URLFilter} to filter URLs, eg. to configure that URLs found in + * sitemaps are normalized by + * {@link crawlercommons.filters.basic.BasicURLNormalizer}: + * + *
+     * sitemapParser.setURLFilter(new BasicURLNormalizer());
+     * 
+ */ + public void setURLFilter(URLFilter filter) { + urlFilter = filter::filter; + } /** * Returns a SiteMap or SiteMapIndex given an online sitemap URL * @@ -487,8 +512,13 @@ public class SiteMapParser { if (line.isEmpty()) { continue; } + String urlFiltered = urlFilter.apply(line); + if (urlFiltered == null) { + LOG.debug("Filtered url: [{}]", line.substring(0, Math.min(1024, line.length()))); + continue; + } try { - URL url = new URL(line); + URL url = new URL(urlFiltered); boolean valid = urlIsValid(textSiteMap.getBaseUrl(), url.toString()); if (valid || !strict) { SiteMapURL sUrl = new SiteMapURL(url, valid); @@ -571,6 +601,7 @@ public class SiteMapParser { handler.setAcceptedNamespaces(acceptedNamespaces); } handler.setExtensionNamespaces(extensionNamespaces); + handler.setURLFilter(urlFilter); try { SAXParser saxParser = factory.newSAXParser(); diff --git a/src/main/java/crawlercommons/sitemaps/SiteMapTester.java b/src/main/java/crawlercommons/sitemaps/SiteMapTester.java index a3f8794..fa475da 100644 --- a/src/main/java/crawlercommons/sitemaps/SiteMapTester.java +++ b/src/main/java/crawlercommons/sitemaps/SiteMapTester.java @@ -24,6 +24,8 @@ import org.apache.commons.io.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import crawlercommons.filters.basic.BasicURLNormalizer; + /** * Sitemap Tool for recursively fetching all URL's from a sitemap (and all of * it's children) @@ -46,6 +48,9 @@ public class SiteMapTester { LOG.error(" if true sitemaps are required to use the standard namespace URI"); LOG.error(" sitemap.extensions"); LOG.error(" if true enable sitemap extension parsing"); + LOG.error(" sitemap.filter.urls"); + LOG.error(" if true filter and normalize all URLs found in the sitemap"); + LOG.error(" using crawlercommons.filters.basic.BasicURLNormalizer"); } else { URL url = new URL(args[0]); String mt = (args.length > 1) ? args[1] : null; @@ -63,14 +68,19 @@ public class SiteMapTester { LOG.info("Parsing {} {}", url, ((mt != null && !mt.isEmpty()) ? "as MIME type " + mt : "")); - boolean strictNamespace = new Boolean(System.getProperty("sitemap.strictNamespace")); + boolean strictNamespace = Boolean.getBoolean("sitemap.strictNamespace"); saxParser.setStrictNamespace(strictNamespace); - boolean enableExtensions = new Boolean(System.getProperty("sitemap.extensions")); + boolean enableExtensions = Boolean.getBoolean("sitemap.extensions"); if (enableExtensions) { saxParser.enableExtensions(); } + boolean enableURLFilter = Boolean.getBoolean("sitemap.filter.urls"); + if (enableURLFilter) { + saxParser.setURLFilter(new BasicURLNormalizer()); + } + AbstractSiteMap sm = null; // guesses the mimetype if (mt == null || mt.equals("")) { diff --git a/src/main/java/crawlercommons/sitemaps/sax/AtomHandler.java b/src/main/java/crawlercommons/sitemaps/sax/AtomHandler.java index ddaaab7..a48c54e 100644 --- a/src/main/java/crawlercommons/sitemaps/sax/AtomHandler.java +++ b/src/main/java/crawlercommons/sitemaps/sax/AtomHandler.java @@ -137,7 +137,12 @@ class AtomHandler extends DelegatorHandler { LOG.debug("Missing url"); LOG.trace("Can't create an entry with a missing URL"); } else { - SiteMapURL sUrl = new SiteMapURL(loc.toString(), lastMod, null, null, valid); + String urlFiltered = urlFilter.apply(loc.toString()); + if (urlFiltered == null) { + LOG.debug("Filtered URL {}", loc.toString()); + return; + } + SiteMapURL sUrl = new SiteMapURL(urlFiltered, lastMod, null, null, valid); sitemap.addSiteMapUrl(sUrl); LOG.debug(" {}. {}", (++i), sUrl); } diff --git a/src/main/java/crawlercommons/sitemaps/sax/DelegatorHandler.java b/src/main/java/crawlercommons/sitemaps/sax/DelegatorHandler.java index fb454e6..360dd01 100644 --- a/src/main/java/crawlercommons/sitemaps/sax/DelegatorHandler.java +++ b/src/main/java/crawlercommons/sitemaps/sax/DelegatorHandler.java @@ -23,6 +23,7 @@ import java.net.URL; import java.util.LinkedList; import java.util.Map; import java.util.Set; +import java.util.function.Function; import org.xml.sax.Attributes; import org.xml.sax.SAXException; @@ -49,6 +50,7 @@ public class DelegatorHandler extends DefaultHandler { private Set acceptedNamespaces; protected Map extensionNamespaces; private StringBuilder characterBuffer = new StringBuilder(); + protected Function urlFilter = (String url) -> url; protected DelegatorHandler(LinkedList elementStack, boolean strict) { this.elementStack = elementStack; @@ -96,6 +98,10 @@ public class DelegatorHandler extends DefaultHandler { return extensionNamespaces.containsKey(uri); } + public void setURLFilter(Function urlFilter) { + this.urlFilter = urlFilter; + } + protected void setException(UnknownFormatException exception) { this.exception = exception; } @@ -170,6 +176,7 @@ public class DelegatorHandler extends DefaultHandler { } } delegate.setExtensionNamespaces(extensionNamespaces); + delegate.setURLFilter(urlFilter); } @Override @@ -273,4 +280,5 @@ public class DelegatorHandler extends DefaultHandler { } return charSeq.subSequence(start, end + 1).toString(); } + } diff --git a/src/main/java/crawlercommons/sitemaps/sax/RSSHandler.java b/src/main/java/crawlercommons/sitemaps/sax/RSSHandler.java index 9ca768f..b1e96ac 100644 --- a/src/main/java/crawlercommons/sitemaps/sax/RSSHandler.java +++ b/src/main/java/crawlercommons/sitemaps/sax/RSSHandler.java @@ -161,6 +161,12 @@ class RSSHandler extends DelegatorHandler { try { // check that the value is a valid URL locURL = new URL(sitemap.getUrl(), value); + String urlFiltered = urlFilter.apply(locURL.toString()); + if (urlFiltered == null) { + LOG.debug("Filtered URL {}", value); + return; + } + locURL = new URL(urlFiltered); } catch (MalformedURLException e) { LOG.debug("Bad url: [{}]", value); LOG.trace("Can't create an entry with a bad URL", e); diff --git a/src/main/java/crawlercommons/sitemaps/sax/XMLHandler.java b/src/main/java/crawlercommons/sitemaps/sax/XMLHandler.java index 70e3228..cadc21f 100644 --- a/src/main/java/crawlercommons/sitemaps/sax/XMLHandler.java +++ b/src/main/java/crawlercommons/sitemaps/sax/XMLHandler.java @@ -168,10 +168,15 @@ class XMLHandler extends DelegatorHandler { if (value == null || isAllBlank(value)) { return; } + String urlFiltered = urlFilter.apply(value); + if (urlFiltered == null) { + LOG.debug("Filtered URL {}", value); + return; + } try { // check that the value is a valid URL - URL locURL = new URL(value); - boolean valid = urlIsValid(sitemap.getBaseUrl(), value); + URL locURL = new URL(urlFiltered); + boolean valid = urlIsValid(sitemap.getBaseUrl(), locURL.toString()); if (valid || !isStrict()) { SiteMapURL sUrl = new SiteMapURL(locURL, valid); sUrl.setLastModified(lastMod); diff --git a/src/main/java/crawlercommons/sitemaps/sax/XMLIndexHandler.java b/src/main/java/crawlercommons/sitemaps/sax/XMLIndexHandler.java index 8de44bf..9f3ee12 100644 --- a/src/main/java/crawlercommons/sitemaps/sax/XMLIndexHandler.java +++ b/src/main/java/crawlercommons/sitemaps/sax/XMLIndexHandler.java @@ -132,9 +132,14 @@ class XMLIndexHandler extends DelegatorHandler { return; } String value = stripAllBlank(loc); + String urlFiltered = urlFilter.apply(value); + if (urlFiltered == null) { + LOG.debug("Filtered URL {}", value); + return; + } try { // check that the value is a valid URL - URL locURL = new URL(value); + URL locURL = new URL(urlFiltered); SiteMap s = new SiteMap(locURL, lastMod); sitemap.addSitemap(s); LOG.debug(" {}. {}", (i + 1), s); diff --git a/src/test/java/crawlercommons/sitemaps/SiteMapParserTest.java b/src/test/java/crawlercommons/sitemaps/SiteMapParserTest.java index 660df70..b726a5f 100644 --- a/src/test/java/crawlercommons/sitemaps/SiteMapParserTest.java +++ b/src/test/java/crawlercommons/sitemaps/SiteMapParserTest.java @@ -16,7 +16,9 @@ package crawlercommons.sitemaps; +import crawlercommons.filters.basic.BasicURLNormalizer; import crawlercommons.sitemaps.AbstractSiteMap.SitemapType; + import org.apache.commons.io.IOUtils; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Assertions; @@ -32,9 +34,11 @@ import java.nio.charset.StandardCharsets; import java.time.ZonedDateTime; import java.time.temporal.ChronoField; import java.util.ArrayList; +import java.util.Arrays; import java.util.Iterator; import java.util.List; import java.util.Locale; +import java.util.stream.Collectors; import static java.nio.charset.StandardCharsets.UTF_8; import static org.junit.jupiter.api.Assertions.*; @@ -201,6 +205,27 @@ public class SiteMapParserTest { assertEquals(2, sm.getSiteMapUrls().size()); } + @Test + public void testSitemapTXTfilterUrls() throws UnknownFormatException, IOException { + SiteMapParser parser = new SiteMapParser(); + parser.setURLFilter(new BasicURLNormalizer()); + String contentType = "text/plain"; + String scontent = "www.example.com/catalog?item=1\nhttp://www.example.com/catalog?item=11#anchor"; + byte[] content = scontent.getBytes(UTF_8); + URL url = new URL("http://www.example.com/sitemap.txt"); + + AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url); + AbstractSiteMapTest.testSerializable(asm); + assertEquals(false, asm.isIndex()); + assertEquals(true, asm instanceof SiteMap); + + SiteMap sm = (SiteMap) asm; + assertEquals(2, sm.getSiteMapUrls().size()); + String[] normUrls = { "http://www.example.com/catalog?item=1", "http://www.example.com/catalog?item=11" }; + List urls = sm.getSiteMapUrls().stream().map(u -> u.getUrl().toString()).collect(Collectors.toList()); + assertTrue(urls.containsAll(Arrays.asList(normUrls))); + } + @Test public void testSitemapXML() throws UnknownFormatException, IOException { SiteMapParser parser = new SiteMapParser(); @@ -257,6 +282,39 @@ public class SiteMapParserTest { } } + @Test + public void testSitemapXMLfilterUrls() throws UnknownFormatException, IOException { + SiteMapParser parser = new SiteMapParser(); + parser.setURLFilter(new BasicURLNormalizer()); + String contentType = "text/xml"; + StringBuilder scontent = new StringBuilder(1024); + getXMLSitemapAsBytes(scontent); + + // remove "http://" for first URL + int pos = scontent.indexOf(SITEMAP_URLS[0][0]); + scontent.replace(pos, pos + 7, ""); + + // append "#anchor" to second URL + pos = scontent.indexOf(SITEMAP_URLS[1][0]) + SITEMAP_URLS[1][0].length(); + scontent.replace(pos, pos, "#anchor"); // remove "http://" + + byte[] content = scontent.toString().getBytes(UTF_8); + URL url = new URL("http://www.example.com/sitemap.xml"); + + AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url); + AbstractSiteMapTest.testSerializable(asm); + assertEquals(false, asm.isIndex()); + assertEquals(true, asm instanceof SiteMap); + + SiteMap sm = (SiteMap) asm; + assertEquals(5, sm.getSiteMapUrls().size()); + + SiteMapURL[] found = sm.getSiteMapUrls().toArray(new SiteMapURL[5]); + for (int i = 0; i < found.length; i++) { + validateSiteMapUrl(i, found[i]); + } + } + /** This Sitemap contains badly formatted XML and can't be read */ @Test public void testSitemapParserBrokenXml() { @@ -696,8 +754,8 @@ public class SiteMapParserTest { * UTF-8 encoded bytes */ private byte[] getXMLSitemapAsBytes(StringBuilder scontent) { - scontent.append("") // - .append(""); + scontent.append("\n"); + scontent.append("\n"); for (String[] surl : SITEMAP_URLS) { scontent.append(" \n ").append(surl[0]).append("\n"); if (surl[1] != null) {