From b449fdf0248b813958837b2003e05afc31e12c4f Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 26 Mar 2019 15:02:43 +0100 Subject: [PATCH] Robots parser to always handle absolute sitemap URL even without valid base URL, fixes #240 --- CHANGES.txt | 1 + .../robots/BaseRobotsParser.java | 5 ++++- .../robots/SimpleRobotRulesParser.java | 17 ++++++++++++++--- .../robots/SimpleRobotRulesParserTest.java | 19 ++++++++++++++++--- 4 files changed, 35 insertions(+), 7 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 77e9a91..cdcbd61 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,6 +1,7 @@ Crawler-Commons Change Log Current Development 1.1-SNAPSHOT (yyyy-mm-dd) +- [Robots] Robots parser to always handle absolute sitemap URL even without valid base URL (pr3mar, kkrugler, sebastian-nagel) #240 Release 1.0 (2019-03-19) - [Sitemaps] Unit tests depend on system timezone (kkrugler, sebastian-nagel) #238 diff --git a/src/main/java/crawlercommons/robots/BaseRobotsParser.java b/src/main/java/crawlercommons/robots/BaseRobotsParser.java index d5f6377..1e26691 100644 --- a/src/main/java/crawlercommons/robots/BaseRobotsParser.java +++ b/src/main/java/crawlercommons/robots/BaseRobotsParser.java @@ -38,7 +38,10 @@ public abstract class BaseRobotsParser implements Serializable { * lower-casing, and the prefix match rule. * * @param url - * URL that content was fetched from (for reporting purposes) + * URL that robots.txt content was fetched from. A complete and + * valid URL (e.g., https://example.com/robots.txt) is expected. + * Used to resolve relative sitemap URLs and for + * logging/reporting purposes. * @param content * raw bytes from the site's robots.txt file * @param contentType diff --git a/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java b/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java index 4038caf..2375ddc 100644 --- a/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java +++ b/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java @@ -737,10 +737,21 @@ public class SimpleRobotRulesParser extends BaseRobotsParser { String sitemap = token.getData(); try { - URL sitemap_url = new URL(new URL(state.getUrl()), sitemap); - String hostname = sitemap_url.getHost(); + URL sitemapUrl; + URL base = null; + try { + base = new URL(state.getUrl()); + } catch (MalformedURLException e) { + // must try without base URL + } + if (base != null) { + sitemapUrl = new URL(base, sitemap); + } else { + sitemapUrl = new URL(sitemap); + } + String hostname = sitemapUrl.getHost(); if ((hostname != null) && (hostname.length() > 0)) { - state.addSitemap(sitemap_url.toExternalForm()); + state.addSitemap(sitemapUrl.toExternalForm()); } } catch (Exception e) { reportWarning("Invalid URL with sitemap directive: " + sitemap, state.getUrl()); diff --git a/src/test/java/crawlercommons/robots/SimpleRobotRulesParserTest.java b/src/test/java/crawlercommons/robots/SimpleRobotRulesParserTest.java index ecf290d..e08f023 100644 --- a/src/test/java/crawlercommons/robots/SimpleRobotRulesParserTest.java +++ b/src/test/java/crawlercommons/robots/SimpleRobotRulesParserTest.java @@ -29,8 +29,6 @@ import java.util.Locale; import javax.servlet.http.HttpServletResponse; -import junit.framework.Assert; - import org.junit.Test; public class SimpleRobotRulesParserTest { @@ -154,7 +152,7 @@ public class SimpleRobotRulesParserTest { + "#disallow: /test" + LF + LF + "#user-agent: someAgent" + LF + LF + "#disallow: /index.html" + LF + "#disallow: /test" + LF + LF; BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes(UTF_8)); - Assert.assertTrue(rules.isAllowed("http://www.domain.com/anypage.html")); + assertTrue(rules.isAllowed("http://www.domain.com/anypage.html")); } @Test @@ -727,6 +725,21 @@ public class SimpleRobotRulesParserTest { assertEquals("Found sitemap", 1, rules.getSitemaps().size()); } + @Test + public void testSitemapInvalidBaseUrl() throws Exception { + // test https://github.com/crawler-commons/crawler-commons/issues/240 + // - should handle absolute sitemap URL even if base URL isn't valid + + final String simpleRobotsTxt = "Sitemap: https://www.example.com/sitemap.xml"; + + SimpleRobotRulesParser robotParser = new SimpleRobotRulesParser(); + BaseRobotRules rules = robotParser.parseContent("example.com", simpleRobotsTxt.getBytes(UTF_8), "text/plain", "a"); + + assertEquals(1, rules.getSitemaps().size()); + assertEquals("https://www.example.com/sitemap.xml", rules.getSitemaps().get(0)); + assertEquals("Found sitemap", 1, rules.getSitemaps().size()); + } + @Test public void testManyUserAgents() throws Exception { BaseRobotRules rules = createRobotRules("wget", readFile("/robots/many-user-agents.txt"));