Robots parser to always handle absolute sitemap URL even without valid base URL,

fixes #240
2024-05-11 08:16:04 +02:00 · 2019-03-26 15:02:43 +01:00 · 2019-03-26 15:02:43 +01:00 · b449fdf024
parent ec2ad7c790
commit b449fdf024
4 changed files with 35 additions and 7 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -1,6 +1,7 @@
 Crawler-Commons Change Log

 Current Development 1.1-SNAPSHOT (yyyy-mm-dd)
+- [Robots] Robots parser to always handle absolute sitemap URL even without valid base URL (pr3mar, kkrugler, sebastian-nagel) #240

 Release 1.0 (2019-03-19)
 - [Sitemaps] Unit tests depend on system timezone (kkrugler, sebastian-nagel) #238
--- a/src/main/java/crawlercommons/robots/BaseRobotsParser.java
+++ b/src/main/java/crawlercommons/robots/BaseRobotsParser.java
@ -38,7 +38,10 @@ public abstract class BaseRobotsParser implements Serializable {
     * lower-casing, and the prefix match rule.
     * 
     * @param url
-     *            URL that content was fetched from (for reporting purposes)
+     *            URL that robots.txt content was fetched from. A complete and
+     *            valid URL (e.g., https://example.com/robots.txt) is expected.
+     *            Used to resolve relative sitemap URLs and for
+     *            logging/reporting purposes.
     * @param content
     *            raw bytes from the site's robots.txt file
     * @param contentType
--- a/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java
+++ b/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java
@ -737,10 +737,21 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {

        String sitemap = token.getData();
        try {
-            URL sitemap_url = new URL(new URL(state.getUrl()), sitemap);
-            String hostname = sitemap_url.getHost();
+            URL sitemapUrl;
+            URL base = null;
+            try {
+                base = new URL(state.getUrl());
+            } catch (MalformedURLException e) {
+                // must try without base URL
+            }
+            if (base != null) {
+                sitemapUrl = new URL(base, sitemap);
+            } else {
+                sitemapUrl = new URL(sitemap);
+            }
+            String hostname = sitemapUrl.getHost();
            if ((hostname != null) && (hostname.length() > 0)) {
-                state.addSitemap(sitemap_url.toExternalForm());
+                state.addSitemap(sitemapUrl.toExternalForm());
            }
        } catch (Exception e) {
            reportWarning("Invalid URL with sitemap directive: " + sitemap, state.getUrl());
--- a/src/test/java/crawlercommons/robots/SimpleRobotRulesParserTest.java
+++ b/src/test/java/crawlercommons/robots/SimpleRobotRulesParserTest.java
@ -29,8 +29,6 @@ import java.util.Locale;

 import javax.servlet.http.HttpServletResponse;

-import junit.framework.Assert;
-
 import org.junit.Test;

 public class SimpleRobotRulesParserTest {
@ -154,7 +152,7 @@ public class SimpleRobotRulesParserTest {
                        + "#disallow: /test" + LF + LF + "#user-agent: someAgent" + LF + LF + "#disallow: /index.html" + LF + "#disallow: /test" + LF + LF;

        BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes(UTF_8));
-        Assert.assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
+        assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
    }

    @Test
@ -727,6 +725,21 @@ public class SimpleRobotRulesParserTest {
        assertEquals("Found sitemap", 1, rules.getSitemaps().size());
    }

+    @Test
+    public void testSitemapInvalidBaseUrl() throws Exception {
+        // test https://github.com/crawler-commons/crawler-commons/issues/240
+        // - should handle absolute sitemap URL even if base URL isn't valid
+
+        final String simpleRobotsTxt = "Sitemap: https://www.example.com/sitemap.xml";
+
+        SimpleRobotRulesParser robotParser = new SimpleRobotRulesParser();
+        BaseRobotRules rules = robotParser.parseContent("example.com", simpleRobotsTxt.getBytes(UTF_8), "text/plain", "a");
+
+        assertEquals(1, rules.getSitemaps().size());
+        assertEquals("https://www.example.com/sitemap.xml", rules.getSitemaps().get(0));
+        assertEquals("Found sitemap", 1, rules.getSitemaps().size());
+    }
+
    @Test
    public void testManyUserAgents() throws Exception {
        BaseRobotRules rules = createRobotRules("wget", readFile("/robots/many-user-agents.txt"));