Merge pull request #401 from sebastian-nagel/cc-389-allow-disallow-unicode-paths

[Robots.txt] Handle allow/disallow directives containing unescaped Unicode characters
2024-05-22 19:06:03 +02:00 · 2023-05-11 16:19:23 +02:00 · 2023-05-11 16:19:23 +02:00 · 79bef97d40
parent e691cec4cf a395cfee73
commit 79bef97d40
2 changed files with 66 additions and 16 deletions
--- a/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java
+++ b/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java
@ -44,7 +44,9 @@ import crawlercommons.robots.SimpleRobotRules.RobotRulesMode;
 * <p>
 * This implementation of {@link BaseRobotsParser} retrieves a set of
 * {@link SimpleRobotRules rules} for an agent with the given name from the
- * <code>robots.txt</code> file of a given domain.
+ * <code>robots.txt</code> file of a given domain. The implementation follows
+ * <a href="https://www.rfc-editor.org/rfc/rfc9309.html#name-access-method">RFC
+ * 9309</a>.
 * </p>
 * 
 * <p>
@ -492,7 +494,17 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {

        int bytesLen = content.length;
        int offset = 0;
-        Charset encoding = StandardCharsets.US_ASCII;
+
+        /*
+         * RFC 9309 requires that is "UTF-8 encoded" (<a href=
+         * "https://www.rfc-editor.org/rfc/rfc9309.html#name-access-method"> RFC
+         * 9309, section 2.3 Access Method</a>), but
+         * "Implementors MAY bridge encoding mismatches if they detect that the robots.txt file is not UTF-8 encoded."
+         * (<a href=
+         * "https://www.rfc-editor.org/rfc/rfc9309.html#name-the-allow-and-disallow-line"
+         * > RFC 9309, section 2.2.2. The "Allow" and "Disallow" Lines</a>)
+         */
+        Charset encoding = StandardCharsets.UTF_8;

        // Check for a UTF-8 BOM at the beginning (EF BB BF)
        if ((bytesLen >= 3) && (content[0] == (byte) 0xEF) && (content[1] == (byte) 0xBB) && (content[2] == (byte) 0xBF)) {
@ -519,11 +531,11 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
        // Decide if we need to do special HTML processing.
        boolean isHtmlType = ((contentType != null) && contentType.toLowerCase(Locale.ROOT).startsWith("text/html"));

-        // If it looks like it contains HTML, but doesn't have a user agent
-        // field, then
-        // assume somebody messed up and returned back to us a random HTML page
-        // instead
-        // of a robots.txt file.
+        /*
+         * If it looks like it contains HTML, but doesn't have a user agent
+         * field, then assume somebody messed up and returned back to us a
+         * random HTML page instead of a robots.txt file.
+         */
        boolean hasHTML = false;
        if (isHtmlType || SIMPLE_HTML_PATTERN.matcher(contentAsStr).find()) {
            if (!USER_AGENT_PATTERN.matcher(contentAsStr).find()) {
@ -550,12 +562,12 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
        while (lineParser.hasMoreTokens()) {
            String line = lineParser.nextToken();

-            // Get rid of HTML markup, in case some brain-dead webmaster has
-            // created an HTML
-            // page for robots.txt. We could do more sophisticated processing
-            // here to better
-            // handle bad HTML, but that's a very tiny percentage of all
-            // robots.txt files.
+            /*
+             * Get rid of HTML markup, in case some brain-dead webmaster has
+             * created an HTML page for robots.txt. We could do more
+             * sophisticated processing here to better handle bad HTML, but
+             * that's a very tiny percentage of all robots.txt files.
+             */
            if (hasHTML) {
                line = line.replaceAll("<[^>]+>", "");
            }
@ -855,9 +867,8 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
                    double delayValue = Double.parseDouble(delayString) * 1000.0;
                    state.setCrawlDelay(Math.round(delayValue));
                } else {
-                    long delayValue = Integer.parseInt(delayString) * 1000L; // sec
-                                                                             // to
-                                                                             // millisec
+                    // seconds to milliseconds
+                    long delayValue = Integer.parseInt(delayString) * 1000L;
                    state.setCrawlDelay(delayValue);
                }
            } catch (Exception e) {
--- a/src/test/java/crawlercommons/robots/SimpleRobotRulesParserTest.java
+++ b/src/test/java/crawlercommons/robots/SimpleRobotRulesParserTest.java
@ -22,6 +22,7 @@ import org.junit.jupiter.params.provider.CsvSource;

 import java.io.InputStream;
 import java.net.HttpURLConnection;
+import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.List;
@ -226,6 +227,44 @@ public class SimpleRobotRulesParserTest {
        assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
    }

+    @Test
+    void testUnicodeUnescapedPaths() {
+        final String simpleRobotsTxt = "User-agent: *" + CRLF //
+                        + "Disallow: /bücher/" + CRLF //
+                        + "Disallow: /k%C3%B6nyvek/" + CRLF //
+                        + CRLF //
+                        + "User-agent: GoodBot" + CRLF //
+                        + "Allow: /";
+
+        BaseRobotRules rules = createRobotRules("mybot", simpleRobotsTxt);
+        assertTrue(rules.isAllowed("https://www.example.com/"));
+
+        // test using escaped and unescaped URLs
+        assertFalse(rules.isAllowed("https://www.example.com/b%C3%BCcher/book1.html"));
+        assertFalse(rules.isAllowed("https://www.example.com/bücher/book2.html"));
+
+        // (for completeness) check also escaped path in robots.txt
+        assertFalse(rules.isAllowed("https://www.example.com/k%C3%B6nyvek/book1.html"));
+        assertFalse(rules.isAllowed("https://www.example.com/könyvek/book2.html"));
+
+        // test invalid encoding: invalid encoded characters should not break
+        // parsing of rules below
+        rules = createRobotRules("goodbot", simpleRobotsTxt.getBytes(StandardCharsets.ISO_8859_1));
+        assertTrue(rules.isAllowed("https://www.example.com/"));
+        assertTrue(rules.isAllowed("https://www.example.com/b%C3%BCcher/book1.html"));
+
+        // test invalid encoding: only rules with invalid characters should be
+        // ignored
+        rules = createRobotRules("mybot", simpleRobotsTxt.getBytes(StandardCharsets.ISO_8859_1));
+        assertTrue(rules.isAllowed("https://www.example.com/"));
+        assertFalse(rules.isAllowed("https://www.example.com/k%C3%B6nyvek/book1.html"));
+        assertFalse(rules.isAllowed("https://www.example.com/könyvek/book2.html"));
+        // if URL paths in disallow rules are not properly encoded, these two
+        // URLs are not matched:
+        // assertFalse(rules.isAllowed("https://www.example.com/b%C3%BCcher/book2.html"));
+        // assertFalse(rules.isAllowed("https://www.example.com/bücher/book1.html"));
+    }
+
    @Test
    void testSimplestAllowAll() {
        final String simpleRobotsTxt = "User-agent: *" + CRLF //