Merge pull request #408 from sebastian-nagel/cc-195-robotstxt-url-decode

[Robots.txt] Path analyse bug with url-decode if allow/disallow path contains escaped wild-card characters
2024-05-04 14:36:04 +02:00 · 2023-05-23 15:17:43 +02:00 · 2023-05-23 15:17:43 +02:00 · 962787f4fd
parent 8bb1694669 5d036a1963
commit 962787f4fd
5 changed files with 108 additions and 14 deletions
--- a/src/main/java/crawlercommons/filters/basic/BasicURLNormalizer.java
+++ b/src/main/java/crawlercommons/filters/basic/BasicURLNormalizer.java
@ -540,7 +540,11 @@ public class BasicURLNormalizer extends URLFilter {
     * characters which should be escaped according to <a
     * href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>.
     */
-    private static String escapePath(String path) {
+    public static String escapePath(String path) {
+        return escapePath(path, null);
+    }
+
+    public static String escapePath(String path, boolean[] extraEscapedBytes) {
        StringBuilder sb = new StringBuilder(path.length());

        // Traverse over all bytes in this URL
@ -548,7 +552,7 @@ public class BasicURLNormalizer extends URLFilter {
        for (int i = 0; i < bytes.length; i++) {
            byte b = bytes[i];
            // Is this a control character?
-            if (b < 0 || escapedCharacters[b]) {
+            if (b < 0 || escapedCharacters[b] || (extraEscapedBytes != null && extraEscapedBytes[b])) {
                // Start escape sequence
                sb.append('%');

--- a/src/main/java/crawlercommons/robots/SimpleRobotRules.java
+++ b/src/main/java/crawlercommons/robots/SimpleRobotRules.java
@ -18,11 +18,12 @@ package crawlercommons.robots;

 import java.io.Serializable;
 import java.net.URL;
-import java.net.URLDecoder;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;

+import crawlercommons.filters.basic.BasicURLNormalizer;
+
 /**
 * Result from parsing a single robots.txt file - which means we get a set of
 * rules, and an optional crawl-delay, and an optional sitemap URL. Note that we
@ -121,6 +122,13 @@ public class SimpleRobotRules extends BaseRobotRules {
    protected ArrayList<RobotRule> _rules;
    protected RobotRulesMode _mode;

+    /** Special characters which require percent-encoding for path matching */
+    protected final static boolean[] specialCharactersPathMatching = new boolean[128];
+    static {
+        specialCharactersPathMatching['*'] = true;
+        specialCharactersPathMatching['$'] = true;
+    }
+
    public SimpleRobotRules() {
        this(RobotRulesMode.ALLOW_SOME);
    }
@ -200,6 +208,24 @@ public class SimpleRobotRules extends BaseRobotRules {
        }
    }

+    /**
+     * Encode/decode (using percent-encoding) all characters where necessary:
+     * encode Unicode/non-ASCII characters) and decode printable ASCII
+     * characters without special semantics.
+     * 
+     * @param urlPathQuery
+     *            path and query component of the URL
+     * @param additionalEncodedBytes
+     *            boolean array to request bytes (ASCII characters) to be
+     *            percent-encoded in addition to other characters requiring
+     *            encoding (Unicode/non-ASCII and characters not allowed in
+     *            URLs).
+     * @return properly percent-encoded URL path and query
+     */
+    public static String escapePath(String urlPathQuery, boolean[] additionalEncodedBytes) {
+        return BasicURLNormalizer.escapePath(BasicURLNormalizer.unescapePath(urlPathQuery), additionalEncodedBytes);
+    }
+
    private String getPath(String url, boolean getWithQuery) {

        try {
@ -214,9 +240,17 @@ public class SimpleRobotRules extends BaseRobotRules {
                path += "?" + query;
            }

-            // We used to lower-case the path, but Google says we need to do
-            // case-sensitive matching.
-            return URLDecoder.decode(path, "UTF-8");
+            /*
+             * We used to lower-case the path, but Google says we need to do
+             * case-sensitive matching.
+             * 
+             * However, we need to properly decode percent-encoded characters,
+             * but preserve those escaped characters which have special
+             * semantics in path matching, e.g. slash `/`. However, for the
+             * implementation of the path matching requires that asterisk `*`
+             * and dollar `$` are exceptionally percent-encoded.
+             */
+            return escapePath(path, specialCharactersPathMatching);
        } catch (Exception e) {
            // If the URL is invalid, we don't really care since the fetch
            // will fail, so return the root.
--- a/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java
+++ b/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java
@ -21,7 +21,6 @@ import java.net.HttpURLConnection;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.net.URLConnection;
-import java.net.URLDecoder;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
@ -778,13 +777,23 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {

    /**
     * Add any uniform rules to clean up path directives
+     * <ul>
+     * <li>trim leading and trailing white space and control characters not
+     * handled by the tokenizer</li>
+     * <li>properly percent-encode all characters where necessary</li>
+     * <li>but make sure that characters with special semantics for path
+     * matching (asterisk <code>*</code>, slash <code>/</code>, dollar
+     * <code>$</code>, etc.) are left as is (do not decode if percent-encoded).
+     * </ul>
+     * 
+     * This method uses {@link SimpleRobotRules#escapePath(String, boolean[])}
+     * to normalize the URL path before matching against allow/disallow rules.
+     * 
     * @param path
-     * @return clean path
+     * @return clean and encoded path
     */
    private String normalizePathDirective(String path) {
-        path = path.trim();
-
-        return path;
+        return SimpleRobotRules.escapePath(path.trim(), null);
    }

    /**
@ -803,7 +812,6 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
        String path = token.getData();

        try {
-            path = URLDecoder.decode(path, "UTF-8");
            path = normalizePathDirective(path);
            if (path.length() == 0) {
                // Disallow: <nothing> => allow all.
@ -832,7 +840,6 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
        String path = token.getData();

        try {
-            path = URLDecoder.decode(path, "UTF-8");
            path = normalizePathDirective(path);
        } catch (Exception e) {
            reportWarning(state, "Error parsing robots rules - can't decode path: {}", path);
--- a/src/test/java/crawlercommons/robots/SimpleRobotRulesParserTest.java
+++ b/src/test/java/crawlercommons/robots/SimpleRobotRulesParserTest.java
@ -265,6 +265,49 @@ public class SimpleRobotRulesParserTest {
        // assertFalse(rules.isAllowed("https://www.example.com/bücher/book1.html"));
    }

+    @ParameterizedTest
+    @CsvSource({ // Tests for percent-encoded characters with special semantics
+                 // in allow/disallow statements:
+                 // (a) must not trim percent-encoded white space
+                    "True, /*%20, https://www.example.com/", //
+                    "False, /*%20, https://www.example.com/foobar%20/", //
+                    "True, /*%20, https://www.example.com/foobar/", //
+                    // (b) match literal %2F in URL path, but do not match a
+                    // slash
+                    "True, /*%2F*, https://www.example.com/path/index.html", //
+                    "False, /*%2F*, https://www.example.com/topic/9%2F11/index.html", //
+                    "False, /topic/9%2F11/, https://www.example.com/topic/9%2F11/index.html", //
+                    "False, /topic/9%2F11/, https://www.example.com/topic/9%2f11/index.html", //
+                    "False, /q?*mime=application%2Fpdf, https://www.example.com/q?mime=application%2Fpdf", //
+                    // (c) percent-encoded dollar and asterisk (*)
+                    "False, /$, https://www.example.com/", //
+                    "True, /$, https://www.example.com/foobar", //
+                    "True, /%24, https://www.example.com/", //
+                    "False, /%24, https://www.example.com/%24100", //
+                    "False, /%24, https://www.example.com/$100", //
+                    "True, /search/%2A/, https://www.example.com/search/foobar/", //
+                    "False, /search/%2A/, https://www.example.com/search/%2A/", //
+                    "False, /search/%2A/, https://www.example.com/search/%2a/", //
+                    "False, /search/%2a/, https://www.example.com/search/%2a/", //
+                    "False, /search/%2a/, https://www.example.com/search/*/", //
+                    "False, /search/*/, https://www.example.com/search/foobar/", //
+                    // examples from RFC 9309
+                    "False, /path/file-with-a-%2A.html, https://www.example.com/path/file-with-a-*.html", //
+                    "True, /path/file-with-a-%2A.html, https://www.example.com/path/file-with-a-foo.html", //
+                    "False, /path/file-with-a-%2A.html, https://www.example.com/path/file-with-a-%2A.html", //
+                    "False, /path/foo-%24, https://www.example.com/path/foo-$", //
+                    "True, /path/foo-%24, https://www.example.com/path/foo-", //
+                    "False, /path/foo-%24, https://www.example.com/path/foo-%24", //
+    })
+    void testEscapedPaths(boolean isAllowed, String disallowPath, String urlStr) {
+        final String simpleRobotsTxt = "User-agent: *" + CRLF //
+                        + "Disallow: " + disallowPath + CRLF //
+                        + "Allow: /";
+        BaseRobotRules rules = createRobotRules("mybot", simpleRobotsTxt);
+        String msg = urlStr + " should " + (isAllowed ? "not" : "") + " be disallowed by rule Disallow: " + disallowPath;
+        assertEquals(isAllowed, rules.isAllowed(urlStr), msg);
+    }
+
    @Test
    void testSimplestAllowAll() {
        final String simpleRobotsTxt = "User-agent: *" + CRLF //
--- a/src/test/resources/normalizer/weirdToNormalizedUrls.csv
+++ b/src/test/resources/normalizer/weirdToNormalizedUrls.csv
@ -18,6 +18,8 @@ http://foo.com/%66oo.htm%1A, http://foo.com/foo.htm%1A

 # check that % decoder converts to upper case letters
 http://foo.com/%66oo.htm%c0, http://foo.com/foo.htm%C0
+https://www.example.com/search/%2a/, https://www.example.com/search/%2A/
+https://www.example.com/topic/9%2f11/, https://www.example.com/topic/9%2F11/

 # check that % decoder leaves encoded spaces alone
 http://foo.com/you%20too.html, http://foo.com/you%20too.html
@ -210,4 +212,8 @@ http://example.com/?, http://example.com/

 # Should not decode URL query data
 https://foo.com/?one/valid_query/without_%2F_params, https://foo.com/?one/valid_query/without_%2F_params
-http://foo.com/asdf/page.php?article%2F1234, http://foo.com/asdf/page.php?article%2F1234
+http://foo.com/asdf/page.php?article%2F1234, http://foo.com/asdf/page.php?article%2F1234
+
+# examples from the robots.txt RFC 9309 - * and $ should be unchanged
+https://www.example.com/path/file-with-a-*.html, https://www.example.com/path/file-with-a-*.html
+https://www.example.com/path/foo-$, https://www.example.com/path/foo-$