Improve robots check draft rfc compliance

2024-09-28 10:41:04 +02:00 · 2021-12-14 11:47:42 -08:00 · 2021-12-14 11:47:42 -08:00 · 1f0e79b72a
commit 1f0e79b72a
parent d3ccb553df
5 changed files with 157 additions and 83 deletions
--- a/src/main/java/crawlercommons/robots/SimpleRobotRules.java
+++ b/src/main/java/crawlercommons/robots/SimpleRobotRules.java
@ -163,13 +163,36 @@ public class SimpleRobotRules extends BaseRobotRules {
                return true;
            }

+            boolean isAllowed = true;
+            int longestRuleMatch = Integer.MIN_VALUE;
            for (RobotRule rule : _rules) {
-                if (ruleMatches(pathWithQuery, rule._prefix)) {
-                    return rule._allow;
+                int matchLength = ruleMatches(pathWithQuery, rule._prefix);
+                if (matchLength == -1) {
+                    // See precedence-of-rules test case for an example
+                    // Some webmasters expect behavior close to google's, and this block is equivalent to:
+                    // https://github.com/google/robotstxt/blob/02bc6cdfa32db50d42563180c42aeb47042b4f0c/robots.cc#L605-L618
+                    // There are example robots.txt in the wild that benefit from this.
+                    // As of 2/7/2022, https://venmo.com/robots.txt for instance.
+                    if (rule._prefix.endsWith("index.htm") || rule._prefix.endsWith("index.html")) {
+                        matchLength = ruleMatches(pathWithQuery, rule._prefix.substring(0, rule._prefix.indexOf("index.htm")) + "$");
+                        if (matchLength == -1) {
+                            continue;
+                        }
+                    } else {
+                        continue;
+                    }
                }
+
+                if (longestRuleMatch < matchLength) {
+                    longestRuleMatch = matchLength;
+                    isAllowed = rule.isAllow();
+                } else if (longestRuleMatch == matchLength) {
+                    isAllowed |= rule.isAllow();
+                }
+                // else we've already got a more specific rule, and this match doesn't matter
            }

-            return true;
+            return isAllowed;
        }
    }

@ -197,7 +220,7 @@ public class SimpleRobotRules extends BaseRobotRules {
        }
    }

-    private boolean ruleMatches(String text, String pattern) {
+    private int ruleMatches(String text, String pattern) {
        int patternPos = 0;
        int textPos = 0;

@ -223,7 +246,7 @@ public class SimpleRobotRules extends BaseRobotRules {
                patternPos += 1;
                if (patternPos >= patternEnd) {
                    // Pattern ends with '*', we're all good.
-                    return true;
+                    return pattern.length();
                }

                // TODO - don't worry about having two '*' in a row?
@ -255,14 +278,14 @@ public class SimpleRobotRules extends BaseRobotRules {

                // If we matched, we're all set, otherwise we failed
                if (!matched) {
-                    return false;
+                    return -1;
                }
            } else {
                // See if the pattern from patternPos to wildcardPos matches the
                // text starting at textPos
                while ((patternPos < wildcardPos) && (textPos < textEnd)) {
                    if (text.charAt(textPos++) != pattern.charAt(patternPos++)) {
-                        return false;
+                        return -1;
                    }
                }
            }
@ -277,7 +300,11 @@ public class SimpleRobotRules extends BaseRobotRules {
        // We're at the end, so we have a match if the pattern was completely
        // consumed, and either we consumed all the text or we didn't have to
        // match it all (no '$' at end of the pattern)
-        return (patternPos == patternEnd) && ((textPos == textEnd) || !containsEndChar);
+        if ((patternPos == patternEnd) && ((textPos == textEnd) || !containsEndChar)) {
+            return pattern.length();
+        } else {
+            return -1;
+        }
    }

    /**
--- a/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java
+++ b/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java
@ -167,11 +167,6 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
         */
        private boolean _finishedAgentFields;

-        // True if we're done adding rules for a matched (not wildcard) agent
-        // name. When this is true, we only consider sitemap directives, so we
-        // skip all remaining user agent blocks.
-        private boolean _skipAgents;
-
        /*
         * Counter of warnings reporting invalid rules/lines in the robots.txt
         * file. The counter is used to limit the number of logged warnings.
@ -225,14 +220,6 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
            _finishedAgentFields = finishedAgentFields;
        }

-        public boolean isSkipAgents() {
-            return _skipAgents;
-        }
-
-        public void setSkipAgents(boolean skipAgents) {
-            _skipAgents = skipAgents;
-        }
-
        public void clearRules() {
            _curRules.clearRules();
        }
@ -513,22 +500,27 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
                    break;

                case DISALLOW:
+                parseState.setFinishedAgentFields(true);
                handleDisallow(parseState, token);
                    break;

                case ALLOW:
+                parseState.setFinishedAgentFields(true);
                handleAllow(parseState, token);
                    break;

                case CRAWL_DELAY:
+                parseState.setFinishedAgentFields(true);
                handleCrawlDelay(parseState, token);
                    break;

                case SITEMAP:
+                parseState.setFinishedAgentFields(true);
                handleSitemap(parseState, token);
                    break;

                case HTTP:
+                parseState.setFinishedAgentFields(true);
                handleHttp(parseState, token);
                    break;

@ -593,21 +585,16 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
     *            data for directive
     */
    private void handleUserAgent(ParseState state, RobotToken token) {
-        if (state.isMatchedRealName()) {
-            if (state.isFinishedAgentFields()) {
-                state.setSkipAgents(true);
-            }
-
-            return;
-        }
-
-        if (state.isFinishedAgentFields()) {
-            // We've got a user agent field, so we haven't yet seen anything
-            // that tells us we're done with this set of agent names.
-            state.setFinishedAgentFields(false);
+        // If we are adding rules, and had already finished reading user agent
+        // fields, then we are in a new section, hence stop adding rules
+        if (state.isAddingRules() && state.isFinishedAgentFields()) {
            state.setAddingRules(false);
        }

+        // Clearly we've encountered a new user agent directive, hence we need to start
+        // processing until we are finished.
+        state.setFinishedAgentFields(false);
+
        // Handle the case when there are multiple target names are passed
        // We assume we should do case-insensitive comparison of target name.
        String[] targetNames = state.getTargetName().toLowerCase(Locale.ROOT).split(",");
@ -621,21 +608,24 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
            // TODO KKr - catch case of multiple names, log as non-standard.
            String[] agentNames = token.getData().split("[ \t,]");
            for (String agentName : agentNames) {
-                // TODO should we do case-insensitive matching? Probably yes.
                agentName = agentName.trim().toLowerCase(Locale.ROOT);
                if (agentName.isEmpty()) {
                    // Ignore empty names
-                } else if (agentName.equals("*") && !state.isMatchedWildcard()) {
+                } else if (agentName.equals("*") && !state.isMatchedRealName()) {
                    state.setMatchedWildcard(true);
                    state.setAddingRules(true);
                } else {
-                    // TODO use longest match as winner
                    for (String targetName : targetNameSplits) {
                        if (targetName.startsWith(agentName)) {
+                            // In case we previously hit a wildcard rule match
+                            // but we could also have hit our own user agent before.
+                            // only clear if wildcard was matched earlier
+                            if (state.isMatchedWildcard()) {
+                                state.clearRules();
+                            }
                            state.setMatchedRealName(true);
                            state.setAddingRules(true);
-                            // In case we previously hit a wildcard rule match
-                            state.clearRules();
+                            state.setMatchedWildcard(false);
                            break;
                        }
                    }
@ -644,6 +634,17 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
        }
    }

+    /**
+     * Add any uniform rules to clean up path directives
+     * @param path
+     * @return clean path
+     */
+    private String normalizePathDirective(String path) {
+        path = path.trim();
+
+        return path;
+    }
+
    /**
     * Handle the disallow: directive
     * 
@ -653,12 +654,6 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
     *            data for directive
     */
    private void handleDisallow(ParseState state, RobotToken token) {
-        if (state.isSkipAgents()) {
-            return;
-        }
-
-        state.setFinishedAgentFields(true);
-
        if (!state.isAddingRules()) {
            return;
        }
@ -667,7 +662,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {

        try {
            path = URLDecoder.decode(path, "UTF-8");
-
+            path = normalizePathDirective(path);
            if (path.length() == 0) {
                // Disallow: <nothing> => allow all.
                state.clearRules();
@ -688,12 +683,6 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
     *            data for directive
     */
    private void handleAllow(ParseState state, RobotToken token) {
-        if (state.isSkipAgents()) {
-            return;
-        }
-
-        state.setFinishedAgentFields(true);
-
        if (!state.isAddingRules()) {
            return;
        }
@ -702,6 +691,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {

        try {
            path = URLDecoder.decode(path, "UTF-8");
+            path = normalizePathDirective(path);
        } catch (Exception e) {
            reportWarning(state, "Error parsing robots rules - can't decode path: {}", path);
        }
@ -723,12 +713,6 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
     *            data for directive
     */
    private void handleCrawlDelay(ParseState state, RobotToken token) {
-        if (state.isSkipAgents()) {
-            return;
-        }
-
-        state.setFinishedAgentFields(true);
-
        if (!state.isAddingRules()) {
            return;
        }
--- a/src/test/java/crawlercommons/robots/SimpleRobotRulesParserTest.java
+++ b/src/test/java/crawlercommons/robots/SimpleRobotRulesParserTest.java
@ -287,27 +287,28 @@ public class SimpleRobotRulesParserTest {
    }

    @ParameterizedTest
-    @CsvSource({ "False, http://www.domain.com/a",//
-                    "False, http://www.domain.com/a/",//
-                    "False, http://www.domain.com/a/bloh/foo.html",//
-                    "True, http://www.domain.com/b",//
-                    "False, http://www.domain.com/b/a",//
-                    "False, http://www.domain.com/b/a/index.html",//
-                    "True, http://www.domain.com/b/b/foo.html",//
-                    "True, http://www.domain.com/c",//
-                    "True, http://www.domain.com/c/a",//
-                    "True, http://www.domain.com/c/a/index.html",//
-                    "True, http://www.domain.com/c/b/foo.html",//
-                    "True, http://www.domain.com/d",//
-                    "True, http://www.domain.com/d/a",//
-                    "True, http://www.domain.com/e/a/index.html",//
-                    "True, http://www.domain.com/e/d",//
-                    "True, http://www.domain.com/e/d/foo.html",//
-                    "True, http://www.domain.com/e/doh.html",//
-                    "True, http://www.domain.com/f/index.html",//
-                    "True, http://www.domain.com/foo/bar/baz.html",//
-                    "True, http://www.domain.com/f/" })
-    void testNutchCases(boolean isAllowed, String urlStr) {
+    @CsvSource({
+            "False, False, http://www.domain.com/a",//
+                    "False, False, http://www.domain.com/a/",//
+                    "False, False, http://www.domain.com/a/bloh/foo.html",//
+                    "True, True, http://www.domain.com/b",//
+                    "False, False, http://www.domain.com/b/a",//
+                    "False, False, http://www.domain.com/b/a/index.html",//
+                    "True, True, http://www.domain.com/b/b/foo.html",//
+                    "True, True, http://www.domain.com/c",//
+                    "True, True, http://www.domain.com/c/a",//
+                    "True, True, http://www.domain.com/c/a/index.html",//
+                    "True, True, http://www.domain.com/c/b/foo.html",//
+                    "True, False, http://www.domain.com/d",//
+                    "True, False, http://www.domain.com/d/a",//
+                    "True, True, http://www.domain.com/e/a/index.html",//
+                    "True, True, http://www.domain.com/e/d",//
+                    "True, False, http://www.domain.com/e/d/foo.html",//
+                    "True, True, http://www.domain.com/e/doh.html",//
+                    "True, True, http://www.domain.com/f/index.html",//
+                    "True, False, http://www.domain.com/foo/bar/baz.html",//
+                    "True, True, http://www.domain.com/f/" })
+    void testNutchCases(boolean isAllowed, boolean isMergeAllowed, String urlStr) {
        // Run through the Nutch test cases.
        final String nutchRobotsTxt = "User-Agent: Agent1 #foo" + CR + "Disallow: /a" + CR + "Disallow: /b/a" + CR + "#Disallow: /c" + CR + "" + CR + "" + CR + "User-Agent: Agent2 Agent3#foo" + CR
                        + "User-Agent: Agent4" + CR + "Disallow: /d" + CR + "Disallow: /e/d/" + CR + "" + CR + "User-Agent: *" + CR + "Disallow: /foo/bar/" + CR;
@ -317,11 +318,11 @@ public class SimpleRobotRulesParserTest {
        rules = createRobotRules("Agent1", nutchRobotsTxt);
        assertEquals(isAllowed, rules.isAllowed(urlStr));

-        // Note that the SimpleRobotRulesParser only parses the rule set of the
-        // first matching agent name. For the following example, the parser
-        // returns only the rules matching 'Agent1'.
+
+
+        // Note that SimpleRobotRulesParser now merges all matching user agent rules.
        rules = createRobotRules("Agent5,Agent2,Agent1,Agent3,*", nutchRobotsTxt);
-        assertEquals(isAllowed, rules.isAllowed(urlStr));
+        assertEquals(isMergeAllowed, rules.isAllowed(urlStr));
    }

    @ParameterizedTest
@ -477,7 +478,13 @@ public class SimpleRobotRulesParserTest {
    @Test
    void testMultiWildcard() {
        // Make sure we only take the first wildcard entry.
-        final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow: /index.html" + CRLF + "Allow: /" + CRLF + CRLF + "User-agent: *" + CRLF + "Disallow: /";
+        final String simpleRobotsTxt =
+                "User-agent: *" + CRLF +
+                "Disallow: /index.html" + CRLF +
+                "Allow: /" + CRLF +
+                CRLF +
+                "User-agent: *" + CRLF +
+                "Disallow: /";

        BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt);
        assertFalse(rules.isAllowed("http://www.domain.com/index.html"));
@ -713,6 +720,31 @@ public class SimpleRobotRulesParserTest {
        assertFalse(rules.isAllowed("http://domain.com/bot-trap/"), "many-user-agents");
    }

+    @Test
+    void testManyUserAgentsMergeRules() throws Exception {
+        BaseRobotRules rules = createRobotRules("wget", readFile("/robots/merge-rules.txt"));
+        assertTrue(rules.isAllowed("http://domain.com/foo/bar/bar.txt"), "merge-rules");
+        assertFalse(rules.isAllowed("http://domain.com/tmp/bar/bar.txt"), "merge-rules");
+        assertTrue(rules.isAllowed("http://domain.com/chk/bar/bar.txt"), "merge-rules");
+    }
+
+    @Test
+    void testPrecedenceOfRules() throws Exception {
+        BaseRobotRules rules = createRobotRules("wget", readFile("/robots/precedence-of-rules.txt"));
+
+        assertTrue(rules.isAllowed("http://domain.com/"), "precedence-of-rules");
+        assertTrue(rules.isAllowed("http://domain.com/foo/bar"), "precedence-of-rules");
+        assertFalse(rules.isAllowed("http://domain.com/bar/foo"), "precedence-of-rules");
+
+        rules = createRobotRules("testy", readFile("/robots/precedence-of-rules.txt"));
+
+        assertTrue(rules.isAllowed("http://domain.com/"), "precedence-of-rules");
+        assertTrue(rules.isAllowed("http://domain.com/foo/bar"), "precedence-of-rules");
+        assertFalse(rules.isAllowed("http://domain.com/foo/blue"), "precedence-of-rules");
+        assertFalse(rules.isAllowed("http://domain.com/foo/"), "precedence-of-rules");
+        assertFalse(rules.isAllowed("http://domain.com/foo/tar"), "precedence-of-rules");
+    }
+
    @Test
    void testMalformedPathInRobotsFile() throws Exception {
        BaseRobotRules rules = createRobotRules("bot1", readFile("/robots/malformed-path.txt"));
--- a/src/test/resources/robots/merge-rules.txt
+++ b/src/test/resources/robots/merge-rules.txt
@ -0,0 +1,16 @@
+User-Agent: testy
+Disallow: /
+
+User-Agent: curl
+Disallow: /
+
+User-Agent: wget
+Allow: /foo/bar
+
+User-Agent: *
+User-Agent: wget
+Disallow: /foo/bar
+Disallow: /tmp/bar
+Allow: /ckh/bar
+
+
--- a/src/test/resources/robots/precedence-of-rules.txt
+++ b/src/test/resources/robots/precedence-of-rules.txt
@ -0,0 +1,15 @@
+User-Agent: testy
+Disallow: /foo
+
+User-Agent: wget
+Allow: /
+Allow: /foo
+Allow: /index.html
+Disallow: /*
+
+User-Agent: testy
+Allow: /foo/bar
+Disallow: /foo/b*
+
+
+