Merge pull request #430 from sebastian-nagel/cc-390-114-robots-closing-rule-group

[Robots.txt] Close groups of rules as defined in RFC 9309
2024-05-03 22:26:15 +02:00 · 2023-07-12 10:35:48 +02:00 · 2023-07-12 10:35:48 +02:00 · 871e4e61d2
parent d685bafb2d e67299432c
commit 871e4e61d2
3 changed files with 151 additions and 18 deletions
--- a/src/main/java/crawlercommons/robots/BaseRobotRules.java
+++ b/src/main/java/crawlercommons/robots/BaseRobotRules.java
@ -47,10 +47,20 @@ public abstract class BaseRobotRules implements Serializable {
        _sitemaps = new LinkedHashSet<>();
    }

+    /**
+     * Get Crawl-delay (in milliseconds)
+     * 
+     * @return Crawl-delay defined in the robots.txt for the given agent name,
+     *         or {@link UNSET_CRAWL_DELAY} if not defined.
+     */
    public long getCrawlDelay() {
        return _crawlDelay;
    }

+    /**
+     * @param crawlDelay
+     *            Crawl-Delay in milliseconds
+     */
    public void setCrawlDelay(long crawlDelay) {
        _crawlDelay = crawlDelay;
    }
--- a/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java
+++ b/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java
@ -207,12 +207,23 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
         * agent.
         */
        private boolean _addingRules;
+        /**
+         * Flag that is true as it is allowed to add a Crawl-delay for the given
+         * agent.
+         */
+        private boolean _addingCrawlDelay;
        /**
         * Flag indicating whether all consecutive agent fields has been seen.
         * It is set to false if an agent field is found and back to true if
         * something else has been found.
         */
        private boolean _finishedAgentFields;
+        /**
+         * Flag indicating whether the Crawl-delay was set for the given agent
+         * name (false means either not set at all or set for the wildcard
+         * agent).
+         */
+        private boolean _crawlDelaySetRealName;

        /*
         * Counter of warnings reporting invalid rules/lines in the robots.txt
@ -259,6 +270,14 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
            _addingRules = addingRules;
        }

+        public boolean isAddingCrawlDelay() {
+            return _addingCrawlDelay;
+        }
+
+        public void setAddingCrawlDelay(boolean addingCrawlDelay) {
+            _addingCrawlDelay = addingCrawlDelay;
+        }
+
        public boolean isFinishedAgentFields() {
            return _finishedAgentFields;
        }
@ -275,8 +294,64 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
            _curRules.addRule(prefix, allow);
        }

+        /**
+         * Set the Crawl-delay given the current parse state.
+         * 
+         * <p>
+         * The <a href=
+         * "https://en.wikipedia.org/wiki/Robots.txt#Crawl-delay_directive">Crawl
+         * -delay</a> is not part of RFC 9309 treating it (same as the Sitemap
+         * directive) as <a href=
+         * "https://www.rfc-editor.org/rfc/rfc9309.html#name-other-records">other
+         * records</a> and specifies: <cite>Parsing of other records MUST NOT
+         * interfere with the parsing of explicitly defined records</cite>.
+         * </p>
+         * 
+         * This methods sets the Crawl-delay in the robots rules
+         * ({@link BaseRobotRules#setCrawlDelay(long)})
+         * <ul>
+         * <li>always if the given agent name was matched in the current
+         * group</li>
+         * <li>for wildcard agent groups, only if the crawl-delay wasn't already
+         * defined for the given agent</li>
+         * </ul>
+         * 
+         * <p>
+         * In case of a multiple defined crawl-delay, resolve the conflict and
+         * override the current value if the value is shorter than the already
+         * defined one. But do not override a Crawl-delay defined for a specific
+         * agent by a value defined for the wildcard agent.
+         * </p>
+         */
        public void setCrawlDelay(long delay) {
-            _curRules.setCrawlDelay(delay);
+            if (_matchedRealName) {
+                // set crawl-delay for the given agent name
+                if (_crawlDelaySetRealName && _addingCrawlDelay) {
+                    /*
+                     * Crawl-delay defined twice for the same agent or defined
+                     * for more than one of multiple agent names.
+                     * 
+                     * Resolve conflict and select shortest Crawl-delay.
+                     */
+                    if (_curRules.getCrawlDelay() > delay) {
+                        _curRules.setCrawlDelay(delay);
+                    }
+                } else {
+                    _curRules.setCrawlDelay(delay);
+                }
+                _crawlDelaySetRealName = true;
+            } else if (_curRules.getCrawlDelay() == BaseRobotRules.UNSET_CRAWL_DELAY) {
+                // not set yet
+                _curRules.setCrawlDelay(delay);
+            } else if (_curRules.getCrawlDelay() > delay) {
+                // Crawl-delay defined twice for the wildcard agent.
+                // Resolve conflict and select shortest Crawl-delay.
+                _curRules.setCrawlDelay(delay);
+            }
+        }
+
+        public void clearCrawlDelay() {
+            _curRules.setCrawlDelay(BaseRobotRules.UNSET_CRAWL_DELAY);
        }

        public SimpleRobotRules getRobotRules() {
@ -689,36 +764,28 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
                    break;

                case CRAWL_DELAY:
-                parseState.setFinishedAgentFields(true);
                handleCrawlDelay(parseState, token);
+                parseState.setAddingCrawlDelay(false);
                    break;

                case SITEMAP:
-                parseState.setFinishedAgentFields(true);
                handleSitemap(parseState, token);
                    break;

                case HTTP:
-                parseState.setFinishedAgentFields(true);
                handleHttp(parseState, token);
                    break;

                case UNKNOWN:
                reportWarning(parseState, "Unknown directive in robots.txt file: {}", line);
-                parseState.setFinishedAgentFields(true);
                    break;

                case MISSING:
                reportWarning(parseState, "Unknown line in robots.txt file (size {}): {}", content.length, line);
-                parseState.setFinishedAgentFields(true);
                    break;

                default:
                    // All others we just ignore
-                    // TODO KKr - which of these should be setting
-                    // finishedAgentFields to true?
-                    // TODO KKr - handle no-index
-                    // TODO KKr - handle request-rate and visit-time
                    break;
            }
        }
@ -790,9 +857,10 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
     */
    private void handleUserAgent(ParseState state, RobotToken token) {
        // If we are adding rules, and had already finished reading user agent
-        // fields, then we are in a new section, hence stop adding rules
+        // fields, then we are in a new section, hence stop adding rules.
        if (state.isAddingRules() && state.isFinishedAgentFields()) {
            state.setAddingRules(false);
+            state.setAddingCrawlDelay(false);
        }

        // Clearly we've encountered a new user agent directive, hence we need to start
@ -810,14 +878,17 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
            } else if (agentName.equals("*") && !state.isMatchedRealName()) {
                state.setMatchedWildcard(true);
                state.setAddingRules(true);
+                state.setAddingCrawlDelay(true);
            } else if (targetNames.contains(agentName) || (!isValidUserAgentToObey(agentName) && userAgentProductTokenPartialMatch(agentName, targetNames))) {
                if (state.isMatchedWildcard()) {
-                    // Clear rules of the wildcard user-agent found
-                    // before the non-wildcard user-agent match.
+                    // Clear rules and Crawl-delay of the wildcard user-agent
+                    // found before the non-wildcard user-agent match.
                    state.clearRules();
+                    state.clearCrawlDelay();
                }
                state.setMatchedRealName(true);
                state.setAddingRules(true);
+                state.setAddingCrawlDelay(true);
                state.setMatchedWildcard(false);
            }
        } else {
@ -834,6 +905,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
            if (agentNameFull.equals("*") && !state.isMatchedRealName()) {
                state.setMatchedWildcard(true);
                state.setAddingRules(true);
+                state.setAddingCrawlDelay(true);
            } else if (userAgentProductTokenPartialMatch(agentNameFull, targetNames)) {
                // match "butterfly" in the line "User-agent: Butterfly/1.0"
                matched = true;
@ -854,12 +926,14 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
            }
            if (matched) {
                if (state.isMatchedWildcard()) {
-                    // Clear rules of the wildcard user-agent found
-                    // before the non-wildcard user-agent match.
+                    // Clear rules and Crawl-delay of the wildcard user-agent
+                    // found before the non-wildcard user-agent match.
                    state.clearRules();
+                    state.clearCrawlDelay();
                }
                state.setMatchedRealName(true);
                state.setAddingRules(true);
+                state.setAddingCrawlDelay(true);
                state.setMatchedWildcard(false);
            }
        }
@ -967,7 +1041,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
     *            data for directive
     */
    private void handleCrawlDelay(ParseState state, RobotToken token) {
-        if (!state.isAddingRules()) {
+        if (!state.isAddingCrawlDelay()) {
            return;
        }

@ -987,6 +1061,8 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
                reportWarning(state, "Error parsing robots rules - can't decode crawl delay: {}", delayString);
            }
        }
+
+        state.setAddingCrawlDelay(false);
    }

    /**
--- a/src/test/java/crawlercommons/robots/SimpleRobotRulesParserTest.java
+++ b/src/test/java/crawlercommons/robots/SimpleRobotRulesParserTest.java
@ -27,6 +27,8 @@ import java.util.Arrays;
 import java.util.Collection;
 import java.util.List;
 import java.util.Locale;
+import java.util.Map;
+import java.util.Map.Entry;
 import java.util.Set;

 import static java.nio.charset.StandardCharsets.US_ASCII;
@ -942,10 +944,55 @@ public class SimpleRobotRulesParserTest {
                        + "User-agent: qihoobot" + CR //
                        + "Disallow: /";

+        // Note: In compliance with RFC 9309, for Googlebot a specific
+        // Crawl-delay is set. However, Googlebot is not allowed to crawl any
+        // page (same as qihoobot).
        BaseRobotRules rules = createRobotRules("googlebot/2.1", krugleRobotsTxt, false);
-        assertTrue(rules.isAllowed("http://www.krugle.com/examples/index.html"));
+        assertEquals(1000L, rules.getCrawlDelay());
+        assertFalse(rules.isAllowed("http://www.krugle.com/examples/index.html"));
+        assertFalse(rules.isAllowed("http://www.krugle.com/"));
+
        rules = createRobotRules("googlebot", krugleRobotsTxt, true);
-        assertTrue(rules.isAllowed("http://www.krugle.com/examples/index.html"));
+        assertEquals(1000L, rules.getCrawlDelay());
+        assertFalse(rules.isAllowed("http://www.krugle.com/examples/index.html"));
+        assertFalse(rules.isAllowed("http://www.krugle.com/"));
+
+        rules = createRobotRules("qihoobot", krugleRobotsTxt, true);
+        assertEquals(BaseRobotRules.UNSET_CRAWL_DELAY, rules.getCrawlDelay());
+        assertFalse(rules.isAllowed("http://www.krugle.com/examples/index.html"));
+        assertFalse(rules.isAllowed("http://www.krugle.com/"));
+
+        rules = createRobotRules("anybot", krugleRobotsTxt, true);
+        assertEquals(3000L, rules.getCrawlDelay());
+        assertFalse(rules.isAllowed("http://www.krugle.com/examples/index.html"));
+        assertTrue(rules.isAllowed("http://www.krugle.com/"));
+    }
+
+    /** Test selection of Crawl-delay directives in robots.txt (see #114) */
+    @Test
+    void testSelectCrawlDelayGroup() {
+        final String robotsTxt = "User-Agent: bingbot" + CRLF //
+                        + "Crawl-delay: 1" + CRLF + CRLF //
+                        + "User-Agent: msnbot" + CRLF //
+                        + "Crawl-delay: 2" + CRLF + CRLF //
+                        + "User-Agent: pinterestbot" + CRLF //
+                        + "Crawl-delay: 0.2" + CRLF + CRLF //
+                        + "User-agent: *" + CRLF //
+                        + "Disallow: /login" + CRLF //
+                        + "Sitemap: http://www.example.com/sitemap.xml" + CRLF //
+                        + "Disallow: /assets/";
+
+        // test for specific Crawl-delay values but same set of rules
+        Map<String, Long> expectedCrawlDelays = Map.of("bingbot", 1000L, "msnbot", 2000L, "anybot", BaseRobotRules.UNSET_CRAWL_DELAY);
+        List<String> sitemaps = List.of("http://www.example.com/sitemap.xml");
+        for (Entry<String, Long> e : expectedCrawlDelays.entrySet()) {
+            BaseRobotRules rules = createRobotRules(e.getKey(), robotsTxt);
+            assertEquals(e.getValue(), rules.getCrawlDelay(), "Crawl-delay for " + e.getKey() + " = " + e.getValue());
+            assertFalse(rules.isAllowed("http://www.example.com/login"), "Disallow path /login for all agents");
+            assertFalse(rules.isAllowed("http://www.example.com/assets/"), "Disallow path /assets for all agents");
+            assertTrue(rules.isAllowed("http://www.example.com/"), "Implicitly allowed");
+            assertEquals(sitemaps, rules.getSitemaps());
+        }
    }

    @Test