From de7221dafc52fb8a03be60a07133e8e96719f70d Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 11 Jul 2023 15:47:33 +0200 Subject: [PATCH] [Robots.txt] Empty disallow statement not to clear other rules, fixes #422 (#424) --- .../robots/SimpleRobotRulesParser.java | 23 +++++++++++++--- .../robots/SimpleRobotRulesParserTest.java | 27 +++++++++++++++++++ 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java b/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java index f0083d3..f7443f2 100644 --- a/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java +++ b/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java @@ -903,8 +903,20 @@ public class SimpleRobotRulesParser extends BaseRobotsParser { try { path = normalizePathDirective(path); if (path.length() == 0) { - // Disallow: => allow all. - state.clearRules(); + /* + * "Disallow: " meant "allow all" in the 1996 REP RFC + * draft because there was no "Allow" directive. + * + * RFC 9309 allows empty patterns in the formal syntax + * (https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2). No + * specific handling of empty patterns is defined, as shortest + * pattern they are matched with lowest priority. + * + * Adding an extra rule with an empty pattern would have no + * effect, because an "allow all" with lowest priority means + * "allow everything else" which is the default anyway. So, we + * ignore the empty disallow statement. + */ } else { state.addRule(path, false); } @@ -935,8 +947,11 @@ public class SimpleRobotRulesParser extends BaseRobotsParser { } if (path.length() == 0) { - // Allow: => allow all. - state.clearRules(); + /* + * Allow: => allow all. + * + * See handleDisallow(...): We ignore the empty allow statement. + */ } else { state.addRule(path, true); } diff --git a/src/test/java/crawlercommons/robots/SimpleRobotRulesParserTest.java b/src/test/java/crawlercommons/robots/SimpleRobotRulesParserTest.java index c79924c..18ef40e 100644 --- a/src/test/java/crawlercommons/robots/SimpleRobotRulesParserTest.java +++ b/src/test/java/crawlercommons/robots/SimpleRobotRulesParserTest.java @@ -735,6 +735,33 @@ public class SimpleRobotRulesParserTest { assertTrue(rules.isAllowed("http://www.domain.com/anypage.html")); } + @Test + void testEmptyDisallowLowestPrecedence() { + String robotsTxt = "User-agent: *" + CRLF // + + "Disallow: /disallowed/" + CRLF // + + "Allow: /allowed/" + CRLF // + + "Disallow: "; + + BaseRobotRules rules = createRobotRules("anybot", robotsTxt); + assertTrue(rules.isAllowed("http://www.example.com/")); + assertTrue(rules.isAllowed("http://www.example.com/anypage.html")); + assertFalse(rules.isAllowed("http://www.example.com/disallowed/index.html")); + assertTrue(rules.isAllowed("http://www.example.com/allowed/index.html")); + + // with merged groups + robotsTxt = "User-agent: *" + CRLF // + + "Disallow: /disallowed/" + CRLF // + + "Allow: /allowed/" + CRLF // + + "User-agent: *" + CRLF // + + "Disallow: "; + + rules = createRobotRules("anybot", robotsTxt); + assertTrue(rules.isAllowed("http://www.example.com/")); + assertTrue(rules.isAllowed("http://www.example.com/anypage.html")); + assertFalse(rules.isAllowed("http://www.example.com/disallowed/index.html")); + assertTrue(rules.isAllowed("http://www.example.com/allowed/index.html")); + } + @Test void testMultiWildcard() { // Make sure we only take the first wildcard entry.