1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-05-03 22:26:15 +02:00

[Robots.txt] Empty disallow statement not to clear other rules, fixes #422 (#424)

This commit is contained in:
Sebastian Nagel 2023-07-11 15:47:33 +02:00 committed by GitHub
parent 7ae8617563
commit de7221dafc
Signed by: GitHub
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 46 additions and 4 deletions

View File

@ -903,8 +903,20 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
try {
path = normalizePathDirective(path);
if (path.length() == 0) {
// Disallow: <nothing> => allow all.
state.clearRules();
/*
* "Disallow: <nothing>" meant "allow all" in the 1996 REP RFC
* draft because there was no "Allow" directive.
*
* RFC 9309 allows empty patterns in the formal syntax
* (https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2). No
* specific handling of empty patterns is defined, as shortest
* pattern they are matched with lowest priority.
*
* Adding an extra rule with an empty pattern would have no
* effect, because an "allow all" with lowest priority means
* "allow everything else" which is the default anyway. So, we
* ignore the empty disallow statement.
*/
} else {
state.addRule(path, false);
}
@ -935,8 +947,11 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
}
if (path.length() == 0) {
// Allow: <nothing> => allow all.
state.clearRules();
/*
* Allow: <nothing> => allow all.
*
* See handleDisallow(...): We ignore the empty allow statement.
*/
} else {
state.addRule(path, true);
}

View File

@ -735,6 +735,33 @@ public class SimpleRobotRulesParserTest {
assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
}
@Test
void testEmptyDisallowLowestPrecedence() {
String robotsTxt = "User-agent: *" + CRLF //
+ "Disallow: /disallowed/" + CRLF //
+ "Allow: /allowed/" + CRLF //
+ "Disallow: ";
BaseRobotRules rules = createRobotRules("anybot", robotsTxt);
assertTrue(rules.isAllowed("http://www.example.com/"));
assertTrue(rules.isAllowed("http://www.example.com/anypage.html"));
assertFalse(rules.isAllowed("http://www.example.com/disallowed/index.html"));
assertTrue(rules.isAllowed("http://www.example.com/allowed/index.html"));
// with merged groups
robotsTxt = "User-agent: *" + CRLF //
+ "Disallow: /disallowed/" + CRLF //
+ "Allow: /allowed/" + CRLF //
+ "User-agent: *" + CRLF //
+ "Disallow: ";
rules = createRobotRules("anybot", robotsTxt);
assertTrue(rules.isAllowed("http://www.example.com/"));
assertTrue(rules.isAllowed("http://www.example.com/anypage.html"));
assertFalse(rules.isAllowed("http://www.example.com/disallowed/index.html"));
assertTrue(rules.isAllowed("http://www.example.com/allowed/index.html"));
}
@Test
void testMultiWildcard() {
// Make sure we only take the first wildcard entry.