mirror of
https://github.com/crawler-commons/crawler-commons
synced 2024-05-03 22:26:15 +02:00
This commit is contained in:
parent
7ae8617563
commit
de7221dafc
|
@ -903,8 +903,20 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
try {
|
||||
path = normalizePathDirective(path);
|
||||
if (path.length() == 0) {
|
||||
// Disallow: <nothing> => allow all.
|
||||
state.clearRules();
|
||||
/*
|
||||
* "Disallow: <nothing>" meant "allow all" in the 1996 REP RFC
|
||||
* draft because there was no "Allow" directive.
|
||||
*
|
||||
* RFC 9309 allows empty patterns in the formal syntax
|
||||
* (https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2). No
|
||||
* specific handling of empty patterns is defined, as shortest
|
||||
* pattern they are matched with lowest priority.
|
||||
*
|
||||
* Adding an extra rule with an empty pattern would have no
|
||||
* effect, because an "allow all" with lowest priority means
|
||||
* "allow everything else" which is the default anyway. So, we
|
||||
* ignore the empty disallow statement.
|
||||
*/
|
||||
} else {
|
||||
state.addRule(path, false);
|
||||
}
|
||||
|
@ -935,8 +947,11 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
}
|
||||
|
||||
if (path.length() == 0) {
|
||||
// Allow: <nothing> => allow all.
|
||||
state.clearRules();
|
||||
/*
|
||||
* Allow: <nothing> => allow all.
|
||||
*
|
||||
* See handleDisallow(...): We ignore the empty allow statement.
|
||||
*/
|
||||
} else {
|
||||
state.addRule(path, true);
|
||||
}
|
||||
|
|
|
@ -735,6 +735,33 @@ public class SimpleRobotRulesParserTest {
|
|||
assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
|
||||
}
|
||||
|
||||
@Test
|
||||
void testEmptyDisallowLowestPrecedence() {
|
||||
String robotsTxt = "User-agent: *" + CRLF //
|
||||
+ "Disallow: /disallowed/" + CRLF //
|
||||
+ "Allow: /allowed/" + CRLF //
|
||||
+ "Disallow: ";
|
||||
|
||||
BaseRobotRules rules = createRobotRules("anybot", robotsTxt);
|
||||
assertTrue(rules.isAllowed("http://www.example.com/"));
|
||||
assertTrue(rules.isAllowed("http://www.example.com/anypage.html"));
|
||||
assertFalse(rules.isAllowed("http://www.example.com/disallowed/index.html"));
|
||||
assertTrue(rules.isAllowed("http://www.example.com/allowed/index.html"));
|
||||
|
||||
// with merged groups
|
||||
robotsTxt = "User-agent: *" + CRLF //
|
||||
+ "Disallow: /disallowed/" + CRLF //
|
||||
+ "Allow: /allowed/" + CRLF //
|
||||
+ "User-agent: *" + CRLF //
|
||||
+ "Disallow: ";
|
||||
|
||||
rules = createRobotRules("anybot", robotsTxt);
|
||||
assertTrue(rules.isAllowed("http://www.example.com/"));
|
||||
assertTrue(rules.isAllowed("http://www.example.com/anypage.html"));
|
||||
assertFalse(rules.isAllowed("http://www.example.com/disallowed/index.html"));
|
||||
assertTrue(rules.isAllowed("http://www.example.com/allowed/index.html"));
|
||||
}
|
||||
|
||||
@Test
|
||||
void testMultiWildcard() {
|
||||
// Make sure we only take the first wildcard entry.
|
||||
|
|
Loading…
Reference in New Issue