1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-05-04 14:36:04 +02:00

[Robots.txt] Clarify behavior when to close blocks of multiple user-agents

- fix unit test broken by introducing compliance with RFC 9309
This commit is contained in:
Sebastian Nagel 2023-06-16 16:35:21 +02:00
parent 4524cfb5c0
commit 17e8544980

View File

@ -917,10 +917,28 @@ public class SimpleRobotRulesParserTest {
+ "User-agent: qihoobot" + CR //
+ "Disallow: /";
// Note: In compliance with RFC 9309, for Googlebot a specific
// Crawl-delay is set. However, Googlebot is not allowed to crawl any
// page (same as qihoobot).
BaseRobotRules rules = createRobotRules("googlebot/2.1", krugleRobotsTxt, false);
assertTrue(rules.isAllowed("http://www.krugle.com/examples/index.html"));
assertEquals(1000L, rules.getCrawlDelay());
assertFalse(rules.isAllowed("http://www.krugle.com/examples/index.html"));
assertFalse(rules.isAllowed("http://www.krugle.com/"));
rules = createRobotRules("googlebot", krugleRobotsTxt, true);
assertTrue(rules.isAllowed("http://www.krugle.com/examples/index.html"));
assertEquals(1000L, rules.getCrawlDelay());
assertFalse(rules.isAllowed("http://www.krugle.com/examples/index.html"));
assertFalse(rules.isAllowed("http://www.krugle.com/"));
rules = createRobotRules("qihoobot", krugleRobotsTxt, true);
assertEquals(BaseRobotRules.UNSET_CRAWL_DELAY, rules.getCrawlDelay());
assertFalse(rules.isAllowed("http://www.krugle.com/examples/index.html"));
assertFalse(rules.isAllowed("http://www.krugle.com/"));
rules = createRobotRules("anybot", krugleRobotsTxt, true);
assertEquals(3000L, rules.getCrawlDelay());
assertFalse(rules.isAllowed("http://www.krugle.com/examples/index.html"));
assertTrue(rules.isAllowed("http://www.krugle.com/"));
}
/** Test selection of Crawl-delay directives in robots.txt (see #114) */