mirror of
https://github.com/crawler-commons/crawler-commons
synced 2024-05-03 22:26:15 +02:00
[Robots.txt] Handle robots.txt with missing sections (and implicit master rules)
- add unit test to verify solution of #114
This commit is contained in:
parent
86109c029a
commit
a3900425f3
|
@ -27,6 +27,8 @@ import java.util.Arrays;
|
|||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Set;
|
||||
|
||||
import static java.nio.charset.StandardCharsets.US_ASCII;
|
||||
|
@ -921,6 +923,31 @@ public class SimpleRobotRulesParserTest {
|
|||
assertTrue(rules.isAllowed("http://www.krugle.com/examples/index.html"));
|
||||
}
|
||||
|
||||
/** Test selection of Crawl-delay directives in robots.txt (see #114) */
|
||||
@Test
|
||||
void testSelectCrawlDelayGroup() {
|
||||
final String robotsTxt = "User-Agent: bingbot" + CRLF //
|
||||
+ "Crawl-delay: 1" + CRLF //
|
||||
+ "User-Agent: msnbot" + CRLF //
|
||||
+ "Crawl-delay: 2" + CRLF //
|
||||
+ "User-agent: *" + CRLF //
|
||||
+ "Disallow: /login" + CRLF //
|
||||
+ "Sitemap: http://www.example.com/sitemap.xml" + CRLF //
|
||||
+ "Disallow: /assets/";
|
||||
|
||||
// test for specific Crawl-delay values but same set of rules
|
||||
Map<String, Long> expectedCrawlDelays = Map.of("bingbot", 1000L, "msnbot", 2000L, "anybot", BaseRobotRules.UNSET_CRAWL_DELAY);
|
||||
List<String> sitemaps = List.of("http://www.example.com/sitemap.xml");
|
||||
for (Entry<String, Long> e : expectedCrawlDelays.entrySet()) {
|
||||
BaseRobotRules rules = createRobotRules(e.getKey(), robotsTxt);
|
||||
assertEquals(e.getValue(), rules.getCrawlDelay());
|
||||
assertFalse(rules.isAllowed("http://www.example.com/login"), "Disallow path /login for all agents");
|
||||
assertFalse(rules.isAllowed("http://www.example.com/assets/"), "Disallow path /login for all agents");
|
||||
assertTrue(rules.isAllowed("http://www.example.com/"), "Implicitly allowed");
|
||||
assertEquals(sitemaps, rules.getSitemaps());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
void testRobotsWithUTF8BOM() throws Exception {
|
||||
BaseRobotRules rules = createRobotRules("foobot", readFile("/robots/robots-with-utf8-bom.txt"));
|
||||
|
|
Loading…
Reference in New Issue