mirror of
https://github.com/crawler-commons/crawler-commons
synced 2024-05-04 14:36:04 +02:00
[Robots.txt] Add units test based on examples in RFC 9309
This commit is contained in:
parent
7a95069f0e
commit
6523fd29ed
|
@ -291,7 +291,19 @@ public class SimpleRobotRulesParserTest {
|
|||
"False, /search/%2a/, https://www.example.com/search/%2a/", //
|
||||
"False, /search/%2a/, https://www.example.com/search/*/", //
|
||||
"False, /search/*/, https://www.example.com/search/foobar/", //
|
||||
// examples from RFC 9309
|
||||
// examples from RFC 9309, 2.2.2. The "Allow" and "Disallow" Lines
|
||||
// https://www.rfc-editor.org/rfc/rfc9309.html#name-the-allow-and-disallow-line
|
||||
"False, /foo/bar?baz=quz, https://www.example.com/foo/bar?baz=quz", //
|
||||
// See the comment in https://github.com/google/robotstxt/blob/master/robots_test.cc
|
||||
// "Percent encoding URIs in the rules is unnecessary."
|
||||
// and "/foo/bar?baz=http://foo.bar stays unencoded."
|
||||
"False, /foo/bar?baz=https://foo.bar, https://www.example.com/foo/bar?baz=https://foo.bar", //
|
||||
"False, /foo/bar?baz=https%3A%2F%2Ffoo.bar, https://www.example.com/foo/bar?baz=https%3A%2F%2Ffoo.bar", //
|
||||
"False, /foo/bar/\u30C4, https://www.example.com/foo/bar/%E3%83%84", //
|
||||
"False, /foo/bar/%E3%83%84, https://www.example.com/foo/bar/%E3%83%84", //
|
||||
"False, /foo/bar/%62%61%7A, https://www.example.com/foo/bar/baz", //
|
||||
// examples from RFC 9309, 2.2.3. Special Characters
|
||||
// https://www.rfc-editor.org/rfc/rfc9309.html#name-special-characters
|
||||
"False, /path/file-with-a-%2A.html, https://www.example.com/path/file-with-a-*.html", //
|
||||
"True, /path/file-with-a-%2A.html, https://www.example.com/path/file-with-a-foo.html", //
|
||||
"False, /path/file-with-a-%2A.html, https://www.example.com/path/file-with-a-%2A.html", //
|
||||
|
@ -1189,6 +1201,69 @@ public class SimpleRobotRulesParserTest {
|
|||
assertFalse(rules.isAllowed("https://example.org/other/page.html"));
|
||||
}
|
||||
|
||||
@Test
|
||||
void testExamplesRobotsTxtRFC9309() throws Exception {
|
||||
byte[] robotstxt = readFile("/robots/rfc9309-example-simple-robots.txt");
|
||||
|
||||
BaseRobotRules rules = createRobotRules("foobot", robotstxt);
|
||||
assertEquals(3, ((SimpleRobotRules) rules).getRobotRules().size());
|
||||
assertTrue(rules.isAllowed("https://example.org/example/page.html"));
|
||||
assertTrue(rules.isAllowed("https://example.org/example/allowed.gif"));
|
||||
assertFalse(rules.isAllowed("https://example.org/"));
|
||||
assertFalse(rules.isAllowed("https://example.org/path/index.html"));
|
||||
|
||||
rules = createRobotRules("barbot", robotstxt);
|
||||
assertEquals(1, ((SimpleRobotRules) rules).getRobotRules().size());
|
||||
assertTrue(rules.isAllowed("https://example.org/"));
|
||||
assertFalse(rules.isAllowed("https://example.org/example/page.html"));
|
||||
rules = createRobotRules("bazbot", robotstxt);
|
||||
assertEquals(1, ((SimpleRobotRules) rules).getRobotRules().size());
|
||||
assertTrue(rules.isAllowed("https://example.org/"));
|
||||
assertFalse(rules.isAllowed("https://example.org/example/page.html"));
|
||||
|
||||
rules = createRobotRules("quxbot", robotstxt);
|
||||
assertEquals(0, ((SimpleRobotRules) rules).getRobotRules().size());
|
||||
assertTrue(rules.isAllowed("https://example.org/"));
|
||||
assertTrue(rules.isAllowed("https://example.org/example/page.html"));
|
||||
|
||||
rules = createRobotRules("anyotherbot", robotstxt);
|
||||
assertEquals(3, ((SimpleRobotRules) rules).getRobotRules().size());
|
||||
assertTrue(rules.isAllowed("https://example.org/publications/doc1.html"));
|
||||
assertFalse(rules.isAllowed("https://example.org/example/page.html"));
|
||||
assertFalse(rules.isAllowed("https://example.org/example.gif"));
|
||||
assertTrue(rules.isAllowed("https://example.org/")); // implicitly allowed
|
||||
|
||||
robotstxt = readFile("/robots/rfc9309-example-longest-match-robots.txt");
|
||||
rules = createRobotRules("foobot", robotstxt);
|
||||
assertTrue(rules.isAllowed("https://example.org/example/page/"));
|
||||
assertTrue(rules.isAllowed("https://example.org/example/page/index.html"));
|
||||
assertFalse(rules.isAllowed("https://example.org/example/page/disallowed.gif"));
|
||||
assertTrue(rules.isAllowed("https://example.org/")); // implicitly allowed
|
||||
|
||||
robotstxt = readFile("/robots/rfc9309-example-rule-group-merging.txt");
|
||||
rules = createRobotRules("examplebot", robotstxt);
|
||||
assertEquals(3, ((SimpleRobotRules) rules).getRobotRules().size());
|
||||
assertFalse(rules.isAllowed("https://example.org/foo"));
|
||||
assertFalse(rules.isAllowed("https://example.org/bar"));
|
||||
assertFalse(rules.isAllowed("https://example.org/baz"));
|
||||
assertTrue(rules.isAllowed("https://example.org/")); // implicitly allowed
|
||||
|
||||
rules = createRobotRules("anyotherbot", robotstxt);
|
||||
assertEquals(2, ((SimpleRobotRules) rules).getRobotRules().size());
|
||||
assertFalse(rules.isAllowed("https://example.org/foo"));
|
||||
assertFalse(rules.isAllowed("https://example.org/bar"));
|
||||
assertTrue(rules.isAllowed("https://example.org/baz"));
|
||||
assertTrue(rules.isAllowed("https://example.org/")); // implicitly allowed
|
||||
|
||||
rules = createRobotRules("bazbot", robotstxt);
|
||||
assertEquals(1, ((SimpleRobotRules) rules).getRobotRules().size());
|
||||
assertTrue(rules.isAllowed("https://example.org/foo"));
|
||||
assertTrue(rules.isAllowed("https://example.org/bar"));
|
||||
assertFalse(rules.isAllowed("https://example.org/baz"));
|
||||
assertTrue(rules.isAllowed("https://example.org/")); // implicitly allowed
|
||||
}
|
||||
|
||||
|
||||
private byte[] readFile(String filename) throws Exception {
|
||||
byte[] bigBuffer = new byte[100000];
|
||||
InputStream is = SimpleRobotRulesParserTest.class.getResourceAsStream(filename);
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
# RFC 9309, section 5.2. Longest Match
|
||||
# https://www.rfc-editor.org/rfc/rfc9309.html#section-5.2
|
||||
|
||||
User-Agent: foobot
|
||||
Allow: /example/page/
|
||||
Disallow: /example/page/disallowed.gif
|
|
@ -0,0 +1,16 @@
|
|||
# RFC 9309, section 2.2.1. The User-Agent Line
|
||||
# https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.1
|
||||
|
||||
user-agent: ExampleBot
|
||||
disallow: /foo
|
||||
disallow: /bar
|
||||
|
||||
user-agent: ExampleBot
|
||||
disallow: /baz
|
||||
|
||||
user-agent: *
|
||||
disallow: /foo
|
||||
disallow: /bar
|
||||
|
||||
user-agent: BazBot
|
||||
disallow: /baz
|
|
@ -0,0 +1,18 @@
|
|||
# RFC 9309, section 5.1. Simple Example
|
||||
# https://www.rfc-editor.org/rfc/rfc9309.html#section-5.1
|
||||
|
||||
User-Agent: *
|
||||
Disallow: *.gif$
|
||||
Disallow: /example/
|
||||
Allow: /publications/
|
||||
|
||||
User-Agent: foobot
|
||||
Disallow:/
|
||||
Allow:/example/page.html
|
||||
Allow:/example/allowed.gif
|
||||
|
||||
User-Agent: barbot
|
||||
User-Agent: bazbot
|
||||
Disallow: /example/page.html
|
||||
|
||||
User-Agent: quxbot
|
Loading…
Reference in New Issue