[Robots.txt] Pass empty collection of agent names to select rules for

any robot (wildcard user-agent name) - in SimpleRobotRulesParser main() - add unit test to verify that wildcard user-agent rules are selected if empty collection of agent names is passed
2024-05-03 22:26:15 +02:00 · 2023-06-15 11:17:50 +02:00 · 2023-06-15 11:17:50 +02:00 · 99289f7835
parent a5bd9645fa
commit 99289f7835
2 changed files with 49 additions and 12 deletions
--- a/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java
+++ b/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java
@ -1107,14 +1107,17 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
            System.err.println("              \ta single 'product token' as per RFC 9309.");
            System.err.println("              \tIf not defined check with '*'");
            System.err.println("  <URL>       \tcheck URL whether allowed or forbidden.");
-            System.err.println("              \tIf no URL is given show robots.txt rules");
+            System.err.println("              \tIf no URL is given show the robots.txt rules.");
            System.exit(1);
        }

        String url = args[0];
-        String agentName = "*";
+        // empty collection to select rules for wildcard user-agent (*)
+        Collection<String> agentNames = Set.of();
+        String agentName = "*"; // for logging
        if (args.length >= 2) {
-            agentName = args[1];
+            agentName = args[1].trim().toLowerCase(Locale.ROOT);
+            agentNames = Set.of(agentName);
        }

        SimpleRobotRulesParser parser = new SimpleRobotRulesParser();
@ -1127,7 +1130,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
                // sitemap paths for file:/ URLs
                url = "http://example.com/robots.txt";
            }
-            rules = parser.parseContent(url, content, "text/plain", Set.of(agentName));
+            rules = parser.parseContent(url, content, "text/plain", agentNames);
        } catch (IOException e) {
            if (connection instanceof HttpURLConnection) {
                int code = ((HttpURLConnection) connection).getResponseCode();
--- a/src/test/java/crawlercommons/robots/SimpleRobotRulesParserTest.java
+++ b/src/test/java/crawlercommons/robots/SimpleRobotRulesParserTest.java
@ -27,6 +27,7 @@ import java.util.Arrays;
 import java.util.Collection;
 import java.util.List;
 import java.util.Locale;
+import java.util.Set;

 import static java.nio.charset.StandardCharsets.US_ASCII;
 import static java.nio.charset.StandardCharsets.UTF_8;
@ -60,10 +61,18 @@ public class SimpleRobotRulesParserTest {
        return createRobotRules(crawlerNames, content.getBytes(UTF_8), exactUserAgentMatching);
    }

+    private static BaseRobotRules createRobotRules(Collection<String> crawlerNames, String content, boolean exactUserAgentMatching) {
+        return createRobotRules(crawlerNames, content.getBytes(UTF_8), exactUserAgentMatching);
+    }
+
    private static BaseRobotRules createRobotRules(String[] crawlerNames, byte[] contentBytes, boolean exactUserAgentMatching) {
+        return createRobotRules(Arrays.asList(crawlerNames), contentBytes, exactUserAgentMatching);
+    }
+
+    private static BaseRobotRules createRobotRules(Collection<String> crawlerNames, byte[] contentBytes, boolean exactUserAgentMatching) {
        SimpleRobotRulesParser robotParser = new SimpleRobotRulesParser();
        robotParser.setExactUserAgentMatching(exactUserAgentMatching);
-        return robotParser.parseContent(FAKE_ROBOTS_URL, contentBytes, "text/plain", Arrays.asList(crawlerNames));
+        return robotParser.parseContent(FAKE_ROBOTS_URL, contentBytes, "text/plain", crawlerNames);
    }

    @Test
@ -291,10 +300,12 @@ public class SimpleRobotRulesParserTest {
                    "False, /search/%2a/, https://www.example.com/search/%2a/", //
                    "False, /search/%2a/, https://www.example.com/search/*/", //
                    "False, /search/*/, https://www.example.com/search/foobar/", //
-                    // examples from RFC 9309,  2.2.2. The "Allow" and "Disallow" Lines
+                    // examples from RFC 9309, 2.2.2. The "Allow" and "Disallow"
+                    // Lines
                    // https://www.rfc-editor.org/rfc/rfc9309.html#name-the-allow-and-disallow-line
                    "False, /foo/bar?baz=quz, https://www.example.com/foo/bar?baz=quz", //
-                    // See the comment in https://github.com/google/robotstxt/blob/master/robots_test.cc
+                    // See the comment in
+                    // https://github.com/google/robotstxt/blob/master/robots_test.cc
                    // "Percent encoding URIs in the rules is unnecessary."
                    // and "/foo/bar?baz=http://foo.bar stays unencoded."
                    "False, /foo/bar?baz=https://foo.bar, https://www.example.com/foo/bar?baz=https://foo.bar", //
@ -1231,14 +1242,14 @@ public class SimpleRobotRulesParserTest {
        assertTrue(rules.isAllowed("https://example.org/publications/doc1.html"));
        assertFalse(rules.isAllowed("https://example.org/example/page.html"));
        assertFalse(rules.isAllowed("https://example.org/example.gif"));
-        assertTrue(rules.isAllowed("https://example.org/")); // implicitly allowed
+        assertTrue(rules.isAllowed("https://example.org/"), "implicitly allowed");

        robotstxt = readFile("/robots/rfc9309-example-longest-match-robots.txt");
        rules = createRobotRules("foobot", robotstxt);
        assertTrue(rules.isAllowed("https://example.org/example/page/"));
        assertTrue(rules.isAllowed("https://example.org/example/page/index.html"));
        assertFalse(rules.isAllowed("https://example.org/example/page/disallowed.gif"));
-        assertTrue(rules.isAllowed("https://example.org/")); // implicitly allowed
+        assertTrue(rules.isAllowed("https://example.org/"), "implicitly allowed");

        robotstxt = readFile("/robots/rfc9309-example-rule-group-merging.txt");
        rules = createRobotRules("examplebot", robotstxt);
@ -1246,23 +1257,46 @@ public class SimpleRobotRulesParserTest {
        assertFalse(rules.isAllowed("https://example.org/foo"));
        assertFalse(rules.isAllowed("https://example.org/bar"));
        assertFalse(rules.isAllowed("https://example.org/baz"));
-        assertTrue(rules.isAllowed("https://example.org/")); // implicitly allowed
+        assertTrue(rules.isAllowed("https://example.org/"), "implicitly allowed");

        rules = createRobotRules("anyotherbot", robotstxt);
        assertEquals(2, ((SimpleRobotRules) rules).getRobotRules().size());
        assertFalse(rules.isAllowed("https://example.org/foo"));
        assertFalse(rules.isAllowed("https://example.org/bar"));
        assertTrue(rules.isAllowed("https://example.org/baz"));
-        assertTrue(rules.isAllowed("https://example.org/")); // implicitly allowed
+        assertTrue(rules.isAllowed("https://example.org/"), "implicitly allowed");

        rules = createRobotRules("bazbot", robotstxt);
        assertEquals(1, ((SimpleRobotRules) rules).getRobotRules().size());
        assertTrue(rules.isAllowed("https://example.org/foo"));
        assertTrue(rules.isAllowed("https://example.org/bar"));
        assertFalse(rules.isAllowed("https://example.org/baz"));
-        assertTrue(rules.isAllowed("https://example.org/")); // implicitly allowed
+        assertTrue(rules.isAllowed("https://example.org/"), "implicitly allowed");
    }

+    @Test
+    void testAPIemptyUserAgentList() {
+        final String simpleRobotsTxt = "User-agent: *" + CRLF //
+                        + "Allow: /allowed/" + CRLF //
+                        + "Disallow: /" + CRLF //
+                        + "User-agent: allowedbot" + CRLF //
+                        + "Allow: /";
+
+        /*
+         * verify that the wildcard user-agent rules are selected if an empty
+         * list of user-agents is passed
+         */
+        BaseRobotRules rules = createRobotRules(Set.of(), simpleRobotsTxt, true);
+        assertTrue(rules.isAllowed("https://www.example.com/allowed/page.html"));
+        assertFalse(rules.isAllowed("https://www.example.com/"));
+
+        rules = createRobotRules(Set.of("anybot"), simpleRobotsTxt, true);
+        assertTrue(rules.isAllowed("https://www.example.com/allowed/page.html"));
+        assertFalse(rules.isAllowed("https://www.example.com/"));
+
+        rules = createRobotRules(Set.of("allowedbot"), simpleRobotsTxt, true);
+        assertTrue(rules.isAllowed("https://www.example.com/"));
+    }

    private byte[] readFile(String filename) throws Exception {
        byte[] bigBuffer = new byte[100000];