diff --git a/src/test/java/crawlercommons/robots/SimpleRobotRulesParserTest.java b/src/test/java/crawlercommons/robots/SimpleRobotRulesParserTest.java index 63ff431..58516eb 100644 --- a/src/test/java/crawlercommons/robots/SimpleRobotRulesParserTest.java +++ b/src/test/java/crawlercommons/robots/SimpleRobotRulesParserTest.java @@ -17,6 +17,8 @@ package crawlercommons.robots; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; import javax.servlet.http.HttpServletResponse; import java.io.InputStream; @@ -27,204 +29,222 @@ import static java.nio.charset.StandardCharsets.US_ASCII; import static java.nio.charset.StandardCharsets.UTF_8; import static org.junit.jupiter.api.Assertions.*; -public class SimpleRobotRulesParserTest { +class SimpleRobotRulesParserTest { private static final String LF = "\n"; private static final String CR = "\r"; private static final String CRLF = "\r\n"; private static final String FAKE_ROBOTS_URL = "http://domain.com"; - private static BaseRobotRules createRobotRules(String crawlerName, byte[] content) { + private static BaseRobotRules createRobotRules(String crawlerName, String content) { + return createRobotRules(crawlerName, content.getBytes(UTF_8)); + } + + private static BaseRobotRules createRobotRules(String crawlerName, byte[] contentBytes) { SimpleRobotRulesParser robotParser = new SimpleRobotRulesParser(); - return robotParser.parseContent(FAKE_ROBOTS_URL, content, "text/plain", crawlerName); + return robotParser.parseContent(FAKE_ROBOTS_URL, contentBytes, "text/plain", crawlerName); } @Test - public void testEmptyRules() { - BaseRobotRules rules = createRobotRules("Any-darn-crawler", "".getBytes(UTF_8)); + void testEmptyRules() { + BaseRobotRules rules = createRobotRules("Any-darn-crawler", ""); assertTrue(rules.isAllowed("http://www.domain.com/anypage.html")); } @Test - public void testQueryParamInDisallow() { + void testQueryParamInDisallow() { final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow: /index.cfm?fuseaction=sitesearch.results*"; - BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes(UTF_8)); + BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt); assertFalse(rules.isAllowed("http://searchservice.domain.com/index.cfm?fuseaction=sitesearch.results&type=People&qry=california&pg=2")); } - @Test - public void testGooglePatternMatching() { - + @ParameterizedTest + @CsvSource({ "False, http://www.fict.com/fish",// + "False, http://www.fict.com/fish.html",// + "False, http://www.fict.com/fish/salmon.html",// + "False, http://www.fict.com/fishheads",// + "False, http://www.fict.com/fishheads/yummy.html",// + "False, http://www.fict.com/fish.php?id=anything",// + "True, http://www.fict.com/Fish.asp",// + "True, http://www.fict.com/catfish",// + "True, http://www.fict.com/?id=fish",// + "True, http://www.fict.com/fis" }) + void testGooglePatternMatching1(boolean isAllowed, String urlStr) { // Test for /fish final String simpleRobotsTxt1 = "User-agent: *" + CRLF + "Disallow: /fish" + CRLF; - - BaseRobotRules rule1 = createRobotRules("Any-darn-crawler", simpleRobotsTxt1.getBytes(UTF_8)); - assertFalse(rule1.isAllowed("http://www.fict.com/fish")); - assertFalse(rule1.isAllowed("http://www.fict.com/fish.html")); - assertFalse(rule1.isAllowed("http://www.fict.com/fish/salmon.html")); - assertFalse(rule1.isAllowed("http://www.fict.com/fishheads")); - assertFalse(rule1.isAllowed("http://www.fict.com/fishheads/yummy.html")); - assertFalse(rule1.isAllowed("http://www.fict.com/fish.php?id=anything")); - - assertTrue(rule1.isAllowed("http://www.fict.com/Fish.asp")); - assertTrue(rule1.isAllowed("http://www.fict.com/catfish")); - assertTrue(rule1.isAllowed("http://www.fict.com/?id=fish")); - assertTrue(rule1.isAllowed("http://www.fict.com/fis")); + BaseRobotRules rule1 = createRobotRules("Any-darn-crawler", simpleRobotsTxt1); + assertEquals(isAllowed, rule1.isAllowed(urlStr)); // Test for /fish* final String simpleRobotsTxt2 = "User-agent: *" + CRLF + "Disallow: /fish*" + CRLF; + BaseRobotRules rule2 = createRobotRules("Any-darn-crawler", simpleRobotsTxt2); + assertEquals(isAllowed, rule2.isAllowed(urlStr)); + } - BaseRobotRules rule2 = createRobotRules("Any-darn-crawler", simpleRobotsTxt2.getBytes(UTF_8)); - assertFalse(rule2.isAllowed("http://www.fict.com/fish")); - assertFalse(rule2.isAllowed("http://www.fict.com/fish.html")); - assertFalse(rule2.isAllowed("http://www.fict.com/fish/salmon.html")); - assertFalse(rule2.isAllowed("http://www.fict.com/fishheads")); - assertFalse(rule2.isAllowed("http://www.fict.com/fishheads/yummy.html")); - assertFalse(rule2.isAllowed("http://www.fict.com/fish.php?id=anything")); - - assertTrue(rule2.isAllowed("http://www.fict.com/Fish.asp")); - assertTrue(rule2.isAllowed("http://www.fict.com/catfish")); - assertTrue(rule2.isAllowed("http://www.fict.com/?id=fish")); - assertTrue(rule2.isAllowed("http://www.fict.com/fis")); - - // Test for /fish/ - final String simpleRobotsTxt3 = "User-agent: *" + CRLF + "Disallow: /fish/" + CRLF; - - BaseRobotRules rule3 = createRobotRules("Any-darn-crawler", simpleRobotsTxt3.getBytes(UTF_8)); - assertFalse(rule3.isAllowed("http://www.fict.com/fish/")); - assertFalse(rule3.isAllowed("http://www.fict.com/fish/?id=anything")); - assertFalse(rule3.isAllowed("http://www.fict.com/fish/salmon.htm")); - - assertTrue(rule3.isAllowed("http://www.fict.com/fish")); - assertTrue(rule3.isAllowed("http://www.fict.com/fish.html")); - assertTrue(rule3.isAllowed("http://www.fict.com/Fish/Salmon.asp")); + @ParameterizedTest + @CsvSource({ "False, http://www.fict.com/fish/",// + "False, http://www.fict.com/fish/?id=anything",// + "False, http://www.fict.com/fish/salmon.htm",// + "True, http://www.fict.com/fish",// + "True, http://www.fict.com/fish.html",// + "True, http://www.fict.com/Fish/Salmon.asp" }) + void testGooglePatternMatching2(boolean isAllowed, String urlStr) { + // Test for /fish + final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow: /fish/" + CRLF; + BaseRobotRules rule = createRobotRules("Any-darn-crawler", simpleRobotsTxt); + assertEquals(isAllowed, rule.isAllowed(urlStr)); + } + @ParameterizedTest + @CsvSource({ "False, http://www.fict.com/filename.php",// + "False, http://www.fict.com/folder/filename.php",// + "False, http://www.fict.com/folder/filename.php?parameters",// + "False, http://www.fict.com/folder/any.php.file.html",// + "False, http://www.fict.com/filename.php/",// + "True, http://www.fict.com/",// + "True, http://www.fict.com/windows.PHP" }) + void testGooglePatternMatching3(boolean isAllowed, String urlStr) { // Test for /*.php - final String simpleRobotsTxt4 = "User-agent: *" + CRLF + "Disallow: /*.php" + CRLF; - - BaseRobotRules rule4 = createRobotRules("Any-darn-crawler", simpleRobotsTxt4.getBytes(UTF_8)); - assertFalse(rule4.isAllowed("http://www.fict.com/filename.php")); - assertFalse(rule4.isAllowed("http://www.fict.com/folder/filename.php")); - assertFalse(rule4.isAllowed("http://www.fict.com/folder/filename.php?parameters")); - assertFalse(rule4.isAllowed("http://www.fict.com/folder/any.php.file.html")); - assertFalse(rule4.isAllowed("http://www.fict.com/filename.php/")); - - assertTrue(rule4.isAllowed("http://www.fict.com/")); - assertTrue(rule4.isAllowed("http://www.fict.com/windows.PHP")); + final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow: /*.php" + CRLF; + BaseRobotRules rule = createRobotRules("Any-darn-crawler", simpleRobotsTxt); + assertEquals(isAllowed, rule.isAllowed(urlStr)); + } + @ParameterizedTest + @CsvSource({ "False, http://www.fict.com/filename.php",// + "False, http://www.fict.com/folder/filename.php",// + "True, http://www.fict.com/filename.php?parameters",// + "True, http://www.fict.com/filename.php/",// + "True, http://www.fict.com/filename.php5",// + "True, http://www.fict.com/windows.PHP" }) + void testGooglePatternMatching4(boolean isAllowed, String urlStr) { // Test for /*.php$ - final String simpleRobotsTxt5 = "User-agent: *" + CRLF + "Disallow: /*.php$" + CRLF; - - BaseRobotRules rule5 = createRobotRules("Any-darn-crawler", simpleRobotsTxt5.getBytes(UTF_8)); - assertFalse(rule5.isAllowed("http://www.fict.com/filename.php")); - assertFalse(rule5.isAllowed("http://www.fict.com/folder/filename.php")); - - assertTrue(rule5.isAllowed("http://www.fict.com/filename.php?parameters")); - assertTrue(rule5.isAllowed("http://www.fict.com/filename.php/")); - assertTrue(rule5.isAllowed("http://www.fict.com/filename.php5")); - assertTrue(rule5.isAllowed("http://www.fict.com/windows.PHP")); + final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow: /*.php$" + CRLF; + BaseRobotRules rule = createRobotRules("Any-darn-crawler", simpleRobotsTxt); + assertEquals(isAllowed, rule.isAllowed(urlStr)); + } + @ParameterizedTest + @CsvSource({ "False, http://www.fict.com/fish.php",// + "False, http://www.fict.com/fishheads/catfish.php?parameters",// + "True, http://www.fict.com/Fish.PHP" }) + void testGooglePatternMatching5(boolean isAllowed, String urlStr) { // Test for /fish*.php - final String simpleRobotsTxt6 = "User-agent: *" + CRLF + "Disallow: /fish*.php" + CRLF; - - BaseRobotRules rule6 = createRobotRules("Any-darn-crawler", simpleRobotsTxt6.getBytes(UTF_8)); - assertFalse(rule6.isAllowed("http://www.fict.com/fish.php")); - assertFalse(rule6.isAllowed("http://www.fict.com/fishheads/catfish.php?parameters")); - - assertTrue(rule6.isAllowed("http://www.fict.com/Fish.PHP")); + final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow: /fish*.php" + CRLF; + BaseRobotRules rule = createRobotRules("Any-darn-crawler", simpleRobotsTxt); + assertEquals(isAllowed, rule.isAllowed(urlStr)); + } + @ParameterizedTest + @CsvSource({ "False, http://www.fict.com/fish.php",// + "False, http://www.fict.com/superfishheads/catfish.php?parameters",// + "True, http://www.fict.com/fishheads/catfish.htm" }) + void testGooglePatternMatching6(boolean isAllowed, String urlStr) { // Test rule with multiple '*' characters - final String simpleRobotsTxt7 = "User-agent: *" + CRLF + "Disallow: /*fish*.php" + CRLF; - - BaseRobotRules rule7 = createRobotRules("Any-darn-crawler", simpleRobotsTxt7.getBytes(UTF_8)); - assertFalse(rule7.isAllowed("http://www.fict.com/fish.php")); - assertFalse(rule7.isAllowed("http://www.fict.com/superfishheads/catfish.php?parameters")); - assertTrue(rule7.isAllowed("http://www.fict.com/fishheads/catfish.htm")); + final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow: /*fish*.php" + CRLF; + BaseRobotRules rule = createRobotRules("Any-darn-crawler", simpleRobotsTxt); + assertEquals(isAllowed, rule.isAllowed(urlStr)); } @Test - public void testCommentedOutLines() { + void testCommentedOutLines() { final String simpleRobotsTxt = "#user-agent: testAgent" + LF + LF + "#allow: /index.html" + LF + "#allow: /test" + LF + LF + "#user-agent: test" + LF + LF + "#allow: /index.html" + LF + "#disallow: /test" + LF + LF + "#user-agent: someAgent" + LF + LF + "#disallow: /index.html" + LF + "#disallow: /test" + LF + LF; - BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes(UTF_8)); + BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt); assertTrue(rules.isAllowed("http://www.domain.com/anypage.html")); } @Test - public void testRobotsTxtAlwaysAllowed() { + void testRobotsTxtAlwaysAllowed() { final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow: /"; - BaseRobotRules rules = createRobotRules("any-darn-crawler", simpleRobotsTxt.getBytes(UTF_8)); + BaseRobotRules rules = createRobotRules("any-darn-crawler", simpleRobotsTxt); assertTrue(rules.isAllowed("http://www.domain.com/robots.txt")); } @Test - public void testAgentNotListed() { + void testAgentNotListed() { // Access is assumed to be allowed, if no rules match an agent. final String simpleRobotsTxt = "User-agent: crawler1" + CRLF + "Disallow: /index.html" + CRLF + "Allow: /" + CRLF + CRLF + "User-agent: crawler2" + CRLF + "Disallow: /"; - BaseRobotRules rules = createRobotRules("crawler3", simpleRobotsTxt.getBytes(UTF_8)); + BaseRobotRules rules = createRobotRules("crawler3", simpleRobotsTxt); assertTrue(rules.isAllowed("http://www.domain.com/anypage.html")); assertTrue(rules.isAllowed("http://www.domain.com/index.html")); } @Test - public void testNonAsciiEncoding() { + void testNonAsciiEncoding() { final String simpleRobotsTxt = "User-agent: *" + " # \u00A2 \u20B5" + CRLF + "Disallow:"; - BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes(UTF_8)); + BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt); assertTrue(rules.isAllowed("http://www.domain.com/anypage.html")); } @Test - public void testSimplestAllowAll() { + void testSimplestAllowAll() { final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow:"; - BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes(UTF_8)); + BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt); assertTrue(rules.isAllowed("http://www.domain.com/anypage.html")); } // https://github.com/crawler-commons/crawler-commons/issues/215 - @Test - public void testDisallowWithQueryOnly() { + void testDisallowWithQueryOnly() { final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow: /"; - BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes(UTF_8)); + BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt); assertFalse(rules.isAllowed("http://www.example.com")); assertFalse(rules.isAllowed("http://www.example.com?q=a")); } - @Test - public void testMixedEndings() { + @ParameterizedTest + @CsvSource({ "True, http://www.fict.org/",// + "True, http://www.fict.org/index.html" }) + void testMixedEndings1(boolean isAllowed, String urlStr) { final String mixedEndingsRobotsTxt = "# /robots.txt for http://www.fict.org/" + CRLF + "# comments to webmaster@fict.org" + CR + LF + "User-agent: unhipbot" + LF + "Disallow: /" + CR + "" + CRLF + "User-agent: webcrawler" + LF + "User-agent: excite" + CR + "Disallow: " + "\u0085" + CR + "User-agent: *" + CRLF + "Disallow: /org/plans.html" + LF + "Allow: /org/" + CR + "Allow: /serv" + CRLF + "Allow: /~mak" + LF + "Disallow: /" + CRLF; - BaseRobotRules rules; - - rules = createRobotRules("WebCrawler/3.0", mixedEndingsRobotsTxt.getBytes(UTF_8)); - assertTrue(rules.isAllowed("http://www.fict.org/")); - assertTrue(rules.isAllowed("http://www.fict.org/index.html")); - - rules = createRobotRules("Unknown/1.0", mixedEndingsRobotsTxt.getBytes(UTF_8)); - assertFalse(rules.isAllowed("http://www.fict.org/")); - assertFalse(rules.isAllowed("http://www.fict.org/index.html")); - assertTrue(rules.isAllowed("http://www.fict.org/robots.txt")); - assertTrue(rules.isAllowed("http://www.fict.org/server.html")); - assertTrue(rules.isAllowed("http://www.fict.org/services/fast.html")); - assertTrue(rules.isAllowed("http://www.fict.org/services/slow.html")); - assertFalse(rules.isAllowed("http://www.fict.org/orgo.gif")); - assertTrue(rules.isAllowed("http://www.fict.org/org/about.html")); - assertFalse(rules.isAllowed("http://www.fict.org/org/plans.html")); - assertFalse(rules.isAllowed("http://www.fict.org/%7Ejim/jim.html")); - assertTrue(rules.isAllowed("http://www.fict.org/%7Emak/mak.html")); - + BaseRobotRules rules = createRobotRules("WebCrawler/3.0", mixedEndingsRobotsTxt); + assertEquals(isAllowed, rules.isAllowed(urlStr)); } - @Test - public void testRfpCases() { + @ParameterizedTest + @CsvSource({ "False, http://www.fict.org/",// + "False, http://www.fict.org/index.html",// + "True, http://www.fict.org/robots.txt",// + "True, http://www.fict.org/server.html",// + "True, http://www.fict.org/services/fast.html",// + "True, http://www.fict.org/services/slow.html",// + "False, http://www.fict.org/orgo.gif",// + "True, http://www.fict.org/org/about.html",// + "False, http://www.fict.org/org/plans.html",// + "False, http://www.fict.org/%7Ejim/jim.html",// + "True, http://www.fict.org/%7Emak/mak.html" }) + void testMixedEndings2(boolean isAllowed, String urlStr) { + final String mixedEndingsRobotsTxt = "# /robots.txt for http://www.fict.org/" + CRLF + "# comments to webmaster@fict.org" + CR + LF + "User-agent: unhipbot" + LF + "Disallow: /" + CR + "" + + CRLF + "User-agent: webcrawler" + LF + "User-agent: excite" + CR + "Disallow: " + "\u0085" + CR + "User-agent: *" + CRLF + "Disallow: /org/plans.html" + LF + "Allow: /org/" + + CR + "Allow: /serv" + CRLF + "Allow: /~mak" + LF + "Disallow: /" + CRLF; + + BaseRobotRules rules = createRobotRules("Unknown/1.0", mixedEndingsRobotsTxt); + assertEquals(isAllowed, rules.isAllowed(urlStr)); + } + + @ParameterizedTest + @CsvSource({ "False, http://www.fict.org/",// + "False, http://www.fict.org/index.html",// + "True, http://www.fict.org/robots.txt",// + "False, http://www.fict.org/server.html",// + "False, http://www.fict.org/services/fast.html",// + "False, http://www.fict.org/services/slow.html",// + "False, http://www.fict.org/orgo.gif",// + "False, http://www.fict.org/org/about.html",// + "False, http://www.fict.org/org/plans.html",// + "False, http://www.fict.org/%7Ejim/jim.html",// + "False, http://www.fict.org/%7Emak/mak.html" }) + void testRfpCases(boolean isAllowed, String urlStr) { // Run through all of the tests that are part of the robots.txt RFP // http://www.robotstxt.org/norobots-rfc.txt final String rfpExampleRobotsTxt = "# /robots.txt for http://www.fict.org/" + CRLF + "# comments to webmaster@fict.org" + CRLF + CRLF + "User-agent: unhipbot" + CRLF + "Disallow: /" + CRLF @@ -233,223 +253,164 @@ public class SimpleRobotRulesParserTest { BaseRobotRules rules; - rules = createRobotRules("UnhipBot/0.1", rfpExampleRobotsTxt.getBytes(UTF_8)); - assertFalse(rules.isAllowed("http://www.fict.org/")); - assertFalse(rules.isAllowed("http://www.fict.org/index.html")); - assertTrue(rules.isAllowed("http://www.fict.org/robots.txt")); - assertFalse(rules.isAllowed("http://www.fict.org/server.html")); - assertFalse(rules.isAllowed("http://www.fict.org/services/fast.html")); - assertFalse(rules.isAllowed("http://www.fict.org/services/slow.html")); - assertFalse(rules.isAllowed("http://www.fict.org/orgo.gif")); - assertFalse(rules.isAllowed("http://www.fict.org/org/about.html")); - assertFalse(rules.isAllowed("http://www.fict.org/org/plans.html")); - assertFalse(rules.isAllowed("http://www.fict.org/%7Ejim/jim.html")); - assertFalse(rules.isAllowed("http://www.fict.org/%7Emak/mak.html")); + rules = createRobotRules("UnhipBot/0.1", rfpExampleRobotsTxt); + assertEquals(isAllowed, rules.isAllowed(urlStr)); - rules = createRobotRules("WebCrawler/3.0", rfpExampleRobotsTxt.getBytes(UTF_8)); - assertTrue(rules.isAllowed("http://www.fict.org/")); - assertTrue(rules.isAllowed("http://www.fict.org/index.html")); - assertTrue(rules.isAllowed("http://www.fict.org/robots.txt")); - assertTrue(rules.isAllowed("http://www.fict.org/server.html")); - assertTrue(rules.isAllowed("http://www.fict.org/services/fast.html")); - assertTrue(rules.isAllowed("http://www.fict.org/services/slow.html")); - assertTrue(rules.isAllowed("http://www.fict.org/orgo.gif")); - assertTrue(rules.isAllowed("http://www.fict.org/org/about.html")); - assertTrue(rules.isAllowed("http://www.fict.org/org/plans.html")); - assertTrue(rules.isAllowed("http://www.fict.org/%7Ejim/jim.html")); - assertTrue(rules.isAllowed("http://www.fict.org/%7Emak/mak.html")); + rules = createRobotRules("WebCrawler/3.0", rfpExampleRobotsTxt); + assertTrue(rules.isAllowed(urlStr)); - rules = createRobotRules("Excite/1.0", rfpExampleRobotsTxt.getBytes(UTF_8)); - assertTrue(rules.isAllowed("http://www.fict.org/")); - assertTrue(rules.isAllowed("http://www.fict.org/index.html")); - assertTrue(rules.isAllowed("http://www.fict.org/robots.txt")); - assertTrue(rules.isAllowed("http://www.fict.org/server.html")); - assertTrue(rules.isAllowed("http://www.fict.org/services/fast.html")); - assertTrue(rules.isAllowed("http://www.fict.org/services/slow.html")); - assertTrue(rules.isAllowed("http://www.fict.org/orgo.gif")); - assertTrue(rules.isAllowed("http://www.fict.org/org/about.html")); - assertTrue(rules.isAllowed("http://www.fict.org/org/plans.html")); - assertTrue(rules.isAllowed("http://www.fict.org/%7Ejim/jim.html")); - assertTrue(rules.isAllowed("http://www.fict.org/%7Emak/mak.html")); - - rules = createRobotRules("Unknown/1.0", rfpExampleRobotsTxt.getBytes(UTF_8)); - assertFalse(rules.isAllowed("http://www.fict.org/")); - assertFalse(rules.isAllowed("http://www.fict.org/index.html")); - assertTrue(rules.isAllowed("http://www.fict.org/robots.txt")); - assertTrue(rules.isAllowed("http://www.fict.org/server.html")); - assertTrue(rules.isAllowed("http://www.fict.org/services/fast.html")); - assertTrue(rules.isAllowed("http://www.fict.org/services/slow.html")); - assertFalse(rules.isAllowed("http://www.fict.org/orgo.gif")); - assertTrue(rules.isAllowed("http://www.fict.org/org/about.html")); - assertFalse(rules.isAllowed("http://www.fict.org/org/plans.html")); - assertFalse(rules.isAllowed("http://www.fict.org/%7Ejim/jim.html")); - assertTrue(rules.isAllowed("http://www.fict.org/%7Emak/mak.html")); + rules = createRobotRules("Excite/1.0", rfpExampleRobotsTxt); + assertTrue(rules.isAllowed(urlStr)); } - @Test - public void testNutchCases() { - // Run through the Nutch test cases. + @ParameterizedTest + @CsvSource({ "False, http://www.fict.org/",// + "False, http://www.fict.org/index.html",// + "True, http://www.fict.org/robots.txt",// + "True, http://www.fict.org/server.html",// + "True, http://www.fict.org/services/fast.html",// + "True, http://www.fict.org/services/slow.html",// + "False, http://www.fict.org/orgo.gif",// + "True, http://www.fict.org/org/about.html",// + "False, http://www.fict.org/org/plans.html",// + "False, http://www.fict.org/%7Ejim/jim.html",// + "True, http://www.fict.org/%7Emak/mak.html" }) + void testRfpCases2(boolean isAllowed, String urlStr) { + // Run through all of the tests that are part of the robots.txt RFP + // http://www.robotstxt.org/norobots-rfc.txt + final String rfpExampleRobotsTxt = "# /robots.txt for http://www.fict.org/" + CRLF + "# comments to webmaster@fict.org" + CRLF + CRLF + "User-agent: unhipbot" + CRLF + "Disallow: /" + CRLF + + "" + CRLF + "User-agent: webcrawler" + CRLF + "User-agent: excite" + CRLF + "Disallow: " + CRLF + CRLF + "User-agent: *" + CRLF + "Disallow: /org/plans.html" + CRLF + + "Allow: /org/" + CRLF + "Allow: /serv" + CRLF + "Allow: /~mak" + CRLF + "Disallow: /" + CRLF; + BaseRobotRules rules = createRobotRules("Unknown/1.0", rfpExampleRobotsTxt); + assertEquals(isAllowed, rules.isAllowed(urlStr)); + } + + @ParameterizedTest + @CsvSource({ "False, http://www.domain.com/a",// + "False, http://www.domain.com/a/",// + "False, http://www.domain.com/a/bloh/foo.html",// + "True, http://www.domain.com/b",// + "False, http://www.domain.com/b/a",// + "False, http://www.domain.com/b/a/index.html",// + "True, http://www.domain.com/b/b/foo.html",// + "True, http://www.domain.com/c",// + "True, http://www.domain.com/c/a",// + "True, http://www.domain.com/c/a/index.html",// + "True, http://www.domain.com/c/b/foo.html",// + "True, http://www.domain.com/d",// + "True, http://www.domain.com/d/a",// + "True, http://www.domain.com/e/a/index.html",// + "True, http://www.domain.com/e/d",// + "True, http://www.domain.com/e/d/foo.html",// + "True, http://www.domain.com/e/doh.html",// + "True, http://www.domain.com/f/index.html",// + "True, http://www.domain.com/foo/bar/baz.html",// + "True, http://www.domain.com/f/" }) + void testNutchCases(boolean isAllowed, String urlStr) { + // Run through the Nutch test cases. final String nutchRobotsTxt = "User-Agent: Agent1 #foo" + CR + "Disallow: /a" + CR + "Disallow: /b/a" + CR + "#Disallow: /c" + CR + "" + CR + "" + CR + "User-Agent: Agent2 Agent3#foo" + CR + "User-Agent: Agent4" + CR + "Disallow: /d" + CR + "Disallow: /e/d/" + CR + "" + CR + "User-Agent: *" + CR + "Disallow: /foo/bar/" + CR; BaseRobotRules rules; - rules = createRobotRules("Agent1", nutchRobotsTxt.getBytes(UTF_8)); - assertFalse(rules.isAllowed("http://www.domain.com/a")); - assertFalse(rules.isAllowed("http://www.domain.com/a/")); - assertFalse(rules.isAllowed("http://www.domain.com/a/bloh/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/b")); - assertFalse(rules.isAllowed("http://www.domain.com/b/a")); - assertFalse(rules.isAllowed("http://www.domain.com/b/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/b/b/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/c")); - assertTrue(rules.isAllowed("http://www.domain.com/c/a")); - assertTrue(rules.isAllowed("http://www.domain.com/c/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/c/b/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/d")); - assertTrue(rules.isAllowed("http://www.domain.com/d/a")); - assertTrue(rules.isAllowed("http://www.domain.com/e/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/e/d")); - assertTrue(rules.isAllowed("http://www.domain.com/e/d/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/e/doh.html")); - assertTrue(rules.isAllowed("http://www.domain.com/f/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/foo/bar/baz.html")); - assertTrue(rules.isAllowed("http://www.domain.com/f/")); - - rules = createRobotRules("Agent2", nutchRobotsTxt.getBytes(UTF_8)); - assertTrue(rules.isAllowed("http://www.domain.com/a")); - assertTrue(rules.isAllowed("http://www.domain.com/a/")); - assertTrue(rules.isAllowed("http://www.domain.com/a/bloh/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/b")); - assertTrue(rules.isAllowed("http://www.domain.com/b/a")); - assertTrue(rules.isAllowed("http://www.domain.com/b/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/b/b/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/c")); - assertTrue(rules.isAllowed("http://www.domain.com/c/a")); - assertTrue(rules.isAllowed("http://www.domain.com/c/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/c/b/foo.html")); - assertFalse(rules.isAllowed("http://www.domain.com/d")); - assertFalse(rules.isAllowed("http://www.domain.com/d/a")); - assertTrue(rules.isAllowed("http://www.domain.com/e/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/e/d")); - assertFalse(rules.isAllowed("http://www.domain.com/e/d/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/e/doh.html")); - assertTrue(rules.isAllowed("http://www.domain.com/f/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/foo/bar/baz.html")); - assertTrue(rules.isAllowed("http://www.domain.com/f/")); - - rules = createRobotRules("Agent3", nutchRobotsTxt.getBytes(UTF_8)); - assertTrue(rules.isAllowed("http://www.domain.com/a")); - assertTrue(rules.isAllowed("http://www.domain.com/a/")); - assertTrue(rules.isAllowed("http://www.domain.com/a/bloh/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/b")); - assertTrue(rules.isAllowed("http://www.domain.com/b/a")); - assertTrue(rules.isAllowed("http://www.domain.com/b/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/b/b/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/c")); - assertTrue(rules.isAllowed("http://www.domain.com/c/a")); - assertTrue(rules.isAllowed("http://www.domain.com/c/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/c/b/foo.html")); - assertFalse(rules.isAllowed("http://www.domain.com/d")); - assertFalse(rules.isAllowed("http://www.domain.com/d/a")); - assertTrue(rules.isAllowed("http://www.domain.com/e/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/e/d")); - assertFalse(rules.isAllowed("http://www.domain.com/e/d/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/e/doh.html")); - assertTrue(rules.isAllowed("http://www.domain.com/f/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/foo/bar/baz.html")); - assertTrue(rules.isAllowed("http://www.domain.com/f/")); - - rules = createRobotRules("Agent4", nutchRobotsTxt.getBytes(UTF_8)); - assertTrue(rules.isAllowed("http://www.domain.com/a")); - assertTrue(rules.isAllowed("http://www.domain.com/a/")); - assertTrue(rules.isAllowed("http://www.domain.com/a/bloh/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/b")); - assertTrue(rules.isAllowed("http://www.domain.com/b/a")); - assertTrue(rules.isAllowed("http://www.domain.com/b/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/b/b/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/c")); - assertTrue(rules.isAllowed("http://www.domain.com/c/a")); - assertTrue(rules.isAllowed("http://www.domain.com/c/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/c/b/foo.html")); - assertFalse(rules.isAllowed("http://www.domain.com/d")); - assertFalse(rules.isAllowed("http://www.domain.com/d/a")); - assertTrue(rules.isAllowed("http://www.domain.com/e/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/e/d")); - assertFalse(rules.isAllowed("http://www.domain.com/e/d/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/e/doh.html")); - assertTrue(rules.isAllowed("http://www.domain.com/f/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/foo/bar/baz.html")); - assertTrue(rules.isAllowed("http://www.domain.com/f/")); - - rules = createRobotRules("Agent5", nutchRobotsTxt.getBytes(UTF_8)); - assertTrue(rules.isAllowed("http://www.domain.com/a")); - assertTrue(rules.isAllowed("http://www.domain.com/a/")); - assertTrue(rules.isAllowed("http://www.domain.com/a/bloh/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/b")); - assertTrue(rules.isAllowed("http://www.domain.com/b/a")); - assertTrue(rules.isAllowed("http://www.domain.com/b/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/b/b/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/c")); - assertTrue(rules.isAllowed("http://www.domain.com/c/a")); - assertTrue(rules.isAllowed("http://www.domain.com/c/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/c/b/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/d")); - assertTrue(rules.isAllowed("http://www.domain.com/d/a")); - assertTrue(rules.isAllowed("http://www.domain.com/e/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/e/d")); - assertTrue(rules.isAllowed("http://www.domain.com/e/d/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/e/doh.html")); - assertTrue(rules.isAllowed("http://www.domain.com/f/index.html")); - assertFalse(rules.isAllowed("http://www.domain.com/foo/bar/baz.html")); - assertTrue(rules.isAllowed("http://www.domain.com/f/")); + rules = createRobotRules("Agent1", nutchRobotsTxt); + assertEquals(isAllowed, rules.isAllowed(urlStr)); // Note that the SimpleRobotRulesParser only parses the rule set of the // first matching agent name. For the following example, the parser // returns only the rules matching 'Agent1'. - rules = createRobotRules("Agent5,Agent2,Agent1,Agent3,*", nutchRobotsTxt.getBytes(UTF_8)); - assertFalse(rules.isAllowed("http://www.domain.com/a")); - assertFalse(rules.isAllowed("http://www.domain.com/a/")); - assertFalse(rules.isAllowed("http://www.domain.com/a/bloh/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/b")); - assertFalse(rules.isAllowed("http://www.domain.com/b/a")); - assertFalse(rules.isAllowed("http://www.domain.com/b/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/b/b/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/c")); - assertTrue(rules.isAllowed("http://www.domain.com/c/a")); - assertTrue(rules.isAllowed("http://www.domain.com/c/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/c/b/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/d")); - assertTrue(rules.isAllowed("http://www.domain.com/d/a")); - assertTrue(rules.isAllowed("http://www.domain.com/e/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/e/d")); - assertTrue(rules.isAllowed("http://www.domain.com/e/d/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/e/doh.html")); - assertTrue(rules.isAllowed("http://www.domain.com/f/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/foo/bar/baz.html")); - assertTrue(rules.isAllowed("http://www.domain.com/f/")); + rules = createRobotRules("Agent5,Agent2,Agent1,Agent3,*", nutchRobotsTxt); + assertEquals(isAllowed, rules.isAllowed(urlStr)); + } + + @ParameterizedTest + @CsvSource({ "True, http://www.domain.com/a",// + "True, http://www.domain.com/a/",// + "True, http://www.domain.com/a/bloh/foo.html",// + "True, http://www.domain.com/b",// + "True, http://www.domain.com/b/a",// + "True, http://www.domain.com/b/a/index.html",// + "True, http://www.domain.com/b/b/foo.html",// + "True, http://www.domain.com/c",// + "True, http://www.domain.com/c/a",// + "True, http://www.domain.com/c/a/index.html",// + "True, http://www.domain.com/c/b/foo.html",// + "False, http://www.domain.com/d",// + "False, http://www.domain.com/d/a",// + "True, http://www.domain.com/e/a/index.html",// + "True, http://www.domain.com/e/d",// + "False, http://www.domain.com/e/d/foo.html",// + "True, http://www.domain.com/e/doh.html",// + "True, http://www.domain.com/f/index.html",// + "True, http://www.domain.com/foo/bar/baz.html",// + "True, http://www.domain.com/f/" }) + void testNutchCases2(boolean isAllowed, String urlStr) { + // Run through the Nutch test cases. + final String nutchRobotsTxt = "User-Agent: Agent1 #foo" + CR + "Disallow: /a" + CR + "Disallow: /b/a" + CR + "#Disallow: /c" + CR + "" + CR + "" + CR + "User-Agent: Agent2 Agent3#foo" + CR + + "User-Agent: Agent4" + CR + "Disallow: /d" + CR + "Disallow: /e/d/" + CR + "" + CR + "User-Agent: *" + CR + "Disallow: /foo/bar/" + CR; + + BaseRobotRules rules; + + rules = createRobotRules("Agent2", nutchRobotsTxt); + assertEquals(isAllowed, rules.isAllowed(urlStr)); + + rules = createRobotRules("Agent3", nutchRobotsTxt); + assertEquals(isAllowed, rules.isAllowed(urlStr)); + + rules = createRobotRules("Agent4", nutchRobotsTxt); + assertEquals(isAllowed, rules.isAllowed(urlStr)); + } + + @ParameterizedTest + @CsvSource({ "True, http://www.domain.com/a",// + "True, http://www.domain.com/a/",// + "True, http://www.domain.com/a/bloh/foo.html",// + "True, http://www.domain.com/b",// + "True, http://www.domain.com/b/a",// + "True, http://www.domain.com/b/a/index.html",// + "True, http://www.domain.com/b/b/foo.html",// + "True, http://www.domain.com/c",// + "True, http://www.domain.com/c/a",// + "True, http://www.domain.com/c/a/index.html",// + "True, http://www.domain.com/c/b/foo.html",// + "True, http://www.domain.com/d",// + "True, http://www.domain.com/d/a",// + "True, http://www.domain.com/e/a/index.html",// + "True, http://www.domain.com/e/d",// + "True, http://www.domain.com/e/d/foo.html",// + "True, http://www.domain.com/e/doh.html",// + "True, http://www.domain.com/f/index.html",// + "False, http://www.domain.com/foo/bar/baz.html",// + "True, http://www.domain.com/f/" }) + void testNutchCases3(boolean isAllowed, String urlStr) { + // Run through the Nutch test cases. + final String nutchRobotsTxt = "User-Agent: Agent1 #foo" + CR + "Disallow: /a" + CR + "Disallow: /b/a" + CR + "#Disallow: /c" + CR + "" + CR + "" + CR + "User-Agent: Agent2 Agent3#foo" + CR + + "User-Agent: Agent4" + CR + "Disallow: /d" + CR + "Disallow: /e/d/" + CR + "" + CR + "User-Agent: *" + CR + "Disallow: /foo/bar/" + CR; + + BaseRobotRules rules = createRobotRules("Agent5", nutchRobotsTxt); + assertEquals(isAllowed, rules.isAllowed(urlStr)); } @Test - public void testHtmlMarkupInRobotsTxt() { + void testHtmlMarkupInRobotsTxt() { final String htmlRobotsTxt = "\n" + "
\n" + "