From 1c74fc6ca057b02401d41319ba10f90df9c96f0b Mon Sep 17 00:00:00 2001 From: Avi Hayun Date: Tue, 23 Jul 2019 15:13:45 +0300 Subject: [PATCH] The robots.txt test now supports the junit5 parameterizedTest, gaining us the ability to have each robot rule be tested separately, so if one rule fails, the others can still show success in their own unit tests. (#256) IMHO it also makes the tests more readable but that is in the eyes of the beholder This files is smaller by 40 lines, and has 172 unit tests instead of 46 tests --- .../robots/SimpleRobotRulesParserTest.java | 680 +++++++++--------- 1 file changed, 320 insertions(+), 360 deletions(-) diff --git a/src/test/java/crawlercommons/robots/SimpleRobotRulesParserTest.java b/src/test/java/crawlercommons/robots/SimpleRobotRulesParserTest.java index 63ff431..58516eb 100644 --- a/src/test/java/crawlercommons/robots/SimpleRobotRulesParserTest.java +++ b/src/test/java/crawlercommons/robots/SimpleRobotRulesParserTest.java @@ -17,6 +17,8 @@ package crawlercommons.robots; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; import javax.servlet.http.HttpServletResponse; import java.io.InputStream; @@ -27,204 +29,222 @@ import static java.nio.charset.StandardCharsets.US_ASCII; import static java.nio.charset.StandardCharsets.UTF_8; import static org.junit.jupiter.api.Assertions.*; -public class SimpleRobotRulesParserTest { +class SimpleRobotRulesParserTest { private static final String LF = "\n"; private static final String CR = "\r"; private static final String CRLF = "\r\n"; private static final String FAKE_ROBOTS_URL = "http://domain.com"; - private static BaseRobotRules createRobotRules(String crawlerName, byte[] content) { + private static BaseRobotRules createRobotRules(String crawlerName, String content) { + return createRobotRules(crawlerName, content.getBytes(UTF_8)); + } + + private static BaseRobotRules createRobotRules(String crawlerName, byte[] contentBytes) { SimpleRobotRulesParser robotParser = new SimpleRobotRulesParser(); - return robotParser.parseContent(FAKE_ROBOTS_URL, content, "text/plain", crawlerName); + return robotParser.parseContent(FAKE_ROBOTS_URL, contentBytes, "text/plain", crawlerName); } @Test - public void testEmptyRules() { - BaseRobotRules rules = createRobotRules("Any-darn-crawler", "".getBytes(UTF_8)); + void testEmptyRules() { + BaseRobotRules rules = createRobotRules("Any-darn-crawler", ""); assertTrue(rules.isAllowed("http://www.domain.com/anypage.html")); } @Test - public void testQueryParamInDisallow() { + void testQueryParamInDisallow() { final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow: /index.cfm?fuseaction=sitesearch.results*"; - BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes(UTF_8)); + BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt); assertFalse(rules.isAllowed("http://searchservice.domain.com/index.cfm?fuseaction=sitesearch.results&type=People&qry=california&pg=2")); } - @Test - public void testGooglePatternMatching() { - + @ParameterizedTest + @CsvSource({ "False, http://www.fict.com/fish",// + "False, http://www.fict.com/fish.html",// + "False, http://www.fict.com/fish/salmon.html",// + "False, http://www.fict.com/fishheads",// + "False, http://www.fict.com/fishheads/yummy.html",// + "False, http://www.fict.com/fish.php?id=anything",// + "True, http://www.fict.com/Fish.asp",// + "True, http://www.fict.com/catfish",// + "True, http://www.fict.com/?id=fish",// + "True, http://www.fict.com/fis" }) + void testGooglePatternMatching1(boolean isAllowed, String urlStr) { // Test for /fish final String simpleRobotsTxt1 = "User-agent: *" + CRLF + "Disallow: /fish" + CRLF; - - BaseRobotRules rule1 = createRobotRules("Any-darn-crawler", simpleRobotsTxt1.getBytes(UTF_8)); - assertFalse(rule1.isAllowed("http://www.fict.com/fish")); - assertFalse(rule1.isAllowed("http://www.fict.com/fish.html")); - assertFalse(rule1.isAllowed("http://www.fict.com/fish/salmon.html")); - assertFalse(rule1.isAllowed("http://www.fict.com/fishheads")); - assertFalse(rule1.isAllowed("http://www.fict.com/fishheads/yummy.html")); - assertFalse(rule1.isAllowed("http://www.fict.com/fish.php?id=anything")); - - assertTrue(rule1.isAllowed("http://www.fict.com/Fish.asp")); - assertTrue(rule1.isAllowed("http://www.fict.com/catfish")); - assertTrue(rule1.isAllowed("http://www.fict.com/?id=fish")); - assertTrue(rule1.isAllowed("http://www.fict.com/fis")); + BaseRobotRules rule1 = createRobotRules("Any-darn-crawler", simpleRobotsTxt1); + assertEquals(isAllowed, rule1.isAllowed(urlStr)); // Test for /fish* final String simpleRobotsTxt2 = "User-agent: *" + CRLF + "Disallow: /fish*" + CRLF; + BaseRobotRules rule2 = createRobotRules("Any-darn-crawler", simpleRobotsTxt2); + assertEquals(isAllowed, rule2.isAllowed(urlStr)); + } - BaseRobotRules rule2 = createRobotRules("Any-darn-crawler", simpleRobotsTxt2.getBytes(UTF_8)); - assertFalse(rule2.isAllowed("http://www.fict.com/fish")); - assertFalse(rule2.isAllowed("http://www.fict.com/fish.html")); - assertFalse(rule2.isAllowed("http://www.fict.com/fish/salmon.html")); - assertFalse(rule2.isAllowed("http://www.fict.com/fishheads")); - assertFalse(rule2.isAllowed("http://www.fict.com/fishheads/yummy.html")); - assertFalse(rule2.isAllowed("http://www.fict.com/fish.php?id=anything")); - - assertTrue(rule2.isAllowed("http://www.fict.com/Fish.asp")); - assertTrue(rule2.isAllowed("http://www.fict.com/catfish")); - assertTrue(rule2.isAllowed("http://www.fict.com/?id=fish")); - assertTrue(rule2.isAllowed("http://www.fict.com/fis")); - - // Test for /fish/ - final String simpleRobotsTxt3 = "User-agent: *" + CRLF + "Disallow: /fish/" + CRLF; - - BaseRobotRules rule3 = createRobotRules("Any-darn-crawler", simpleRobotsTxt3.getBytes(UTF_8)); - assertFalse(rule3.isAllowed("http://www.fict.com/fish/")); - assertFalse(rule3.isAllowed("http://www.fict.com/fish/?id=anything")); - assertFalse(rule3.isAllowed("http://www.fict.com/fish/salmon.htm")); - - assertTrue(rule3.isAllowed("http://www.fict.com/fish")); - assertTrue(rule3.isAllowed("http://www.fict.com/fish.html")); - assertTrue(rule3.isAllowed("http://www.fict.com/Fish/Salmon.asp")); + @ParameterizedTest + @CsvSource({ "False, http://www.fict.com/fish/",// + "False, http://www.fict.com/fish/?id=anything",// + "False, http://www.fict.com/fish/salmon.htm",// + "True, http://www.fict.com/fish",// + "True, http://www.fict.com/fish.html",// + "True, http://www.fict.com/Fish/Salmon.asp" }) + void testGooglePatternMatching2(boolean isAllowed, String urlStr) { + // Test for /fish + final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow: /fish/" + CRLF; + BaseRobotRules rule = createRobotRules("Any-darn-crawler", simpleRobotsTxt); + assertEquals(isAllowed, rule.isAllowed(urlStr)); + } + @ParameterizedTest + @CsvSource({ "False, http://www.fict.com/filename.php",// + "False, http://www.fict.com/folder/filename.php",// + "False, http://www.fict.com/folder/filename.php?parameters",// + "False, http://www.fict.com/folder/any.php.file.html",// + "False, http://www.fict.com/filename.php/",// + "True, http://www.fict.com/",// + "True, http://www.fict.com/windows.PHP" }) + void testGooglePatternMatching3(boolean isAllowed, String urlStr) { // Test for /*.php - final String simpleRobotsTxt4 = "User-agent: *" + CRLF + "Disallow: /*.php" + CRLF; - - BaseRobotRules rule4 = createRobotRules("Any-darn-crawler", simpleRobotsTxt4.getBytes(UTF_8)); - assertFalse(rule4.isAllowed("http://www.fict.com/filename.php")); - assertFalse(rule4.isAllowed("http://www.fict.com/folder/filename.php")); - assertFalse(rule4.isAllowed("http://www.fict.com/folder/filename.php?parameters")); - assertFalse(rule4.isAllowed("http://www.fict.com/folder/any.php.file.html")); - assertFalse(rule4.isAllowed("http://www.fict.com/filename.php/")); - - assertTrue(rule4.isAllowed("http://www.fict.com/")); - assertTrue(rule4.isAllowed("http://www.fict.com/windows.PHP")); + final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow: /*.php" + CRLF; + BaseRobotRules rule = createRobotRules("Any-darn-crawler", simpleRobotsTxt); + assertEquals(isAllowed, rule.isAllowed(urlStr)); + } + @ParameterizedTest + @CsvSource({ "False, http://www.fict.com/filename.php",// + "False, http://www.fict.com/folder/filename.php",// + "True, http://www.fict.com/filename.php?parameters",// + "True, http://www.fict.com/filename.php/",// + "True, http://www.fict.com/filename.php5",// + "True, http://www.fict.com/windows.PHP" }) + void testGooglePatternMatching4(boolean isAllowed, String urlStr) { // Test for /*.php$ - final String simpleRobotsTxt5 = "User-agent: *" + CRLF + "Disallow: /*.php$" + CRLF; - - BaseRobotRules rule5 = createRobotRules("Any-darn-crawler", simpleRobotsTxt5.getBytes(UTF_8)); - assertFalse(rule5.isAllowed("http://www.fict.com/filename.php")); - assertFalse(rule5.isAllowed("http://www.fict.com/folder/filename.php")); - - assertTrue(rule5.isAllowed("http://www.fict.com/filename.php?parameters")); - assertTrue(rule5.isAllowed("http://www.fict.com/filename.php/")); - assertTrue(rule5.isAllowed("http://www.fict.com/filename.php5")); - assertTrue(rule5.isAllowed("http://www.fict.com/windows.PHP")); + final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow: /*.php$" + CRLF; + BaseRobotRules rule = createRobotRules("Any-darn-crawler", simpleRobotsTxt); + assertEquals(isAllowed, rule.isAllowed(urlStr)); + } + @ParameterizedTest + @CsvSource({ "False, http://www.fict.com/fish.php",// + "False, http://www.fict.com/fishheads/catfish.php?parameters",// + "True, http://www.fict.com/Fish.PHP" }) + void testGooglePatternMatching5(boolean isAllowed, String urlStr) { // Test for /fish*.php - final String simpleRobotsTxt6 = "User-agent: *" + CRLF + "Disallow: /fish*.php" + CRLF; - - BaseRobotRules rule6 = createRobotRules("Any-darn-crawler", simpleRobotsTxt6.getBytes(UTF_8)); - assertFalse(rule6.isAllowed("http://www.fict.com/fish.php")); - assertFalse(rule6.isAllowed("http://www.fict.com/fishheads/catfish.php?parameters")); - - assertTrue(rule6.isAllowed("http://www.fict.com/Fish.PHP")); + final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow: /fish*.php" + CRLF; + BaseRobotRules rule = createRobotRules("Any-darn-crawler", simpleRobotsTxt); + assertEquals(isAllowed, rule.isAllowed(urlStr)); + } + @ParameterizedTest + @CsvSource({ "False, http://www.fict.com/fish.php",// + "False, http://www.fict.com/superfishheads/catfish.php?parameters",// + "True, http://www.fict.com/fishheads/catfish.htm" }) + void testGooglePatternMatching6(boolean isAllowed, String urlStr) { // Test rule with multiple '*' characters - final String simpleRobotsTxt7 = "User-agent: *" + CRLF + "Disallow: /*fish*.php" + CRLF; - - BaseRobotRules rule7 = createRobotRules("Any-darn-crawler", simpleRobotsTxt7.getBytes(UTF_8)); - assertFalse(rule7.isAllowed("http://www.fict.com/fish.php")); - assertFalse(rule7.isAllowed("http://www.fict.com/superfishheads/catfish.php?parameters")); - assertTrue(rule7.isAllowed("http://www.fict.com/fishheads/catfish.htm")); + final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow: /*fish*.php" + CRLF; + BaseRobotRules rule = createRobotRules("Any-darn-crawler", simpleRobotsTxt); + assertEquals(isAllowed, rule.isAllowed(urlStr)); } @Test - public void testCommentedOutLines() { + void testCommentedOutLines() { final String simpleRobotsTxt = "#user-agent: testAgent" + LF + LF + "#allow: /index.html" + LF + "#allow: /test" + LF + LF + "#user-agent: test" + LF + LF + "#allow: /index.html" + LF + "#disallow: /test" + LF + LF + "#user-agent: someAgent" + LF + LF + "#disallow: /index.html" + LF + "#disallow: /test" + LF + LF; - BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes(UTF_8)); + BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt); assertTrue(rules.isAllowed("http://www.domain.com/anypage.html")); } @Test - public void testRobotsTxtAlwaysAllowed() { + void testRobotsTxtAlwaysAllowed() { final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow: /"; - BaseRobotRules rules = createRobotRules("any-darn-crawler", simpleRobotsTxt.getBytes(UTF_8)); + BaseRobotRules rules = createRobotRules("any-darn-crawler", simpleRobotsTxt); assertTrue(rules.isAllowed("http://www.domain.com/robots.txt")); } @Test - public void testAgentNotListed() { + void testAgentNotListed() { // Access is assumed to be allowed, if no rules match an agent. final String simpleRobotsTxt = "User-agent: crawler1" + CRLF + "Disallow: /index.html" + CRLF + "Allow: /" + CRLF + CRLF + "User-agent: crawler2" + CRLF + "Disallow: /"; - BaseRobotRules rules = createRobotRules("crawler3", simpleRobotsTxt.getBytes(UTF_8)); + BaseRobotRules rules = createRobotRules("crawler3", simpleRobotsTxt); assertTrue(rules.isAllowed("http://www.domain.com/anypage.html")); assertTrue(rules.isAllowed("http://www.domain.com/index.html")); } @Test - public void testNonAsciiEncoding() { + void testNonAsciiEncoding() { final String simpleRobotsTxt = "User-agent: *" + " # \u00A2 \u20B5" + CRLF + "Disallow:"; - BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes(UTF_8)); + BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt); assertTrue(rules.isAllowed("http://www.domain.com/anypage.html")); } @Test - public void testSimplestAllowAll() { + void testSimplestAllowAll() { final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow:"; - BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes(UTF_8)); + BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt); assertTrue(rules.isAllowed("http://www.domain.com/anypage.html")); } // https://github.com/crawler-commons/crawler-commons/issues/215 - @Test - public void testDisallowWithQueryOnly() { + void testDisallowWithQueryOnly() { final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow: /"; - BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes(UTF_8)); + BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt); assertFalse(rules.isAllowed("http://www.example.com")); assertFalse(rules.isAllowed("http://www.example.com?q=a")); } - @Test - public void testMixedEndings() { + @ParameterizedTest + @CsvSource({ "True, http://www.fict.org/",// + "True, http://www.fict.org/index.html" }) + void testMixedEndings1(boolean isAllowed, String urlStr) { final String mixedEndingsRobotsTxt = "# /robots.txt for http://www.fict.org/" + CRLF + "# comments to webmaster@fict.org" + CR + LF + "User-agent: unhipbot" + LF + "Disallow: /" + CR + "" + CRLF + "User-agent: webcrawler" + LF + "User-agent: excite" + CR + "Disallow: " + "\u0085" + CR + "User-agent: *" + CRLF + "Disallow: /org/plans.html" + LF + "Allow: /org/" + CR + "Allow: /serv" + CRLF + "Allow: /~mak" + LF + "Disallow: /" + CRLF; - BaseRobotRules rules; - - rules = createRobotRules("WebCrawler/3.0", mixedEndingsRobotsTxt.getBytes(UTF_8)); - assertTrue(rules.isAllowed("http://www.fict.org/")); - assertTrue(rules.isAllowed("http://www.fict.org/index.html")); - - rules = createRobotRules("Unknown/1.0", mixedEndingsRobotsTxt.getBytes(UTF_8)); - assertFalse(rules.isAllowed("http://www.fict.org/")); - assertFalse(rules.isAllowed("http://www.fict.org/index.html")); - assertTrue(rules.isAllowed("http://www.fict.org/robots.txt")); - assertTrue(rules.isAllowed("http://www.fict.org/server.html")); - assertTrue(rules.isAllowed("http://www.fict.org/services/fast.html")); - assertTrue(rules.isAllowed("http://www.fict.org/services/slow.html")); - assertFalse(rules.isAllowed("http://www.fict.org/orgo.gif")); - assertTrue(rules.isAllowed("http://www.fict.org/org/about.html")); - assertFalse(rules.isAllowed("http://www.fict.org/org/plans.html")); - assertFalse(rules.isAllowed("http://www.fict.org/%7Ejim/jim.html")); - assertTrue(rules.isAllowed("http://www.fict.org/%7Emak/mak.html")); - + BaseRobotRules rules = createRobotRules("WebCrawler/3.0", mixedEndingsRobotsTxt); + assertEquals(isAllowed, rules.isAllowed(urlStr)); } - @Test - public void testRfpCases() { + @ParameterizedTest + @CsvSource({ "False, http://www.fict.org/",// + "False, http://www.fict.org/index.html",// + "True, http://www.fict.org/robots.txt",// + "True, http://www.fict.org/server.html",// + "True, http://www.fict.org/services/fast.html",// + "True, http://www.fict.org/services/slow.html",// + "False, http://www.fict.org/orgo.gif",// + "True, http://www.fict.org/org/about.html",// + "False, http://www.fict.org/org/plans.html",// + "False, http://www.fict.org/%7Ejim/jim.html",// + "True, http://www.fict.org/%7Emak/mak.html" }) + void testMixedEndings2(boolean isAllowed, String urlStr) { + final String mixedEndingsRobotsTxt = "# /robots.txt for http://www.fict.org/" + CRLF + "# comments to webmaster@fict.org" + CR + LF + "User-agent: unhipbot" + LF + "Disallow: /" + CR + "" + + CRLF + "User-agent: webcrawler" + LF + "User-agent: excite" + CR + "Disallow: " + "\u0085" + CR + "User-agent: *" + CRLF + "Disallow: /org/plans.html" + LF + "Allow: /org/" + + CR + "Allow: /serv" + CRLF + "Allow: /~mak" + LF + "Disallow: /" + CRLF; + + BaseRobotRules rules = createRobotRules("Unknown/1.0", mixedEndingsRobotsTxt); + assertEquals(isAllowed, rules.isAllowed(urlStr)); + } + + @ParameterizedTest + @CsvSource({ "False, http://www.fict.org/",// + "False, http://www.fict.org/index.html",// + "True, http://www.fict.org/robots.txt",// + "False, http://www.fict.org/server.html",// + "False, http://www.fict.org/services/fast.html",// + "False, http://www.fict.org/services/slow.html",// + "False, http://www.fict.org/orgo.gif",// + "False, http://www.fict.org/org/about.html",// + "False, http://www.fict.org/org/plans.html",// + "False, http://www.fict.org/%7Ejim/jim.html",// + "False, http://www.fict.org/%7Emak/mak.html" }) + void testRfpCases(boolean isAllowed, String urlStr) { // Run through all of the tests that are part of the robots.txt RFP // http://www.robotstxt.org/norobots-rfc.txt final String rfpExampleRobotsTxt = "# /robots.txt for http://www.fict.org/" + CRLF + "# comments to webmaster@fict.org" + CRLF + CRLF + "User-agent: unhipbot" + CRLF + "Disallow: /" + CRLF @@ -233,223 +253,164 @@ public class SimpleRobotRulesParserTest { BaseRobotRules rules; - rules = createRobotRules("UnhipBot/0.1", rfpExampleRobotsTxt.getBytes(UTF_8)); - assertFalse(rules.isAllowed("http://www.fict.org/")); - assertFalse(rules.isAllowed("http://www.fict.org/index.html")); - assertTrue(rules.isAllowed("http://www.fict.org/robots.txt")); - assertFalse(rules.isAllowed("http://www.fict.org/server.html")); - assertFalse(rules.isAllowed("http://www.fict.org/services/fast.html")); - assertFalse(rules.isAllowed("http://www.fict.org/services/slow.html")); - assertFalse(rules.isAllowed("http://www.fict.org/orgo.gif")); - assertFalse(rules.isAllowed("http://www.fict.org/org/about.html")); - assertFalse(rules.isAllowed("http://www.fict.org/org/plans.html")); - assertFalse(rules.isAllowed("http://www.fict.org/%7Ejim/jim.html")); - assertFalse(rules.isAllowed("http://www.fict.org/%7Emak/mak.html")); + rules = createRobotRules("UnhipBot/0.1", rfpExampleRobotsTxt); + assertEquals(isAllowed, rules.isAllowed(urlStr)); - rules = createRobotRules("WebCrawler/3.0", rfpExampleRobotsTxt.getBytes(UTF_8)); - assertTrue(rules.isAllowed("http://www.fict.org/")); - assertTrue(rules.isAllowed("http://www.fict.org/index.html")); - assertTrue(rules.isAllowed("http://www.fict.org/robots.txt")); - assertTrue(rules.isAllowed("http://www.fict.org/server.html")); - assertTrue(rules.isAllowed("http://www.fict.org/services/fast.html")); - assertTrue(rules.isAllowed("http://www.fict.org/services/slow.html")); - assertTrue(rules.isAllowed("http://www.fict.org/orgo.gif")); - assertTrue(rules.isAllowed("http://www.fict.org/org/about.html")); - assertTrue(rules.isAllowed("http://www.fict.org/org/plans.html")); - assertTrue(rules.isAllowed("http://www.fict.org/%7Ejim/jim.html")); - assertTrue(rules.isAllowed("http://www.fict.org/%7Emak/mak.html")); + rules = createRobotRules("WebCrawler/3.0", rfpExampleRobotsTxt); + assertTrue(rules.isAllowed(urlStr)); - rules = createRobotRules("Excite/1.0", rfpExampleRobotsTxt.getBytes(UTF_8)); - assertTrue(rules.isAllowed("http://www.fict.org/")); - assertTrue(rules.isAllowed("http://www.fict.org/index.html")); - assertTrue(rules.isAllowed("http://www.fict.org/robots.txt")); - assertTrue(rules.isAllowed("http://www.fict.org/server.html")); - assertTrue(rules.isAllowed("http://www.fict.org/services/fast.html")); - assertTrue(rules.isAllowed("http://www.fict.org/services/slow.html")); - assertTrue(rules.isAllowed("http://www.fict.org/orgo.gif")); - assertTrue(rules.isAllowed("http://www.fict.org/org/about.html")); - assertTrue(rules.isAllowed("http://www.fict.org/org/plans.html")); - assertTrue(rules.isAllowed("http://www.fict.org/%7Ejim/jim.html")); - assertTrue(rules.isAllowed("http://www.fict.org/%7Emak/mak.html")); - - rules = createRobotRules("Unknown/1.0", rfpExampleRobotsTxt.getBytes(UTF_8)); - assertFalse(rules.isAllowed("http://www.fict.org/")); - assertFalse(rules.isAllowed("http://www.fict.org/index.html")); - assertTrue(rules.isAllowed("http://www.fict.org/robots.txt")); - assertTrue(rules.isAllowed("http://www.fict.org/server.html")); - assertTrue(rules.isAllowed("http://www.fict.org/services/fast.html")); - assertTrue(rules.isAllowed("http://www.fict.org/services/slow.html")); - assertFalse(rules.isAllowed("http://www.fict.org/orgo.gif")); - assertTrue(rules.isAllowed("http://www.fict.org/org/about.html")); - assertFalse(rules.isAllowed("http://www.fict.org/org/plans.html")); - assertFalse(rules.isAllowed("http://www.fict.org/%7Ejim/jim.html")); - assertTrue(rules.isAllowed("http://www.fict.org/%7Emak/mak.html")); + rules = createRobotRules("Excite/1.0", rfpExampleRobotsTxt); + assertTrue(rules.isAllowed(urlStr)); } - @Test - public void testNutchCases() { - // Run through the Nutch test cases. + @ParameterizedTest + @CsvSource({ "False, http://www.fict.org/",// + "False, http://www.fict.org/index.html",// + "True, http://www.fict.org/robots.txt",// + "True, http://www.fict.org/server.html",// + "True, http://www.fict.org/services/fast.html",// + "True, http://www.fict.org/services/slow.html",// + "False, http://www.fict.org/orgo.gif",// + "True, http://www.fict.org/org/about.html",// + "False, http://www.fict.org/org/plans.html",// + "False, http://www.fict.org/%7Ejim/jim.html",// + "True, http://www.fict.org/%7Emak/mak.html" }) + void testRfpCases2(boolean isAllowed, String urlStr) { + // Run through all of the tests that are part of the robots.txt RFP + // http://www.robotstxt.org/norobots-rfc.txt + final String rfpExampleRobotsTxt = "# /robots.txt for http://www.fict.org/" + CRLF + "# comments to webmaster@fict.org" + CRLF + CRLF + "User-agent: unhipbot" + CRLF + "Disallow: /" + CRLF + + "" + CRLF + "User-agent: webcrawler" + CRLF + "User-agent: excite" + CRLF + "Disallow: " + CRLF + CRLF + "User-agent: *" + CRLF + "Disallow: /org/plans.html" + CRLF + + "Allow: /org/" + CRLF + "Allow: /serv" + CRLF + "Allow: /~mak" + CRLF + "Disallow: /" + CRLF; + BaseRobotRules rules = createRobotRules("Unknown/1.0", rfpExampleRobotsTxt); + assertEquals(isAllowed, rules.isAllowed(urlStr)); + } + + @ParameterizedTest + @CsvSource({ "False, http://www.domain.com/a",// + "False, http://www.domain.com/a/",// + "False, http://www.domain.com/a/bloh/foo.html",// + "True, http://www.domain.com/b",// + "False, http://www.domain.com/b/a",// + "False, http://www.domain.com/b/a/index.html",// + "True, http://www.domain.com/b/b/foo.html",// + "True, http://www.domain.com/c",// + "True, http://www.domain.com/c/a",// + "True, http://www.domain.com/c/a/index.html",// + "True, http://www.domain.com/c/b/foo.html",// + "True, http://www.domain.com/d",// + "True, http://www.domain.com/d/a",// + "True, http://www.domain.com/e/a/index.html",// + "True, http://www.domain.com/e/d",// + "True, http://www.domain.com/e/d/foo.html",// + "True, http://www.domain.com/e/doh.html",// + "True, http://www.domain.com/f/index.html",// + "True, http://www.domain.com/foo/bar/baz.html",// + "True, http://www.domain.com/f/" }) + void testNutchCases(boolean isAllowed, String urlStr) { + // Run through the Nutch test cases. final String nutchRobotsTxt = "User-Agent: Agent1 #foo" + CR + "Disallow: /a" + CR + "Disallow: /b/a" + CR + "#Disallow: /c" + CR + "" + CR + "" + CR + "User-Agent: Agent2 Agent3#foo" + CR + "User-Agent: Agent4" + CR + "Disallow: /d" + CR + "Disallow: /e/d/" + CR + "" + CR + "User-Agent: *" + CR + "Disallow: /foo/bar/" + CR; BaseRobotRules rules; - rules = createRobotRules("Agent1", nutchRobotsTxt.getBytes(UTF_8)); - assertFalse(rules.isAllowed("http://www.domain.com/a")); - assertFalse(rules.isAllowed("http://www.domain.com/a/")); - assertFalse(rules.isAllowed("http://www.domain.com/a/bloh/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/b")); - assertFalse(rules.isAllowed("http://www.domain.com/b/a")); - assertFalse(rules.isAllowed("http://www.domain.com/b/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/b/b/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/c")); - assertTrue(rules.isAllowed("http://www.domain.com/c/a")); - assertTrue(rules.isAllowed("http://www.domain.com/c/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/c/b/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/d")); - assertTrue(rules.isAllowed("http://www.domain.com/d/a")); - assertTrue(rules.isAllowed("http://www.domain.com/e/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/e/d")); - assertTrue(rules.isAllowed("http://www.domain.com/e/d/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/e/doh.html")); - assertTrue(rules.isAllowed("http://www.domain.com/f/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/foo/bar/baz.html")); - assertTrue(rules.isAllowed("http://www.domain.com/f/")); - - rules = createRobotRules("Agent2", nutchRobotsTxt.getBytes(UTF_8)); - assertTrue(rules.isAllowed("http://www.domain.com/a")); - assertTrue(rules.isAllowed("http://www.domain.com/a/")); - assertTrue(rules.isAllowed("http://www.domain.com/a/bloh/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/b")); - assertTrue(rules.isAllowed("http://www.domain.com/b/a")); - assertTrue(rules.isAllowed("http://www.domain.com/b/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/b/b/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/c")); - assertTrue(rules.isAllowed("http://www.domain.com/c/a")); - assertTrue(rules.isAllowed("http://www.domain.com/c/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/c/b/foo.html")); - assertFalse(rules.isAllowed("http://www.domain.com/d")); - assertFalse(rules.isAllowed("http://www.domain.com/d/a")); - assertTrue(rules.isAllowed("http://www.domain.com/e/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/e/d")); - assertFalse(rules.isAllowed("http://www.domain.com/e/d/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/e/doh.html")); - assertTrue(rules.isAllowed("http://www.domain.com/f/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/foo/bar/baz.html")); - assertTrue(rules.isAllowed("http://www.domain.com/f/")); - - rules = createRobotRules("Agent3", nutchRobotsTxt.getBytes(UTF_8)); - assertTrue(rules.isAllowed("http://www.domain.com/a")); - assertTrue(rules.isAllowed("http://www.domain.com/a/")); - assertTrue(rules.isAllowed("http://www.domain.com/a/bloh/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/b")); - assertTrue(rules.isAllowed("http://www.domain.com/b/a")); - assertTrue(rules.isAllowed("http://www.domain.com/b/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/b/b/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/c")); - assertTrue(rules.isAllowed("http://www.domain.com/c/a")); - assertTrue(rules.isAllowed("http://www.domain.com/c/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/c/b/foo.html")); - assertFalse(rules.isAllowed("http://www.domain.com/d")); - assertFalse(rules.isAllowed("http://www.domain.com/d/a")); - assertTrue(rules.isAllowed("http://www.domain.com/e/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/e/d")); - assertFalse(rules.isAllowed("http://www.domain.com/e/d/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/e/doh.html")); - assertTrue(rules.isAllowed("http://www.domain.com/f/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/foo/bar/baz.html")); - assertTrue(rules.isAllowed("http://www.domain.com/f/")); - - rules = createRobotRules("Agent4", nutchRobotsTxt.getBytes(UTF_8)); - assertTrue(rules.isAllowed("http://www.domain.com/a")); - assertTrue(rules.isAllowed("http://www.domain.com/a/")); - assertTrue(rules.isAllowed("http://www.domain.com/a/bloh/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/b")); - assertTrue(rules.isAllowed("http://www.domain.com/b/a")); - assertTrue(rules.isAllowed("http://www.domain.com/b/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/b/b/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/c")); - assertTrue(rules.isAllowed("http://www.domain.com/c/a")); - assertTrue(rules.isAllowed("http://www.domain.com/c/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/c/b/foo.html")); - assertFalse(rules.isAllowed("http://www.domain.com/d")); - assertFalse(rules.isAllowed("http://www.domain.com/d/a")); - assertTrue(rules.isAllowed("http://www.domain.com/e/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/e/d")); - assertFalse(rules.isAllowed("http://www.domain.com/e/d/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/e/doh.html")); - assertTrue(rules.isAllowed("http://www.domain.com/f/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/foo/bar/baz.html")); - assertTrue(rules.isAllowed("http://www.domain.com/f/")); - - rules = createRobotRules("Agent5", nutchRobotsTxt.getBytes(UTF_8)); - assertTrue(rules.isAllowed("http://www.domain.com/a")); - assertTrue(rules.isAllowed("http://www.domain.com/a/")); - assertTrue(rules.isAllowed("http://www.domain.com/a/bloh/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/b")); - assertTrue(rules.isAllowed("http://www.domain.com/b/a")); - assertTrue(rules.isAllowed("http://www.domain.com/b/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/b/b/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/c")); - assertTrue(rules.isAllowed("http://www.domain.com/c/a")); - assertTrue(rules.isAllowed("http://www.domain.com/c/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/c/b/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/d")); - assertTrue(rules.isAllowed("http://www.domain.com/d/a")); - assertTrue(rules.isAllowed("http://www.domain.com/e/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/e/d")); - assertTrue(rules.isAllowed("http://www.domain.com/e/d/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/e/doh.html")); - assertTrue(rules.isAllowed("http://www.domain.com/f/index.html")); - assertFalse(rules.isAllowed("http://www.domain.com/foo/bar/baz.html")); - assertTrue(rules.isAllowed("http://www.domain.com/f/")); + rules = createRobotRules("Agent1", nutchRobotsTxt); + assertEquals(isAllowed, rules.isAllowed(urlStr)); // Note that the SimpleRobotRulesParser only parses the rule set of the // first matching agent name. For the following example, the parser // returns only the rules matching 'Agent1'. - rules = createRobotRules("Agent5,Agent2,Agent1,Agent3,*", nutchRobotsTxt.getBytes(UTF_8)); - assertFalse(rules.isAllowed("http://www.domain.com/a")); - assertFalse(rules.isAllowed("http://www.domain.com/a/")); - assertFalse(rules.isAllowed("http://www.domain.com/a/bloh/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/b")); - assertFalse(rules.isAllowed("http://www.domain.com/b/a")); - assertFalse(rules.isAllowed("http://www.domain.com/b/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/b/b/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/c")); - assertTrue(rules.isAllowed("http://www.domain.com/c/a")); - assertTrue(rules.isAllowed("http://www.domain.com/c/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/c/b/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/d")); - assertTrue(rules.isAllowed("http://www.domain.com/d/a")); - assertTrue(rules.isAllowed("http://www.domain.com/e/a/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/e/d")); - assertTrue(rules.isAllowed("http://www.domain.com/e/d/foo.html")); - assertTrue(rules.isAllowed("http://www.domain.com/e/doh.html")); - assertTrue(rules.isAllowed("http://www.domain.com/f/index.html")); - assertTrue(rules.isAllowed("http://www.domain.com/foo/bar/baz.html")); - assertTrue(rules.isAllowed("http://www.domain.com/f/")); + rules = createRobotRules("Agent5,Agent2,Agent1,Agent3,*", nutchRobotsTxt); + assertEquals(isAllowed, rules.isAllowed(urlStr)); + } + + @ParameterizedTest + @CsvSource({ "True, http://www.domain.com/a",// + "True, http://www.domain.com/a/",// + "True, http://www.domain.com/a/bloh/foo.html",// + "True, http://www.domain.com/b",// + "True, http://www.domain.com/b/a",// + "True, http://www.domain.com/b/a/index.html",// + "True, http://www.domain.com/b/b/foo.html",// + "True, http://www.domain.com/c",// + "True, http://www.domain.com/c/a",// + "True, http://www.domain.com/c/a/index.html",// + "True, http://www.domain.com/c/b/foo.html",// + "False, http://www.domain.com/d",// + "False, http://www.domain.com/d/a",// + "True, http://www.domain.com/e/a/index.html",// + "True, http://www.domain.com/e/d",// + "False, http://www.domain.com/e/d/foo.html",// + "True, http://www.domain.com/e/doh.html",// + "True, http://www.domain.com/f/index.html",// + "True, http://www.domain.com/foo/bar/baz.html",// + "True, http://www.domain.com/f/" }) + void testNutchCases2(boolean isAllowed, String urlStr) { + // Run through the Nutch test cases. + final String nutchRobotsTxt = "User-Agent: Agent1 #foo" + CR + "Disallow: /a" + CR + "Disallow: /b/a" + CR + "#Disallow: /c" + CR + "" + CR + "" + CR + "User-Agent: Agent2 Agent3#foo" + CR + + "User-Agent: Agent4" + CR + "Disallow: /d" + CR + "Disallow: /e/d/" + CR + "" + CR + "User-Agent: *" + CR + "Disallow: /foo/bar/" + CR; + + BaseRobotRules rules; + + rules = createRobotRules("Agent2", nutchRobotsTxt); + assertEquals(isAllowed, rules.isAllowed(urlStr)); + + rules = createRobotRules("Agent3", nutchRobotsTxt); + assertEquals(isAllowed, rules.isAllowed(urlStr)); + + rules = createRobotRules("Agent4", nutchRobotsTxt); + assertEquals(isAllowed, rules.isAllowed(urlStr)); + } + + @ParameterizedTest + @CsvSource({ "True, http://www.domain.com/a",// + "True, http://www.domain.com/a/",// + "True, http://www.domain.com/a/bloh/foo.html",// + "True, http://www.domain.com/b",// + "True, http://www.domain.com/b/a",// + "True, http://www.domain.com/b/a/index.html",// + "True, http://www.domain.com/b/b/foo.html",// + "True, http://www.domain.com/c",// + "True, http://www.domain.com/c/a",// + "True, http://www.domain.com/c/a/index.html",// + "True, http://www.domain.com/c/b/foo.html",// + "True, http://www.domain.com/d",// + "True, http://www.domain.com/d/a",// + "True, http://www.domain.com/e/a/index.html",// + "True, http://www.domain.com/e/d",// + "True, http://www.domain.com/e/d/foo.html",// + "True, http://www.domain.com/e/doh.html",// + "True, http://www.domain.com/f/index.html",// + "False, http://www.domain.com/foo/bar/baz.html",// + "True, http://www.domain.com/f/" }) + void testNutchCases3(boolean isAllowed, String urlStr) { + // Run through the Nutch test cases. + final String nutchRobotsTxt = "User-Agent: Agent1 #foo" + CR + "Disallow: /a" + CR + "Disallow: /b/a" + CR + "#Disallow: /c" + CR + "" + CR + "" + CR + "User-Agent: Agent2 Agent3#foo" + CR + + "User-Agent: Agent4" + CR + "Disallow: /d" + CR + "Disallow: /e/d/" + CR + "" + CR + "User-Agent: *" + CR + "Disallow: /foo/bar/" + CR; + + BaseRobotRules rules = createRobotRules("Agent5", nutchRobotsTxt); + assertEquals(isAllowed, rules.isAllowed(urlStr)); } @Test - public void testHtmlMarkupInRobotsTxt() { + void testHtmlMarkupInRobotsTxt() { final String htmlRobotsTxt = "\n" + "\n" + "/robots.txt\n" + "\n" + "\n" + "User-agent: anybot
\n" + "Disallow:
\n" + "Crawl-Delay: 10
\n" + "\n" + "User-agent: *
\n" + "Disallow: /
\n" + "Crawl-Delay: 30
\n" + "\n" + "\n" + "\n"; BaseRobotRules rules; - rules = createRobotRules("anybot", htmlRobotsTxt.getBytes(UTF_8)); + rules = createRobotRules("anybot", htmlRobotsTxt); assertTrue(rules.isAllowed("http://www.domain.com/index.html")); assertEquals(10000, rules.getCrawlDelay()); - rules = createRobotRules("bogusbot", htmlRobotsTxt.getBytes(UTF_8)); + rules = createRobotRules("bogusbot", htmlRobotsTxt); assertFalse(rules.isAllowed("http://www.domain.com/index.html")); assertEquals(30000, rules.getCrawlDelay()); } @Test - public void testIgnoreOfHtml() { + void testIgnoreOfHtml() { final String htmlFile = "Site under Maintenance"; BaseRobotRules rules = createRobotRules("anybot", htmlFile.getBytes(US_ASCII)); @@ -458,39 +419,39 @@ public class SimpleRobotRulesParserTest { } @Test - public void testHeritrixCases() { + void testHeritrixCases() { final String heritrixRobotsTxt = "User-agent: *\n" + "Disallow: /cgi-bin/\n" + "Disallow: /details/software\n" + "\n" + "User-agent: denybot\n" + "Disallow: /\n" + "\n" + "User-agent: allowbot1\n" + "Disallow: \n" + "\n" + "User-agent: allowbot2\n" + "Disallow: /foo\n" + "Allow: /\n" + "\n" + "User-agent: delaybot\n" + "Disallow: /\n" + "Crawl-Delay: 20\n" + "Allow: /images/\n"; BaseRobotRules rules; - rules = createRobotRules("Mozilla allowbot1 99.9", heritrixRobotsTxt.getBytes(UTF_8)); + rules = createRobotRules("Mozilla allowbot1 99.9", heritrixRobotsTxt); assertTrue(rules.isAllowed("http://www.domain.com/path")); assertTrue(rules.isAllowed("http://www.domain.com/")); - rules = createRobotRules("Mozilla allowbot2 99.9", heritrixRobotsTxt.getBytes(UTF_8)); + rules = createRobotRules("Mozilla allowbot2 99.9", heritrixRobotsTxt); assertTrue(rules.isAllowed("http://www.domain.com/path")); assertTrue(rules.isAllowed("http://www.domain.com/")); assertFalse(rules.isAllowed("http://www.domain.com/foo")); - rules = createRobotRules("Mozilla denybot 99.9", heritrixRobotsTxt.getBytes(UTF_8)); + rules = createRobotRules("Mozilla denybot 99.9", heritrixRobotsTxt); assertFalse(rules.isAllowed("http://www.domain.com/path")); assertFalse(rules.isAllowed("http://www.domain.com/")); assertEquals(BaseRobotRules.UNSET_CRAWL_DELAY, rules.getCrawlDelay()); - rules = createRobotRules("Mozilla anonbot 99.9", heritrixRobotsTxt.getBytes(UTF_8)); + rules = createRobotRules("Mozilla anonbot 99.9", heritrixRobotsTxt); assertTrue(rules.isAllowed("http://www.domain.com/path")); assertFalse(rules.isAllowed("http://www.domain.com/cgi-bin/foo.pl")); - rules = createRobotRules("Mozilla delaybot 99.9", heritrixRobotsTxt.getBytes(UTF_8)); + rules = createRobotRules("Mozilla delaybot 99.9", heritrixRobotsTxt); assertEquals(20000, rules.getCrawlDelay()); } @Test - public void testCaseSensitivePaths() { + void testCaseSensitivePaths() { final String simpleRobotsTxt = "User-agent: *" + CRLF + "Allow: /AnyPage.html" + CRLF + "Allow: /somepage.html" + CRLF + "Disallow: /"; - BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes(UTF_8)); + BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt); assertTrue(rules.isAllowed("http://www.domain.com/AnyPage.html")); assertFalse(rules.isAllowed("http://www.domain.com/anypage.html")); assertTrue(rules.isAllowed("http://www.domain.com/somepage.html")); @@ -498,72 +459,72 @@ public class SimpleRobotRulesParserTest { } @Test - public void testEmptyDisallow() { + void testEmptyDisallow() { final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow:"; - BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes(UTF_8)); + BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt); assertTrue(rules.isAllowed("http://www.domain.com/anypage.html")); } @Test - public void testEmptyAllow() { + void testEmptyAllow() { final String simpleRobotsTxt = "User-agent: *" + CRLF + "Allow:"; - BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes(UTF_8)); + BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt); assertTrue(rules.isAllowed("http://www.domain.com/anypage.html")); } @Test - public void testMultiWildcard() { + void testMultiWildcard() { // Make sure we only take the first wildcard entry. final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow: /index.html" + CRLF + "Allow: /" + CRLF + CRLF + "User-agent: *" + CRLF + "Disallow: /"; - BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes(UTF_8)); + BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt); assertFalse(rules.isAllowed("http://www.domain.com/index.html")); assertTrue(rules.isAllowed("http://www.domain.com/anypage.html")); } @Test - public void testMultiMatches() { + void testMultiMatches() { // Make sure we only take the first record that matches. final String simpleRobotsTxt = "User-agent: crawlerbot" + CRLF + "Disallow: /index.html" + CRLF + "Allow: /" + CRLF + CRLF + "User-agent: crawler" + CRLF + "Disallow: /"; - BaseRobotRules rules = createRobotRules("crawlerbot", simpleRobotsTxt.getBytes(UTF_8)); + BaseRobotRules rules = createRobotRules("crawlerbot", simpleRobotsTxt); assertFalse(rules.isAllowed("http://www.domain.com/index.html")); assertTrue(rules.isAllowed("http://www.domain.com/anypage.html")); } @Test - public void testMultiAgentNames() { + void testMultiAgentNames() { // When there are more than one agent name on a line. final String simpleRobotsTxt = "User-agent: crawler1 crawler2" + CRLF + "Disallow: /index.html" + CRLF + "Allow: /"; - BaseRobotRules rules = createRobotRules("crawler2", simpleRobotsTxt.getBytes(UTF_8)); + BaseRobotRules rules = createRobotRules("crawler2", simpleRobotsTxt); assertFalse(rules.isAllowed("http://www.domain.com/index.html")); assertTrue(rules.isAllowed("http://www.domain.com/anypage.html")); } @Test - public void testMultiWordAgentName() { + void testMultiWordAgentName() { // When the user agent name has a space in it. final String simpleRobotsTxt = "User-agent: Download Ninja" + CRLF + "Disallow: /index.html" + CRLF + "Allow: /"; - BaseRobotRules rules = createRobotRules("Download Ninja", simpleRobotsTxt.getBytes(UTF_8)); + BaseRobotRules rules = createRobotRules("Download Ninja", simpleRobotsTxt); assertFalse(rules.isAllowed("http://www.domain.com/index.html")); assertTrue(rules.isAllowed("http://www.domain.com/anypage.html")); } @Test - public void testUnsupportedFields() { + void testUnsupportedFields() { // When we have a new field type that we don't know about. final String simpleRobotsTxt = "User-agent: crawler1" + CRLF + "Disallow: /index.html" + CRLF + "Allow: /" + CRLF + "newfield: 234" + CRLF + "User-agent: crawler2" + CRLF + "Disallow: /"; - BaseRobotRules rules = createRobotRules("crawler2", simpleRobotsTxt.getBytes(UTF_8)); + BaseRobotRules rules = createRobotRules("crawler2", simpleRobotsTxt); assertFalse(rules.isAllowed("http://www.domain.com/anypage.html")); } @Test - public void testAcapFields() { + void testAcapFields() { final String robotsTxt = "acap-crawler: *" + CRLF + "acap-disallow-crawl: /ultima_ora/"; SimpleRobotRulesParser parser = new SimpleRobotRulesParser(); @@ -572,7 +533,7 @@ public class SimpleRobotRulesParserTest { } @Test - public void testStatusCodeCreation() { + void testStatusCodeCreation() { BaseRobotRules rules; SimpleRobotRulesParser robotParser = new SimpleRobotRulesParser(); @@ -605,71 +566,71 @@ public class SimpleRobotRulesParserTest { } @Test - public void testCrawlDelay() { + void testCrawlDelay() { final String delayRules1RobotsTxt = "User-agent: bixo" + CR + "Crawl-delay: 10" + CR + "User-agent: foobot" + CR + "Crawl-delay: 20" + CR + "User-agent: *" + CR + "Disallow:/baz" + CR; - BaseRobotRules rules = createRobotRules("bixo", delayRules1RobotsTxt.getBytes(UTF_8)); + BaseRobotRules rules = createRobotRules("bixo", delayRules1RobotsTxt); long crawlDelay = rules.getCrawlDelay(); assertEquals(10000, crawlDelay, "testing crawl delay for agent bixo - rule 1"); final String delayRules2RobotsTxt = "User-agent: foobot" + CR + "Crawl-delay: 20" + CR + "User-agent: *" + CR + "Disallow:/baz" + CR; - rules = createRobotRules("bixo", delayRules2RobotsTxt.getBytes(UTF_8)); + rules = createRobotRules("bixo", delayRules2RobotsTxt); crawlDelay = rules.getCrawlDelay(); assertEquals(BaseRobotRules.UNSET_CRAWL_DELAY, crawlDelay, "testing crawl delay for agent bixo - rule 2"); } @Test - public void testBigCrawlDelay() { + void testBigCrawlDelay() { final String robotsTxt = "User-agent: *" + CR + "Crawl-delay: 3600" + CR + "Disallow:" + CR; - BaseRobotRules rules = createRobotRules("bixo", robotsTxt.getBytes(UTF_8)); + BaseRobotRules rules = createRobotRules("bixo", robotsTxt); assertFalse(rules.isAllowed("http://www.domain.com/"), "disallow all if huge crawl delay"); } @Test - public void testBrokenKrugleRobotsTxtFile() { + void testBrokenKrugleRobotsTxtFile() { final String krugleRobotsTxt = "User-agent: *" + CR + "Disallow: /maintenance.html" + CR + "Disallow: /perl/" + CR + "Disallow: /cgi-bin/" + CR + "Disallow: /examples/" + CR + "Crawl-delay: 3" + CR + "" + CR + "User-agent: googlebot" + CR + "Crawl-delay: 1" + CR + "" + CR + "User-agent: qihoobot" + CR + "Disallow: /"; - BaseRobotRules rules = createRobotRules("googlebot/2.1", krugleRobotsTxt.getBytes(UTF_8)); + BaseRobotRules rules = createRobotRules("googlebot/2.1", krugleRobotsTxt); assertTrue(rules.isAllowed("http://www.krugle.com/examples/index.html")); } @Test - public void testRobotsWithUTF8BOM() throws Exception { + void testRobotsWithUTF8BOM() throws Exception { BaseRobotRules rules = createRobotRules("foobot", readFile("/robots/robots-with-utf8-bom.txt")); assertFalse(rules.isAllowed("http://www.domain.com/profile"), "Disallow match against *"); } @Test - public void testRobotsWithUTF16LEBOM() throws Exception { + void testRobotsWithUTF16LEBOM() throws Exception { BaseRobotRules rules = createRobotRules("foobot", readFile("/robots/robots-with-utf16le-bom.txt")); assertFalse(rules.isAllowed("http://www.domain.com/profile"), "Disallow match against *"); } @Test - public void testRobotsWithUTF16BEBOM() throws Exception { + void testRobotsWithUTF16BEBOM() throws Exception { BaseRobotRules rules = createRobotRules("foobot", readFile("/robots/robots-with-utf16be-bom.txt")); assertFalse(rules.isAllowed("http://www.domain.com/profile"), "Disallow match against *"); } @Test - public void testFloatingPointCrawlDelay() { + void testFloatingPointCrawlDelay() { final String robotsTxt = "User-agent: *" + CR + "Crawl-delay: 0.5" + CR + "Disallow:" + CR; - BaseRobotRules rules = createRobotRules("bixo", robotsTxt.getBytes(UTF_8)); + BaseRobotRules rules = createRobotRules("bixo", robotsTxt); assertEquals(500, rules.getCrawlDelay()); } @Test - public void testIgnoringHost() throws Exception { + void testIgnoringHost() throws Exception { BaseRobotRules rules = createRobotRules("foobot", readFile("/robots/www.flot.com-robots.txt")); assertFalse(rules.isAllowed("http://www.flot.com/img/"), "Disallow img directory"); } @Test - public void testDirectiveTypos() throws Exception { + void testDirectiveTypos() throws Exception { BaseRobotRules rules = createRobotRules("bot1", readFile("/robots/directive-typos-robots.txt")); assertFalse(rules.isAllowed("http://domain.com/desallow/"), "desallow"); assertFalse(rules.isAllowed("http://domain.com/dissalow/"), "dissalow"); @@ -685,7 +646,7 @@ public class SimpleRobotRulesParserTest { } @Test - public void testFormatErrors() throws Exception { + void testFormatErrors() throws Exception { BaseRobotRules rules = createRobotRules("bot1", readFile("/robots/format-errors-robots.txt")); assertFalse(rules.isAllowed("http://domain.com/whitespace-before-colon/"), "whitespace-before-colon"); assertFalse(rules.isAllowed("http://domain.com/no-colon/"), "no-colon"); @@ -699,14 +660,14 @@ public class SimpleRobotRulesParserTest { // See http://www.conman.org/people/spc/robots2.html @Test - public void testExtendedStandard() throws Exception { + void testExtendedStandard() throws Exception { SimpleRobotRulesParser robotParser = new SimpleRobotRulesParser(); robotParser.parseContent(FAKE_ROBOTS_URL, readFile("/robots/extended-standard-robots.txt"), "text/plain", "foobot"); assertEquals(0, robotParser.getNumWarnings(), "Zero warnings with expended directives"); } @Test - public void testSitemap() throws Exception { + void testSitemap() throws Exception { BaseRobotRules rules = createRobotRules("bot1", readFile("/robots/sitemap-robots.txt")); assertEquals(3, rules.getSitemaps().size(), "Found sitemap"); // check that the last one is not lowercase only @@ -716,13 +677,13 @@ public class SimpleRobotRulesParserTest { } @Test - public void testRelativeSitemap() throws Exception { + void testRelativeSitemap() throws Exception { BaseRobotRules rules = createRobotRules("bot1", readFile("/robots/relative-sitemap-robots.txt")); assertEquals(1, rules.getSitemaps().size(), "Found sitemap"); } @Test - public void testSitemapInvalidBaseUrl() throws Exception { + void testSitemapInvalidBaseUrl() { // test https://github.com/crawler-commons/crawler-commons/issues/240 // - should handle absolute sitemap URL even if base URL isn't valid @@ -737,7 +698,7 @@ public class SimpleRobotRulesParserTest { } @Test - public void testManyUserAgents() throws Exception { + void testManyUserAgents() throws Exception { BaseRobotRules rules = createRobotRules("wget", readFile("/robots/many-user-agents.txt")); assertFalse(rules.isAllowed("http://domain.com/"), "many-user-agents"); @@ -747,21 +708,21 @@ public class SimpleRobotRulesParserTest { } @Test - public void testMalformedPathInRobotsFile() throws Exception { + void testMalformedPathInRobotsFile() throws Exception { BaseRobotRules rules = createRobotRules("bot1", readFile("/robots/malformed-path.txt")); assertFalse(rules.isAllowed("http://en.wikipedia.org/wiki/Wikipedia_talk:Mediation_Committee/"), "Disallowed URL"); assertTrue(rules.isAllowed("http://en.wikipedia.org/wiki/"), "Regular URL"); } @Test - public void testDOSlineEndings() throws Exception { + void testDOSlineEndings() throws Exception { BaseRobotRules rules = createRobotRules("bot1", readFile("/robots/dos-line-endings.txt")); assertTrue(rules.isAllowed("http://ford.com/"), "Allowed URL"); assertEquals(1000L, rules.getCrawlDelay()); } @Test - public void testAmazonRobotsWithWildcards() throws Exception { + void testAmazonRobotsWithWildcards() throws Exception { BaseRobotRules rules = createRobotRules("Any-darn-crawler", readFile("/robots/wildcards.txt")); assertFalse(rules.isAllowed("http://www.fict.com/wishlist/bogus")); assertTrue(rules.isAllowed("http://www.fict.com/wishlist/universal/page")); @@ -769,38 +730,37 @@ public class SimpleRobotRulesParserTest { } @Test - public void testAllowBeforeDisallow() throws Exception { + void testAllowBeforeDisallow() { final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow: /fish" + CRLF + "Allow: /fish" + CRLF; - - BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes(UTF_8)); + BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt); assertTrue(rules.isAllowed("http://www.fict.com/fish")); } @Test - public void testSpacesInMultipleUserAgentNames() throws Exception { + void testSpacesInMultipleUserAgentNames() { final String simpleRobotsTxt = "User-agent: One, Two, Three" + CRLF + "Disallow: /" + CRLF + "" + CRLF + "User-agent: *" + CRLF + "Allow: /" + CRLF; - BaseRobotRules rules = createRobotRules("One", simpleRobotsTxt.getBytes(UTF_8)); + BaseRobotRules rules = createRobotRules("One", simpleRobotsTxt); assertFalse(rules.isAllowed("http://www.fict.com/fish")); - rules = createRobotRules("Two", simpleRobotsTxt.getBytes(UTF_8)); + rules = createRobotRules("Two", simpleRobotsTxt); assertFalse(rules.isAllowed("http://www.fict.com/fish")); - rules = createRobotRules("Three", simpleRobotsTxt.getBytes(UTF_8)); + rules = createRobotRules("Three", simpleRobotsTxt); assertFalse(rules.isAllowed("http://www.fict.com/fish")); - rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes(UTF_8)); + rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt); assertTrue(rules.isAllowed("http://www.fict.com/fish")); } // https://github.com/crawler-commons/crawler-commons/issues/112 @Test - public void testSitemapAtEndOfFile() throws Exception { + void testSitemapAtEndOfFile() { final String simpleRobotsTxt = "User-agent: a" + CRLF + "Disallow: /content/dam/" + CRLF + CRLF + "User-agent: b" + CRLF + "Disallow: /content/dam/" + CRLF + CRLF + "User-agent: c" + CRLF + "Disallow: /content/dam/" + CRLF + CRLF + CRLF + "Sitemap: https://wwwfoocom/sitemapxml"; - BaseRobotRules rules = createRobotRules("a", simpleRobotsTxt.getBytes(UTF_8)); + BaseRobotRules rules = createRobotRules("a", simpleRobotsTxt); assertEquals(1, rules.getSitemaps().size()); assertEquals("https://wwwfoocom/sitemapxml", rules.getSitemaps().get(0)); }