diff --git a/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java b/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java index c22461c..2f6eb80 100644 --- a/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java +++ b/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java @@ -44,7 +44,9 @@ import crawlercommons.robots.SimpleRobotRules.RobotRulesMode; *

* This implementation of {@link BaseRobotsParser} retrieves a set of * {@link SimpleRobotRules rules} for an agent with the given name from the - * robots.txt file of a given domain. + * robots.txt file of a given domain. The implementation follows + * RFC + * 9309. *

* *

@@ -492,7 +494,17 @@ public class SimpleRobotRulesParser extends BaseRobotsParser { int bytesLen = content.length; int offset = 0; - Charset encoding = StandardCharsets.US_ASCII; + + /* + * RFC 9309 requires that is "UTF-8 encoded" ( RFC + * 9309, section 2.3 Access Method), but + * "Implementors MAY bridge encoding mismatches if they detect that the robots.txt file is not UTF-8 encoded." + * ( RFC 9309, section 2.2.2. The "Allow" and "Disallow" Lines) + */ + Charset encoding = StandardCharsets.UTF_8; // Check for a UTF-8 BOM at the beginning (EF BB BF) if ((bytesLen >= 3) && (content[0] == (byte) 0xEF) && (content[1] == (byte) 0xBB) && (content[2] == (byte) 0xBF)) { @@ -519,11 +531,11 @@ public class SimpleRobotRulesParser extends BaseRobotsParser { // Decide if we need to do special HTML processing. boolean isHtmlType = ((contentType != null) && contentType.toLowerCase(Locale.ROOT).startsWith("text/html")); - // If it looks like it contains HTML, but doesn't have a user agent - // field, then - // assume somebody messed up and returned back to us a random HTML page - // instead - // of a robots.txt file. + /* + * If it looks like it contains HTML, but doesn't have a user agent + * field, then assume somebody messed up and returned back to us a + * random HTML page instead of a robots.txt file. + */ boolean hasHTML = false; if (isHtmlType || SIMPLE_HTML_PATTERN.matcher(contentAsStr).find()) { if (!USER_AGENT_PATTERN.matcher(contentAsStr).find()) { @@ -550,12 +562,12 @@ public class SimpleRobotRulesParser extends BaseRobotsParser { while (lineParser.hasMoreTokens()) { String line = lineParser.nextToken(); - // Get rid of HTML markup, in case some brain-dead webmaster has - // created an HTML - // page for robots.txt. We could do more sophisticated processing - // here to better - // handle bad HTML, but that's a very tiny percentage of all - // robots.txt files. + /* + * Get rid of HTML markup, in case some brain-dead webmaster has + * created an HTML page for robots.txt. We could do more + * sophisticated processing here to better handle bad HTML, but + * that's a very tiny percentage of all robots.txt files. + */ if (hasHTML) { line = line.replaceAll("<[^>]+>", ""); } @@ -855,9 +867,8 @@ public class SimpleRobotRulesParser extends BaseRobotsParser { double delayValue = Double.parseDouble(delayString) * 1000.0; state.setCrawlDelay(Math.round(delayValue)); } else { - long delayValue = Integer.parseInt(delayString) * 1000L; // sec - // to - // millisec + // seconds to milliseconds + long delayValue = Integer.parseInt(delayString) * 1000L; state.setCrawlDelay(delayValue); } } catch (Exception e) { diff --git a/src/test/java/crawlercommons/robots/SimpleRobotRulesParserTest.java b/src/test/java/crawlercommons/robots/SimpleRobotRulesParserTest.java index 597498f..3fb6dc4 100644 --- a/src/test/java/crawlercommons/robots/SimpleRobotRulesParserTest.java +++ b/src/test/java/crawlercommons/robots/SimpleRobotRulesParserTest.java @@ -22,6 +22,7 @@ import org.junit.jupiter.params.provider.CsvSource; import java.io.InputStream; import java.net.HttpURLConnection; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.Collection; import java.util.List; @@ -226,6 +227,44 @@ public class SimpleRobotRulesParserTest { assertTrue(rules.isAllowed("http://www.domain.com/anypage.html")); } + @Test + void testUnicodeUnescapedPaths() { + final String simpleRobotsTxt = "User-agent: *" + CRLF // + + "Disallow: /bücher/" + CRLF // + + "Disallow: /k%C3%B6nyvek/" + CRLF // + + CRLF // + + "User-agent: GoodBot" + CRLF // + + "Allow: /"; + + BaseRobotRules rules = createRobotRules("mybot", simpleRobotsTxt); + assertTrue(rules.isAllowed("https://www.example.com/")); + + // test using escaped and unescaped URLs + assertFalse(rules.isAllowed("https://www.example.com/b%C3%BCcher/book1.html")); + assertFalse(rules.isAllowed("https://www.example.com/bücher/book2.html")); + + // (for completeness) check also escaped path in robots.txt + assertFalse(rules.isAllowed("https://www.example.com/k%C3%B6nyvek/book1.html")); + assertFalse(rules.isAllowed("https://www.example.com/könyvek/book2.html")); + + // test invalid encoding: invalid encoded characters should not break + // parsing of rules below + rules = createRobotRules("goodbot", simpleRobotsTxt.getBytes(StandardCharsets.ISO_8859_1)); + assertTrue(rules.isAllowed("https://www.example.com/")); + assertTrue(rules.isAllowed("https://www.example.com/b%C3%BCcher/book1.html")); + + // test invalid encoding: only rules with invalid characters should be + // ignored + rules = createRobotRules("mybot", simpleRobotsTxt.getBytes(StandardCharsets.ISO_8859_1)); + assertTrue(rules.isAllowed("https://www.example.com/")); + assertFalse(rules.isAllowed("https://www.example.com/k%C3%B6nyvek/book1.html")); + assertFalse(rules.isAllowed("https://www.example.com/könyvek/book2.html")); + // if URL paths in disallow rules are not properly encoded, these two + // URLs are not matched: + // assertFalse(rules.isAllowed("https://www.example.com/b%C3%BCcher/book2.html")); + // assertFalse(rules.isAllowed("https://www.example.com/bücher/book1.html")); + } + @Test void testSimplestAllowAll() { final String simpleRobotsTxt = "User-agent: *" + CRLF //