1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-05-04 14:36:04 +02:00

Merge pull request #408 from sebastian-nagel/cc-195-robotstxt-url-decode

[Robots.txt] Path analyse bug with url-decode if allow/disallow path contains escaped wild-card characters
This commit is contained in:
Sebastian Nagel 2023-05-23 15:17:43 +02:00 committed by GitHub
commit 962787f4fd
Signed by: GitHub
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 108 additions and 14 deletions

View File

@ -540,7 +540,11 @@ public class BasicURLNormalizer extends URLFilter {
* characters which should be escaped according to <a
* href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>.
*/
private static String escapePath(String path) {
public static String escapePath(String path) {
return escapePath(path, null);
}
public static String escapePath(String path, boolean[] extraEscapedBytes) {
StringBuilder sb = new StringBuilder(path.length());
// Traverse over all bytes in this URL
@ -548,7 +552,7 @@ public class BasicURLNormalizer extends URLFilter {
for (int i = 0; i < bytes.length; i++) {
byte b = bytes[i];
// Is this a control character?
if (b < 0 || escapedCharacters[b]) {
if (b < 0 || escapedCharacters[b] || (extraEscapedBytes != null && extraEscapedBytes[b])) {
// Start escape sequence
sb.append('%');

View File

@ -18,11 +18,12 @@ package crawlercommons.robots;
import java.io.Serializable;
import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import crawlercommons.filters.basic.BasicURLNormalizer;
/**
* Result from parsing a single robots.txt file - which means we get a set of
* rules, and an optional crawl-delay, and an optional sitemap URL. Note that we
@ -121,6 +122,13 @@ public class SimpleRobotRules extends BaseRobotRules {
protected ArrayList<RobotRule> _rules;
protected RobotRulesMode _mode;
/** Special characters which require percent-encoding for path matching */
protected final static boolean[] specialCharactersPathMatching = new boolean[128];
static {
specialCharactersPathMatching['*'] = true;
specialCharactersPathMatching['$'] = true;
}
public SimpleRobotRules() {
this(RobotRulesMode.ALLOW_SOME);
}
@ -200,6 +208,24 @@ public class SimpleRobotRules extends BaseRobotRules {
}
}
/**
* Encode/decode (using percent-encoding) all characters where necessary:
* encode Unicode/non-ASCII characters) and decode printable ASCII
* characters without special semantics.
*
* @param urlPathQuery
* path and query component of the URL
* @param additionalEncodedBytes
* boolean array to request bytes (ASCII characters) to be
* percent-encoded in addition to other characters requiring
* encoding (Unicode/non-ASCII and characters not allowed in
* URLs).
* @return properly percent-encoded URL path and query
*/
public static String escapePath(String urlPathQuery, boolean[] additionalEncodedBytes) {
return BasicURLNormalizer.escapePath(BasicURLNormalizer.unescapePath(urlPathQuery), additionalEncodedBytes);
}
private String getPath(String url, boolean getWithQuery) {
try {
@ -214,9 +240,17 @@ public class SimpleRobotRules extends BaseRobotRules {
path += "?" + query;
}
// We used to lower-case the path, but Google says we need to do
// case-sensitive matching.
return URLDecoder.decode(path, "UTF-8");
/*
* We used to lower-case the path, but Google says we need to do
* case-sensitive matching.
*
* However, we need to properly decode percent-encoded characters,
* but preserve those escaped characters which have special
* semantics in path matching, e.g. slash `/`. However, for the
* implementation of the path matching requires that asterisk `*`
* and dollar `$` are exceptionally percent-encoded.
*/
return escapePath(path, specialCharactersPathMatching);
} catch (Exception e) {
// If the URL is invalid, we don't really care since the fetch
// will fail, so return the root.

View File

@ -21,7 +21,6 @@ import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLDecoder;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
@ -778,13 +777,23 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
/**
* Add any uniform rules to clean up path directives
* <ul>
* <li>trim leading and trailing white space and control characters not
* handled by the tokenizer</li>
* <li>properly percent-encode all characters where necessary</li>
* <li>but make sure that characters with special semantics for path
* matching (asterisk <code>*</code>, slash <code>/</code>, dollar
* <code>$</code>, etc.) are left as is (do not decode if percent-encoded).
* </ul>
*
* This method uses {@link SimpleRobotRules#escapePath(String, boolean[])}
* to normalize the URL path before matching against allow/disallow rules.
*
* @param path
* @return clean path
* @return clean and encoded path
*/
private String normalizePathDirective(String path) {
path = path.trim();
return path;
return SimpleRobotRules.escapePath(path.trim(), null);
}
/**
@ -803,7 +812,6 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
String path = token.getData();
try {
path = URLDecoder.decode(path, "UTF-8");
path = normalizePathDirective(path);
if (path.length() == 0) {
// Disallow: <nothing> => allow all.
@ -832,7 +840,6 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
String path = token.getData();
try {
path = URLDecoder.decode(path, "UTF-8");
path = normalizePathDirective(path);
} catch (Exception e) {
reportWarning(state, "Error parsing robots rules - can't decode path: {}", path);

View File

@ -265,6 +265,49 @@ public class SimpleRobotRulesParserTest {
// assertFalse(rules.isAllowed("https://www.example.com/bücher/book1.html"));
}
@ParameterizedTest
@CsvSource({ // Tests for percent-encoded characters with special semantics
// in allow/disallow statements:
// (a) must not trim percent-encoded white space
"True, /*%20, https://www.example.com/", //
"False, /*%20, https://www.example.com/foobar%20/", //
"True, /*%20, https://www.example.com/foobar/", //
// (b) match literal %2F in URL path, but do not match a
// slash
"True, /*%2F*, https://www.example.com/path/index.html", //
"False, /*%2F*, https://www.example.com/topic/9%2F11/index.html", //
"False, /topic/9%2F11/, https://www.example.com/topic/9%2F11/index.html", //
"False, /topic/9%2F11/, https://www.example.com/topic/9%2f11/index.html", //
"False, /q?*mime=application%2Fpdf, https://www.example.com/q?mime=application%2Fpdf", //
// (c) percent-encoded dollar and asterisk (*)
"False, /$, https://www.example.com/", //
"True, /$, https://www.example.com/foobar", //
"True, /%24, https://www.example.com/", //
"False, /%24, https://www.example.com/%24100", //
"False, /%24, https://www.example.com/$100", //
"True, /search/%2A/, https://www.example.com/search/foobar/", //
"False, /search/%2A/, https://www.example.com/search/%2A/", //
"False, /search/%2A/, https://www.example.com/search/%2a/", //
"False, /search/%2a/, https://www.example.com/search/%2a/", //
"False, /search/%2a/, https://www.example.com/search/*/", //
"False, /search/*/, https://www.example.com/search/foobar/", //
// examples from RFC 9309
"False, /path/file-with-a-%2A.html, https://www.example.com/path/file-with-a-*.html", //
"True, /path/file-with-a-%2A.html, https://www.example.com/path/file-with-a-foo.html", //
"False, /path/file-with-a-%2A.html, https://www.example.com/path/file-with-a-%2A.html", //
"False, /path/foo-%24, https://www.example.com/path/foo-$", //
"True, /path/foo-%24, https://www.example.com/path/foo-", //
"False, /path/foo-%24, https://www.example.com/path/foo-%24", //
})
void testEscapedPaths(boolean isAllowed, String disallowPath, String urlStr) {
final String simpleRobotsTxt = "User-agent: *" + CRLF //
+ "Disallow: " + disallowPath + CRLF //
+ "Allow: /";
BaseRobotRules rules = createRobotRules("mybot", simpleRobotsTxt);
String msg = urlStr + " should " + (isAllowed ? "not" : "") + " be disallowed by rule Disallow: " + disallowPath;
assertEquals(isAllowed, rules.isAllowed(urlStr), msg);
}
@Test
void testSimplestAllowAll() {
final String simpleRobotsTxt = "User-agent: *" + CRLF //

View File

@ -18,6 +18,8 @@ http://foo.com/%66oo.htm%1A, http://foo.com/foo.htm%1A
# check that % decoder converts to upper case letters
http://foo.com/%66oo.htm%c0, http://foo.com/foo.htm%C0
https://www.example.com/search/%2a/, https://www.example.com/search/%2A/
https://www.example.com/topic/9%2f11/, https://www.example.com/topic/9%2F11/
# check that % decoder leaves encoded spaces alone
http://foo.com/you%20too.html, http://foo.com/you%20too.html
@ -210,4 +212,8 @@ http://example.com/?, http://example.com/
# Should not decode URL query data
https://foo.com/?one/valid_query/without_%2F_params, https://foo.com/?one/valid_query/without_%2F_params
http://foo.com/asdf/page.php?article%2F1234, http://foo.com/asdf/page.php?article%2F1234
http://foo.com/asdf/page.php?article%2F1234, http://foo.com/asdf/page.php?article%2F1234
# examples from the robots.txt RFC 9309 - * and $ should be unchanged
https://www.example.com/path/file-with-a-*.html, https://www.example.com/path/file-with-a-*.html
https://www.example.com/path/foo-$, https://www.example.com/path/foo-$
Can't render this file because it has a wrong number of fields in line 3.