mirror of
https://github.com/crawler-commons/crawler-commons
synced 2024-05-04 14:36:04 +02:00
Merge pull request #408 from sebastian-nagel/cc-195-robotstxt-url-decode
[Robots.txt] Path analyse bug with url-decode if allow/disallow path contains escaped wild-card characters
This commit is contained in:
commit
962787f4fd
|
@ -540,7 +540,11 @@ public class BasicURLNormalizer extends URLFilter {
|
|||
* characters which should be escaped according to <a
|
||||
* href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>.
|
||||
*/
|
||||
private static String escapePath(String path) {
|
||||
public static String escapePath(String path) {
|
||||
return escapePath(path, null);
|
||||
}
|
||||
|
||||
public static String escapePath(String path, boolean[] extraEscapedBytes) {
|
||||
StringBuilder sb = new StringBuilder(path.length());
|
||||
|
||||
// Traverse over all bytes in this URL
|
||||
|
@ -548,7 +552,7 @@ public class BasicURLNormalizer extends URLFilter {
|
|||
for (int i = 0; i < bytes.length; i++) {
|
||||
byte b = bytes[i];
|
||||
// Is this a control character?
|
||||
if (b < 0 || escapedCharacters[b]) {
|
||||
if (b < 0 || escapedCharacters[b] || (extraEscapedBytes != null && extraEscapedBytes[b])) {
|
||||
// Start escape sequence
|
||||
sb.append('%');
|
||||
|
||||
|
|
|
@ -18,11 +18,12 @@ package crawlercommons.robots;
|
|||
|
||||
import java.io.Serializable;
|
||||
import java.net.URL;
|
||||
import java.net.URLDecoder;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import crawlercommons.filters.basic.BasicURLNormalizer;
|
||||
|
||||
/**
|
||||
* Result from parsing a single robots.txt file - which means we get a set of
|
||||
* rules, and an optional crawl-delay, and an optional sitemap URL. Note that we
|
||||
|
@ -121,6 +122,13 @@ public class SimpleRobotRules extends BaseRobotRules {
|
|||
protected ArrayList<RobotRule> _rules;
|
||||
protected RobotRulesMode _mode;
|
||||
|
||||
/** Special characters which require percent-encoding for path matching */
|
||||
protected final static boolean[] specialCharactersPathMatching = new boolean[128];
|
||||
static {
|
||||
specialCharactersPathMatching['*'] = true;
|
||||
specialCharactersPathMatching['$'] = true;
|
||||
}
|
||||
|
||||
public SimpleRobotRules() {
|
||||
this(RobotRulesMode.ALLOW_SOME);
|
||||
}
|
||||
|
@ -200,6 +208,24 @@ public class SimpleRobotRules extends BaseRobotRules {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Encode/decode (using percent-encoding) all characters where necessary:
|
||||
* encode Unicode/non-ASCII characters) and decode printable ASCII
|
||||
* characters without special semantics.
|
||||
*
|
||||
* @param urlPathQuery
|
||||
* path and query component of the URL
|
||||
* @param additionalEncodedBytes
|
||||
* boolean array to request bytes (ASCII characters) to be
|
||||
* percent-encoded in addition to other characters requiring
|
||||
* encoding (Unicode/non-ASCII and characters not allowed in
|
||||
* URLs).
|
||||
* @return properly percent-encoded URL path and query
|
||||
*/
|
||||
public static String escapePath(String urlPathQuery, boolean[] additionalEncodedBytes) {
|
||||
return BasicURLNormalizer.escapePath(BasicURLNormalizer.unescapePath(urlPathQuery), additionalEncodedBytes);
|
||||
}
|
||||
|
||||
private String getPath(String url, boolean getWithQuery) {
|
||||
|
||||
try {
|
||||
|
@ -214,9 +240,17 @@ public class SimpleRobotRules extends BaseRobotRules {
|
|||
path += "?" + query;
|
||||
}
|
||||
|
||||
// We used to lower-case the path, but Google says we need to do
|
||||
// case-sensitive matching.
|
||||
return URLDecoder.decode(path, "UTF-8");
|
||||
/*
|
||||
* We used to lower-case the path, but Google says we need to do
|
||||
* case-sensitive matching.
|
||||
*
|
||||
* However, we need to properly decode percent-encoded characters,
|
||||
* but preserve those escaped characters which have special
|
||||
* semantics in path matching, e.g. slash `/`. However, for the
|
||||
* implementation of the path matching requires that asterisk `*`
|
||||
* and dollar `$` are exceptionally percent-encoded.
|
||||
*/
|
||||
return escapePath(path, specialCharactersPathMatching);
|
||||
} catch (Exception e) {
|
||||
// If the URL is invalid, we don't really care since the fetch
|
||||
// will fail, so return the root.
|
||||
|
|
|
@ -21,7 +21,6 @@ import java.net.HttpURLConnection;
|
|||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.net.URLConnection;
|
||||
import java.net.URLDecoder;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Arrays;
|
||||
|
@ -778,13 +777,23 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
|
||||
/**
|
||||
* Add any uniform rules to clean up path directives
|
||||
* <ul>
|
||||
* <li>trim leading and trailing white space and control characters not
|
||||
* handled by the tokenizer</li>
|
||||
* <li>properly percent-encode all characters where necessary</li>
|
||||
* <li>but make sure that characters with special semantics for path
|
||||
* matching (asterisk <code>*</code>, slash <code>/</code>, dollar
|
||||
* <code>$</code>, etc.) are left as is (do not decode if percent-encoded).
|
||||
* </ul>
|
||||
*
|
||||
* This method uses {@link SimpleRobotRules#escapePath(String, boolean[])}
|
||||
* to normalize the URL path before matching against allow/disallow rules.
|
||||
*
|
||||
* @param path
|
||||
* @return clean path
|
||||
* @return clean and encoded path
|
||||
*/
|
||||
private String normalizePathDirective(String path) {
|
||||
path = path.trim();
|
||||
|
||||
return path;
|
||||
return SimpleRobotRules.escapePath(path.trim(), null);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -803,7 +812,6 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
String path = token.getData();
|
||||
|
||||
try {
|
||||
path = URLDecoder.decode(path, "UTF-8");
|
||||
path = normalizePathDirective(path);
|
||||
if (path.length() == 0) {
|
||||
// Disallow: <nothing> => allow all.
|
||||
|
@ -832,7 +840,6 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
String path = token.getData();
|
||||
|
||||
try {
|
||||
path = URLDecoder.decode(path, "UTF-8");
|
||||
path = normalizePathDirective(path);
|
||||
} catch (Exception e) {
|
||||
reportWarning(state, "Error parsing robots rules - can't decode path: {}", path);
|
||||
|
|
|
@ -265,6 +265,49 @@ public class SimpleRobotRulesParserTest {
|
|||
// assertFalse(rules.isAllowed("https://www.example.com/bücher/book1.html"));
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@CsvSource({ // Tests for percent-encoded characters with special semantics
|
||||
// in allow/disallow statements:
|
||||
// (a) must not trim percent-encoded white space
|
||||
"True, /*%20, https://www.example.com/", //
|
||||
"False, /*%20, https://www.example.com/foobar%20/", //
|
||||
"True, /*%20, https://www.example.com/foobar/", //
|
||||
// (b) match literal %2F in URL path, but do not match a
|
||||
// slash
|
||||
"True, /*%2F*, https://www.example.com/path/index.html", //
|
||||
"False, /*%2F*, https://www.example.com/topic/9%2F11/index.html", //
|
||||
"False, /topic/9%2F11/, https://www.example.com/topic/9%2F11/index.html", //
|
||||
"False, /topic/9%2F11/, https://www.example.com/topic/9%2f11/index.html", //
|
||||
"False, /q?*mime=application%2Fpdf, https://www.example.com/q?mime=application%2Fpdf", //
|
||||
// (c) percent-encoded dollar and asterisk (*)
|
||||
"False, /$, https://www.example.com/", //
|
||||
"True, /$, https://www.example.com/foobar", //
|
||||
"True, /%24, https://www.example.com/", //
|
||||
"False, /%24, https://www.example.com/%24100", //
|
||||
"False, /%24, https://www.example.com/$100", //
|
||||
"True, /search/%2A/, https://www.example.com/search/foobar/", //
|
||||
"False, /search/%2A/, https://www.example.com/search/%2A/", //
|
||||
"False, /search/%2A/, https://www.example.com/search/%2a/", //
|
||||
"False, /search/%2a/, https://www.example.com/search/%2a/", //
|
||||
"False, /search/%2a/, https://www.example.com/search/*/", //
|
||||
"False, /search/*/, https://www.example.com/search/foobar/", //
|
||||
// examples from RFC 9309
|
||||
"False, /path/file-with-a-%2A.html, https://www.example.com/path/file-with-a-*.html", //
|
||||
"True, /path/file-with-a-%2A.html, https://www.example.com/path/file-with-a-foo.html", //
|
||||
"False, /path/file-with-a-%2A.html, https://www.example.com/path/file-with-a-%2A.html", //
|
||||
"False, /path/foo-%24, https://www.example.com/path/foo-$", //
|
||||
"True, /path/foo-%24, https://www.example.com/path/foo-", //
|
||||
"False, /path/foo-%24, https://www.example.com/path/foo-%24", //
|
||||
})
|
||||
void testEscapedPaths(boolean isAllowed, String disallowPath, String urlStr) {
|
||||
final String simpleRobotsTxt = "User-agent: *" + CRLF //
|
||||
+ "Disallow: " + disallowPath + CRLF //
|
||||
+ "Allow: /";
|
||||
BaseRobotRules rules = createRobotRules("mybot", simpleRobotsTxt);
|
||||
String msg = urlStr + " should " + (isAllowed ? "not" : "") + " be disallowed by rule Disallow: " + disallowPath;
|
||||
assertEquals(isAllowed, rules.isAllowed(urlStr), msg);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testSimplestAllowAll() {
|
||||
final String simpleRobotsTxt = "User-agent: *" + CRLF //
|
||||
|
|
|
@ -18,6 +18,8 @@ http://foo.com/%66oo.htm%1A, http://foo.com/foo.htm%1A
|
|||
|
||||
# check that % decoder converts to upper case letters
|
||||
http://foo.com/%66oo.htm%c0, http://foo.com/foo.htm%C0
|
||||
https://www.example.com/search/%2a/, https://www.example.com/search/%2A/
|
||||
https://www.example.com/topic/9%2f11/, https://www.example.com/topic/9%2F11/
|
||||
|
||||
# check that % decoder leaves encoded spaces alone
|
||||
http://foo.com/you%20too.html, http://foo.com/you%20too.html
|
||||
|
@ -210,4 +212,8 @@ http://example.com/?, http://example.com/
|
|||
|
||||
# Should not decode URL query data
|
||||
https://foo.com/?one/valid_query/without_%2F_params, https://foo.com/?one/valid_query/without_%2F_params
|
||||
http://foo.com/asdf/page.php?article%2F1234, http://foo.com/asdf/page.php?article%2F1234
|
||||
http://foo.com/asdf/page.php?article%2F1234, http://foo.com/asdf/page.php?article%2F1234
|
||||
|
||||
# examples from the robots.txt RFC 9309 - * and $ should be unchanged
|
||||
https://www.example.com/path/file-with-a-*.html, https://www.example.com/path/file-with-a-*.html
|
||||
https://www.example.com/path/foo-$, https://www.example.com/path/foo-$
|
Can't render this file because it has a wrong number of fields in line 3.
|
Loading…
Reference in New Issue