1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-05-22 19:06:03 +02:00

Improvements to BasicURLNormalizer (#292)

- better percent-encoding of URL paths and queries, fixes #263
- hostnames:
  * convert IDNs from Unicode to Punycode, fixes #248
  * remove trailing dot
- normalize path `/..` to `/`
- also normalize path of file:/ URLs
This commit is contained in:
Sebastian Nagel 2020-06-22 14:51:39 +02:00 committed by GitHub
parent 78d7e7e85f
commit 774c5c8092
Signed by: GitHub
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 189 additions and 40 deletions

View File

@ -21,6 +21,7 @@ import static java.nio.charset.StandardCharsets.UTF_8;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.IDN;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
@ -70,7 +71,7 @@ public class BasicURLNormalizer extends URLFilter {
* when found in a URI, should be decoded to their corresponding
* unreserved characters by URI normalizers.
*/
if ((0x41 <= c && c <= 0x5A) || (0x61 <= c && c <= 0x7A) || (0x30 <= c && c <= 0x39) || c == 0x2D || c == 0x2E || c == 0x5F || c == 0x7E) {
if (isAlphaNumeric(c) || c == 0x2D || c == 0x2E || c == 0x5F || c == 0x7E) {
unescapedCharacters[c] = true;
} else {
unescapedCharacters[c] = false;
@ -78,6 +79,56 @@ public class BasicURLNormalizer extends URLFilter {
}
}
/**
* look-up table for characters which should always be escaped in URL path
* and query, cf. https://url.spec.whatwg.org/#percent-encoded-bytes and
* https://en.wikipedia.org/wiki/Percent-encoding
*/
private final static boolean[] escapedCharacters = new boolean[128];
static {
for (int c = 0; c < 128; c++) {
if (unescapedCharacters[c]) {
escapedCharacters[c] = false;
} else if (c <= 0x1F // control characters
|| c == 0x20 // space
|| c == 0x22 // "
|| c == 0x23 // #
|| c == 0x3C // <
|| c == 0x3E // >
|| c == 0x5B // [
|| c == 0x5D // ]
|| c == 0x5E // ^
|| c == 0x60 // `
|| c == 0x7B // {
|| c == 0x7C // |
|| c == 0x7D // }
|| c == 0x7F // DEL
) {
escapedCharacters[c] = true;
} else {
LOG.debug("Character {} ({}) not handled as escaped or unescaped", c, (char) c);
}
}
}
private static boolean isAlphaNumeric(int c) {
return (0x41 <= c && c <= 0x5A) || (0x61 <= c && c <= 0x7A) || (0x30 <= c && c <= 0x39);
}
private static boolean isHexCharacter(int c) {
return (0x41 <= c && c <= 0x46) || (0x61 <= c && c <= 0x66) || (0x30 <= c && c <= 0x39);
}
private static boolean isAscii(String str) {
char[] chars = str.toCharArray();
for (char c : chars) {
if (c > 127) {
return false;
}
}
return true;
}
@Override
public String filter(String urlString) {
@ -100,6 +151,7 @@ public class BasicURLNormalizer extends URLFilter {
String file = url.getFile();
boolean changed = false;
boolean normalizePath = false;
if (!urlString.startsWith(protocol)) // protocol was lowercased
changed = true;
@ -107,8 +159,13 @@ public class BasicURLNormalizer extends URLFilter {
if ("http".equals(protocol) || "https".equals(protocol) || "ftp".equals(protocol)) {
if (host != null && url.getAuthority() != null) {
String newHost = host.toLowerCase(Locale.ROOT); // lowercase
// host
String newHost;
try {
newHost = normalizeHostName(host);
} catch (IllegalArgumentException | IndexOutOfBoundsException e) {
LOG.info("Invalid hostname: {}", host, e);
return null;
}
if (!host.equals(newHost)) {
host = newHost;
changed = true;
@ -127,30 +184,22 @@ public class BasicURLNormalizer extends URLFilter {
changed = true;
}
normalizePath = true;
if (file == null || "".equals(file)) { // add a slash
file = "/";
changed = true;
normalizePath = false; // no further path normalization required
} else if (!file.startsWith("/")) {
file = "/" + file;
changed = true;
file = "/" + file;
changed = true;
normalizePath = false; // no further path normalization required
}
if (url.getRef() != null) { // remove the ref
changed = true;
}
// check for unnecessary use of "/../", "/./", and "//"
String file2 = null;
try {
file2 = getFileWithNormalizedPath(url);
} catch (MalformedURLException e) {
LOG.info("Malformed URL {}", url);
return null;
}
if (!file.equals(file2)) {
changed = true;
file = file2;
}
} else if (protocol.equals("file")) {
normalizePath = true;
}
// properly encode characters in path/file using percent-encoding
@ -161,11 +210,28 @@ public class BasicURLNormalizer extends URLFilter {
file = file2;
}
if (normalizePath) {
// check for unnecessary use of "/../", "/./", and "//"
try {
if (changed) {
url = new URL(protocol, host, port, file);
}
file2 = getFileWithNormalizedPath(url);
if (!file.equals(file2)) {
changed = true;
file = file2;
}
} catch (MalformedURLException e) {
LOG.info("Malformed URL {}://{}{}{}", protocol, host, (port == -1 ? "" : ":" + port), file);
return null;
}
}
if (changed)
try {
urlString = new URL(protocol, host, port, file).toString();
} catch (MalformedURLException e) {
LOG.info("Malformed URL {}{}{}{}", protocol, host, port, file);
LOG.info("Malformed URL {}://{}{}{}", protocol, host, (port == -1 ? "" : ":" + port), file);
return null;
}
@ -183,7 +249,7 @@ public class BasicURLNormalizer extends URLFilter {
// URI.normalize() does not normalize leading dot segments,
// see also http://tools.ietf.org/html/rfc3986#section-5.2.4
int start = 0;
while (file.startsWith("/../", start)) {
while (file.startsWith("/..", start) && ((start + 3) == file.length() || file.charAt(3) == '/')) {
start += 3;
}
if (start > 0) {
@ -208,8 +274,8 @@ public class BasicURLNormalizer extends URLFilter {
/**
* Remove % encoding from path segment in URL for characters which should be
* unescaped according to <a
* href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>.
* unescaped according to
* <a href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>.
*/
private String unescapePath(String path) {
StringBuilder sb = new StringBuilder();
@ -230,7 +296,7 @@ public class BasicURLNormalizer extends URLFilter {
if (letter < 128 && unescapedCharacters[letter]) {
// character should be unescaped in URLs
sb.append(new Character((char) letter));
sb.append(Character.valueOf((char) letter));
} else {
// Append the encoded character as uppercase
sb.append(matcher.group().toUpperCase(Locale.ROOT));
@ -246,22 +312,23 @@ public class BasicURLNormalizer extends URLFilter {
sb.append(path.substring(end + 1, letter));
}
// Ok!
return sb.toString();
}
/**
* Convert path segment of URL from Unicode to UTF-8 and escape all
* characters which should be escaped according to <a
* href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>..
* characters which should be escaped according to
* <a href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>..
*/
private String escapePath(String path) {
StringBuilder sb = new StringBuilder(path.length());
// Traverse over all bytes in this URL
for (byte b : path.getBytes(UTF_8)) {
byte[] bytes = path.getBytes(UTF_8);
for (int i = 0; i < bytes.length; i++) {
byte b = bytes[i];
// Is this a control character?
if (b < 33 || b == 91 || b == 93) {
if (b < 0 || escapedCharacters[b]) {
// Start escape sequence
sb.append('%');
@ -276,6 +343,25 @@ public class BasicURLNormalizer extends URLFilter {
// No, append this hexadecimal representation
sb.append(hex);
}
} else if (b == 0x25) {
// percent sign (%): read-ahead to check whether a valid escape
// sequence
if ((i + 2) >= bytes.length) {
// need at least two more characters
sb.append("%25");
} else {
byte e1 = bytes[i + 1];
byte e2 = bytes[i + 2];
if (isHexCharacter(e1) && isHexCharacter(e2)) {
// valid percent encoding, output and fast-forward
i += 2;
sb.append((char) b);
sb.append((char) e1);
sb.append((char) e2);
} else {
sb.append("%25");
}
}
} else {
// No, just append this character as-is
sb.append((char) b);
@ -285,6 +371,35 @@ public class BasicURLNormalizer extends URLFilter {
return sb.toString();
}
private String normalizeHostName(String host) throws IllegalArgumentException, IndexOutOfBoundsException {
/* 1. lowercase host name */
host = host.toLowerCase(Locale.ROOT);
/*
* 2. convert between Unicode and ASCII forms for Internationalized
* Domain Names (IDNs)
*/
if (!isAscii(host)) {
/*
* IllegalArgumentException: thrown if the input string contains
* non-convertible Unicode codepoints
*
* IndexOutOfBoundsException: thrown (undocumented) if one "label"
* (non-ASCII dot-separated segment) is longer than 256 characters,
* cf. https://bugs.openjdk.java.net/browse/JDK-6806873
*/
host = IDN.toASCII(host);
}
/* 3. trim a trailing dot */
if (host.endsWith(".")) {
host = host.substring(0, host.length() - 1);
}
return host;
}
public static void main(String args[]) throws IOException {
BasicURLNormalizer normalizer = new BasicURLNormalizer();
String line, normUrl;

View File

@ -1,7 +1,5 @@
# Weird URL, Normalized URL
# testNUTCH1098
# -------------
# check that % encoding is normalized
http://foo.com/%66oo.html, http://foo.com/foo.html
@ -34,17 +32,15 @@ http://foo.com/file.html%23cz, http://foo.com/file.html%23cz
http://foo.com/fast/dir%2fcz, http://foo.com/fast/dir%2Fcz
# check that control chars are encoded
#http://foo.com/\u001a!, http://foo.com/%1A!
http://foo.com/!, http://foo.com/%1A!
# check that control chars are always encoded into 2 digits
#http://foo.com/\u0001!, http://foo.com/%01!
http://foo.com/!, http://foo.com/%01!
# check encoding of spanish chars
#http://mydomain.com/en Espa\u00F1ol.aspx, http://mydomain.com/en%20Espa%C3%B1ol.aspx
# encoding of Spanish chars
http://mydomain.com/en Español.aspx, http://mydomain.com/en%20Espa%C3%B1ol.aspx
# testNUTCH2064
# -------------
# Ampersand and colon and other punctuation characters are not to be unescaped
http://x.com/s?q=a%26b&m=10, http://x.com/s?q=a%26b&m=10
http://x.com/show?http%3A%2F%2Fx.com%2Fb, http://x.com/show?http%3A%2F%2Fx.com%2Fb
@ -53,9 +49,11 @@ http://google.com/search?q=c%2B%2B, http://google.com/search?q=c%2B%2B
# do also not touch the query part which is application/x-www-form-urlencoded
http://x.com/s?q=a+b, http://x.com/s?q=a+b
# and keep Internationalized domain names http://bücher.de/ may be http://xn--bcher-kva.de/
# but definitely not http://b%C3%BCcher.de/
http://b\u00fccher.de/, http://b\u00fccher.de/
# convert Internationalized Domain Names (IDNs) fro Unicode to Punycode #248
# (definitely do not apply percent-encoding: http://b%C3%BCcher.de/)
http://bücher.de/, http://xn--bcher-kva.de/
http://êxample.com, http://xn--xample-hva.com/
https://нэб.рф/, https://xn--90ax2c.xn--p1ai/
# test whether percent-encoding works together with other normalizations
http://x.com/./a/../%66.html, http://x.com/f.html
@ -64,7 +62,7 @@ http://x.com/./a/../%66.html, http://x.com/f.html
http://x.com/?x[y]=1, http://x.com/?x%5By%5D=1
# boundary test for first character outside the ASCII range (U+0080)
#http://x.com/foo\u0080, http://x.com/foo%C2%80
http://x.com/foo€, http://x.com/foo%C2%80
http://x.com/foo%c2%80, http://x.com/foo%C2%80
@ -119,7 +117,10 @@ http://foo.com/aa//bb/foo.html, http://foo.com/aa/bb/foo.html
http://foo.com/aa/bb//foo.html, http://foo.com/aa/bb/foo.html
http://foo.com//aa//bb//foo.html, http://foo.com/aa/bb/foo.html
http://foo.com////aa////bb//foo.html, http://foo.com/aa/bb/foo.html
http://foo.com////aa////bb////foo.html, http://foo.com/aa/bb/foo.html
http://foo.com/aa?referer=http://bar.com, http://foo.com/aa?referer=http://bar.com
# also normalize /.. (already in the root directory)
http://foo.com/.., http://foo.com/
# check URLs without host (authority)
file:///foo/bar.txt, file:///foo/bar.txt
@ -132,3 +133,36 @@ http:///////, http:/
http://example.com?,http://example.com/?
http://example.com?a=1,http://example.com/?a=1
# normalizing percent escapes #263
https://www.last.fm/music/Prefuse+73/_/90%+of+My+Mind+Is+With+You,https://www.last.fm/music/Prefuse+73/_/90%25+of+My+Mind+Is+With+You
# escape curly braces properly
http://foo.com/{{stuff}} , http://foo.com/%7B%7Bstuff%7D%7D
# special characters in path/query
"http://www.example.com/a/c/../b/search?q=foobar""", http://www.example.com/a/b/search?q=foobar%22
http://www.example.com/a/c/../b/search?q=foobar%, http://www.example.com/a/b/search?q=foobar%25
http://www.example.com/a/c/../b/search?q=foobar<, http://www.example.com/a/b/search?q=foobar%3C
http://www.example.com/a/c/../b/search?q=foobar>, http://www.example.com/a/b/search?q=foobar%3E
http://www.example.com/a/c/../b/search?q=foobar^, http://www.example.com/a/b/search?q=foobar%5E
http://www.example.com/a/c/../b/search?q=foobar`, http://www.example.com/a/b/search?q=foobar%60
http://www.example.com/a/c/../b/search?q=foobar|, http://www.example.com/a/b/search?q=foobar%7C
# escape percent sign if it's initial to an invalid escape sequence
http://www.example.com/p%zz%77%v, http://www.example.com/p%25zzw%25v
# boundary test: percent sign close to the end of string
http://www.example.com/search?q=foobar%, http://www.example.com/search?q=foobar%25
http://www.example.com/search?q=foobar%2, http://www.example.com/search?q=foobar%252
http://www.example.com/search?q=foobar%25, http://www.example.com/search?q=foobar%25
http://www.example.com/search?q=foobar%252, http://www.example.com/search?q=foobar%252
# protocol to be lowercased
HTTP://foo.com/, http://foo.com/
# removal of trailing dot in hostname
https://www.example.org./, https://www.example.org/
# file:/ URLs
file:/var/www/html/////./bar/index.html, file:/var/www/html/bar/index.html
file:/var/www/html/foo/../bar/index.html, file:/var/www/html/bar/index.html

Can't render this file because it has a wrong number of fields in line 3.