mirror of
https://github.com/crawler-commons/crawler-commons
synced 2024-05-22 19:06:03 +02:00
Improvements to BasicURLNormalizer (#292)
- better percent-encoding of URL paths and queries, fixes #263 - hostnames: * convert IDNs from Unicode to Punycode, fixes #248 * remove trailing dot - normalize path `/..` to `/` - also normalize path of file:/ URLs
This commit is contained in:
parent
78d7e7e85f
commit
774c5c8092
|
@ -21,6 +21,7 @@ import static java.nio.charset.StandardCharsets.UTF_8;
|
|||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.IDN;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.URL;
|
||||
|
@ -70,7 +71,7 @@ public class BasicURLNormalizer extends URLFilter {
|
|||
* when found in a URI, should be decoded to their corresponding
|
||||
* unreserved characters by URI normalizers.
|
||||
*/
|
||||
if ((0x41 <= c && c <= 0x5A) || (0x61 <= c && c <= 0x7A) || (0x30 <= c && c <= 0x39) || c == 0x2D || c == 0x2E || c == 0x5F || c == 0x7E) {
|
||||
if (isAlphaNumeric(c) || c == 0x2D || c == 0x2E || c == 0x5F || c == 0x7E) {
|
||||
unescapedCharacters[c] = true;
|
||||
} else {
|
||||
unescapedCharacters[c] = false;
|
||||
|
@ -78,6 +79,56 @@ public class BasicURLNormalizer extends URLFilter {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* look-up table for characters which should always be escaped in URL path
|
||||
* and query, cf. https://url.spec.whatwg.org/#percent-encoded-bytes and
|
||||
* https://en.wikipedia.org/wiki/Percent-encoding
|
||||
*/
|
||||
private final static boolean[] escapedCharacters = new boolean[128];
|
||||
static {
|
||||
for (int c = 0; c < 128; c++) {
|
||||
if (unescapedCharacters[c]) {
|
||||
escapedCharacters[c] = false;
|
||||
} else if (c <= 0x1F // control characters
|
||||
|| c == 0x20 // space
|
||||
|| c == 0x22 // "
|
||||
|| c == 0x23 // #
|
||||
|| c == 0x3C // <
|
||||
|| c == 0x3E // >
|
||||
|| c == 0x5B // [
|
||||
|| c == 0x5D // ]
|
||||
|| c == 0x5E // ^
|
||||
|| c == 0x60 // `
|
||||
|| c == 0x7B // {
|
||||
|| c == 0x7C // |
|
||||
|| c == 0x7D // }
|
||||
|| c == 0x7F // DEL
|
||||
) {
|
||||
escapedCharacters[c] = true;
|
||||
} else {
|
||||
LOG.debug("Character {} ({}) not handled as escaped or unescaped", c, (char) c);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean isAlphaNumeric(int c) {
|
||||
return (0x41 <= c && c <= 0x5A) || (0x61 <= c && c <= 0x7A) || (0x30 <= c && c <= 0x39);
|
||||
}
|
||||
|
||||
private static boolean isHexCharacter(int c) {
|
||||
return (0x41 <= c && c <= 0x46) || (0x61 <= c && c <= 0x66) || (0x30 <= c && c <= 0x39);
|
||||
}
|
||||
|
||||
private static boolean isAscii(String str) {
|
||||
char[] chars = str.toCharArray();
|
||||
for (char c : chars) {
|
||||
if (c > 127) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String filter(String urlString) {
|
||||
|
||||
|
@ -100,6 +151,7 @@ public class BasicURLNormalizer extends URLFilter {
|
|||
String file = url.getFile();
|
||||
|
||||
boolean changed = false;
|
||||
boolean normalizePath = false;
|
||||
|
||||
if (!urlString.startsWith(protocol)) // protocol was lowercased
|
||||
changed = true;
|
||||
|
@ -107,8 +159,13 @@ public class BasicURLNormalizer extends URLFilter {
|
|||
if ("http".equals(protocol) || "https".equals(protocol) || "ftp".equals(protocol)) {
|
||||
|
||||
if (host != null && url.getAuthority() != null) {
|
||||
String newHost = host.toLowerCase(Locale.ROOT); // lowercase
|
||||
// host
|
||||
String newHost;
|
||||
try {
|
||||
newHost = normalizeHostName(host);
|
||||
} catch (IllegalArgumentException | IndexOutOfBoundsException e) {
|
||||
LOG.info("Invalid hostname: {}", host, e);
|
||||
return null;
|
||||
}
|
||||
if (!host.equals(newHost)) {
|
||||
host = newHost;
|
||||
changed = true;
|
||||
|
@ -127,30 +184,22 @@ public class BasicURLNormalizer extends URLFilter {
|
|||
changed = true;
|
||||
}
|
||||
|
||||
normalizePath = true;
|
||||
if (file == null || "".equals(file)) { // add a slash
|
||||
file = "/";
|
||||
changed = true;
|
||||
normalizePath = false; // no further path normalization required
|
||||
} else if (!file.startsWith("/")) {
|
||||
file = "/" + file;
|
||||
changed = true;
|
||||
file = "/" + file;
|
||||
changed = true;
|
||||
normalizePath = false; // no further path normalization required
|
||||
}
|
||||
|
||||
if (url.getRef() != null) { // remove the ref
|
||||
changed = true;
|
||||
}
|
||||
|
||||
// check for unnecessary use of "/../", "/./", and "//"
|
||||
String file2 = null;
|
||||
try {
|
||||
file2 = getFileWithNormalizedPath(url);
|
||||
} catch (MalformedURLException e) {
|
||||
LOG.info("Malformed URL {}", url);
|
||||
return null;
|
||||
}
|
||||
if (!file.equals(file2)) {
|
||||
changed = true;
|
||||
file = file2;
|
||||
}
|
||||
} else if (protocol.equals("file")) {
|
||||
normalizePath = true;
|
||||
}
|
||||
|
||||
// properly encode characters in path/file using percent-encoding
|
||||
|
@ -161,11 +210,28 @@ public class BasicURLNormalizer extends URLFilter {
|
|||
file = file2;
|
||||
}
|
||||
|
||||
if (normalizePath) {
|
||||
// check for unnecessary use of "/../", "/./", and "//"
|
||||
try {
|
||||
if (changed) {
|
||||
url = new URL(protocol, host, port, file);
|
||||
}
|
||||
file2 = getFileWithNormalizedPath(url);
|
||||
if (!file.equals(file2)) {
|
||||
changed = true;
|
||||
file = file2;
|
||||
}
|
||||
} catch (MalformedURLException e) {
|
||||
LOG.info("Malformed URL {}://{}{}{}", protocol, host, (port == -1 ? "" : ":" + port), file);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
if (changed)
|
||||
try {
|
||||
urlString = new URL(protocol, host, port, file).toString();
|
||||
} catch (MalformedURLException e) {
|
||||
LOG.info("Malformed URL {}{}{}{}", protocol, host, port, file);
|
||||
LOG.info("Malformed URL {}://{}{}{}", protocol, host, (port == -1 ? "" : ":" + port), file);
|
||||
return null;
|
||||
}
|
||||
|
||||
|
@ -183,7 +249,7 @@ public class BasicURLNormalizer extends URLFilter {
|
|||
// URI.normalize() does not normalize leading dot segments,
|
||||
// see also http://tools.ietf.org/html/rfc3986#section-5.2.4
|
||||
int start = 0;
|
||||
while (file.startsWith("/../", start)) {
|
||||
while (file.startsWith("/..", start) && ((start + 3) == file.length() || file.charAt(3) == '/')) {
|
||||
start += 3;
|
||||
}
|
||||
if (start > 0) {
|
||||
|
@ -208,8 +274,8 @@ public class BasicURLNormalizer extends URLFilter {
|
|||
|
||||
/**
|
||||
* Remove % encoding from path segment in URL for characters which should be
|
||||
* unescaped according to <a
|
||||
* href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>.
|
||||
* unescaped according to
|
||||
* <a href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>.
|
||||
*/
|
||||
private String unescapePath(String path) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
@ -230,7 +296,7 @@ public class BasicURLNormalizer extends URLFilter {
|
|||
|
||||
if (letter < 128 && unescapedCharacters[letter]) {
|
||||
// character should be unescaped in URLs
|
||||
sb.append(new Character((char) letter));
|
||||
sb.append(Character.valueOf((char) letter));
|
||||
} else {
|
||||
// Append the encoded character as uppercase
|
||||
sb.append(matcher.group().toUpperCase(Locale.ROOT));
|
||||
|
@ -246,22 +312,23 @@ public class BasicURLNormalizer extends URLFilter {
|
|||
sb.append(path.substring(end + 1, letter));
|
||||
}
|
||||
|
||||
// Ok!
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert path segment of URL from Unicode to UTF-8 and escape all
|
||||
* characters which should be escaped according to <a
|
||||
* href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>..
|
||||
* characters which should be escaped according to
|
||||
* <a href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>..
|
||||
*/
|
||||
private String escapePath(String path) {
|
||||
StringBuilder sb = new StringBuilder(path.length());
|
||||
|
||||
// Traverse over all bytes in this URL
|
||||
for (byte b : path.getBytes(UTF_8)) {
|
||||
byte[] bytes = path.getBytes(UTF_8);
|
||||
for (int i = 0; i < bytes.length; i++) {
|
||||
byte b = bytes[i];
|
||||
// Is this a control character?
|
||||
if (b < 33 || b == 91 || b == 93) {
|
||||
if (b < 0 || escapedCharacters[b]) {
|
||||
// Start escape sequence
|
||||
sb.append('%');
|
||||
|
||||
|
@ -276,6 +343,25 @@ public class BasicURLNormalizer extends URLFilter {
|
|||
// No, append this hexadecimal representation
|
||||
sb.append(hex);
|
||||
}
|
||||
} else if (b == 0x25) {
|
||||
// percent sign (%): read-ahead to check whether a valid escape
|
||||
// sequence
|
||||
if ((i + 2) >= bytes.length) {
|
||||
// need at least two more characters
|
||||
sb.append("%25");
|
||||
} else {
|
||||
byte e1 = bytes[i + 1];
|
||||
byte e2 = bytes[i + 2];
|
||||
if (isHexCharacter(e1) && isHexCharacter(e2)) {
|
||||
// valid percent encoding, output and fast-forward
|
||||
i += 2;
|
||||
sb.append((char) b);
|
||||
sb.append((char) e1);
|
||||
sb.append((char) e2);
|
||||
} else {
|
||||
sb.append("%25");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// No, just append this character as-is
|
||||
sb.append((char) b);
|
||||
|
@ -285,6 +371,35 @@ public class BasicURLNormalizer extends URLFilter {
|
|||
return sb.toString();
|
||||
}
|
||||
|
||||
private String normalizeHostName(String host) throws IllegalArgumentException, IndexOutOfBoundsException {
|
||||
|
||||
/* 1. lowercase host name */
|
||||
host = host.toLowerCase(Locale.ROOT);
|
||||
|
||||
/*
|
||||
* 2. convert between Unicode and ASCII forms for Internationalized
|
||||
* Domain Names (IDNs)
|
||||
*/
|
||||
if (!isAscii(host)) {
|
||||
/*
|
||||
* IllegalArgumentException: thrown if the input string contains
|
||||
* non-convertible Unicode codepoints
|
||||
*
|
||||
* IndexOutOfBoundsException: thrown (undocumented) if one "label"
|
||||
* (non-ASCII dot-separated segment) is longer than 256 characters,
|
||||
* cf. https://bugs.openjdk.java.net/browse/JDK-6806873
|
||||
*/
|
||||
host = IDN.toASCII(host);
|
||||
}
|
||||
|
||||
/* 3. trim a trailing dot */
|
||||
if (host.endsWith(".")) {
|
||||
host = host.substring(0, host.length() - 1);
|
||||
}
|
||||
|
||||
return host;
|
||||
}
|
||||
|
||||
public static void main(String args[]) throws IOException {
|
||||
BasicURLNormalizer normalizer = new BasicURLNormalizer();
|
||||
String line, normUrl;
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
# Weird URL, Normalized URL
|
||||
|
||||
# testNUTCH1098
|
||||
# -------------
|
||||
# check that % encoding is normalized
|
||||
http://foo.com/%66oo.html, http://foo.com/foo.html
|
||||
|
||||
|
@ -34,17 +32,15 @@ http://foo.com/file.html%23cz, http://foo.com/file.html%23cz
|
|||
http://foo.com/fast/dir%2fcz, http://foo.com/fast/dir%2Fcz
|
||||
|
||||
# check that control chars are encoded
|
||||
#http://foo.com/\u001a!, http://foo.com/%1A!
|
||||
http://foo.com/!, http://foo.com/%1A!
|
||||
|
||||
# check that control chars are always encoded into 2 digits
|
||||
#http://foo.com/\u0001!, http://foo.com/%01!
|
||||
http://foo.com/!, http://foo.com/%01!
|
||||
|
||||
# check encoding of spanish chars
|
||||
#http://mydomain.com/en Espa\u00F1ol.aspx, http://mydomain.com/en%20Espa%C3%B1ol.aspx
|
||||
# encoding of Spanish chars
|
||||
http://mydomain.com/en Español.aspx, http://mydomain.com/en%20Espa%C3%B1ol.aspx
|
||||
|
||||
|
||||
# testNUTCH2064
|
||||
# -------------
|
||||
# Ampersand and colon and other punctuation characters are not to be unescaped
|
||||
http://x.com/s?q=a%26b&m=10, http://x.com/s?q=a%26b&m=10
|
||||
http://x.com/show?http%3A%2F%2Fx.com%2Fb, http://x.com/show?http%3A%2F%2Fx.com%2Fb
|
||||
|
@ -53,9 +49,11 @@ http://google.com/search?q=c%2B%2B, http://google.com/search?q=c%2B%2B
|
|||
# do also not touch the query part which is application/x-www-form-urlencoded
|
||||
http://x.com/s?q=a+b, http://x.com/s?q=a+b
|
||||
|
||||
# and keep Internationalized domain names http://bücher.de/ may be http://xn--bcher-kva.de/
|
||||
# but definitely not http://b%C3%BCcher.de/
|
||||
http://b\u00fccher.de/, http://b\u00fccher.de/
|
||||
# convert Internationalized Domain Names (IDNs) fro Unicode to Punycode #248
|
||||
# (definitely do not apply percent-encoding: http://b%C3%BCcher.de/)
|
||||
http://bücher.de/, http://xn--bcher-kva.de/
|
||||
http://êxample.com, http://xn--xample-hva.com/
|
||||
https://нэб.рф/, https://xn--90ax2c.xn--p1ai/
|
||||
|
||||
# test whether percent-encoding works together with other normalizations
|
||||
http://x.com/./a/../%66.html, http://x.com/f.html
|
||||
|
@ -64,7 +62,7 @@ http://x.com/./a/../%66.html, http://x.com/f.html
|
|||
http://x.com/?x[y]=1, http://x.com/?x%5By%5D=1
|
||||
|
||||
# boundary test for first character outside the ASCII range (U+0080)
|
||||
#http://x.com/foo\u0080, http://x.com/foo%C2%80
|
||||
http://x.com/foo, http://x.com/foo%C2%80
|
||||
http://x.com/foo%c2%80, http://x.com/foo%C2%80
|
||||
|
||||
|
||||
|
@ -119,7 +117,10 @@ http://foo.com/aa//bb/foo.html, http://foo.com/aa/bb/foo.html
|
|||
http://foo.com/aa/bb//foo.html, http://foo.com/aa/bb/foo.html
|
||||
http://foo.com//aa//bb//foo.html, http://foo.com/aa/bb/foo.html
|
||||
http://foo.com////aa////bb//foo.html, http://foo.com/aa/bb/foo.html
|
||||
http://foo.com////aa////bb////foo.html, http://foo.com/aa/bb/foo.html
|
||||
http://foo.com/aa?referer=http://bar.com, http://foo.com/aa?referer=http://bar.com
|
||||
# also normalize /.. (already in the root directory)
|
||||
http://foo.com/.., http://foo.com/
|
||||
|
||||
# check URLs without host (authority)
|
||||
file:///foo/bar.txt, file:///foo/bar.txt
|
||||
|
@ -132,3 +133,36 @@ http:///////, http:/
|
|||
http://example.com?,http://example.com/?
|
||||
http://example.com?a=1,http://example.com/?a=1
|
||||
|
||||
# normalizing percent escapes #263
|
||||
https://www.last.fm/music/Prefuse+73/_/90%+of+My+Mind+Is+With+You,https://www.last.fm/music/Prefuse+73/_/90%25+of+My+Mind+Is+With+You
|
||||
|
||||
# escape curly braces properly
|
||||
http://foo.com/{{stuff}} , http://foo.com/%7B%7Bstuff%7D%7D
|
||||
|
||||
# special characters in path/query
|
||||
"http://www.example.com/a/c/../b/search?q=foobar""", http://www.example.com/a/b/search?q=foobar%22
|
||||
http://www.example.com/a/c/../b/search?q=foobar%, http://www.example.com/a/b/search?q=foobar%25
|
||||
http://www.example.com/a/c/../b/search?q=foobar<, http://www.example.com/a/b/search?q=foobar%3C
|
||||
http://www.example.com/a/c/../b/search?q=foobar>, http://www.example.com/a/b/search?q=foobar%3E
|
||||
http://www.example.com/a/c/../b/search?q=foobar^, http://www.example.com/a/b/search?q=foobar%5E
|
||||
http://www.example.com/a/c/../b/search?q=foobar`, http://www.example.com/a/b/search?q=foobar%60
|
||||
http://www.example.com/a/c/../b/search?q=foobar|, http://www.example.com/a/b/search?q=foobar%7C
|
||||
|
||||
# escape percent sign if it's initial to an invalid escape sequence
|
||||
http://www.example.com/p%zz%77%v, http://www.example.com/p%25zzw%25v
|
||||
|
||||
# boundary test: percent sign close to the end of string
|
||||
http://www.example.com/search?q=foobar%, http://www.example.com/search?q=foobar%25
|
||||
http://www.example.com/search?q=foobar%2, http://www.example.com/search?q=foobar%252
|
||||
http://www.example.com/search?q=foobar%25, http://www.example.com/search?q=foobar%25
|
||||
http://www.example.com/search?q=foobar%252, http://www.example.com/search?q=foobar%252
|
||||
|
||||
# protocol to be lowercased
|
||||
HTTP://foo.com/, http://foo.com/
|
||||
|
||||
# removal of trailing dot in hostname
|
||||
https://www.example.org./, https://www.example.org/
|
||||
|
||||
# file:/ URLs
|
||||
file:/var/www/html/////./bar/index.html, file:/var/www/html/bar/index.html
|
||||
file:/var/www/html/foo/../bar/index.html, file:/var/www/html/bar/index.html
|
||||
|
|
Can't render this file because it has a wrong number of fields in line 3.
|
Loading…
Reference in New Issue