Improvements to BasicURLNormalizer (#292)

- better percent-encoding of URL paths and queries, fixes #263 - hostnames: * convert IDNs from Unicode to Punycode, fixes #248 * remove trailing dot - normalize path `/..` to `/` - also normalize path of file:/ URLs
2024-05-22 19:06:03 +02:00 · 2020-06-22 14:51:39 +02:00 · 2020-06-22 14:51:39 +02:00 · 774c5c8092
parent 78d7e7e85f
commit 774c5c8092
2 changed files with 189 additions and 40 deletions
--- a/src/main/java/crawlercommons/filters/basic/BasicURLNormalizer.java
+++ b/src/main/java/crawlercommons/filters/basic/BasicURLNormalizer.java
@ -21,6 +21,7 @@ import static java.nio.charset.StandardCharsets.UTF_8;
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStreamReader;
+import java.net.IDN;
 import java.net.MalformedURLException;
 import java.net.URISyntaxException;
 import java.net.URL;
@ -70,7 +71,7 @@ public class BasicURLNormalizer extends URLFilter {
             * when found in a URI, should be decoded to their corresponding
             * unreserved characters by URI normalizers.
             */
-            if ((0x41 <= c && c <= 0x5A) || (0x61 <= c && c <= 0x7A) || (0x30 <= c && c <= 0x39) || c == 0x2D || c == 0x2E || c == 0x5F || c == 0x7E) {
+            if (isAlphaNumeric(c) || c == 0x2D || c == 0x2E || c == 0x5F || c == 0x7E) {
                unescapedCharacters[c] = true;
            } else {
                unescapedCharacters[c] = false;
@ -78,6 +79,56 @@ public class BasicURLNormalizer extends URLFilter {
        }
    }

+    /**
+     * look-up table for characters which should always be escaped in URL path
+     * and query, cf. https://url.spec.whatwg.org/#percent-encoded-bytes and
+     * https://en.wikipedia.org/wiki/Percent-encoding
+     */
+    private final static boolean[] escapedCharacters = new boolean[128];
+    static {
+        for (int c = 0; c < 128; c++) {
+            if (unescapedCharacters[c]) {
+                escapedCharacters[c] = false;
+            } else if (c <= 0x1F // control characters
+                            || c == 0x20 // space
+                            || c == 0x22 // "
+                            || c == 0x23 // #
+                            || c == 0x3C // <
+                            || c == 0x3E // >
+                            || c == 0x5B // [
+                            || c == 0x5D // ]
+                            || c == 0x5E // ^
+                            || c == 0x60 // `
+                            || c == 0x7B // {
+                            || c == 0x7C // |
+                            || c == 0x7D // }
+                            || c == 0x7F // DEL
+            ) {
+                escapedCharacters[c] = true;
+            } else {
+                LOG.debug("Character {} ({}) not handled as escaped or unescaped", c, (char) c);
+            }
+        }
+    }
+
+    private static boolean isAlphaNumeric(int c) {
+        return (0x41 <= c && c <= 0x5A) || (0x61 <= c && c <= 0x7A) || (0x30 <= c && c <= 0x39);
+    }
+
+    private static boolean isHexCharacter(int c) {
+        return (0x41 <= c && c <= 0x46) || (0x61 <= c && c <= 0x66) || (0x30 <= c && c <= 0x39);
+    }
+
+    private static boolean isAscii(String str) {
+        char[] chars = str.toCharArray();
+        for (char c : chars) {
+            if (c > 127) {
+                return false;
+            }
+        }
+        return true;
+    }
+
    @Override
    public String filter(String urlString) {

@ -100,6 +151,7 @@ public class BasicURLNormalizer extends URLFilter {
        String file = url.getFile();

        boolean changed = false;
+        boolean normalizePath = false;

        if (!urlString.startsWith(protocol)) // protocol was lowercased
            changed = true;
@ -107,8 +159,13 @@ public class BasicURLNormalizer extends URLFilter {
        if ("http".equals(protocol) || "https".equals(protocol) || "ftp".equals(protocol)) {

            if (host != null && url.getAuthority() != null) {
-                String newHost = host.toLowerCase(Locale.ROOT); // lowercase
-                                                                // host
+                String newHost;
+                try {
+                    newHost = normalizeHostName(host);
+                } catch (IllegalArgumentException | IndexOutOfBoundsException e) {
+                    LOG.info("Invalid hostname: {}", host, e);
+                    return null;
+                }
                if (!host.equals(newHost)) {
                    host = newHost;
                    changed = true;
@ -127,30 +184,22 @@ public class BasicURLNormalizer extends URLFilter {
                changed = true;
            }

+            normalizePath = true;
            if (file == null || "".equals(file)) { // add a slash
                file = "/";
                changed = true;
+                normalizePath = false; // no further path normalization required
            } else if (!file.startsWith("/")) {
-              file = "/" + file;
-              changed = true;
+                file = "/" + file;
+                changed = true;
+                normalizePath = false; // no further path normalization required
            }

            if (url.getRef() != null) { // remove the ref
                changed = true;
            }
-
-            // check for unnecessary use of "/../", "/./", and "//"
-            String file2 = null;
-            try {
-                file2 = getFileWithNormalizedPath(url);
-            } catch (MalformedURLException e) {
-                LOG.info("Malformed URL {}", url);
-                return null;
-            }
-            if (!file.equals(file2)) {
-                changed = true;
-                file = file2;
-            }
+        } else if (protocol.equals("file")) {
+            normalizePath = true;
        }

        // properly encode characters in path/file using percent-encoding
@ -161,11 +210,28 @@ public class BasicURLNormalizer extends URLFilter {
            file = file2;
        }

+        if (normalizePath) {
+            // check for unnecessary use of "/../", "/./", and "//"
+            try {
+                if (changed) {
+                    url = new URL(protocol, host, port, file);
+                }
+                file2 = getFileWithNormalizedPath(url);
+                if (!file.equals(file2)) {
+                    changed = true;
+                    file = file2;
+                }
+            } catch (MalformedURLException e) {
+                LOG.info("Malformed URL {}://{}{}{}", protocol, host, (port == -1 ? "" : ":" + port), file);
+                return null;
+            }
+        }
+
        if (changed)
            try {
                urlString = new URL(protocol, host, port, file).toString();
            } catch (MalformedURLException e) {
-                LOG.info("Malformed URL {}{}{}{}", protocol, host, port, file);
+                LOG.info("Malformed URL {}://{}{}{}", protocol, host, (port == -1 ? "" : ":" + port), file);
                return null;
            }

@ -183,7 +249,7 @@ public class BasicURLNormalizer extends URLFilter {
                // URI.normalize() does not normalize leading dot segments,
                // see also http://tools.ietf.org/html/rfc3986#section-5.2.4
                int start = 0;
-                while (file.startsWith("/../", start)) {
+                while (file.startsWith("/..", start) && ((start + 3) == file.length() || file.charAt(3) == '/')) {
                    start += 3;
                }
                if (start > 0) {
@ -208,8 +274,8 @@ public class BasicURLNormalizer extends URLFilter {

    /**
     * Remove % encoding from path segment in URL for characters which should be
-     * unescaped according to <a
-     * href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>.
+     * unescaped according to
+     * <a href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>.
     */
    private String unescapePath(String path) {
        StringBuilder sb = new StringBuilder();
@ -230,7 +296,7 @@ public class BasicURLNormalizer extends URLFilter {

            if (letter < 128 && unescapedCharacters[letter]) {
                // character should be unescaped in URLs
-                sb.append(new Character((char) letter));
+                sb.append(Character.valueOf((char) letter));
            } else {
                // Append the encoded character as uppercase
                sb.append(matcher.group().toUpperCase(Locale.ROOT));
@ -246,22 +312,23 @@ public class BasicURLNormalizer extends URLFilter {
            sb.append(path.substring(end + 1, letter));
        }

-        // Ok!
        return sb.toString();
    }

    /**
     * Convert path segment of URL from Unicode to UTF-8 and escape all
-     * characters which should be escaped according to <a
-     * href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>..
+     * characters which should be escaped according to
+     * <a href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>..
     */
    private String escapePath(String path) {
        StringBuilder sb = new StringBuilder(path.length());

        // Traverse over all bytes in this URL
-        for (byte b : path.getBytes(UTF_8)) {
+        byte[] bytes = path.getBytes(UTF_8);
+        for (int i = 0; i < bytes.length; i++) {
+            byte b = bytes[i];
            // Is this a control character?
-            if (b < 33 || b == 91 || b == 93) {
+            if (b < 0 || escapedCharacters[b]) {
                // Start escape sequence
                sb.append('%');

@ -276,6 +343,25 @@ public class BasicURLNormalizer extends URLFilter {
                    // No, append this hexadecimal representation
                    sb.append(hex);
                }
+            } else if (b == 0x25) {
+                // percent sign (%): read-ahead to check whether a valid escape
+                // sequence
+                if ((i + 2) >= bytes.length) {
+                    // need at least two more characters
+                    sb.append("%25");
+                } else {
+                    byte e1 = bytes[i + 1];
+                    byte e2 = bytes[i + 2];
+                    if (isHexCharacter(e1) && isHexCharacter(e2)) {
+                        // valid percent encoding, output and fast-forward
+                        i += 2;
+                        sb.append((char) b);
+                        sb.append((char) e1);
+                        sb.append((char) e2);
+                    } else {
+                        sb.append("%25");
+                    }
+                }
            } else {
                // No, just append this character as-is
                sb.append((char) b);
@ -285,6 +371,35 @@ public class BasicURLNormalizer extends URLFilter {
        return sb.toString();
    }

+    private String normalizeHostName(String host) throws IllegalArgumentException, IndexOutOfBoundsException {
+
+        /* 1. lowercase host name */
+        host = host.toLowerCase(Locale.ROOT);
+
+        /*
+         * 2. convert between Unicode and ASCII forms for Internationalized
+         * Domain Names (IDNs)
+         */
+        if (!isAscii(host)) {
+            /*
+             * IllegalArgumentException: thrown if the input string contains
+             * non-convertible Unicode codepoints
+             * 
+             * IndexOutOfBoundsException: thrown (undocumented) if one "label"
+             * (non-ASCII dot-separated segment) is longer than 256 characters,
+             * cf. https://bugs.openjdk.java.net/browse/JDK-6806873
+             */
+            host = IDN.toASCII(host);
+        }
+
+        /* 3. trim a trailing dot */
+        if (host.endsWith(".")) {
+            host = host.substring(0, host.length() - 1);
+        }
+
+        return host;
+    }
+
    public static void main(String args[]) throws IOException {
        BasicURLNormalizer normalizer = new BasicURLNormalizer();
        String line, normUrl;
--- a/src/test/resources/normalizer/weirdToNormalizedUrls.csv
+++ b/src/test/resources/normalizer/weirdToNormalizedUrls.csv
@ -1,7 +1,5 @@
 # Weird URL, Normalized URL

-# testNUTCH1098
-# -------------
 # check that % encoding is normalized
 http://foo.com/%66oo.html, http://foo.com/foo.html

@ -34,17 +32,15 @@ http://foo.com/file.html%23cz, http://foo.com/file.html%23cz
 http://foo.com/fast/dir%2fcz, http://foo.com/fast/dir%2Fcz

 # check that control chars are encoded
-#http://foo.com/\u001a!, http://foo.com/%1A!
+http://foo.com/!, http://foo.com/%1A!

 # check that control chars are always encoded into 2 digits
-#http://foo.com/\u0001!, http://foo.com/%01!
+http://foo.com/!, http://foo.com/%01!

-# check encoding of spanish chars
-#http://mydomain.com/en Espa\u00F1ol.aspx, http://mydomain.com/en%20Espa%C3%B1ol.aspx
+# encoding of Spanish chars
+http://mydomain.com/en Español.aspx, http://mydomain.com/en%20Espa%C3%B1ol.aspx


-# testNUTCH2064
-# -------------
 # Ampersand and colon and other punctuation characters are not to be unescaped
 http://x.com/s?q=a%26b&m=10, http://x.com/s?q=a%26b&m=10
 http://x.com/show?http%3A%2F%2Fx.com%2Fb, http://x.com/show?http%3A%2F%2Fx.com%2Fb
@ -53,9 +49,11 @@ http://google.com/search?q=c%2B%2B, http://google.com/search?q=c%2B%2B
 # do also not touch the query part which is application/x-www-form-urlencoded
 http://x.com/s?q=a+b, http://x.com/s?q=a+b

-# and keep Internationalized domain names http://bücher.de/ may be http://xn--bcher-kva.de/
-# but definitely not http://b%C3%BCcher.de/
-http://b\u00fccher.de/, http://b\u00fccher.de/
+# convert Internationalized Domain Names (IDNs) fro Unicode to Punycode #248
+# (definitely do not apply percent-encoding: http://b%C3%BCcher.de/)
+http://bücher.de/, http://xn--bcher-kva.de/
+http://êxample.com, http://xn--xample-hva.com/
+https://нэб.рф/, https://xn--90ax2c.xn--p1ai/

 # test whether percent-encoding works together with other normalizations
 http://x.com/./a/../%66.html, http://x.com/f.html
@ -64,7 +62,7 @@ http://x.com/./a/../%66.html, http://x.com/f.html
 http://x.com/?x[y]=1, http://x.com/?x%5By%5D=1

 # boundary test for first character outside the ASCII range (U+0080)
-#http://x.com/foo\u0080, http://x.com/foo%C2%80
+http://x.com/foo, http://x.com/foo%C2%80
 http://x.com/foo%c2%80, http://x.com/foo%C2%80


@ -119,7 +117,10 @@ http://foo.com/aa//bb/foo.html, http://foo.com/aa/bb/foo.html
 http://foo.com/aa/bb//foo.html, http://foo.com/aa/bb/foo.html
 http://foo.com//aa//bb//foo.html, http://foo.com/aa/bb/foo.html
 http://foo.com////aa////bb//foo.html, http://foo.com/aa/bb/foo.html
+http://foo.com////aa////bb////foo.html, http://foo.com/aa/bb/foo.html
 http://foo.com/aa?referer=http://bar.com, http://foo.com/aa?referer=http://bar.com
+# also normalize  /..  (already in the root directory)
+http://foo.com/.., http://foo.com/

 # check URLs without host (authority)
 file:///foo/bar.txt, file:///foo/bar.txt
@ -132,3 +133,36 @@ http:///////, http:/
 http://example.com?,http://example.com/?
 http://example.com?a=1,http://example.com/?a=1

+# normalizing percent escapes #263
+https://www.last.fm/music/Prefuse+73/_/90%+of+My+Mind+Is+With+You,https://www.last.fm/music/Prefuse+73/_/90%25+of+My+Mind+Is+With+You
+
+# escape curly braces properly
+http://foo.com/{{stuff}} , http://foo.com/%7B%7Bstuff%7D%7D
+
+# special characters in path/query
+"http://www.example.com/a/c/../b/search?q=foobar""", http://www.example.com/a/b/search?q=foobar%22
+http://www.example.com/a/c/../b/search?q=foobar%, http://www.example.com/a/b/search?q=foobar%25
+http://www.example.com/a/c/../b/search?q=foobar<, http://www.example.com/a/b/search?q=foobar%3C
+http://www.example.com/a/c/../b/search?q=foobar>, http://www.example.com/a/b/search?q=foobar%3E
+http://www.example.com/a/c/../b/search?q=foobar^, http://www.example.com/a/b/search?q=foobar%5E
+http://www.example.com/a/c/../b/search?q=foobar`, http://www.example.com/a/b/search?q=foobar%60
+http://www.example.com/a/c/../b/search?q=foobar|, http://www.example.com/a/b/search?q=foobar%7C
+
+# escape percent sign if it's initial to an invalid escape sequence
+http://www.example.com/p%zz%77%v, http://www.example.com/p%25zzw%25v
+
+# boundary test: percent sign close to the end of string
+http://www.example.com/search?q=foobar%, http://www.example.com/search?q=foobar%25
+http://www.example.com/search?q=foobar%2, http://www.example.com/search?q=foobar%252
+http://www.example.com/search?q=foobar%25, http://www.example.com/search?q=foobar%25
+http://www.example.com/search?q=foobar%252, http://www.example.com/search?q=foobar%252
+
+# protocol to be lowercased
+HTTP://foo.com/, http://foo.com/
+
+# removal of trailing dot in hostname
+https://www.example.org./, https://www.example.org/
+
+# file:/ URLs
+file:/var/www/html/////./bar/index.html, file:/var/www/html/bar/index.html
+file:/var/www/html/foo/../bar/index.html, file:/var/www/html/bar/index.html