mirror of
https://github.com/crawler-commons/crawler-commons
synced 2024-05-18 01:56:06 +02:00
e5563c3049
closes #308 - add unit test to prove that an empty query is removed
9.2 KiB
9.2 KiB
1 | # Weird URL | Normalized URL |
---|---|---|
2 | http://foo.com/%66oo.html | http://foo.com/foo.html |
3 | http://foo.com/%66oo.htm%6c | http://foo.com/foo.html |
4 | http://foo.com/%66oo.ht%6dl | http://foo.com/foo.html |
5 | http://foo.com/%66oo.ht%6d%6c | http://foo.com/foo.html |
6 | http://foo.com/%66oo.htm%C0 | http://foo.com/foo.htm%C0 |
7 | http://foo.com/%66oo.htm%1A | http://foo.com/foo.htm%1A |
8 | http://foo.com/%66oo.htm%c0 | http://foo.com/foo.htm%C0 |
9 | https://www.example.com/search/%2a/ | https://www.example.com/search/%2A/ |
10 | https://www.example.com/topic/9%2f11/ | https://www.example.com/topic/9%2F11/ |
11 | http://foo.com/you%20too.html | http://foo.com/you%20too.html |
12 | http://foo.com/you too.html | http://foo.com/you%20too.html |
13 | http://foo.com/file.html%23cz | http://foo.com/file.html%23cz |
14 | http://foo.com/fast/dir%2fcz | http://foo.com/fast/dir%2Fcz |
15 | http://foo.com/! | http://foo.com/%1A! |
16 | http://foo.com/! | http://foo.com/%01! |
17 | http://mydomain.com/en Español.aspx | http://mydomain.com/en%20Espa%C3%B1ol.aspx |
18 | http://x.com/s?m=10&q=a%26b | http://x.com/s?m=10&q=a%26b |
19 | http://x.com/show?http%3A%2F%2Fx.com%2Fb | http://x.com/show?http%3A%2F%2Fx.com%2Fb |
20 | http://google.com/search?q=c%2B%2B | http://google.com/search?q=c%2B%2B |
21 | http://x.com/s?q=a+b | http://x.com/s?q=a+b |
22 | http://bücher.de/ | http://xn--bcher-kva.de/ |
23 | http://êxample.com | http://xn--xample-hva.com/ |
24 | https://нэб.рф/ | https://xn--90ax2c.xn--p1ai/ |
25 | https://www.0251-sachverst%c3%a4ndiger.de/ | https://www.xn--0251-sachverstndiger-ozb.de/ |
26 | http://x.com/./a/../%66.html | http://x.com/f.html |
27 | http://x.com/?x[y]=1 | http://x.com/?x%5By%5D=1 |
28 | http://x.com/foo | http://x.com/foo%C2%80 |
29 | http://x.com/foo%c2%80 | http://x.com/foo%C2%80 |
30 | http://foo.com/ | http://foo.com/ |
31 | http://foo.com/ | http://foo.com/ |
32 | http://Foo.Com/index.html | http://foo.com/index.html |
33 | http://Foo.Com/index.html | http://foo.com/index.html |
34 | https://example%2Ecom/ | https://example.com/ |
35 | http://foo.com:80/index.html | http://foo.com/index.html |
36 | http://foo.com:81/ | http://foo.com:81/ |
37 | http://example.com:/ | http://example.com/ |
38 | https://example.com:/foobar.html | https://example.com/foobar.html |
39 | http://foo.com | http://foo.com/ |
40 | http://foo.com/foo.html#ref | http://foo.com/foo.html |
41 | http://foo.com/%66oo.html | http://foo.com/foo.html |
42 | http://foo.com/aa/./foo.html | http://foo.com/aa/foo.html |
43 | http://foo.com/aa/../ | http://foo.com/ |
44 | http://foo.com/aa/bb/../ | http://foo.com/aa/ |
45 | http://foo.com/aa/.. | http://foo.com/ |
46 | http://foo.com/aa/bb/cc/../../foo.html | http://foo.com/aa/foo.html |
47 | http://foo.com/aa/bb/../cc/dd/../ee/foo.html | http://foo.com/aa/cc/ee/foo.html |
48 | http://foo.com/../foo.html | http://foo.com/foo.html |
49 | http://foo.com/../../foo.html | http://foo.com/foo.html |
50 | http://foo.com/../aa/../foo.html | http://foo.com/foo.html |
51 | http://foo.com/aa/../../foo.html | http://foo.com/foo.html |
52 | http://foo.com/aa/../bb/../foo.html/../../ | http://foo.com/ |
53 | http://foo.com/../aa/foo.html | http://foo.com/aa/foo.html |
54 | http://foo.com/../aa/../foo.html | http://foo.com/foo.html |
55 | http://foo.com/a..a/foo.html | http://foo.com/a..a/foo.html |
56 | http://foo.com/a..a/../foo.html | http://foo.com/foo.html |
57 | http://foo.com/foo.foo/../foo.html | http://foo.com/foo.html |
58 | http://foo.com//aa/bb/foo.html | http://foo.com/aa/bb/foo.html |
59 | http://foo.com/aa//bb/foo.html | http://foo.com/aa/bb/foo.html |
60 | http://foo.com/aa/bb//foo.html | http://foo.com/aa/bb/foo.html |
61 | http://foo.com//aa//bb//foo.html | http://foo.com/aa/bb/foo.html |
62 | http://foo.com////aa////bb//foo.html | http://foo.com/aa/bb/foo.html |
63 | http://foo.com////aa////bb////foo.html | http://foo.com/aa/bb/foo.html |
64 | http://foo.com/aa?referer=http://bar.com | http://foo.com/aa?referer=http://bar.com |
65 | http://foo.com/.. | http://foo.com/ |
66 | file:///foo/bar.txt | file:///foo/bar.txt |
67 | ftp:/ | ftp:/ |
68 | http: | http:/ |
69 | http://// | http:/ |
70 | http://///// | http:/ |
71 | http://example.com? | http://example.com/ |
72 | http://example.com?a=1 | http://example.com/?a=1 |
73 | http://example.com/? | http://example.com/ |
74 | https://www.last.fm/music/Prefuse+73/_/90%+of+My+Mind+Is+With+You | https://www.last.fm/music/Prefuse+73/_/90%25+of+My+Mind+Is+With+You |
75 | http://foo.com/{{stuff}} | http://foo.com/%7B%7Bstuff%7D%7D |
76 | http://www.example.com/a/c/../b/search?q=foobar" | http://www.example.com/a/b/search?q=foobar%22 |
77 | http://www.example.com/a/c/../b/search?q=foobar% | http://www.example.com/a/b/search?q=foobar%25 |
78 | http://www.example.com/a/c/../b/search?q=foobar< | http://www.example.com/a/b/search?q=foobar%3C |
79 | http://www.example.com/a/c/../b/search?q=foobar> | http://www.example.com/a/b/search?q=foobar%3E |
80 | http://www.example.com/a/c/../b/search?q=foobar^ | http://www.example.com/a/b/search?q=foobar%5E |
81 | http://www.example.com/a/c/../b/search?q=foobar` | http://www.example.com/a/b/search?q=foobar%60 |
82 | http://www.example.com/a/c/../b/search?q=foobar| | http://www.example.com/a/b/search?q=foobar%7C |
83 | http://www.example.com/p%zz%77%v | http://www.example.com/p%25zzw%25v |
84 | http://www.example.com/search?q=foobar% | http://www.example.com/search?q=foobar%25 |
85 | http://www.example.com/search?q=foobar%2 | http://www.example.com/search?q=foobar%252 |
86 | http://www.example.com/search?q=foobar%25 | http://www.example.com/search?q=foobar%25 |
87 | http://www.example.com/search?q=foobar%252 | http://www.example.com/search?q=foobar%252 |
88 | HTTP://foo.com/ | http://foo.com/ |
89 | # no protocol/scheme | see #271 |
90 | foo.com/index.html | http://foo.com/index.html |
91 | ftp://foo.com/index.html | ftp://foo.com/index.html |
92 | file:/path/index.html | file:/path/index.html |
93 | https://www.example.org./ | https://www.example.org/ |
94 | file:/var/www/html/////./bar/index.html | file:/var/www/html/bar/index.html |
95 | file:/var/www/html/foo/../bar/index.html | file:/var/www/html/bar/index.html |
96 | http://example.com/?b=1&a=1 | http://example.com/?a=1&b=1 |
97 | http://foo.com/foo.html?b=1&a=1 | http://foo.com/foo.html?a=1&b=1 |
98 | http://foo.com/index?a=1&b=2 | http://foo.com/index?a=1&b=2 |
99 | http://foo.com/index?b=2&a=1 | http://foo.com/index?a=1&b=2 |
100 | http://foo.com/index?b=2&a=1#c | http://foo.com/index?a=1&b=2 |
101 | https://foo.com/search?q=tl;dr | https://foo.com/search?q=tl;dr |
102 | http://foo.com/index?a=1&b | http://foo.com/index?a=1&b |
103 | http://foo.com/index?a=1&b= | http://foo.com/index?a=1&b |
104 | http://foo.com/index?a=1&b#c | http://foo.com/index?a=1&b |
105 | http://foo.com/index?b&a=1 | http://foo.com/index?a=1&b |
106 | http://foo.com/index?b=&a=1 | http://foo.com/index?a=1&b |
107 | http://foo.com/index?b=1&a=1& | http://foo.com/index?a=1&b=1 |
108 | http://foo.com/index?&b=1&a=1 | http://foo.com/index?a=1&b=1 |
109 | http://foo.com/index?&=1&a=1 | http://foo.com/index?a=1 |
110 | http://foo.com/index?=1&b=1&a=1 | http://foo.com/index?a=1&b=1 |
111 | http://example.com/? | http://example.com/ |
112 | https://foo.com/?one/valid_query/without_%2F_params | https://foo.com/?one/valid_query/without_%2F_params |
113 | http://foo.com/asdf/page.php?article%2F1234 | http://foo.com/asdf/page.php?article%2F1234 |
114 | https://www.example.com/path/file-with-a-*.html | https://www.example.com/path/file-with-a-*.html |
115 | https://www.example.com/path/foo-$ | https://www.example.com/path/foo-$ |