mirror of
https://github.com/crawler-commons/crawler-commons
synced 2024-05-24 19:36:06 +02:00
4.7 KiB
4.7 KiB
1 | # Weird URL, Normalized URL |
---|---|
2 | # testNUTCH1098 |
3 | # ------------- |
4 | # check that % encoding is normalized |
5 | http://foo.com/%66oo.html, http://foo.com/foo.html |
6 | # check that % encoding works correctly at end of URL |
7 | http://foo.com/%66oo.htm%6c, http://foo.com/foo.html |
8 | http://foo.com/%66oo.ht%6dl, http://foo.com/foo.html |
9 | # check that % decoder do not overlap strings |
10 | http://foo.com/%66oo.ht%6d%6c, http://foo.com/foo.html |
11 | # check that % decoder leaves high bit chars alone |
12 | http://foo.com/%66oo.htm%C0, http://foo.com/foo.htm%C0 |
13 | # check that % decoder leaves control chars alone |
14 | http://foo.com/%66oo.htm%1A, http://foo.com/foo.htm%1A |
15 | # check that % decoder converts to upper case letters |
16 | http://foo.com/%66oo.htm%c0, http://foo.com/foo.htm%C0 |
17 | # check that % decoder leaves encoded spaces alone |
18 | http://foo.com/you%20too.html, http://foo.com/you%20too.html |
19 | # check that spaces are encoded into %20 |
20 | http://foo.com/you too.html, http://foo.com/you%20too.html |
21 | # check that encoded # are not decoded |
22 | http://foo.com/file.html%23cz, http://foo.com/file.html%23cz |
23 | # check that encoded / are not decoded |
24 | http://foo.com/fast/dir%2fcz, http://foo.com/fast/dir%2Fcz |
25 | # check that control chars are encoded |
26 | #http://foo.com/\u001a!, http://foo.com/%1A! |
27 | # check that control chars are always encoded into 2 digits |
28 | #http://foo.com/\u0001!, http://foo.com/%01! |
29 | # check encoding of spanish chars |
30 | #http://mydomain.com/en Espa\u00F1ol.aspx, http://mydomain.com/en%20Espa%C3%B1ol.aspx |
31 | # testNUTCH2064 |
32 | # ------------- |
33 | # Ampersand and colon and other punctuation characters are not to be unescaped |
34 | http://x.com/s?q=a%26b&m=10, http://x.com/s?q=a%26b&m=10 |
35 | http://x.com/show?http%3A%2F%2Fx.com%2Fb, http://x.com/show?http%3A%2F%2Fx.com%2Fb |
36 | http://google.com/search?q=c%2B%2B, http://google.com/search?q=c%2B%2B |
37 | # do also not touch the query part which is application/x-www-form-urlencoded |
38 | http://x.com/s?q=a+b, http://x.com/s?q=a+b |
39 | # and keep Internationalized domain names http://bücher.de/ may be http://xn--bcher-kva.de/ |
40 | # but definitely not http://b%C3%BCcher.de/ |
41 | http://b\u00fccher.de/, http://b\u00fccher.de/ |
42 | # test whether percent-encoding works together with other normalizations |
43 | http://x.com/./a/../%66.html, http://x.com/f.html |
44 | # [ and ] need escaping as well |
45 | http://x.com/?x[y]=1, http://x.com/?x%5By%5D=1 |
46 | # boundary test for first character outside the ASCII range (U+0080) |
47 | #http://x.com/foo\u0080, http://x.com/foo%C2%80 |
48 | http://x.com/foo%c2%80, http://x.com/foo%C2%80 |
49 | # testNormalizer |
50 | # -------------- |
51 | # check that leading and trailing spaces are removed |
52 | http://foo.com/ , http://foo.com/ |
53 | # check that protocol is lower cased |
54 | http://foo.com/, http://foo.com/ |
55 | # check that host is lower cased |
56 | http://Foo.Com/index.html, http://foo.com/index.html |
57 | http://Foo.Com/index.html, http://foo.com/index.html |
58 | # check that port number is normalized |
59 | http://foo.com:80/index.html, http://foo.com/index.html |
60 | http://foo.com:81/, http://foo.com:81/ |
61 | # check that empty port is removed |
62 | http://example.com:/, http://example.com/ |
63 | https://example.com:/foobar.html, https://example.com/foobar.html |
64 | # check that null path is normalized |
65 | http://foo.com, http://foo.com/ |
66 | # check that references are removed |
67 | http://foo.com/foo.html#ref, http://foo.com/foo.html |
68 | # check that encoding is normalized |
69 | http://foo.com/%66oo.html, http://foo.com/foo.html |
70 | # check that unnecessary ../ are removed |
71 | http://foo.com/aa/./foo.html, http://foo.com/aa/foo.html |
72 | http://foo.com/aa/../, http://foo.com/ |
73 | http://foo.com/aa/bb/../, http://foo.com/aa/ |
74 | http://foo.com/aa/.., http://foo.com/ |
75 | http://foo.com/aa/bb/cc/../../foo.html, http://foo.com/aa/foo.html |
76 | http://foo.com/aa/bb/../cc/dd/../ee/foo.html, http://foo.com/aa/cc/ee/foo.html |
77 | http://foo.com/../foo.html, http://foo.com/foo.html |
78 | http://foo.com/../../foo.html, http://foo.com/foo.html |
79 | http://foo.com/../aa/../foo.html, http://foo.com/foo.html |
80 | http://foo.com/aa/../../foo.html, http://foo.com/foo.html |
81 | http://foo.com/aa/../bb/../foo.html/../../, http://foo.com/ |
82 | http://foo.com/../aa/foo.html, http://foo.com/aa/foo.html |
83 | http://foo.com/../aa/../foo.html, http://foo.com/foo.html |
84 | http://foo.com/a..a/foo.html, http://foo.com/a..a/foo.html |
85 | http://foo.com/a..a/../foo.html, http://foo.com/foo.html |
86 | http://foo.com/foo.foo/../foo.html, http://foo.com/foo.html |
87 | http://foo.com//aa/bb/foo.html, http://foo.com/aa/bb/foo.html |
88 | http://foo.com/aa//bb/foo.html, http://foo.com/aa/bb/foo.html |
89 | http://foo.com/aa/bb//foo.html, http://foo.com/aa/bb/foo.html |
90 | http://foo.com//aa//bb//foo.html, http://foo.com/aa/bb/foo.html |
91 | http://foo.com////aa////bb//foo.html, http://foo.com/aa/bb/foo.html |
92 | http://foo.com/aa?referer=http://bar.com, http://foo.com/aa?referer=http://bar.com |
93 | # check URLs without host (authority) |
94 | file:///foo/bar.txt, file:///foo/bar.txt |
95 | ftp:/, ftp:/ |
96 | http:, http:/ |
97 | http:////, http:/ |
98 | http:///////, http:/ |