1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-05-24 19:36:06 +02:00
crawler-commons/src/test/resources/normalizer/weirdToNormalizedUrls.csv

4.7 KiB

1# Weird URL, Normalized URL
2# testNUTCH1098
3# -------------
4# check that % encoding is normalized
5http://foo.com/%66oo.html, http://foo.com/foo.html
6# check that % encoding works correctly at end of URL
7http://foo.com/%66oo.htm%6c, http://foo.com/foo.html
8http://foo.com/%66oo.ht%6dl, http://foo.com/foo.html
9# check that % decoder do not overlap strings
10http://foo.com/%66oo.ht%6d%6c, http://foo.com/foo.html
11# check that % decoder leaves high bit chars alone
12http://foo.com/%66oo.htm%C0, http://foo.com/foo.htm%C0
13# check that % decoder leaves control chars alone
14http://foo.com/%66oo.htm%1A, http://foo.com/foo.htm%1A
15# check that % decoder converts to upper case letters
16http://foo.com/%66oo.htm%c0, http://foo.com/foo.htm%C0
17# check that % decoder leaves encoded spaces alone
18http://foo.com/you%20too.html, http://foo.com/you%20too.html
19# check that spaces are encoded into %20
20http://foo.com/you too.html, http://foo.com/you%20too.html
21# check that encoded # are not decoded
22http://foo.com/file.html%23cz, http://foo.com/file.html%23cz
23# check that encoded / are not decoded
24http://foo.com/fast/dir%2fcz, http://foo.com/fast/dir%2Fcz
25# check that control chars are encoded
26#http://foo.com/\u001a!, http://foo.com/%1A!
27# check that control chars are always encoded into 2 digits
28#http://foo.com/\u0001!, http://foo.com/%01!
29# check encoding of spanish chars
30#http://mydomain.com/en Espa\u00F1ol.aspx, http://mydomain.com/en%20Espa%C3%B1ol.aspx
31# testNUTCH2064
32# -------------
33# Ampersand and colon and other punctuation characters are not to be unescaped
34http://x.com/s?q=a%26b&m=10, http://x.com/s?q=a%26b&m=10
35http://x.com/show?http%3A%2F%2Fx.com%2Fb, http://x.com/show?http%3A%2F%2Fx.com%2Fb
36http://google.com/search?q=c%2B%2B, http://google.com/search?q=c%2B%2B
37# do also not touch the query part which is application/x-www-form-urlencoded
38http://x.com/s?q=a+b, http://x.com/s?q=a+b
39# and keep Internationalized domain names http://bücher.de/ may be http://xn--bcher-kva.de/
40# but definitely not http://b%C3%BCcher.de/
41http://b\u00fccher.de/, http://b\u00fccher.de/
42# test whether percent-encoding works together with other normalizations
43http://x.com/./a/../%66.html, http://x.com/f.html
44# [ and ] need escaping as well
45http://x.com/?x[y]=1, http://x.com/?x%5By%5D=1
46# boundary test for first character outside the ASCII range (U+0080)
47#http://x.com/foo\u0080, http://x.com/foo%C2%80
48http://x.com/foo%c2%80, http://x.com/foo%C2%80
49# testNormalizer
50# --------------
51# check that leading and trailing spaces are removed
52http://foo.com/ , http://foo.com/
53# check that protocol is lower cased
54http://foo.com/, http://foo.com/
55# check that host is lower cased
56http://Foo.Com/index.html, http://foo.com/index.html
57http://Foo.Com/index.html, http://foo.com/index.html
58# check that port number is normalized
59http://foo.com:80/index.html, http://foo.com/index.html
60http://foo.com:81/, http://foo.com:81/
61# check that empty port is removed
62http://example.com:/, http://example.com/
63https://example.com:/foobar.html, https://example.com/foobar.html
64# check that null path is normalized
65http://foo.com, http://foo.com/
66# check that references are removed
67http://foo.com/foo.html#ref, http://foo.com/foo.html
68# check that encoding is normalized
69http://foo.com/%66oo.html, http://foo.com/foo.html
70# check that unnecessary ../ are removed
71http://foo.com/aa/./foo.html, http://foo.com/aa/foo.html
72http://foo.com/aa/../, http://foo.com/
73http://foo.com/aa/bb/../, http://foo.com/aa/
74http://foo.com/aa/.., http://foo.com/
75http://foo.com/aa/bb/cc/../../foo.html, http://foo.com/aa/foo.html
76http://foo.com/aa/bb/../cc/dd/../ee/foo.html, http://foo.com/aa/cc/ee/foo.html
77http://foo.com/../foo.html, http://foo.com/foo.html
78http://foo.com/../../foo.html, http://foo.com/foo.html
79http://foo.com/../aa/../foo.html, http://foo.com/foo.html
80http://foo.com/aa/../../foo.html, http://foo.com/foo.html
81http://foo.com/aa/../bb/../foo.html/../../, http://foo.com/
82http://foo.com/../aa/foo.html, http://foo.com/aa/foo.html
83http://foo.com/../aa/../foo.html, http://foo.com/foo.html
84http://foo.com/a..a/foo.html, http://foo.com/a..a/foo.html
85http://foo.com/a..a/../foo.html, http://foo.com/foo.html
86http://foo.com/foo.foo/../foo.html, http://foo.com/foo.html
87http://foo.com//aa/bb/foo.html, http://foo.com/aa/bb/foo.html
88http://foo.com/aa//bb/foo.html, http://foo.com/aa/bb/foo.html
89http://foo.com/aa/bb//foo.html, http://foo.com/aa/bb/foo.html
90http://foo.com//aa//bb//foo.html, http://foo.com/aa/bb/foo.html
91http://foo.com////aa////bb//foo.html, http://foo.com/aa/bb/foo.html
92http://foo.com/aa?referer=http://bar.com, http://foo.com/aa?referer=http://bar.com
93# check URLs without host (authority)
94file:///foo/bar.txt, file:///foo/bar.txt
95ftp:/, ftp:/
96http:, http:/
97http:////, http:/
98http:///////, http:/