1
0
Fork 0
mirror of git://git.code.sf.net/p/zsh/code synced 2024-06-01 04:46:08 +02:00
zsh/Test/D07multibyte.ztst

387 lines
8.9 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

%prep
# Find a UTF-8 locale.
setopt multibyte
# Don't let LC_* override our choice of locale.
unset -m LC_\*
mb_ok=
langs=(en_US.UTF-8 en_GB.UTF-8 en.UTF-8
$(locale -a 2>/dev/null | sed -e 's/utf8/UTF-8/' | grep UTF-8))
for LANG in $langs; do
if [[ é = ? ]]; then
mb_ok=1
break;
fi
done
if [[ -z $mb_ok ]]; then
ZTST_unimplemented="no UTF-8 locale or multibyte mode is not implemented"
else
print -u $ZTST_fd Testing multibyte with locale $LANG
mkdir multibyte.tmp && cd multibyte.tmp
fi
%test
a=ténébreux
for i in {1..9}; do
print ${a[i]}
for j in {$i..9}; do
print $i $j ${a[i,j]} ${a[-j,-i]}
done
done
0:Basic indexing with multibyte characters
>t
>1 1 t x
>1 2 té ux
>1 3 tén eux
>1 4 téné reux
>1 5 ténéb breux
>1 6 ténébr ébreux
>1 7 ténébre nébreux
>1 8 ténébreu énébreux
>1 9 ténébreux ténébreux
>2 2 é u
>2 3 én eu
>2 4 éné reu
>2 5 énéb breu
>2 6 énébr ébreu
>2 7 énébre nébreu
>2 8 énébreu énébreu
>2 9 énébreux ténébreu
>n
>3 3 n e
>3 4 né re
>3 5 néb bre
>3 6 nébr ébre
>3 7 nébre nébre
>3 8 nébreu énébre
>3 9 nébreux ténébre
>4 4 é r
>4 5 éb br
>4 6 ébr ébr
>4 7 ébre nébr
>4 8 ébreu énébr
>4 9 ébreux ténébr
>b
>5 5 b b
>5 6 br éb
>5 7 bre néb
>5 8 breu énéb
>5 9 breux ténéb
>r
>6 6 r é
>6 7 re né
>6 8 reu éné
>6 9 reux téné
>e
>7 7 e n
>7 8 eu én
>7 9 eux tén
>u
>8 8 u é
>8 9 ux té
>x
>9 9 x t
s=é
print A${s[-2]}A B${s[-1]}B C${s[0]}C D${s[1]}D E${s[2]}E
0:Out of range subscripts with multibyte characters
>AA BéB CC DéD EE
print ${a[(i)é]} ${a[(I)é]} ${a[${a[(i)é]},${a[(I)é]}]}
0:Reverse indexing with multibyte characters
>2 4 éné
print ${a[(r)én,(r)éb]}
0:Subscript searching with multibyte characters
>énéb
print ${a[(rb:1:)é,-1]}
print ${a[(rb:2:)é,-1]}
print ${a[(rb:3:)é,-1]}
print ${a[(rb:4:)é,-1]}
print ${a[(rb:5:)é,-1]}
0:Subscript searching with initial offset
>énébreux
>énébreux
>ébreux
>ébreux
>
print ${a[(rn:1:)é,-1]}
print ${a[(rn:2:)é,-1]}
print ${a[(rn:3:)é,-1]}
0:Subscript searching with count
>énébreux
>ébreux
>
print ${a[(R)én,(R)éb]}
0:Backward subscript searching with multibyte characters
>énéb
# Starting offsets with (R) seem to be so strange as to be hardly
# worth testing.
setopt extendedglob
[[ $a = (#b)t(én)(éb)reux ]] || print "Failed to match." >&2
for i in {1..${#match}}; do
print $match[i] $mbegin[i] $mend[i] ${a[$mbegin[i],$mend[i]]}
done
0:Multibyte offsets in pattern tests
>én 2 3 én
>éb 4 5 éb
b=${(U)a}
print $b
print ${(L)b}
desdichado="Je suis le $a, le veuf, l'inconsolé"
print ${(C)desdichado}
lxiv="l'état c'est moi"
print ${(C)lxiv}
0:Case modification of multibyte strings
>TÉNÉBREUX
>ténébreux
>Je Suis Le Ténébreux, Le Veuf, L'Inconsolé
>L'État C'Est Moi
array=(ølaf ødd øpened án encyclopædia)
barray=(${(U)array})
print $barray
print ${(L)barray}
print ${(C)array}
print ${(C)barray}
0:Case modification of arrays with multibyte strings
>ØLAF ØDD ØPENED ÁN ENCYCLOPÆDIA
>ølaf ødd øpened án encyclopædia
>Ølaf Ødd Øpened Án Encyclopædia
>Ølaf Ødd Øpened Án Encyclopædia
print $(( ##¥ ))
pound=£
print $(( #pound ))
alpha=α
print $(( ##α )) $(( #alpha ))
0:Conversion to Unicode in mathematical expressions
>165
>163
>945 945
unsetopt posix_identifiers
expr='hähä=3 || exit 1; print $hähä'
eval $expr
setopt posix_identifiers
(eval $expr)
1:POSIX_IDENTIFIERS option
>3
?(eval):1: command not found: hähä=3
foo="Ølaf«Ødd«øpénëd«ån«àpple"
print -l ${(s.«.)foo}
ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος."
print -l ${=ioh}
print ${(w)#ioh}
0:Splitting with multibyte characters
>Ølaf
>Ødd
>øpénëd
>ån
>àpple
>Ἐν
>ἀρχῇ
>ἦν
>ὁ
>λόγος,
>καὶ
>ὁ
>λόγος
>ἦν
>πρὸς
>τὸν
>θεόν,
>καὶ
>θεὸς
>ἦν
>ὁ
>λόγος.
>17
read -d £ one
read -d £ two
print $one
print $two
0:read with multibyte delimiter
<first£second£
>first
>second
(IFS=«
read -d » -A array
print -l $array)
0:read -A with multibyte IFS
<dominus«illuminatio«mea»ignored
>dominus
>illuminatio
>mea
read -k2 -u0 twochars
print $twochars
0:read multibyte characters
<«»ignored
>«»
# See if the system grokks first-century Greek...
ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος."
for (( i = 1; i <= ${#ioh}; i++ )); do
# FC3 doesn't recognise ῇ (U+1FC7: Greek small letter eta with
# perispomeni and ypogegrammeni, of course) as a lower case character.
if [[ $ioh[i] != [[:lower:]] && $i != 7 ]]; then
for tp in upper space punct invalid; do
if [[ $tp = invalid || $ioh[i] = [[:${tp}:]] ]]; then
print "$i: $tp"
break
fi
done
fi
done
0:isw* functions on non-ASCII wide characters
>1: upper
>3: space
>8: space
>11: space
>13: space
>19: punct
>20: space
>24: space
>26: space
>32: space
>35: space
>40: space
>44: space
>49: punct
>50: space
>54: space
>59: space
>62: space
>64: space
>70: punct
ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος"
print ${ioh#[[:alpha:]]##}
print ${ioh##[[:alpha:]]##}
print ${ioh%[[:alpha:]]##}
print ${ioh%%[[:alpha:]]##}
print ${(S)ioh#λ*ς}
print ${(S)ioh##λ*ς}
print ${(S)ioh%θ*ς}
print ${(S)ioh%%θ*ς}
0:Parameter #, ##, %, %% with multibyte characters
>ν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος
> ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος
>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγο
>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ
>Ἐν ἀρχῇ ἦν ὁ , καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος
>Ἐν ἀρχῇ ἦν ὁ
>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ ἦν ὁ λόγος
>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ
foo=(κατέβην χθὲς εἰς Πειραιᾶ)
print ${(l.3..¥.r.3..£.)foo}
print ${(l.4..¥.r.2..£.)foo}
print ${(l.5..¥.r.1..£.)foo}
print ${(l.4..¥..«.r.4..£..».)foo}
print ${(l.4..¥..Σωκράτης.r.4..£..Γλαύκωνος.)foo}
0:simultaneous left and right padding
>κατέβη ¥χθὲς£ ¥¥εἰς£ Πειραι
>¥κατέβ ¥¥χθὲς ¥¥¥εἰς ¥Πειρα
>¥¥κατέ ¥¥¥χθὲ ¥¥¥¥εἰ ¥¥Πειρ
>«κατέβην ¥«χθὲς»£ ¥¥«εἰς»£ «Πειραιᾶ
>ςκατέβην ηςχθὲςΓλ τηςεἰςΓλ ςΠειραιᾶ
# er... yeah, that looks right...
foo=picobarn
print ${foo:s£bar£rod£:s¥rod¥stick¥}
0:Delimiters in modifiers
>picostickn
# TODO: if we get paired multibyte bracket delimiters to work
# (as Emacs does, the smug so-and-so), the following should change.
foo=bar
print ${(r£5££X£)foo}
print ${(l«10««Y««HI«)foo}
0:Delimiters in parameter flags
>barXX
>YYYYYHIbar
printf "%4.3s\n" főobar
0:Multibyte characters in printf widths
> főo
# We ask for case-insensitive sorting here (and supply upper case
# characters) so that we exercise the logic in the shell that lowers the
# case of the string for case-insensitive sorting.
print -oi HAH HUH HEH HÉH HÈH
(LC_ALL=C; print -oi HAH HUH HEH HÉH HÈH)
0:Multibyte characters in print sorting
>HAH HEH HÉH HÈH HUH
>HAH HEH HUH HÈH HÉH
# These are control characters in Unicode, so don't show up.
# We just want to check they're not being treated as tokens.
for x in {128..150}; do
print ${(#)x}
done | while read line; do
print ${#line} $(( #line ))
done
0:evaluated character number with multibyte characters
>1 128
>1 129
>1 130
>1 131
>1 132
>1 133
>1 134
>1 135
>1 136
>1 137
>1 138
>1 139
>1 140
>1 141
>1 142
>1 143
>1 144
>1 145
>1 146
>1 147
>1 148
>1 149
>1 150
touch ngs1.txt ngs2.txt ngs10.txt ngs20.txt ngs100.txt ngs200.txt
setopt numericglobsort
print -l ngs*
unsetopt numericglobsort
print -l ngs*
0:NUMERIC_GLOB_SORT option in UTF-8 locale
>ngs1.txt
>ngs2.txt
>ngs10.txt
>ngs20.txt
>ngs100.txt
>ngs200.txt
>ngs100.txt
>ngs10.txt
>ngs1.txt
>ngs200.txt
>ngs20.txt
>ngs2.txt
# Not strictly multibyte, but gives us a well-defined locale for testing.
foo=$'X\xc0Y\x07Z\x7fT'
print -r ${(q)foo}
0:Backslash-quoting of unprintable/invalid characters uses $'...'
>X$'\300'Y$'\a'Z$'\177'T