zsh/Test/D07multibyte.ztst

%prep

# Find a UTF-8 locale.
  setopt multibyte
# Don't let LC_* override our choice of locale.
  unset -m LC_\*
  mb_ok=
  langs=(en_US.UTF-8 en_GB.UTF-8 en.UTF-8
	 $(locale -a 2>/dev/null | sed -e 's/utf8/UTF-8/' | grep UTF-8))
  for LANG in $langs; do
    if [[ é = ? ]]; then
      mb_ok=1
      break;
    fi
  done
  if [[ -z $mb_ok ]]; then
    ZTST_unimplemented="no UTF-8 locale or multibyte mode is not implemented"
  else
    print -u $ZTST_fd Testing multibyte with locale $LANG
    mkdir multibyte.tmp && cd multibyte.tmp
  fi

%test

  a=ténébreux
  for i in {1..9}; do
      print ${a[i]}
      for j in {$i..9}; do
	  print $i $j ${a[i,j]} ${a[-j,-i]}
      done
  done
0:Basic indexing with multibyte characters
>t
>1 1 t x
>1 2 té ux
>1 3 tén eux
>1 4 téné reux
>1 5 ténéb breux
>1 6 ténébr ébreux
>1 7 ténébre nébreux
>1 8 ténébreu énébreux
>1 9 ténébreux ténébreux
>é
>2 2 é u
>2 3 én eu
>2 4 éné reu
>2 5 énéb breu
>2 6 énébr ébreu
>2 7 énébre nébreu
>2 8 énébreu énébreu
>2 9 énébreux ténébreu
>n
>3 3 n e
>3 4 né re
>3 5 néb bre
>3 6 nébr ébre
>3 7 nébre nébre
>3 8 nébreu énébre
>3 9 nébreux ténébre
>é
>4 4 é r
>4 5 éb br
>4 6 ébr ébr
>4 7 ébre nébr
>4 8 ébreu énébr
>4 9 ébreux ténébr
>b
>5 5 b b
>5 6 br éb
>5 7 bre néb
>5 8 breu énéb
>5 9 breux ténéb
>r
>6 6 r é
>6 7 re né
>6 8 reu éné
>6 9 reux téné
>e
>7 7 e n
>7 8 eu én
>7 9 eux tén
>u
>8 8 u é
>8 9 ux té
>x
>9 9 x t

  s=é
  print A${s[-2]}A B${s[-1]}B C${s[0]}C D${s[1]}D E${s[2]}E
0:Out of range subscripts with multibyte characters
>AA BéB CC DéD EE

  print ${a[(i)é]} ${a[(I)é]} ${a[${a[(i)é]},${a[(I)é]}]}
0:Reverse indexing with multibyte characters
>2 4 éné

  print ${a[(r)én,(r)éb]}
0:Subscript searching with multibyte characters
>énéb

  print ${a[(rb:1:)é,-1]}
  print ${a[(rb:2:)é,-1]}
  print ${a[(rb:3:)é,-1]}
  print ${a[(rb:4:)é,-1]}
  print ${a[(rb:5:)é,-1]}
0:Subscript searching with initial offset
>énébreux
>énébreux
>ébreux
>ébreux
>

  print ${a[(rn:1:)é,-1]}
  print ${a[(rn:2:)é,-1]}
  print ${a[(rn:3:)é,-1]}
0:Subscript searching with count
>énébreux
>ébreux
>

  print ${a[(R)én,(R)éb]}
0:Backward subscript searching with multibyte characters
>énéb

# Starting offsets with (R) seem to be so strange as to be hardly
# worth testing.

  setopt extendedglob
  [[ $a = (#b)t(én)(éb)reux ]] || print "Failed to match." >&2
  for i in {1..${#match}}; do
    print $match[i] $mbegin[i] $mend[i] ${a[$mbegin[i],$mend[i]]}
  done
0:Multibyte offsets in pattern tests
>én 2 3 én
>éb 4 5 éb

  b=${(U)a}
  print $b
  print ${(L)b}
  desdichado="Je suis le $a, le veuf, l'inconsolé"
  print ${(C)desdichado}
  lxiv="l'état c'est moi"
  print ${(C)lxiv}
0:Case modification of multibyte strings
>TÉNÉBREUX
>ténébreux
>Je Suis Le Ténébreux, Le Veuf, L'Inconsolé
>L'État C'Est Moi

  array=(ølaf ødd øpened án encyclopædia)
  barray=(${(U)array})
  print $barray
  print ${(L)barray}
  print ${(C)array}
  print ${(C)barray}
0:Case modification of arrays with multibyte strings
>ØLAF ØDD ØPENED ÁN ENCYCLOPÆDIA
>ølaf ødd øpened án encyclopædia
>Ølaf Ødd Øpened Án Encyclopædia
>Ølaf Ødd Øpened Án Encyclopædia

  print $(( ##¥ ))
  pound=£
  print $(( #pound ))
  alpha=α
  print $(( ##α )) $(( #alpha ))
0:Conversion to Unicode in mathematical expressions
>165
>163
>945 945

  unsetopt posix_identifiers
  expr='hähä=3 || exit 1; print $hähä'
  eval $expr
  setopt posix_identifiers
  (eval $expr)
1:POSIX_IDENTIFIERS option
>3
?(eval):1: command not found: hähä=3

  foo="Ølaf«Ødd«øpénëd«ån«àpple"
  print -l ${(s.«.)foo}
  ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος."
  print -l ${=ioh}
  print ${(w)#ioh}
0:Splitting with multibyte characters
>Ølaf
>Ødd
>øpénëd
>ån
>àpple
>Ἐν
>ἀρχῇ
>ἦν
>ὁ
>λόγος,
>καὶ
>ὁ
>λόγος
>ἦν
>πρὸς
>τὸν
>θεόν,
>καὶ
>θεὸς
>ἦν
>ὁ
>λόγος.
>17

  read -d £ one
  read -d £ two
  print $one
  print $two
0:read with multibyte delimiter
<first£second£
>first
>second

  (IFS=«
  read -d » -A array
  print -l $array)
0:read -A with multibyte IFS
<dominus«illuminatio«mea»ignored
>dominus
>illuminatio
>mea

  read -k2 -u0 twochars
  print $twochars
0:read multibyte characters
<«»ignored
>«»

  # See if the system grokks first-century Greek...
  ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος."
  for (( i = 1; i <= ${#ioh}; i++ )); do
    # FC3 doesn't recognise ῇ (U+1FC7: Greek small letter eta with
    # perispomeni and ypogegrammeni, of course) as a lower case character.
    if [[ $ioh[i] != [[:lower:]] && $i != 7 ]]; then
      for tp in upper space punct invalid; do
        if [[ $tp = invalid || $ioh[i] = [[:${tp}:]] ]]; then
          print "$i: $tp"
	  break
	fi
      done
    fi
  done
0:isw* functions on non-ASCII wide characters
>1: upper
>3: space
>8: space
>11: space
>13: space
>19: punct
>20: space
>24: space
>26: space
>32: space
>35: space
>40: space
>44: space
>49: punct
>50: space
>54: space
>59: space
>62: space
>64: space
>70: punct

  ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος"
  print ${ioh#[[:alpha:]]##}
  print ${ioh##[[:alpha:]]##}
  print ${ioh%[[:alpha:]]##}
  print ${ioh%%[[:alpha:]]##}
  print ${(S)ioh#λ*ς}
  print ${(S)ioh##λ*ς}
  print ${(S)ioh%θ*ς}
  print ${(S)ioh%%θ*ς}
0:Parameter #, ##, %, %% with multibyte characters
>ν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος
> ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος
>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγο
>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ
>Ἐν ἀρχῇ ἦν ὁ , καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος
>Ἐν ἀρχῇ ἦν ὁ
>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ  ἦν ὁ λόγος
>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ

  foo=(κατέβην χθὲς εἰς Πειραιᾶ)
  print ${(l.3..¥.r.3..£.)foo}
  print ${(l.4..¥.r.2..£.)foo}
  print ${(l.5..¥.r.1..£.)foo}
  print ${(l.4..¥..«.r.4..£..».)foo}
  print ${(l.4..¥..Σωκράτης.r.4..£..Γλαύκωνος.)foo}
0:simultaneous left and right padding
>κατέβη ¥χθὲς£ ¥¥εἰς£ Πειραι
>¥κατέβ ¥¥χθὲς ¥¥¥εἰς ¥Πειρα
>¥¥κατέ ¥¥¥χθὲ ¥¥¥¥εἰ ¥¥Πειρ
>«κατέβην ¥«χθὲς»£ ¥¥«εἰς»£ «Πειραιᾶ
>ςκατέβην ηςχθὲςΓλ τηςεἰςΓλ ςΠειραιᾶ
# er... yeah, that looks right...

  foo=picobarn
  print ${foo:s£bar£rod£:s¥rod¥stick¥}
0:Delimiters in modifiers
>picostickn

# TODO: if we get paired multibyte bracket delimiters to work
# (as Emacs does, the smug so-and-so), the following should change.
  foo=bar
  print ${(r£5££X£)foo}
  print ${(l«10««Y««HI«)foo}
0:Delimiters in parameter flags
>barXX
>YYYYYHIbar

  printf "%4.3s\n" főobar
0:Multibyte characters in printf widths
> főo

# We ask for case-insensitive sorting here (and supply upper case
# characters) so that we exercise the logic in the shell that lowers the
# case of the string for case-insensitive sorting.
  print -oi HAH HUH HEH HÉH HÈH
  (LC_ALL=C; print -oi HAH HUH HEH HÉH HÈH)
0:Multibyte characters in print sorting
>HAH HEH HÉH HÈH HUH
>HAH HEH HUH HÈH HÉH

# These are control characters in Unicode, so don't show up.
# We just want to check they're not being treated as tokens.
  for x in {128..150}; do
     print ${(#)x}
  done | while read line; do
    print ${#line} $(( #line ))
  done
0:evaluated character number with multibyte characters
>1 128
>1 129
>1 130
>1 131
>1 132
>1 133
>1 134
>1 135
>1 136
>1 137
>1 138
>1 139
>1 140
>1 141
>1 142
>1 143
>1 144
>1 145
>1 146
>1 147
>1 148
>1 149
>1 150

  touch ngs1.txt ngs2.txt ngs10.txt ngs20.txt ngs100.txt ngs200.txt
  setopt numericglobsort
  print -l ngs*
  unsetopt numericglobsort
  print -l ngs*
0:NUMERIC_GLOB_SORT option in UTF-8 locale
>ngs1.txt
>ngs2.txt
>ngs10.txt
>ngs20.txt
>ngs100.txt
>ngs200.txt
>ngs100.txt
>ngs10.txt
>ngs1.txt
>ngs200.txt
>ngs20.txt
>ngs2.txt

# Not strictly multibyte, but gives us a well-defined locale for testing.
  foo=$'X\xc0Y\x07Z\x7fT'
  print -r ${(q)foo}
0:Backslash-quoting of unprintable/invalid characters uses $'...'
>X$'\300'Y$'\a'Z$'\177'T