1
0
Fork 0
mirror of git://git.code.sf.net/p/zsh/code synced 2024-06-08 00:06:04 +02:00

22524: searchable parameter subscripts with multibyte chars,

a few other fixes
This commit is contained in:
Peter Stephenson 2006-06-27 16:28:45 +00:00
parent ec1274f0a3
commit f95a6a913c
3 changed files with 206 additions and 24 deletions

View File

@ -1,5 +1,8 @@
2006-06-27 Peter Stephenson <pws@csr.com>
* 22524: Src/params.c, Test/D07multibyte.ztst: searchable
subscripts with multibyte characters.
* 22522: Src/utils.c: infinite loop with invalid character
in mb_metastrlen().

View File

@ -934,11 +934,13 @@ isident(char *s)
* If supplied they are set to the length of the character before
* the index position and the one at the index position. If
* multibyte characters are not in use they are set to 1 for
* consistency.
* consistency. Note they aren't fully handled if a2 is non-zero,
* since they aren't needed.
*
* Returns a raw offset into the value from the start or end (i.e.
* after the arithmetic for Meta and possible multibyte characters has
* been taken into account).
* been taken into account). This actually gives the offset *after*
* the character in question; subtract *prevcharlen if necessary.
*/
/**/
@ -1178,16 +1180,23 @@ getarg(char **str, int *inv, Value v, int a2, zlong *w,
t += (lastcharlen = MB_METACHARLEN(t));
/* for consistency, keep any remainder off the end */
r = (zlong)(t - s) + nchars;
if (prevcharlen)
if (prevcharlen && !nchars /* ignore if off the end */)
*prevcharlen = lastcharlen;
if (nextcharlen && *t)
*nextcharlen = MB_METACHARLEN(t);
} else if (r == 0) {
if (prevcharlen)
*prevcharlen = 0;
if (nextcharlen && *s) {
MB_METACHARINIT();
*nextcharlen = MB_METACHARLEN(s);
}
} else {
zlong nchars = (zlong)MB_METASTRLEN(s) + r;
if (nchars < 0) {
/* invalid but keep index anyway */
r = nchars;
/* make sure this isn't valid as a raw pointer */
r -= (zlong)strlen(s);
} else {
MB_METACHARINIT();
for (t = s; nchars && *t; nchars--)
@ -1300,57 +1309,188 @@ getarg(char **str, int *inv, Value v, int a2, zlong *w,
}
return a2 ? -1 : 0;
} else {
/* Searching characters */
int slen;
d = getstrvalue(v);
if (!d || !*d)
return 0;
len = strlen(d);
/*
* beg and len are character counts, not raw offsets.
* Remember we need to return a raw offset.
*/
len = MB_METASTRLEN(d);
slen = strlen(d);
if (beg < 0)
beg += len;
MB_METACHARINIT();
if (beg >= 0 && beg < len) {
char *de = d + len;
char *de = d + slen;
if (a2) {
/*
* Second argument: we don't need to
* handle prevcharlen or nextcharlen, but
* we do need to handle characters appropriately.
*/
if (down) {
int nmatches = 0;
char *lastpos = NULL;
if (!hasbeg)
beg = len;
for (r = beg, t = d + beg; t >= d; r--, t--) {
/*
* See below: we have to move forward,
* but need to count from the end.
*/
for (t = d, r = 0; r <= beg; r++) {
sav = *t;
*t = '\0';
if (pattry(pprog, d)
&& !--num) {
*t = sav;
return r;
if (pattry(pprog, d)) {
nmatches++;
lastpos = t;
}
*t = sav;
if (t == de)
break;
t += MB_METACHARLEN(t);
}
} else
for (r = beg, t = d + beg; t <= de; r++, t++) {
if (nmatches >= num) {
if (num > 1) {
nmatches -= num;
MB_METACHARINIT();
for (t = d, r = 0; ; r++) {
sav = *t;
*t = '\0';
if (pattry(pprog, d) &&
nmatches-- == 0) {
lastpos = t;
*t = sav;
break;
}
*t = sav;
t += MB_METACHARLEN(t);
}
}
/* else lastpos is already OK */
return lastpos - d;
}
} else {
/*
* This handling of the b flag
* gives odd results, but this is the
* way it's always worked.
*/
for (t = d; beg && t <= de; beg--)
t += MB_METACHARLEN(t);
for (;;) {
sav = *t;
*t = '\0';
if (pattry(pprog, d) &&
!--num) {
if (pattry(pprog, d) && !--num) {
*t = sav;
return r;
/*
* This time, don't increment
* pointer, since it's already
* after everything we matched.
*/
return t - d;
}
*t = sav;
if (t == de)
break;
t += MB_METACHARLEN(t);
}
}
} else {
/*
* First argument: this is the only case
* where we need prevcharlen and nextcharlen.
*/
int lastcharlen;
if (down) {
int nmatches = 0;
char *lastpos = NULL;
if (!hasbeg)
beg = len;
/*
* We can only move forward through
* multibyte strings, so record the
* matches.
* Unfortunately the count num works
* from the end, so it's easy to get the
* last one but we need to repeat if
* we want another one.
*/
for (t = d, r = 0; r <= beg; r++) {
if (pattry(pprog, t)) {
nmatches++;
lastpos = t;
}
if (t == de)
break;
t += MB_METACHARLEN(t);
}
if (nmatches >= num) {
if (num > 1) {
/*
* Need to start again and repeat
* to get the right match.
*/
nmatches -= num;
MB_METACHARINIT();
for (t = d, r = 0; ; r++) {
if (pattry(pprog, t) &&
nmatches-- == 0) {
lastpos = t;
break;
}
t += MB_METACHARLEN(t);
}
}
/* else lastpos is already OK */
/* return pointer after matched char */
lastpos +=
(lastcharlen = MB_METACHARLEN(lastpos));
if (prevcharlen)
*prevcharlen = lastcharlen;
if (nextcharlen)
*nextcharlen = MB_METACHARLEN(lastpos);
return lastpos - d;
}
for (r = beg + 1, t = d + beg; t >= d; r--, t--) {
if (pattry(pprog, t) &&
!--num)
return r;
}
} else
for (r = beg + 1, t = d + beg; t <= de; r++, t++)
if (pattry(pprog, t) &&
!--num)
return r;
} else {
for (t = d; beg && t <= de; beg--)
t += MB_METACHARLEN(t);
for (;;) {
if (pattry(pprog, t) && !--num) {
/* return pointer after matched char */
t += (lastcharlen = MB_METACHARLEN(t));
if (prevcharlen)
*prevcharlen = lastcharlen;
if (nextcharlen)
*nextcharlen = MB_METACHARLEN(t);
return t - d;
}
if (t == de)
break;
t += MB_METACHARLEN(t);
}
}
}
}
return down ? 0 : len + 1;
return down ? 0 : slen + 1;
}
}
}
@ -1429,9 +1569,12 @@ getindex(char **pptr, Value v, int dq)
}
}
/* if start was too big, keep the difference */
start = nstart + (target - p) + startprevlen;
start = nstart + (target - p) + 1;
} else {
zlong startoff = start + strlen(t);
#ifdef DEBUG
dputs("BUG: can't have negative inverse offsets???");
#endif
if (startoff < 0) {
/* invalid: keep index but don't dereference */
start = startoff;

View File

@ -82,6 +82,42 @@
>x
>9 9 x t
s=é
print A${s[-2]}A B${s[-1]}B C${s[0]}C D${s[1]}D E${s[2]}E
0:Out of range subscripts with multibyte characters
>AA BéB CéC DéD EE
print ${a[(i)é]} ${a[(I)é]} ${a[${a[(i)é]},${a[(I)é]}]}
0:Reverse indexing with multibyte characters
>2 4 éné
print ${a[(r)én,(r)éb]}
0:Subscript searching with multibyte characters
>énéb
print ${a[(rb:1:)é,-1]}
print ${a[(rb:2:)é,-1]}
print ${a[(rb:3:)é,-1]}
print ${a[(rb:4:)é,-1]}
print ${a[(rb:5:)é,-1]}
0:Subscript searching with initial offset
>énébreux
>énébreux
>ébreux
>ébreux
>
print ${a[(rn:1:)é,-1]}
print ${a[(rn:2:)é,-1]}
print ${a[(rn:3:)é,-1]}
0:Subscript searching with count
>énébreux
>ébreux
>
print ${a[(R)én,(R)éb]}
0:Backward subscript searching with multibyte characters
>énéb
# Starting offsets with (R) seem to be so strange as to be hardly
# worth testing.