mirror of
git://git.code.sf.net/p/zsh/code
synced 2024-09-28 15:01:21 +02:00
22518: Initial go at making parameter subscripts
use multibyte characters.
This commit is contained in:
parent
bd50a3c516
commit
05bd0b2dd1
@ -1,5 +1,9 @@
|
||||
2006-06-26 Peter Stephenson <pws@csr.com>
|
||||
|
||||
* 22518: Src/params.c, Src/utils,c, Src/zsh.h,
|
||||
Test/B02typeset.ztst: Initial go at making parameter subscripts
|
||||
use multibyte characters.
|
||||
|
||||
* 22516: Src/parse.c: error evaluating "func()" didn't pop
|
||||
the command stack.
|
||||
|
||||
|
152
Src/params.c
152
Src/params.c
@ -918,9 +918,33 @@ isident(char *s)
|
||||
return !ss[1];
|
||||
}
|
||||
|
||||
/*
|
||||
* Parse a single argument to a parameter subscript.
|
||||
* The subscripts starts at *str; *str is updated (input/output)
|
||||
*
|
||||
* *inv is set to indicate if the subscript is reversed (output)
|
||||
* v is the Value for the parameter being accessed (input; note
|
||||
* v->isarr may be modified, and if v is a hash the parameter will
|
||||
* be updated to the element of the hash)
|
||||
* a2 is 1 if this is the second subscript of a range (input)
|
||||
* *w is only set if we need to find the end of a word (input; should
|
||||
* be set to 0 by the caller).
|
||||
*
|
||||
* The final two arguments are to support multibyte characters.
|
||||
* If supplied they are set to the length of the character before
|
||||
* the index position and the one at the index position. If
|
||||
* multibyte characters are not in use they are set to 1 for
|
||||
* consistency.
|
||||
*
|
||||
* Returns a raw offset into the value from the start or end (i.e.
|
||||
* after the arithmetic for Meta and possible multibyte characters has
|
||||
* been taken into account).
|
||||
*/
|
||||
|
||||
/**/
|
||||
static zlong
|
||||
getarg(char **str, int *inv, Value v, int a2, zlong *w)
|
||||
getarg(char **str, int *inv, Value v, int a2, zlong *w,
|
||||
int *prevcharlen, int *nextcharlen)
|
||||
{
|
||||
int hasbeg = 0, word = 0, rev = 0, ind = 0, down = 0, l, i, ishash;
|
||||
int keymatch = 0, needtok = 0;
|
||||
@ -929,6 +953,10 @@ getarg(char **str, int *inv, Value v, int a2, zlong *w)
|
||||
Patprog pprog = NULL;
|
||||
|
||||
ishash = (v->pm && PM_TYPE(v->pm->node.flags) == PM_HASHED);
|
||||
if (prevcharlen)
|
||||
*prevcharlen = 1;
|
||||
if (nextcharlen)
|
||||
*nextcharlen = 1;
|
||||
|
||||
/* first parse any subscription flags */
|
||||
if (v->pm && (*s == '(' || *s == Inpar)) {
|
||||
@ -1133,17 +1161,43 @@ getarg(char **str, int *inv, Value v, int a2, zlong *w)
|
||||
|
||||
return (a2 ? s : d + 1) - t;
|
||||
} else if (!v->isarr && !word) {
|
||||
int lastcharlen = 1;
|
||||
s = getstrvalue(v);
|
||||
/*
|
||||
* Note for the confused (= pws): the index r we
|
||||
* have so far is that specified by the user. The value
|
||||
* passed back is an offset from the start or end of
|
||||
* the string. Hence it needs correcting at least
|
||||
* for Meta characters and maybe for multibyte characters.
|
||||
*/
|
||||
if (r > 0) {
|
||||
for (t = s + r - 1; *s && s < t;)
|
||||
if (*s++ == Meta)
|
||||
s++, t++, r++;
|
||||
zlong nchars = r;
|
||||
|
||||
MB_METACHARINIT();
|
||||
for (t = s; nchars && *t; nchars--)
|
||||
t += (lastcharlen = MB_METACHARLEN(t));
|
||||
/* for consistency, keep any remainder off the end */
|
||||
r = (zlong)(t - s) + nchars;
|
||||
if (prevcharlen)
|
||||
*prevcharlen = lastcharlen;
|
||||
if (nextcharlen && *t)
|
||||
*nextcharlen = MB_METACHARLEN(t);
|
||||
} else {
|
||||
r += ztrlen(s);
|
||||
for (t = s + r; *s && s < t; r--)
|
||||
if (*s++ == Meta)
|
||||
t++, r++;
|
||||
r -= strlen(s);
|
||||
zlong nchars = (zlong)MB_METASTRLEN(s) + r;
|
||||
|
||||
if (nchars < 0) {
|
||||
/* invalid but keep index anyway */
|
||||
r = nchars;
|
||||
} else {
|
||||
MB_METACHARINIT();
|
||||
for (t = s; nchars && *t; nchars--)
|
||||
t += (lastcharlen = MB_METACHARLEN(t));
|
||||
r = - (zlong)strlen(t); /* keep negative */
|
||||
if (prevcharlen)
|
||||
*prevcharlen = lastcharlen;
|
||||
if (nextcharlen && *t)
|
||||
*nextcharlen = MB_METACHARLEN(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@ -1338,19 +1392,57 @@ getindex(char **pptr, Value v, int dq)
|
||||
s += 2;
|
||||
} else {
|
||||
zlong we = 0, dummy;
|
||||
int startprevlen, startnextlen;
|
||||
|
||||
start = getarg(&s, &inv, v, 0, &we);
|
||||
start = getarg(&s, &inv, v, 0, &we, &startprevlen, &startnextlen);
|
||||
|
||||
if (inv) {
|
||||
if (!v->isarr && start != 0) {
|
||||
char *t, *p;
|
||||
t = getstrvalue(v);
|
||||
/*
|
||||
* Note for the confused (= pws): this is an inverse
|
||||
* offset so at this stage we need to convert from
|
||||
* the immediate offset into the value that we have
|
||||
* into a logical character position.
|
||||
*/
|
||||
if (start > 0) {
|
||||
for (p = t + start - 1; p-- > t; )
|
||||
if (*p == Meta)
|
||||
start--;
|
||||
} else
|
||||
start = -ztrlen(t + start + strlen(t));
|
||||
int nstart = 0;
|
||||
char *target = t + start - startprevlen;
|
||||
|
||||
p = t;
|
||||
MB_METACHARINIT();
|
||||
while (*p) {
|
||||
/*
|
||||
* move up characters, counting how many we
|
||||
* found
|
||||
*/
|
||||
p += MB_METACHARLEN(p);
|
||||
if (p < target)
|
||||
nstart++;
|
||||
else {
|
||||
if (p == target)
|
||||
nstart++;
|
||||
else
|
||||
p = target; /* pretend we hit exactly */
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* if start was too big, keep the difference */
|
||||
start = nstart + (target - p) + startprevlen;
|
||||
} else {
|
||||
zlong startoff = start + strlen(t);
|
||||
if (startoff < 0) {
|
||||
/* invalid: keep index but don't dereference */
|
||||
start = startoff;
|
||||
} else {
|
||||
/* find start in full characters */
|
||||
MB_METACHARINIT();
|
||||
for (p = t; p < t + startoff;)
|
||||
p += MB_METACHARLEN(p);
|
||||
start = - MB_METASTRLEN(p);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (start > 0 && (isset(KSHARRAYS) || (v->pm->node.flags & PM_HASHED)))
|
||||
start--;
|
||||
@ -1373,15 +1465,21 @@ getindex(char **pptr, Value v, int dq)
|
||||
|
||||
if ((com = (*s == ','))) {
|
||||
s++;
|
||||
end = getarg(&s, &inv, v, 1, &dummy);
|
||||
end = getarg(&s, &inv, v, 1, &dummy, NULL, NULL);
|
||||
} else {
|
||||
end = we ? we : start;
|
||||
}
|
||||
if (start != end) com = 1;
|
||||
if (start != end)
|
||||
com = 1;
|
||||
/*
|
||||
* Somehow the logic sometimes forces us to use the previous
|
||||
* or next character to what we would expect, which is
|
||||
* why we had to calculate them in getarg().
|
||||
*/
|
||||
if (start > 0)
|
||||
start--;
|
||||
start -= startprevlen;
|
||||
else if (start == 0 && end == 0)
|
||||
end++;
|
||||
end = startnextlen;
|
||||
if (s == tbrack) {
|
||||
s++;
|
||||
if (v->isarr && !com &&
|
||||
@ -1578,13 +1676,19 @@ getstrvalue(Value v)
|
||||
if (v->start < 0)
|
||||
v->start = 0;
|
||||
}
|
||||
if (v->end < 0)
|
||||
v->end += strlen(s) + 1;
|
||||
if (v->end < 0) {
|
||||
v->end += strlen(s);
|
||||
if (v->end >= 0) {
|
||||
char *eptr = s + v->end;
|
||||
if (*eptr)
|
||||
v->end += MB_METACHARLEN(eptr);
|
||||
}
|
||||
}
|
||||
s = (v->start > (int)strlen(s)) ? dupstring("") : dupstring(s + v->start);
|
||||
if (v->end <= v->start)
|
||||
s[0] = '\0';
|
||||
else if (v->end - v->start <= (int)strlen(s))
|
||||
s[v->end - v->start + (s[v->end - v->start - 1] == Meta)] = '\0';
|
||||
s[v->end - v->start] = '\0';
|
||||
|
||||
return s;
|
||||
}
|
||||
@ -2791,7 +2895,7 @@ char *
|
||||
tiedarrgetfn(Param pm)
|
||||
{
|
||||
struct tieddata *dptr = (struct tieddata *)pm->u.data;
|
||||
return *dptr->arrptr ? zjoin(*dptr->arrptr, dptr->joinchar, 1) : "";
|
||||
return *dptr->arrptr ? zjoin(*dptr->arrptr, STOUC(dptr->joinchar), 1) : "";
|
||||
}
|
||||
|
||||
/**/
|
||||
@ -3463,7 +3567,7 @@ arrfixenv(char *s, char **t)
|
||||
return;
|
||||
|
||||
if (pm->node.flags & PM_TIED)
|
||||
joinchar = ((struct tieddata *)pm->u.data)->joinchar;
|
||||
joinchar = STOUC(((struct tieddata *)pm->u.data)->joinchar);
|
||||
else
|
||||
joinchar = ':';
|
||||
|
||||
|
106
Src/utils.c
106
Src/utils.c
@ -3683,6 +3683,112 @@ mb_width(const char *s)
|
||||
return width;
|
||||
}
|
||||
|
||||
static mbstate_t mb_shiftstate;
|
||||
|
||||
/*
|
||||
* Initialise multibyte state: called before a sequence of
|
||||
* mb_metacharlen().
|
||||
*/
|
||||
|
||||
/**/
|
||||
void
|
||||
mb_metacharinit(void)
|
||||
{
|
||||
memset(&mb_shiftstate, 0, sizeof(mb_shiftstate));
|
||||
}
|
||||
|
||||
/*
|
||||
* Length of metafied string s which contains the next multibyte
|
||||
* character; single (possibly metafied) character if string is not null
|
||||
* but character is not valid (e.g. possibly incomplete at end of string).
|
||||
* Returned value is guaranteed not to reach beyond the end of the
|
||||
* string (assuming correct metafication).
|
||||
*/
|
||||
|
||||
/**/
|
||||
int
|
||||
mb_metacharlen(char *s)
|
||||
{
|
||||
char inchar, *ptr;
|
||||
size_t ret;
|
||||
wchar_t wc;
|
||||
|
||||
if (!isset(MULTIBYTE))
|
||||
return 1 + (*s == Meta);
|
||||
|
||||
ret = MB_INVALID;
|
||||
for (ptr = s; *ptr; ) {
|
||||
if (*ptr == Meta)
|
||||
inchar = *++ptr ^ 32;
|
||||
else
|
||||
inchar = *ptr;
|
||||
ptr++;
|
||||
ret = mbrtowc(&wc, &inchar, 1, &mb_shiftstate);
|
||||
|
||||
if (ret == MB_INVALID)
|
||||
break;
|
||||
if (ret == MB_INCOMPLETE)
|
||||
continue;
|
||||
return ptr - s;
|
||||
}
|
||||
|
||||
/* No valid multibyte sequence */
|
||||
memset(&mb_shiftstate, 0, sizeof(mb_shiftstate));
|
||||
if (ptr > s)
|
||||
return 1 + (*s == Meta); /* Treat as single byte character */
|
||||
else
|
||||
return 0; /* Probably shouldn't happen */
|
||||
}
|
||||
|
||||
/*
|
||||
* Total number of multibyte characters in metafied string s.
|
||||
* Same answer as iterating mb_metacharlen() and counting calls
|
||||
* until end of string.
|
||||
*/
|
||||
|
||||
/**/
|
||||
int
|
||||
mb_metastrlen(char *ptr)
|
||||
{
|
||||
char inchar, *laststart;
|
||||
size_t ret;
|
||||
wchar_t wc;
|
||||
int num, num_in_char;
|
||||
|
||||
if (!isset(MULTIBYTE))
|
||||
return ztrlen(ptr);
|
||||
|
||||
laststart = ptr;
|
||||
ret = MB_INVALID;
|
||||
num = num_in_char = 0;
|
||||
|
||||
memset(&mb_shiftstate, 0, sizeof(mb_shiftstate));
|
||||
while (*ptr) {
|
||||
if (*ptr == Meta)
|
||||
inchar = *++ptr ^ 32;
|
||||
else
|
||||
inchar = *ptr;
|
||||
ptr++;
|
||||
ret = mbrtowc(&wc, &inchar, 1, &mb_shiftstate);
|
||||
|
||||
if (ret == MB_INCOMPLETE) {
|
||||
num_in_char++;
|
||||
} else {
|
||||
if (ret == MB_INVALID) {
|
||||
/* Reset, treat as single character */
|
||||
memset(&mb_shiftstate, 0, sizeof(mb_shiftstate));
|
||||
ptr = laststart + (*laststart == Meta) + 1;
|
||||
} else
|
||||
laststart = ptr;
|
||||
num++;
|
||||
num_in_char = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* If incomplete, treat remainder as trailing single bytes */
|
||||
return num + num_in_char;
|
||||
}
|
||||
|
||||
/**/
|
||||
#endif /* MULTIBYTE_SUPPORT */
|
||||
|
||||
|
@ -1926,6 +1926,9 @@ typedef char *(*ZleGetLineFn) _((int *, int *));
|
||||
|
||||
#ifdef MULTIBYTE_SUPPORT
|
||||
#define nicezputs(str, outs) (void)mb_niceformat((str), (outs), NULL, 0)
|
||||
#define MB_METACHARINIT() mb_metacharinit()
|
||||
#define MB_METACHARLEN(str) mb_metacharlen(str)
|
||||
#define MB_METASTRLEN(str) mb_metastrlen(str)
|
||||
|
||||
#define MB_INCOMPLETE ((size_t)-2)
|
||||
#define MB_INVALID ((size_t)-1)
|
||||
@ -1946,6 +1949,9 @@ typedef char *(*ZleGetLineFn) _((int *, int *));
|
||||
#define ZWS(s) L ## s
|
||||
|
||||
#else
|
||||
#define MB_METACHARINIT()
|
||||
#define MB_METACHARLEN(str) (*(str) == Meta ? 2 : 1)
|
||||
#define MB_METASTRLEN(str) ztrlen(str)
|
||||
|
||||
/* Leave character or string as is. */
|
||||
#define ZWC(c) c
|
||||
|
@ -182,13 +182,26 @@
|
||||
>l o c a l
|
||||
>l:o:c:a l o c a
|
||||
|
||||
(setopt NO_multibyte cbases
|
||||
LC_ALL=C 2>/dev/null
|
||||
typeset -T SCALAR=$'l\x83o\x83c\x83a\x83l' array $'\x83'
|
||||
print $array
|
||||
typeset -U SCALAR
|
||||
print $SCALAR $array
|
||||
for (( i = 1; i <= ${#SCALAR}; i++ )); do
|
||||
char=$SCALAR[i]
|
||||
print $(( [#16] #char ))
|
||||
done
|
||||
print $array)
|
||||
0:Tied parameters and uniquified arrays with meta-character as separator
|
||||
>l o c a l
|
||||
>lƒoƒcƒa l o c a
|
||||
>0x6C
|
||||
>0x83
|
||||
>0x6F
|
||||
>0x83
|
||||
>0x63
|
||||
>0x83
|
||||
>0x61
|
||||
>l o c a
|
||||
|
||||
typeset -T SCALAR=$'l\000o\000c\000a\000l' array $'\000'
|
||||
typeset -U SCALAR
|
||||
|
Loading…
Reference in New Issue
Block a user