22518: Initial go at making parameter subscripts

use multibyte characters.
2024-09-28 15:01:21 +02:00 · 2006-06-26 18:17:31 +00:00 · 2006-06-26 18:17:31 +00:00 · 05bd0b2dd1
commit 05bd0b2dd1
parent bd50a3c516
5 changed files with 259 additions and 26 deletions
--- a/4
+++ b/4
@ -1,5 +1,9 @@
 2006-06-26  Peter Stephenson  <pws@csr.com>

+	* 22518: Src/params.c, Src/utils,c, Src/zsh.h,
+	Test/B02typeset.ztst: Initial go at making parameter subscripts
+	use multibyte characters.
+
 	* 22516: Src/parse.c: error evaluating "func()" didn't pop
 	the command stack.

--- a/Src/params.c
+++ b/Src/params.c
@ -918,9 +918,33 @@ isident(char *s)
    return !ss[1];
 }

+/*
+ * Parse a single argument to a parameter subscript.
+ * The subscripts starts at *str; *str is updated (input/output)
+ *
+ * *inv is set to indicate if the subscript is reversed (output)
+ * v is the Value for the parameter being accessed (input; note
+ *  v->isarr may be modified, and if v is a hash the parameter will
+ *  be updated to the element of the hash)
+ * a2 is 1 if this is the second subscript of a range (input)
+ * *w is only set if we need to find the end of a word (input; should
+ *  be set to 0 by the caller).
+ *
+ * The final two arguments are to support multibyte characters.
+ * If supplied they are set to the length of the character before
+ * the index position and the one at the index position.  If
+ * multibyte characters are not in use they are set to 1 for
+ * consistency.
+ *
+ * Returns a raw offset into the value from the start or end (i.e.
+ * after the arithmetic for Meta and possible multibyte characters has
+ * been taken into account).
+ */
+
 /**/
 static zlong
-getarg(char **str, int *inv, Value v, int a2, zlong *w)
+getarg(char **str, int *inv, Value v, int a2, zlong *w,
+       int *prevcharlen, int *nextcharlen)
 {
    int hasbeg = 0, word = 0, rev = 0, ind = 0, down = 0, l, i, ishash;
    int keymatch = 0, needtok = 0;
@ -929,6 +953,10 @@ getarg(char **str, int *inv, Value v, int a2, zlong *w)
    Patprog pprog = NULL;

    ishash = (v->pm && PM_TYPE(v->pm->node.flags) == PM_HASHED);
+    if (prevcharlen)
+	*prevcharlen = 1;
+    if (nextcharlen)
+	*nextcharlen = 1;

    /* first parse any subscription flags */
    if (v->pm && (*s == '(' || *s == Inpar)) {
@ -1133,17 +1161,43 @@ getarg(char **str, int *inv, Value v, int a2, zlong *w)

 	    return (a2 ? s : d + 1) - t;
 	} else if (!v->isarr && !word) {
+	    int lastcharlen = 1;
 	    s = getstrvalue(v);
+	    /*
+	     * Note for the confused (= pws):  the index r we
+	     * have so far is that specified by the user.  The value
+	     * passed back is an offset from the start or end of
+	     * the string.  Hence it needs correcting at least
+	     * for Meta characters and maybe for multibyte characters.
+	     */
 	    if (r > 0) {
-		for (t = s + r - 1; *s && s < t;)
-		    if (*s++ == Meta)
-			s++, t++, r++;
+		zlong nchars = r;
+
+		MB_METACHARINIT();
+		for (t = s; nchars && *t; nchars--)
+		    t += (lastcharlen = MB_METACHARLEN(t));
+		/* for consistency, keep any remainder off the end */
+		r = (zlong)(t - s) + nchars;
+		if (prevcharlen)
+		    *prevcharlen = lastcharlen;
+		if (nextcharlen && *t)
+		    *nextcharlen = MB_METACHARLEN(t);
 	    } else {
-		r += ztrlen(s);
-		for (t = s + r; *s && s < t; r--)
-		    if (*s++ == Meta)
-			t++, r++;
-		r -= strlen(s);
+		zlong nchars = (zlong)MB_METASTRLEN(s) + r;
+
+		if (nchars < 0) {
+		    /* invalid but keep index anyway */
+		    r = nchars;
+		} else {
+		    MB_METACHARINIT();
+		    for (t = s; nchars && *t; nchars--)
+			t += (lastcharlen = MB_METACHARLEN(t));
+		    r = - (zlong)strlen(t); /* keep negative */
+		    if (prevcharlen)
+			*prevcharlen = lastcharlen;
+		    if (nextcharlen && *t)
+			*nextcharlen = MB_METACHARLEN(t);
+		}
 	    }
 	}
    } else {
@ -1338,19 +1392,57 @@ getindex(char **pptr, Value v, int dq)
 	s += 2;
    } else {
 	zlong we = 0, dummy;
+	int startprevlen, startnextlen;

-	start = getarg(&s, &inv, v, 0, &we);
+	start = getarg(&s, &inv, v, 0, &we, &startprevlen, &startnextlen);

 	if (inv) {
 	    if (!v->isarr && start != 0) {
 		char *t, *p;
 		t = getstrvalue(v);
+		/*
+		 * Note for the confused (= pws): this is an inverse
+		 * offset so at this stage we need to convert from
+		 * the immediate offset into the value that we have
+		 * into a logical character position.
+		 */
 		if (start > 0) {
-		    for (p = t + start - 1; p-- > t; )
-			if (*p == Meta)
-			    start--;
-		} else
-		    start = -ztrlen(t + start + strlen(t));
+		    int nstart = 0;
+		    char *target = t + start - startprevlen;
+
+		    p = t;
+		    MB_METACHARINIT();
+		    while (*p) {
+			/*
+			 * move up characters, counting how many we
+			 * found
+			 */
+			p += MB_METACHARLEN(p);
+			if (p < target)
+			    nstart++;
+			else {
+			    if (p == target)
+				nstart++;
+			    else
+				p = target; /* pretend we hit exactly */
+			    break;
+			}
+		    }
+		    /* if start was too big, keep the difference */
+		    start = nstart + (target - p) + startprevlen;
+		} else {
+		    zlong startoff = start + strlen(t);
+		    if (startoff < 0) {
+			/* invalid: keep index but don't dereference */
+			start = startoff;
+		    } else {
+			/* find start in full characters */
+			MB_METACHARINIT();
+			for (p = t; p < t + startoff;)
+			    p += MB_METACHARLEN(p);
+			start = - MB_METASTRLEN(p);
+		    }
+		}
 	    }
 	    if (start > 0 && (isset(KSHARRAYS) || (v->pm->node.flags & PM_HASHED)))
 		start--;
@ -1373,15 +1465,21 @@ getindex(char **pptr, Value v, int dq)

 	    if ((com = (*s == ','))) {
 		s++;
-		end = getarg(&s, &inv, v, 1, &dummy);
+		end = getarg(&s, &inv, v, 1, &dummy, NULL, NULL);
 	    } else {
 		end = we ? we : start;
 	    }
-	    if (start != end) com = 1;
+	    if (start != end)
+		com = 1;
+	    /*
+	     * Somehow the logic sometimes forces us to use the previous
+	     * or next character to what we would expect, which is
+	     * why we had to calculate them in getarg().
+	     */
 	    if (start > 0)
-		start--;
+		start -= startprevlen;
 	    else if (start == 0 && end == 0)
-		end++;
+		end = startnextlen;
 	    if (s == tbrack) {
 		s++;
 		if (v->isarr && !com &&
@ -1578,13 +1676,19 @@ getstrvalue(Value v)
 	if (v->start < 0)
 	    v->start = 0;
    }
-    if (v->end < 0)
-	v->end += strlen(s) + 1;
+    if (v->end < 0) {
+	v->end += strlen(s);
+	if (v->end >= 0) {
+	    char *eptr = s + v->end;
+	    if (*eptr)
+		v->end += MB_METACHARLEN(eptr);
+	}
+    }
    s = (v->start > (int)strlen(s)) ? dupstring("") : dupstring(s + v->start);
    if (v->end <= v->start)
 	s[0] = '\0';
    else if (v->end - v->start <= (int)strlen(s))
-	s[v->end - v->start + (s[v->end - v->start - 1] == Meta)] = '\0';
+	s[v->end - v->start] = '\0';

    return s;
 }
@ -2791,7 +2895,7 @@ char *
 tiedarrgetfn(Param pm)
 {
    struct tieddata *dptr = (struct tieddata *)pm->u.data;
-    return *dptr->arrptr ? zjoin(*dptr->arrptr, dptr->joinchar, 1) : "";
+    return *dptr->arrptr ? zjoin(*dptr->arrptr, STOUC(dptr->joinchar), 1) : "";
 }

 /**/
@ -3463,7 +3567,7 @@ arrfixenv(char *s, char **t)
 	return;

    if (pm->node.flags & PM_TIED)
-	joinchar = ((struct tieddata *)pm->u.data)->joinchar;
+	joinchar = STOUC(((struct tieddata *)pm->u.data)->joinchar);
    else
 	joinchar = ':';

--- a/Src/utils.c
+++ b/Src/utils.c
@ -3683,6 +3683,112 @@ mb_width(const char *s)
    return width;
 }

+static mbstate_t mb_shiftstate;
+
+/*
+ * Initialise multibyte state: called before a sequence of
+ * mb_metacharlen().
+ */
+
+/**/
+void
+mb_metacharinit(void)
+{
+    memset(&mb_shiftstate, 0, sizeof(mb_shiftstate));
+}
+
+/*
+ * Length of metafied string s which contains the next multibyte
+ * character; single (possibly metafied) character if string is not null
+ * but character is not valid (e.g. possibly incomplete at end of string).
+ * Returned value is guaranteed not to reach beyond the end of the
+ * string (assuming correct metafication).
+ */
+
+/**/
+int
+mb_metacharlen(char *s)
+{
+    char inchar, *ptr;
+    size_t ret;
+    wchar_t wc;
+
+    if (!isset(MULTIBYTE))
+	return 1 + (*s == Meta);
+
+    ret = MB_INVALID;
+    for (ptr = s; *ptr; ) {
+	if (*ptr == Meta)
+	    inchar = *++ptr ^ 32;
+	else
+	    inchar = *ptr;
+	ptr++;
+	ret = mbrtowc(&wc, &inchar, 1, &mb_shiftstate);
+
+	if (ret == MB_INVALID)
+	    break;
+	if (ret == MB_INCOMPLETE)
+	    continue;
+	return ptr - s;
+    }
+
+    /* No valid multibyte sequence */
+    memset(&mb_shiftstate, 0, sizeof(mb_shiftstate));
+    if (ptr > s)
+	return 1 + (*s == Meta);	/* Treat as single byte character */
+    else
+	return 0;		/* Probably shouldn't happen */
+}
+
+/*
+ * Total number of multibyte characters in metafied string s.
+ * Same answer as iterating mb_metacharlen() and counting calls
+ * until end of string.
+ */
+
+/**/
+int
+mb_metastrlen(char *ptr)
+{
+    char inchar, *laststart;
+    size_t ret;
+    wchar_t wc;
+    int num, num_in_char;
+
+    if (!isset(MULTIBYTE))
+	return ztrlen(ptr);
+
+    laststart = ptr;
+    ret = MB_INVALID;
+    num = num_in_char = 0;
+
+    memset(&mb_shiftstate, 0, sizeof(mb_shiftstate));
+    while (*ptr) {
+	if (*ptr == Meta)
+	    inchar = *++ptr ^ 32;
+	else
+	    inchar = *ptr;
+	ptr++;
+	ret = mbrtowc(&wc, &inchar, 1, &mb_shiftstate);
+
+	if (ret == MB_INCOMPLETE) {
+	    num_in_char++;
+	} else {
+	    if (ret == MB_INVALID) {
+		/* Reset, treat as single character */
+		memset(&mb_shiftstate, 0, sizeof(mb_shiftstate));
+		ptr = laststart + (*laststart == Meta) + 1;
+	    } else
+		laststart = ptr;
+	    num++;
+	    num_in_char = 0;
+	}
+    }
+
+    /* If incomplete, treat remainder as trailing single bytes */
+    return num + num_in_char;
+}
+
 /**/
 #endif /* MULTIBYTE_SUPPORT */

--- a/Src/zsh.h
+++ b/Src/zsh.h
@ -1926,6 +1926,9 @@ typedef char *(*ZleGetLineFn) _((int *, int *));

 #ifdef MULTIBYTE_SUPPORT
 #define nicezputs(str, outs)	(void)mb_niceformat((str), (outs), NULL, 0)
+#define MB_METACHARINIT()	mb_metacharinit()
+#define MB_METACHARLEN(str)	mb_metacharlen(str)
+#define MB_METASTRLEN(str)	mb_metastrlen(str)

 #define MB_INCOMPLETE	((size_t)-2)
 #define MB_INVALID	((size_t)-1)
@ -1946,6 +1949,9 @@ typedef char *(*ZleGetLineFn) _((int *, int *));
 #define ZWS(s)	L ## s

 #else
+#define MB_METACHARINIT()
+#define MB_METACHARLEN(str)	(*(str) == Meta ? 2 : 1)
+#define MB_METASTRLEN(str)	ztrlen(str)

 /* Leave character or string as is. */
 #define ZWC(c)	c
--- a/Test/B02typeset.ztst
+++ b/Test/B02typeset.ztst
@ -182,13 +182,26 @@
 >l o c a l
 >l:o:c:a l o c a

+ (setopt NO_multibyte cbases
+ LC_ALL=C 2>/dev/null
 typeset -T SCALAR=$'l\x83o\x83c\x83a\x83l' array $'\x83'
 print $array
 typeset -U SCALAR
- print $SCALAR $array
+ for (( i = 1; i <= ${#SCALAR}; i++ )); do
+   char=$SCALAR[i]
+   print $(( [#16] #char ))
+ done
+ print $array)
 0:Tied parameters and uniquified arrays with meta-character as separator
 >l o c a l
->lƒoƒcƒa l o c a
+>0x6C
+>0x83
+>0x6F
+>0x83
+>0x63
+>0x83
+>0x61
+>l o c a

 typeset -T SCALAR=$'l\000o\000c\000a\000l' array $'\000'
 typeset -U SCALAR