22524: searchable parameter subscripts with multibyte chars,

a few other fixes
2024-06-08 00:06:04 +02:00 · 2006-06-27 16:28:45 +00:00 · 2006-06-27 16:28:45 +00:00 · f95a6a913c
parent ec1274f0a3
commit f95a6a913c
3 changed files with 206 additions and 24 deletions
--- a/3
+++ b/3
@ -1,5 +1,8 @@
 2006-06-27  Peter Stephenson  <pws@csr.com>

+	* 22524: Src/params.c, Test/D07multibyte.ztst: searchable
+	subscripts with multibyte characters.
+
 	* 22522: Src/utils.c: infinite loop with invalid character
 	in mb_metastrlen().

--- a/Src/params.c
+++ b/Src/params.c
@ -934,11 +934,13 @@ isident(char *s)
 * If supplied they are set to the length of the character before
 * the index position and the one at the index position.  If
 * multibyte characters are not in use they are set to 1 for
- * consistency.
+ * consistency.  Note they aren't fully handled if a2 is non-zero,
+ * since they aren't needed.
 *
 * Returns a raw offset into the value from the start or end (i.e.
 * after the arithmetic for Meta and possible multibyte characters has
- * been taken into account).
+ * been taken into account).  This actually gives the offset *after*
+ * the character in question; subtract *prevcharlen if necessary.
 */

 /**/
@ -1178,16 +1180,23 @@ getarg(char **str, int *inv, Value v, int a2, zlong *w,
 		    t += (lastcharlen = MB_METACHARLEN(t));
 		/* for consistency, keep any remainder off the end */
 		r = (zlong)(t - s) + nchars;
-		if (prevcharlen)
+		if (prevcharlen && !nchars /* ignore if off the end */)
 		    *prevcharlen = lastcharlen;
 		if (nextcharlen && *t)
 		    *nextcharlen = MB_METACHARLEN(t);
+	    } else if (r == 0) {
+		if (prevcharlen)
+		    *prevcharlen = 0;
+		if (nextcharlen && *s) {
+		    MB_METACHARINIT();
+		    *nextcharlen = MB_METACHARLEN(s);
+		}
 	    } else {
 		zlong nchars = (zlong)MB_METASTRLEN(s) + r;

 		if (nchars < 0) {
-		    /* invalid but keep index anyway */
-		    r = nchars;
+		    /* make sure this isn't valid as a raw pointer */
+		    r -= (zlong)strlen(s);
 		} else {
 		    MB_METACHARINIT();
 		    for (t = s; nchars && *t; nchars--)
@ -1300,57 +1309,188 @@ getarg(char **str, int *inv, Value v, int a2, zlong *w,
 		    }
 		return a2 ? -1 : 0;
 	    } else {
+		/* Searching characters */
+		int slen;
 		d = getstrvalue(v);
 		if (!d || !*d)
 		    return 0;
-		len = strlen(d);
+		/*
+		 * beg and len are character counts, not raw offsets.
+		 * Remember we need to return a raw offset.
+		 */
+		len = MB_METASTRLEN(d);
+		slen = strlen(d);
 		if (beg < 0)
 		    beg += len;
+		MB_METACHARINIT();
 		if (beg >= 0 && beg < len) {
-                    char *de = d + len;
+                    char *de = d + slen;

 		    if (a2) {
+			/*
+			 * Second argument: we don't need to
+			 * handle prevcharlen or nextcharlen, but
+			 * we do need to handle characters appropriately.
+			 */
 			if (down) {
+			    int nmatches = 0;
+			    char *lastpos = NULL;
+
 			    if (!hasbeg)
 				beg = len;
-			    for (r = beg, t = d + beg; t >= d; r--, t--) {
+
+			    /*
+			     * See below: we have to move forward,
+			     * but need to count from the end.
+			     */
+			    for (t = d, r = 0; r <= beg; r++) {
 				sav = *t;
 				*t = '\0';
-				if (pattry(pprog, d)
-				    && !--num) {
-				    *t = sav;
-				    return r;
+				if (pattry(pprog, d)) {
+				    nmatches++;
+				    lastpos = t;
 				}
 				*t = sav;
+				if (t == de)
+				    break;
+				t += MB_METACHARLEN(t);
 			    }
-			} else
-			    for (r = beg, t = d + beg; t <= de; r++, t++) {
+
+			    if (nmatches >= num) {
+				if (num > 1) {
+				    nmatches -= num;
+				    MB_METACHARINIT();
+				    for (t = d, r = 0; ; r++) {
+					sav = *t;
+					*t = '\0';
+					if (pattry(pprog, d) &&
+					    nmatches-- == 0) {
+					    lastpos = t;
+					    *t = sav;
+					    break;
+					}
+					*t = sav;
+					t += MB_METACHARLEN(t);
+				    }
+				}
+				/* else lastpos is already OK */
+
+				return lastpos - d;
+			    }
+			} else {
+			    /*
+			     * This handling of the b flag
+			     * gives odd results, but this is the
+			     * way it's always worked.
+			     */
+			    for (t = d; beg && t <= de; beg--)
+				t += MB_METACHARLEN(t);
+			    for (;;) {
 				sav = *t;
 				*t = '\0';
-				if (pattry(pprog, d) &&
-				    !--num) {
+				if (pattry(pprog, d) && !--num) {
 				    *t = sav;
-				    return r;
+				    /*
+				     * This time, don't increment
+				     * pointer, since it's already
+				     * after everything we matched.
+				     */
+				    return t - d;
 				}
 				*t = sav;
+				if (t == de)
+				    break;
+				t += MB_METACHARLEN(t);
 			    }
+			}
 		    } else {
+			/*
+			 * First argument: this is the only case
+			 * where we need prevcharlen and nextcharlen.
+			 */
+			int lastcharlen;
+
 			if (down) {
+			    int nmatches = 0;
+			    char *lastpos = NULL;
+
 			    if (!hasbeg)
 				beg = len;
+
+			    /*
+			     * We can only move forward through
+			     * multibyte strings, so record the
+			     * matches.
+			     * Unfortunately the count num works
+			     * from the end, so it's easy to get the
+			     * last one but we need to repeat if
+			     * we want another one.
+			     */
+			    for (t = d, r = 0; r <= beg; r++) {
+				if (pattry(pprog, t)) {
+				    nmatches++;
+				    lastpos = t;
+				}
+				if (t == de)
+				    break;
+				t += MB_METACHARLEN(t);
+			    }
+
+			    if (nmatches >= num) {
+				if (num > 1) {
+				    /*
+				     * Need to start again and repeat
+				     * to get the right match.
+				     */
+				    nmatches -= num;
+				    MB_METACHARINIT();
+				    for (t = d, r = 0; ; r++) {
+					if (pattry(pprog, t) &&
+					    nmatches-- == 0) {
+					    lastpos = t;
+					    break;
+					}
+					t += MB_METACHARLEN(t);
+				    }
+				}
+				/* else lastpos is already OK */
+
+				/* return pointer after matched char */
+				lastpos +=
+				    (lastcharlen = MB_METACHARLEN(lastpos));
+				if (prevcharlen)
+				    *prevcharlen = lastcharlen;
+				if (nextcharlen)
+				    *nextcharlen = MB_METACHARLEN(lastpos);
+				return lastpos - d;
+			    }
+
 			    for (r = beg + 1, t = d + beg; t >= d; r--, t--) {
 				if (pattry(pprog, t) &&
 				    !--num)
 				    return r;
 			    }
-			} else
-			    for (r = beg + 1, t = d + beg; t <= de; r++, t++)
-				if (pattry(pprog, t) &&
-				    !--num)
-				    return r;
+			} else {
+			    for (t = d; beg && t <= de; beg--)
+				t += MB_METACHARLEN(t);
+			    for (;;) {
+				if (pattry(pprog, t) && !--num) {
+				    /* return pointer after matched char */
+				    t += (lastcharlen = MB_METACHARLEN(t));
+				    if (prevcharlen)
+					*prevcharlen = lastcharlen;
+				    if (nextcharlen)
+					*nextcharlen = MB_METACHARLEN(t);
+				    return t - d;
+				}
+				if (t == de)
+				    break;
+				t += MB_METACHARLEN(t);
+			    }
+			}
 		    }
 		}
-		return down ? 0 : len + 1;
+		return down ? 0 : slen + 1;
 	    }
 	}
    }
@ -1429,9 +1569,12 @@ getindex(char **pptr, Value v, int dq)
 			}
 		    }
 		    /* if start was too big, keep the difference */
-		    start = nstart + (target - p) + startprevlen;
+		    start = nstart + (target - p) + 1;
 		} else {
 		    zlong startoff = start + strlen(t);
+#ifdef DEBUG
+		    dputs("BUG: can't have negative inverse offsets???");
+#endif
 		    if (startoff < 0) {
 			/* invalid: keep index but don't dereference */
 			start = startoff;
--- a/Test/D07multibyte.ztst
+++ b/Test/D07multibyte.ztst
@ -82,6 +82,42 @@
 >x
 >9 9 x t

+  s=é
+  print A${s[-2]}A B${s[-1]}B C${s[0]}C D${s[1]}D E${s[2]}E
+0:Out of range subscripts with multibyte characters
+>AA BéB CéC DéD EE
+
  print ${a[(i)é]} ${a[(I)é]} ${a[${a[(i)é]},${a[(I)é]}]}
 0:Reverse indexing with multibyte characters
 >2 4 éné
+
+  print ${a[(r)én,(r)éb]}
+0:Subscript searching with multibyte characters
+>énéb
+
+  print ${a[(rb:1:)é,-1]}
+  print ${a[(rb:2:)é,-1]}
+  print ${a[(rb:3:)é,-1]}
+  print ${a[(rb:4:)é,-1]}
+  print ${a[(rb:5:)é,-1]}
+0:Subscript searching with initial offset
+>énébreux
+>énébreux
+>ébreux
+>ébreux
+>
+
+  print ${a[(rn:1:)é,-1]}
+  print ${a[(rn:2:)é,-1]}
+  print ${a[(rn:3:)é,-1]}
+0:Subscript searching with count
+>énébreux
+>ébreux
+>
+
+  print ${a[(R)én,(R)éb]}
+0:Backward subscript searching with multibyte characters
+>énéb
+
+# Starting offsets with (R) seem to be so strange as to be hardly
+# worth testing.