1
0
Fork 0
mirror of git://git.code.sf.net/p/zsh/code synced 2024-06-08 00:06:04 +02:00

20500: Unmetafy patterns where possible and other minor pattern fixes

This commit is contained in:
Peter Stephenson 2004-10-18 11:56:14 +00:00
parent bc704ef05c
commit b115ca307a
7 changed files with 538 additions and 249 deletions

View File

@ -1,3 +1,11 @@
2004-10-18 Peter Stephenson <pws@csr.com>
* 20500: Misc/globtests, Src/glob.c, Src/pattern.c, Src/zsh.h,
Src/Zle/complist.c, Test/D02glob.ztst: Use unmetafied strings
in patterns more; improve glob.c/pattern.c interface; fix
minor <num-> issue; add bogus quotation from Proust (it's
my file and I'll do what I like with it).
2004-10-17 Wayne Davison <wayned@users.sourceforge.net> 2004-10-17 Wayne Davison <wayned@users.sourceforge.net>
* 20496: Src/utils.c: made zclose() not call close() when the * 20496: Src/utils.c: made zclose() not call close() when the

View File

@ -134,6 +134,9 @@ t 633 <1-1000>33
t 633 <-1000>33 t 633 <-1000>33
t 633 <1->33 t 633 <1->33
t 633 <->33 t 633 <->33
# An open top end of a range will match any integer, even
# if not representable in the internal integer type.
t 12345678901234567890123456789012345678901234567890123456789012345678901234567890foo <42->foo
# Approximate matching # Approximate matching
t READ.ME (#ia1)readme t READ.ME (#ia1)readme
f READ..ME (#ia1)readme f READ..ME (#ia1)readme

View File

@ -601,7 +601,7 @@ putmatchcol(Listcols c, char *group, char *n)
for (pc = c->pats; pc; pc = pc->next) for (pc = c->pats; pc; pc = pc->next)
if ((!pc->prog || !group || pattry(pc->prog, group)) && if ((!pc->prog || !group || pattry(pc->prog, group)) &&
pattryrefs(pc->pat, n, &nrefs, begpos, endpos)) { pattryrefs(pc->pat, n, -1, 0, &nrefs, begpos, endpos)) {
if (pc->cols[1]) { if (pc->cols[1]) {
patcols = pc->cols; patcols = pc->cols;
@ -639,7 +639,7 @@ putfilecol(Listcols c, char *group, char *n, mode_t m)
for (pc = c->pats; pc; pc = pc->next) for (pc = c->pats; pc; pc = pc->next)
if ((!pc->prog || !group || pattry(pc->prog, group)) && if ((!pc->prog || !group || pattry(pc->prog, group)) &&
pattryrefs(pc->pat, n, &nrefs, begpos, endpos)) { pattryrefs(pc->pat, n, -1, 0, &nrefs, begpos, endpos)) {
if (pc->cols[1]) { if (pc->cols[1]) {
patcols = pc->cols; patcols = pc->cols;

View File

@ -2195,8 +2195,13 @@ set_pat_end(Patprog p, char null_me)
static int static int
igetmatch(char **sp, Patprog p, int fl, int n, char *replstr) igetmatch(char **sp, Patprog p, int fl, int n, char *replstr)
{ {
char *s = *sp, *t, sav; char *s = *sp, *t;
int i, l = strlen(*sp), ml = ztrlen(*sp), matched = 1; /*
* Note that ioff and ml count characters in the character
* set (Meta's are not included), while l counts characters in the
* string.
*/
int ioff, l = strlen(*sp), ml = ztrlen(*sp), matched = 1;
repllist = NULL; repllist = NULL;
@ -2208,7 +2213,7 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr)
p->flags &= ~(PAT_NOTSTART|PAT_NOTEND); p->flags &= ~(PAT_NOTSTART|PAT_NOTEND);
if (fl & SUB_ALL) { if (fl & SUB_ALL) {
i = matched && pattry(p, s); int i = matched && pattry(p, s);
*sp = get_match_ret(*sp, 0, i ? l : 0, fl, i ? replstr : 0); *sp = get_match_ret(*sp, 0, i ? l : 0, fl, i ? replstr : 0);
if (! **sp && (((fl & SUB_MATCH) && !i) || ((fl & SUB_REST) && i))) if (! **sp && (((fl & SUB_MATCH) && !i) || ((fl & SUB_REST) && i)))
return 0; return 0;
@ -2223,25 +2228,22 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr)
* First get the longest match... * First get the longest match...
*/ */
if (pattry(p, s)) { if (pattry(p, s)) {
char *mpos = patinput; /* patmatchlen returns metafied length, as we need */
int mlen = patmatchlen();
if (!(fl & SUB_LONG) && !(p->flags & PAT_PURES)) { if (!(fl & SUB_LONG) && !(p->flags & PAT_PURES)) {
/* /*
* ... now we know whether it's worth looking for the * ... now we know whether it's worth looking for the
* shortest, which we do by brute force. * shortest, which we do by brute force.
*/ */
for (t = s; t < mpos; METAINC(t)) { for (t = s; t < s + mlen; METAINC(t)) {
sav = *t; set_pat_end(p, *t);
set_pat_end(p, sav); if (pattrylen(p, s, t - s, 0)) {
*t = '\0'; mlen = patmatchlen();
if (pattry(p, s)) {
mpos = patinput;
*t = sav;
break; break;
} }
*t = sav;
} }
} }
*sp = get_match_ret(*sp, 0, mpos-s, fl, replstr); *sp = get_match_ret(*sp, 0, mlen, fl, replstr);
return 1; return 1;
} }
break; break;
@ -2250,35 +2252,30 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr)
/* Smallest possible match at tail of string: * /* Smallest possible match at tail of string: *
* move back down string until we get a match. * * move back down string until we get a match. *
* There's no optimization here. */ * There's no optimization here. */
patoffset = ml; for (ioff = ml, t = s + l; t >= s; t--, ioff--) {
for (t = s + l; t >= s; t--, patoffset--) {
set_pat_start(p, t-s); set_pat_start(p, t-s);
if (pattry(p, t)) { if (pattrylen(p, t, -1, ioff)) {
*sp = get_match_ret(*sp, t - s, l, fl, replstr); *sp = get_match_ret(*sp, t - s, l, fl, replstr);
patoffset = 0;
return 1; return 1;
} }
if (t > s+1 && t[-2] == Meta) if (t > s+1 && t[-2] == Meta)
t--; t--;
} }
patoffset = 0;
break; break;
case (SUB_END|SUB_LONG): case (SUB_END|SUB_LONG):
/* Largest possible match at tail of string: * /* Largest possible match at tail of string: *
* move forward along string until we get a match. * * move forward along string until we get a match. *
* Again there's no optimisation. */ * Again there's no optimisation. */
for (i = 0, t = s; i < l; i++, t++, patoffset++) { for (ioff = 0, t = s; t < s + l; ioff++, t++) {
set_pat_start(p, t-s); set_pat_start(p, t-s);
if (pattry(p, t)) { if (pattrylen(p, t, -1, ioff)) {
*sp = get_match_ret(*sp, i, l, fl, replstr); *sp = get_match_ret(*sp, t-s, l, fl, replstr);
patoffset = 0;
return 1; return 1;
} }
if (*t == Meta) if (*t == Meta)
i++, t++; t++;
} }
patoffset = 0;
break; break;
case SUB_SUBSTR: case SUB_SUBSTR:
@ -2293,26 +2290,23 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr)
t = s; t = s;
if (fl & SUB_GLOBAL) if (fl & SUB_GLOBAL)
repllist = newlinklist(); repllist = newlinklist();
ioff = 0; /* offset into string */
do { do {
/* loop over all matches for global substitution */ /* loop over all matches for global substitution */
matched = 0; matched = 0;
for (; t < s + l; t++, patoffset++) { for (; t < s + l; t++, ioff++) {
/* Find the longest match from this position. */ /* Find the longest match from this position. */
set_pat_start(p, t-s); set_pat_start(p, t-s);
if (pattry(p, t)) { if (pattrylen(p, t, -1, ioff)) {
char *mpos = patinput; char *mpos = t + patmatchlen();
if (!(fl & SUB_LONG) && !(p->flags & PAT_PURES)) { if (!(fl & SUB_LONG) && !(p->flags & PAT_PURES)) {
char *ptr; char *ptr;
for (ptr = t; ptr < mpos; METAINC(ptr)) { for (ptr = t; ptr < mpos; METAINC(ptr)) {
sav = *ptr; set_pat_end(p, *ptr);
set_pat_end(p, sav); if (pattrylen(p, t, ptr - t, ioff)) {
*ptr = '\0'; mpos = t + patmatchlen();
if (pattry(p, t)) {
mpos = patinput;
*ptr = sav;
break; break;
} }
*ptr = sav;
} }
} }
if (!--n || (n <= 0 && (fl & SUB_GLOBAL))) { if (!--n || (n <= 0 && (fl & SUB_GLOBAL))) {
@ -2330,7 +2324,6 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr)
*/ */
continue; continue;
} else { } else {
patoffset = 0;
return 1; return 1;
} }
} }
@ -2339,7 +2332,7 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr)
* which is already marked for replacement. * which is already marked for replacement.
*/ */
matched = 1; matched = 1;
for ( ; t < mpos; t++, patoffset++) for ( ; t < mpos; t++, ioff++)
if (*t == Meta) if (*t == Meta)
t++; t++;
break; break;
@ -2348,7 +2341,6 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr)
t++; t++;
} }
} while (matched); } while (matched);
patoffset = 0;
/* /*
* check if we can match a blank string, if so do it * check if we can match a blank string, if so do it
* at the start. Goodness knows if this is a good idea * at the start. Goodness knows if this is a good idea
@ -2365,50 +2357,39 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr)
case (SUB_END|SUB_SUBSTR): case (SUB_END|SUB_SUBSTR):
case (SUB_END|SUB_LONG|SUB_SUBSTR): case (SUB_END|SUB_LONG|SUB_SUBSTR):
/* Longest/shortest at end, matching substrings. */ /* Longest/shortest at end, matching substrings. */
patoffset = ml;
if (!(fl & SUB_LONG)) { if (!(fl & SUB_LONG)) {
set_pat_start(p, l); set_pat_start(p, l);
if (pattry(p, s + l) && !--n) { if (pattrylen(p, s + l, -1, ml) && !--n) {
*sp = get_match_ret(*sp, l, l, fl, replstr); *sp = get_match_ret(*sp, l, l, fl, replstr);
patoffset = 0;
return 1; return 1;
} }
} }
patoffset--; for (ioff = ml - 1, t = s + l - 1; t >= s; t--, ioff--) {
for (t = s + l - 1; t >= s; t--, patoffset--) {
if (t > s && t[-1] == Meta) if (t > s && t[-1] == Meta)
t--; t--;
set_pat_start(p, t-s); set_pat_start(p, t-s);
if (pattry(p, t) && !--n) { if (pattrylen(p, t, -1, ioff) && !--n) {
/* Found the longest match */ /* Found the longest match */
char *mpos = patinput; char *mpos = t + patmatchlen();
if (!(fl & SUB_LONG) && !(p->flags & PAT_PURES)) { if (!(fl & SUB_LONG) && !(p->flags & PAT_PURES)) {
char *ptr; char *ptr;
for (ptr = t; ptr < mpos; METAINC(ptr)) { for (ptr = t; ptr < mpos; METAINC(ptr)) {
sav = *ptr; set_pat_end(p, *ptr);
set_pat_end(p, sav); if (pattrylen(p, t, ptr - t, ioff)) {
*ptr = '\0'; mpos = t + patmatchlen();
if (pattry(p, t)) {
mpos = patinput;
*ptr = sav;
break; break;
} }
*ptr = sav;
} }
} }
*sp = get_match_ret(*sp, t-s, mpos-s, fl, replstr); *sp = get_match_ret(*sp, t-s, mpos-s, fl, replstr);
patoffset = 0;
return 1; return 1;
} }
} }
patoffset = ml;
set_pat_start(p, l); set_pat_start(p, l);
if ((fl & SUB_LONG) && pattry(p, s + l) && !--n) { if ((fl & SUB_LONG) && pattrylen(p, s + l, -1, ml) && !--n) {
*sp = get_match_ret(*sp, l, l, fl, replstr); *sp = get_match_ret(*sp, l, l, fl, replstr);
patoffset = 0;
return 1; return 1;
} }
patoffset = 0;
break; break;
} }
} }
@ -2419,6 +2400,7 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr)
Repldata rd; Repldata rd;
int lleft = 0; /* size of returned string */ int lleft = 0; /* size of returned string */
char *ptr, *start; char *ptr, *start;
int i;
i = 0; /* start of last chunk we got from *sp */ i = 0; /* start of last chunk we got from *sp */
for (nd = firstnode(repllist); nd; incnode(nd)) { for (nd = firstnode(repllist); nd; incnode(nd)) {

View File

@ -47,6 +47,26 @@
* *
* Eagle-eyed readers will notice this is an altered version. Incredibly * Eagle-eyed readers will notice this is an altered version. Incredibly
* sharp-eyed readers might even find bits that weren't altered. * sharp-eyed readers might even find bits that weren't altered.
*
*
* And I experienced a sense that, like certain regular
* expressions, seemed to match the day from beginning to end, so
* that I did not need to identify the parenthesised subexpression
* that told of dawn, nor the group of characters that indicated
* the moment when my grandfather returned home with news of
* Swann's departure for Paris; and the whole length of the month
* of May, as if matched by a closure, fitted into the buffer of my
* life with no sign of overflowing, turning the days, like a
* procession of insects that could consist of this or that
* species, into a random and unstructured repetition of different
* sequences, anchored from the first day of the month to the last
* in the same fashion as the weeks when I knew I would not see
* Gilberte and would search in vain for any occurrences of the
* string in the avenue of hawthorns by Tansonville, without my
* having to delimit explicitly the start or finish of the pattern.
*
* M. Proust, "In Search of Lost Files",
* bk I, "The Walk by Bourne's Place".
*/ */
#include "zsh.mdh" #include "zsh.mdh"
@ -78,7 +98,7 @@ typedef union upat *Upat;
#define P_EXCSYNC 0x01 /* no Test if following exclude already failed */ #define P_EXCSYNC 0x01 /* no Test if following exclude already failed */
#define P_EXCEND 0x02 /* no Test if exclude matched orig branch */ #define P_EXCEND 0x02 /* no Test if exclude matched orig branch */
#define P_BACK 0x03 /* no Match "", "next" ptr points backward. */ #define P_BACK 0x03 /* no Match "", "next" ptr points backward. */
#define P_EXACTLY 0x04 /* str Match this string. */ #define P_EXACTLY 0x04 /* lstr Match this string. */
#define P_NOTHING 0x05 /* no Match empty string. */ #define P_NOTHING 0x05 /* no Match empty string. */
#define P_ONEHASH 0x06 /* node Match this (simple) thing 0 or more times. */ #define P_ONEHASH 0x06 /* node Match this (simple) thing 0 or more times. */
#define P_TWOHASH 0x07 /* node Match this (simple) thing 1 or more times. */ #define P_TWOHASH 0x07 /* node Match this (simple) thing 1 or more times. */
@ -103,10 +123,14 @@ typedef union upat *Upat;
/* spaces left for P_OPEN+n,... for backreferences */ /* spaces left for P_OPEN+n,... for backreferences */
#define P_OPEN 0x80 /* no Mark this point in input as start of n. */ #define P_OPEN 0x80 /* no Mark this point in input as start of n. */
#define P_CLOSE 0x90 /* no Analogous to OPEN. */ #define P_CLOSE 0x90 /* no Analogous to OPEN. */
/* zl is the range type zrange_t: may be zlong or unsigned long /*
* char is a single char * no no argument
* uc* is a pointer to unsigned char, used at run time and initialised * zr the range type zrange_t: may be zlong or unsigned long
* char a single char
* uc* a pointer to unsigned char, used at run time and initialised
* to NULL. * to NULL.
* str null-terminated, metafied string
* lstr length as long then string, not null-terminated, unmetafied.
*/ */
/* /*
@ -179,16 +203,24 @@ typedef union upat *Upat;
#define P_ISEXCLUDE(p) (((p)->l & 0x30) == 0x30) #define P_ISEXCLUDE(p) (((p)->l & 0x30) == 0x30)
#define P_NOTDOT(p) ((p)->l & 0x40) #define P_NOTDOT(p) ((p)->l & 0x40)
/* Specific to lstr type, i.e. P_EXACTLY. */
#define P_LS_LEN(p) ((p)[1].l) /* can be used as lvalue */
#define P_LS_STR(p) ((char *)((p) + 2))
/* Flags needed when pattern is executed */ /* Flags needed when pattern is executed */
#define P_SIMPLE 0x01 /* Simple enough to be #/## operand. */ #define P_SIMPLE 0x01 /* Simple enough to be #/## operand. */
#define P_HSTART 0x02 /* Starts with # or ##'d pattern. */ #define P_HSTART 0x02 /* Starts with # or ##'d pattern. */
#define P_PURESTR 0x04 /* Can be matched with a strcmp */ #define P_PURESTR 0x04 /* Can be matched with a strcmp */
/* Next character after one which may be a Meta (x is any char *) */
#define METANEXT(x) (*(x) == Meta ? (x)+2 : (x)+1)
/* /*
* Increment pointer which may be on a Meta (x is a pointer variable), * Increment pointer which may be on a Meta (x is a pointer variable),
* returning the incremented value (i.e. like pre-increment). * returning the incremented value (i.e. like pre-increment).
*
* In future the following will need to refer to metafied multibyte
* characters. References to invidual characters are not turned
* into a macro when the characters is metafied (c.f. CHARREF()
* below then the character is not metafied) and will need tracking
* down.
*/ */
#define METAINC(x) ((x) += (*(x) == Meta) ? 2 : 1) #define METAINC(x) ((x) += (*(x) == Meta) ? 2 : 1)
/* /*
@ -254,13 +286,18 @@ static int patglobflags; /* globbing flags & approx */
/* Add n more characters, ensuring there is enough space. */ /* Add n more characters, ensuring there is enough space. */
enum {
PA_NOALIGN = 1,
PA_UNMETA = 2
};
/**/ /**/
static void static void
patadd(char *add, int ch, long n, int noalgn) patadd(char *add, int ch, long n, int paflags)
{ {
/* Make sure everything gets aligned unless we get noalgn. */ /* Make sure everything gets aligned unless we get PA_NOALIGN. */
long newpatsize = patsize + n; long newpatsize = patsize + n;
if (!noalgn) if (!(paflags & PA_NOALIGN))
newpatsize = (newpatsize + sizeof(union upat) - 1) & newpatsize = (newpatsize + sizeof(union upat) - 1) &
~(sizeof(union upat) - 1); ~(sizeof(union upat) - 1);
if (patalloc < newpatsize) { if (patalloc < newpatsize) {
@ -272,8 +309,25 @@ patadd(char *add, int ch, long n, int noalgn)
} }
patsize = newpatsize; patsize = newpatsize;
if (add) { if (add) {
while (n--) if (paflags & PA_UNMETA) {
*patcode++ = *add++; /*
* Unmetafy and untokenize the string as we go.
* The Meta characters in add aren't counted in n.
*/
while (n--) {
if (itok(*add))
*patcode++ = ztokens[*add++ - Pound];
else if (*add == Meta) {
add++;
*patcode++ = *add++ ^ 32;
} else {
*patcode++ = *add++;
}
}
} else {
while (n--)
*patcode++ = *add++;
}
} else } else
*patcode++ = ch; *patcode++ = ch;
patcode = patout + patsize; patcode = patout + patsize;
@ -297,13 +351,22 @@ patcompstart(void)
patglobflags = GF_IGNCASE; patglobflags = GF_IGNCASE;
} }
/* Top level pattern compilation subroutine */ /*
* Top level pattern compilation subroutine
* exp is a null-terminated, metafied string.
* inflags is an or of some PAT_* flags.
* endexp, if non-null, is set to a pointer to the end of the
* part of exp which was compiled. This is used when
* compiling patterns for directories which must be
* matched recursively.
*/
/**/ /**/
mod_export Patprog mod_export Patprog
patcompile(char *exp, int inflags, char **endexp) patcompile(char *exp, int inflags, char **endexp)
{ {
int flags = 0, len = 0; int flags = 0;
long len = 0;
long startoff; long startoff;
Upat pscan; Upat pscan;
char *lng, *strp = NULL; char *lng, *strp = NULL;
@ -324,7 +387,7 @@ patcompile(char *exp, int inflags, char **endexp)
* in struct is actual count of parentheses. * in struct is actual count of parentheses.
*/ */
patnpar = 1; patnpar = 1;
patflags = inflags & ~PAT_PURES; patflags = inflags & ~(PAT_PURES|PAT_HAS_EXCLUDP);
patendseg = endseg; patendseg = endseg;
patendseglen = isset(EXTENDEDGLOB) ? PATENDSEGLEN_EXT : PATENDSEGLEN_NORM; patendseglen = isset(EXTENDEDGLOB) ? PATENDSEGLEN_EXT : PATENDSEGLEN_NORM;
@ -366,7 +429,12 @@ patcompile(char *exp, int inflags, char **endexp)
if (patcompswitch(0, &flags) == 0) if (patcompswitch(0, &flags) == 0)
return NULL; return NULL;
} else { } else {
/* Yes, copy the string and skip compilation altogether */ /*
* Yes, copy the string, and skip compilation altogether.
* Null terminate for the benefit of globbing.
* Leave metafied both for globbing and for our own
* efficiency.
*/
patparse = strp; patparse = strp;
len = strp - exp; len = strp - exp;
patadd(exp, 0, len + 1, 0); patadd(exp, 0, len + 1, 0);
@ -404,19 +472,52 @@ patcompile(char *exp, int inflags, char **endexp)
for (; pscan; pscan = next) { for (; pscan; pscan = next) {
next = PATNEXT(pscan); next = PATNEXT(pscan);
if (P_OP(pscan) == P_EXACTLY) { if (P_OP(pscan) == P_EXACTLY) {
char *opnd = (char *)P_OPERAND(pscan); char *opnd = P_LS_STR(pscan), *mtest;
while ((*dst = *opnd++)) long oplen = P_LS_LEN(pscan), ilen;
dst++; int nmeta = 0;
/*
* Unfortunately we unmetafied the string
* and we need to put any metacharacters
* back now we know it's a pure string.
* This shouldn't happen too often, it's
* just that there are some cases such
* as . and .. in files where we really
* need a pure string even if there are
* pattern characters flying around.
*/
for (mtest = opnd, ilen = oplen; ilen;
mtest++, ilen--)
if (imeta(*mtest))
nmeta++;
if (nmeta) {
char *oldpatout = patout;
patadd(NULL, 0, nmeta, 0);
/*
* Yuk.
*/
p = (Patprog)patout;
opnd = patout + (opnd - oldpatout);
dst = patout + startoff;
}
while (oplen--) {
if (imeta(*opnd)) {
*dst++ = Meta;
*dst++ = *opnd ^ 32;
} else {
*dst++ = *opnd++;
}
}
} }
} }
*dst++ = '\0';
p->size = dst - patout; p->size = dst - patout;
/* patmlen is really strlen, don't include null byte */ /* patmlen is really strlen. We don't need a null. */
p->patmlen = p->size - startoff - 1; p->patmlen = p->size - startoff;
} else { } else {
/* starting point info */ /* starting point info */
if (P_OP(pscan) == P_EXACTLY && !p->globflags) if (P_OP(pscan) == P_EXACTLY && !p->globflags &&
p->patstartch = *(char *)P_OPERAND(pscan); P_LS_LEN(pscan))
p->patstartch = *P_LS_STR(pscan);
/* /*
* Find the longest literal string in something expensive. * Find the longest literal string in something expensive.
* This is itself not all that cheap if we have * This is itself not all that cheap if we have
@ -427,9 +528,9 @@ patcompile(char *exp, int inflags, char **endexp)
len = 0; len = 0;
for (; pscan; pscan = PATNEXT(pscan)) for (; pscan; pscan = PATNEXT(pscan))
if (P_OP(pscan) == P_EXACTLY && if (P_OP(pscan) == P_EXACTLY &&
(int)strlen((char *)P_OPERAND(pscan)) >= len) { P_LS_LEN(pscan) >= len) {
lng = (char *)P_OPERAND(pscan); lng = P_LS_STR(pscan);
len = strlen(lng); len = P_LS_LEN(pscan);
} }
if (lng) { if (lng) {
p->mustoff = lng - patout; p->mustoff = lng - patout;
@ -844,9 +945,9 @@ static long
patcomppiece(int *flagp) patcomppiece(int *flagp)
{ {
long starter = 0, next, pound, op; long starter = 0, next, pound, op;
int flags, flags2, kshchar, len, ch, patch; int flags, flags2, kshchar, len, ch, patch, nmeta;
union upat up; union upat up;
char *nptr, *str0, cbuf[2]; char *nptr, *str0, *ptr, cbuf[2];
zrange_t from, to; zrange_t from, to;
flags = 0; flags = 0;
@ -881,6 +982,9 @@ patcomppiece(int *flagp)
} }
if (patparse > str0) { if (patparse > str0) {
long slen = patparse - str0;
int morelen;
/* Ordinary string: cancel kshchar lookahead */ /* Ordinary string: cancel kshchar lookahead */
kshchar = '\0'; kshchar = '\0';
/* /*
@ -889,25 +993,40 @@ patcomppiece(int *flagp)
flags |= P_PURESTR; flags |= P_PURESTR;
DPUTS(patparse == str0, "BUG: matched nothing in patcomppiece."); DPUTS(patparse == str0, "BUG: matched nothing in patcomppiece.");
/* more than one character matched? */ /* more than one character matched? */
len = str0 + (*str0 == Meta ? 2 : 1) < patparse; morelen = str0 + (*str0 == Meta ? 2 : 1) < patparse;
/* /*
* If we have more than one character, a following hash only * If we have more than one character, a following hash only
* applies to the last, so decrement. * applies to the last, so decrement.
*/ */
if (isset(EXTENDEDGLOB) && *patparse == Pound && len) if (isset(EXTENDEDGLOB) && *patparse == Pound && morelen)
patparse -= (patparse > str0 + 1 && patparse[-2] == Meta) ? 2 : 1; patparse -= (patparse > str0 + 1 && patparse[-2] == Meta) ? 2 : 1;
/* /*
* If len is 1, we can't have an active # following, so doesn't * If len is 1, we can't have an active # following, so doesn't
* matter that we don't make X in `XX#' simple. * matter that we don't make X in `XX#' simple.
*/ */
if (!len) if (!morelen)
flags |= P_SIMPLE; flags |= P_SIMPLE;
starter = patnode(P_EXACTLY); starter = patnode(P_EXACTLY);
/* add enough space including null byte */
len = patparse - str0; /* Get length of string without metafication. */
patadd(str0, 0, len + 1, 0); nmeta = 0;
nptr = (char *)P_OPERAND((Upat)patout + starter); for (ptr = str0; ptr < patparse; ptr++) {
nptr[len] = '\0'; if (*ptr == Meta) {
nmeta++;
ptr++;
}
}
slen = (patparse - str0) - nmeta;
/* First add length, which is a long */
patadd((char *)&slen, 0, sizeof(long), 0);
/*
* Then the string, not null terminated.
* Unmetafy and untokenize; pass the final length,
* which is what we need to allocate, i.e. not including
* a count for each Meta in the string.
*/
patadd(str0, 0, slen, PA_UNMETA);
nptr = P_LS_STR((Upat)patout + starter);
/* /*
* It's much simpler to turn off pure string mode for * It's much simpler to turn off pure string mode for
* any case-insensitive or approximate matching; usually, * any case-insensitive or approximate matching; usually,
@ -918,13 +1037,13 @@ patcomppiece(int *flagp)
* ..(#a1).. (i.e. the (#a1) has no effect), but if you're * ..(#a1).. (i.e. the (#a1) has no effect), but if you're
* going to write funny patterns, you get no sympathy from me. * going to write funny patterns, you get no sympathy from me.
*/ */
if (patglobflags && if (patglobflags) {
(!(patflags & PAT_FILE) || (strcmp(nptr, ".") && if (!(patflags & PAT_FILE))
strcmp(nptr, "..")))) flags &= ~P_PURESTR;
flags &= ~P_PURESTR; else if (!(nptr[0] == '.' &&
for (; *nptr; METAINC(nptr)) (slen == 1 || (nptr[1] == '.' && slen == 2))))
if (itok(*nptr)) flags &= ~P_PURESTR;
*nptr = ztokens[*nptr - Pound]; }
} else { } else {
if (kshchar) if (kshchar)
patparse++; patparse++;
@ -950,7 +1069,7 @@ patcomppiece(int *flagp)
starter = patnode(P_ANYOF); starter = patnode(P_ANYOF);
if (*patparse == Outbrack) { if (*patparse == Outbrack) {
patparse++; patparse++;
patadd(NULL, ']', 1, 1); patadd(NULL, ']', 1, PA_NOALIGN);
} }
while (*patparse && *patparse != Outbrack) { while (*patparse && *patparse != Outbrack) {
/* Meta is not a token */ /* Meta is not a token */
@ -990,7 +1109,7 @@ patcomppiece(int *flagp)
ch = PP_UNKWN; ch = PP_UNKWN;
patparse = nptr + 2; patparse = nptr + 2;
if (ch != PP_UNKWN) if (ch != PP_UNKWN)
patadd(NULL, STOUC(Meta+ch), 1, 1); patadd(NULL, STOUC(Meta+ch), 1, PA_NOALIGN);
continue; continue;
} }
if (itok(*patparse)) { if (itok(*patparse)) {
@ -1003,15 +1122,17 @@ patcomppiece(int *flagp)
patparse++; patparse++;
if (*patparse == '-' && patparse[1] != Outbrack) { if (*patparse == '-' && patparse[1] != Outbrack) {
patadd(NULL, STOUC(Meta+PP_RANGE), 1, 1); patadd(NULL, STOUC(Meta+PP_RANGE), 1, PA_NOALIGN);
patadd(cbuf, 0, (cbuf[0] == Meta) ? 2 : 1, 1); patadd(cbuf, 0, (cbuf[0] == Meta) ? 2 : 1, PA_NOALIGN);
if (itok(*++patparse)) { if (itok(*++patparse)) {
patadd(0, STOUC(ztokens[*patparse - Pound]), 1, 1); patadd(0, STOUC(ztokens[*patparse - Pound]), 1,
PA_NOALIGN);
} else } else
patadd(patparse, 0, (*patparse == Meta) ? 2 : 1, 1); patadd(patparse, 0, (*patparse == Meta) ? 2 : 1,
PA_NOALIGN);
METAINC(patparse); METAINC(patparse);
} else } else
patadd(cbuf, 0, (cbuf[0] == Meta) ? 2 : 1, 1); patadd(cbuf, 0, (cbuf[0] == Meta) ? 2 : 1, PA_NOALIGN);
} }
if (*patparse != Outbrack) if (*patparse != Outbrack)
return 0; return 0;
@ -1309,19 +1430,11 @@ static void patoptail(long p, long val)
*/ */
static char *patinstart; /* Start of input string */ static char *patinstart; /* Start of input string */
static char *patinend; /* End of input string */ static char *patinend; /* End of input string */
static char *patinput; /* String input pointer */
static char *patinpath; /* Full path for use with ~ exclusions */ static char *patinpath; /* Full path for use with ~ exclusions */
static int patinlen; /* Length of last successful match.
/**/ * Includes count of Meta characters.
char *patinput; /* String input pointer */ */
/*
* Offset of string at which we are trying to match.
* This is added in to the positions recorded in patbeginp and patendp
* when we are looking for substrings. Currently this only happens
* in the parameter substitution code.
*/
/**/
int patoffset;
static char *patbeginp[NSUBEXP]; /* Pointer to backref beginnings */ static char *patbeginp[NSUBEXP]; /* Pointer to backref beginnings */
static char *patendp[NSUBEXP]; /* Pointer to backref ends */ static char *patendp[NSUBEXP]; /* Pointer to backref ends */
@ -1329,9 +1442,24 @@ static int parsfound; /* parentheses (with backrefs) found */
static int globdots; /* Glob initial dots? */ static int globdots; /* Glob initial dots? */
/*
* Macros which are currently trivial but are likely to be less
* so when we handle multibyte characters. They operate on
* umetafied strings.
*/
/* Get a character from the start point in a string */
#define CHARREF(x) (STOUC(*x))
/* Get a pointer to the next character */
#define CHARNEXT(x) (x+1)
/* Increment a pointer past the current character. */
#define CHARINC(x) (x++)
/* Counter the number of characters between two pointers, largest first */
#define CHARSUB(x,y) (x-y)
/* /*
* The following need to be accessed in the globbing scanner for * The following need to be accessed in the globbing scanner for
* a multi-component file path. See horror story there. * a multi-component file path. See horror story in glob.c.
*/ */
/**/ /**/
int errsfound; /* Total error count so far */ int errsfound; /* Total error count so far */
@ -1347,23 +1475,59 @@ pattrystart(void)
errsfound = 0; errsfound = 0;
} }
/*
* Test prog against null-terminated, metafied string.
*/
/**/ /**/
mod_export int mod_export int
pattry(Patprog prog, char *string) pattry(Patprog prog, char *string)
{ {
return pattryrefs(prog, string, NULL, NULL, NULL); return pattryrefs(prog, string, -1, 0, NULL, NULL, NULL);
} }
/* The last three arguments are used to report the positions for the /*
* backreferences. On entry, *nump should contain the maximum number * Test prog against string of given length, no null termination
* positions to report. */ * but still metafied at this point. offset gives an offset
* to include in reported match indices
*/
/**/ /**/
mod_export int mod_export int
pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp) pattrylen(Patprog prog, char *string, int len, int offset)
{ {
int i, maxnpos = 0, ret; return pattryrefs(prog, string, len, offset, NULL, NULL, NULL);
char **sp, **ep; }
/*
* Test prog against string with given length stringlen, which
* may be -1 to indicate a null-terminated string. The input
* string is metafied; the length is the raw string length, not the
* number of possibly metafied characters.
*
* offset is the position in the original string (not seen by
* the patter module) at which we are trying to match.
* This is added in to the positions recorded in patbeginp and patendp
* when we are looking for substrings. Currently this only happens
* in the parameter substitution code.
*
* Note this is a character offset, i.e. a metafied character
* counts as 1.
*
* The last three arguments are used to report the positions for the
* backreferences. On entry, *nump should contain the maximum number
* of positions to report. In this case the match, mbegin, mend
* arrays are not altered.
*/
/**/
mod_export int
pattryrefs(Patprog prog, char *string, int stringlen, int patoffset,
int *nump, int *begp, int *endp)
{
int i, maxnpos = 0, ret, needfullpath, unmetalen, unmetalenp;
int origlen;
char **sp, **ep, *tryalloced, *ptr;
char *progstr = (char *)prog + prog->startoff; char *progstr = (char *)prog + prog->startoff;
if (nump) { if (nump) {
@ -1374,12 +1538,87 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
if (*string == Nularg) if (*string == Nularg)
string++; string++;
patinstart = patinput = string; if (stringlen < 0)
patinend = patinstart + strlen(patinstart); stringlen = strlen(string);
origlen = stringlen;
patflags = prog->flags;
/*
* For a top-level ~-exclusion, we will need the full
* path to exclude, so copy the path so far and append the
* current test string.
*/
needfullpath = (patflags & PAT_HAS_EXCLUDP) && pathpos;
/* Get the length of the full string when unmetafied. */
unmetalen = ztrsub(string + stringlen, string);
if (needfullpath)
unmetalenp = ztrsub(pathbuf + pathpos, pathbuf);
else
unmetalenp = 0;
DPUTS(needfullpath && (patflags & (PAT_PURES|PAT_ANY)),
"rum sort of file exclusion");
/*
* Partly for efficiency, and partly for the convenience of
* globbing, we don't unmetafy pure string patterns, and
* there's no reason to if the pattern is just a *.
*/
if (!(patflags & (PAT_PURES|PAT_ANY))
&& (needfullpath || unmetalen != stringlen)) {
/*
* We need to copy if we need to prepend the path so far
* (in which case we copy both chunks), or if we have
* Meta characters.
*/
char *dst;
int icopy, ncopy;
dst = tryalloced = zalloc(unmetalen + unmetalenp);
if (needfullpath) {
/* loop twice, copy path buffer first time */
ptr = pathbuf;
ncopy = unmetalenp;
} else {
/* just loop once, copy string with unmetafication */
ptr = string;
ncopy = unmetalen;
}
for (icopy = 0; icopy < 2; icopy++) {
for (i = 0; i < ncopy; i++) {
if (*ptr == Meta) {
ptr++;
*dst++ = *ptr++ ^ 32;
} else {
*dst++ = *ptr++;
}
}
if (!needfullpath)
break;
/* next time append test string to path so far */
ptr = string;
ncopy = unmetalen;
}
if (needfullpath) {
patinstart = tryalloced + unmetalenp;
patinpath = tryalloced;
} else {
patinstart = tryalloced;
patinpath = NULL;
}
stringlen = unmetalen;
} else {
patinstart = string;
tryalloced = patinpath = NULL;
}
patinend = patinstart + stringlen;
/* /*
* From now on we do not require NULL termination of * From now on we do not require NULL termination of
* the test string. It is still metafied, as is string * the test string. There should also be no more references
* data in the prog. * to the variable string.
*/ */
if (prog->flags & (PAT_PURES|PAT_ANY)) { if (prog->flags & (PAT_PURES|PAT_ANY)) {
@ -1399,11 +1638,11 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
* Testing a pure string. See if initial * Testing a pure string. See if initial
* components match. * components match.
*/ */
int lendiff = (patinend - patinstart) - prog->patmlen; int lendiff = stringlen - prog->patmlen;
if (lendiff < 0) { if (lendiff < 0) {
/* No, the pattern string is too long. */ /* No, the pattern string is too long. */
ret = 0; ret = 0;
} else if (!memcmp(progstr, string, prog->patmlen)) { } else if (!memcmp(progstr, patinstart, prog->patmlen)) {
/* /*
* Initial component matches. Matches either * Initial component matches. Matches either
* if lengths are the same or we are not anchored * if lengths are the same or we are not anchored
@ -1420,47 +1659,61 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
* For files, we won't match initial "."s unless * For files, we won't match initial "."s unless
* glob_dots is set. * glob_dots is set.
*/ */
if ((prog->flags & PAT_NOGLD) && *string == '.') if ((prog->flags & PAT_NOGLD) && *patinstart == '.') {
return 0; ret = 0;
/* in case used for ${..#..} etc. */ } else {
patinput = string + prog->patmlen; /*
/* if matching files, must update globbing flags */ * Remember the length in case used for ${..#..} etc.
patglobflags = prog->globend; * In this case, we didn't unmetafy the string.
return 1; */
} else patinlen = (int)prog->patmlen;
return 0; /* if matching files, must update globbing flags */
patglobflags = prog->globend;
}
}
if (tryalloced)
zfree(tryalloced, unmetalen + unmetalenp);
return ret;
} else { } else {
/* /*
* Test for a `must match' string, unless we're scanning for a match * Test for a `must match' string, unless we're scanning for a match
* in which case we don't need to do this each time. * in which case we don't need to do this each time.
*/ */
ret = 1;
if (!(prog->flags & PAT_SCAN) && prog->mustoff) if (!(prog->flags & PAT_SCAN) && prog->mustoff)
{ {
char *testptr; /* start pointer into test string */ char *testptr; /* start pointer into test string */
char *teststop; /* last point from which we can match */ char *teststop; /* last point from which we can match */
char *patptr = (char *)prog + prog->mustoff; char *patptr = (char *)prog + prog->mustoff;
int patlen = strlen(patptr); int patlen = prog->patmlen;
int found = 0; int found = 0;
if (patlen > patinend - patinstart) { if (patlen > stringlen) {
/* Too long, can't match. */ /* Too long, can't match. */
return 0; ret = 0;
} } else {
teststop = patinend - patlen; teststop = patinend - patlen;
for (testptr = patinstart; testptr <= teststop; testptr++) for (testptr = patinstart; testptr <= teststop; testptr++)
{ {
if (!memcmp(testptr, patptr, patlen)) { if (!memcmp(testptr, patptr, patlen)) {
found = 1; found = 1;
break; break;
}
} }
}
if (!found) if (!found)
return 0; ret = 0;
}
}
if (!ret) {
if (tryalloced)
zfree(tryalloced, unmetalen + unmetalenp);
return 0;
} }
patflags = prog->flags;
patglobflags = prog->globflags; patglobflags = prog->globflags;
if (!(patflags & PAT_FILE)) { if (!(patflags & PAT_FILE)) {
forceerrs = -1; forceerrs = -1;
@ -1469,26 +1722,7 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
globdots = !(patflags & PAT_NOGLD); globdots = !(patflags & PAT_NOGLD);
parsfound = 0; parsfound = 0;
if ((patflags & PAT_HAS_EXCLUDP) && pathpos) { patinput = patinstart;
/*
* For a top-level ~-exclusion, we will need the full
* path to exclude, so copy the path so far and append the
* current test string.
*
* There are some advantages in making patinstart etc.
* point into this new string; however, that gets confusing
* if we need patinput outside this file. That's
* not likely for files but I don't think it's worth
* the risk.
*/
int len = patinend - patinstart;
patinpath = (char *)zalloc(pathpos + len);
memcpy(patinpath, pathbuf, pathpos);
memcpy(patinpath + pathpos, patinstart, len);
} else {
patinpath = NULL;
}
if (patmatch((Upat)progstr)) { if (patmatch((Upat)progstr)) {
/* /*
@ -1496,6 +1730,23 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
* failed, so set it now * failed, so set it now
*/ */
patglobflags = prog->globend; patglobflags = prog->globend;
/*
* Record length of successful match, including Meta
* characters. Do it here so that patmatchlen() can return
* it even if we delete the pattern strings.
*/
patinlen = patinput - patinstart;
/*
* Optimization: if we didn't find any Meta characters
* to begin with, we don't need to look for them now.
*/
if (unmetalen != origlen) {
for (ptr = patinstart; ptr < patinput; ptr++)
if (imeta(*ptr))
patinlen++;
}
/* /*
* Should we clear backreferences and matches on a failed * Should we clear backreferences and matches on a failed
* match? * match?
@ -1504,15 +1755,18 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
/* /*
* m flag: for global match. This carries no overhead * m flag: for global match. This carries no overhead
* in the pattern matching part. * in the pattern matching part.
*
* Remember the test pattern is already unmetafied.
*/ */
char *str; char *str;
int mlen = ztrsub(patinput, patinstart); int mlen = CHARSUB(patinput, patinstart);
str = ztrduppfx(patinstart, patinput - patinstart); str = metafy(patinstart, patinput - patinstart, META_DUP);
setsparam("MATCH", str); setsparam("MATCH", str);
setiparam("MBEGIN", (zlong)(patoffset + !isset(KSHARRAYS))); setiparam("MBEGIN", (zlong)(patoffset + !isset(KSHARRAYS)));
setiparam("MEND", setiparam("MEND",
(zlong)(mlen + patoffset + !isset(KSHARRAYS) - 1)); (zlong)(mlen + patoffset +
!isset(KSHARRAYS) - 1));
} }
if (prog->patnpar && nump) { if (prog->patnpar && nump) {
/* /*
@ -1527,9 +1781,10 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
for (i = 0; i < prog->patnpar && i < maxnpos; i++) { for (i = 0; i < prog->patnpar && i < maxnpos; i++) {
if (parsfound & (1 << i)) { if (parsfound & (1 << i)) {
if (begp) if (begp)
*begp++ = ztrsub(*sp, patinstart) + patoffset; *begp++ = CHARSUB(*sp, patinstart) + patoffset;
if (endp) if (endp)
*endp++ = ztrsub(*ep, patinstart) + patoffset - 1; *endp++ = CHARSUB(*ep, patinstart) + patoffset
- 1;
} else { } else {
if (begp) if (begp)
*begp++ = -1; *begp++ = -1;
@ -1557,7 +1812,7 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
for (i = 0; i < prog->patnpar; i++) { for (i = 0; i < prog->patnpar; i++) {
if (parsfound & (1 << i)) { if (parsfound & (1 << i)) {
matcharr[i] = ztrduppfx(*sp, *ep - *sp); matcharr[i] = metafy(*sp, *ep - *sp, META_DUP);
/* /*
* mbegin and mend give indexes into the string * mbegin and mend give indexes into the string
* in the standard notation, i.e. respecting * in the standard notation, i.e. respecting
@ -1568,12 +1823,12 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
* corresponds to indexing as ${foo[1,1]}. * corresponds to indexing as ${foo[1,1]}.
*/ */
sprintf(numbuf, "%ld", sprintf(numbuf, "%ld",
(long)(ztrsub(*sp, patinstart) + (long)(CHARSUB(*sp, patinstart) +
patoffset + patoffset +
!isset(KSHARRAYS))); !isset(KSHARRAYS)));
mbeginarr[i] = ztrdup(numbuf); mbeginarr[i] = ztrdup(numbuf);
sprintf(numbuf, "%ld", sprintf(numbuf, "%ld",
(long)(ztrsub(*ep, patinstart) + (long)(CHARSUB(*ep, patinstart) +
patoffset + patoffset +
!isset(KSHARRAYS) - 1)); !isset(KSHARRAYS) - 1));
mendarr[i] = ztrdup(numbuf); mendarr[i] = ztrdup(numbuf);
@ -1593,19 +1848,31 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
setaparam("mbegin", mbeginarr); setaparam("mbegin", mbeginarr);
setaparam("mend", mendarr); setaparam("mend", mendarr);
} }
ret = 1; ret = 1;
} else } else
ret = 0; ret = 0;
if (patinpath) { if (tryalloced)
zfree(patinpath, pathpos + (patinend - patinstart)); zfree(tryalloced, unmetalen + unmetalenp);
patinpath = NULL;
}
return ret; return ret;
} }
} }
/*
* Return length of previous succesful match. This is
* in metafied bytes, i.e. includes a count of Meta characters.
* Unusual and futile attempt at modular encapsulation.
*/
/**/
int
patmatchlen(void)
{
return patinlen;
}
/* /*
* Match literal characters with case insensitivity test: the first * Match literal characters with case insensitivity test: the first
* comes from the input string, the second the current pattern. * comes from the input string, the second the current pattern.
@ -1627,8 +1894,11 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
* exactpos is used to remember how far down an exact string we have * exactpos is used to remember how far down an exact string we have
* matched, if we are doing approximation and can therefore redo from * matched, if we are doing approximation and can therefore redo from
* the same point; we never need to otherwise. * the same point; we never need to otherwise.
*
* exactend is a pointer to the end of the string, which isn't
* null-terminated.
*/ */
static char *exactpos; static char *exactpos, *exactend;
/* /*
* Main matching routine. * Main matching routine.
@ -1643,7 +1913,7 @@ patmatch(Upat prog)
{ {
/* Current and next nodes */ /* Current and next nodes */
Upat scan = prog, next, opnd; Upat scan = prog, next, opnd;
char *start, *save, *chrop, *compend; char *start, *save, *chrop, *chrend, *compend;
int savglobflags, op, no, min, nextch, fail = 0, saverrsfound; int savglobflags, op, no, min, nextch, fail = 0, saverrsfound;
zrange_t from, to, comp; zrange_t from, to, comp;
@ -1659,45 +1929,52 @@ patmatch(Upat prog)
if (patinput == patinend) if (patinput == patinend)
fail = 1; fail = 1;
else else
METAINC(patinput); CHARINC(patinput);
break; break;
case P_EXACTLY: case P_EXACTLY:
/* /*
* acts as nothing if *chrop is null: this is used by * acts as nothing if *chrop is null: this is used by
* approx code. * approx code.
*/ */
chrop = exactpos ? exactpos : (char *)P_OPERAND(scan); if (exactpos) {
chrop = exactpos;
chrend = exactend;
} else {
chrop = P_LS_STR(scan);
chrend = chrop + P_LS_LEN(scan);
}
exactpos = NULL; exactpos = NULL;
while (*chrop && patinput < patinend) { while (chrop < chrend && patinput < patinend) {
int chin = STOUC(UNMETA(patinput)); int chin = CHARREF(patinput);
int chpa = STOUC(UNMETA(chrop)); int chpa = CHARREF(chrop);
if (!CHARMATCH(chin, chpa)) { if (!CHARMATCH(chin, chpa)) {
fail = 1; fail = 1;
break; break;
} }
METAINC(chrop); CHARINC(chrop);
METAINC(patinput); CHARINC(patinput);
} }
if (*chrop) { if (chrop < chrend) {
exactpos = chrop; exactpos = chrop;
exactend = chrend;
fail = 1; fail = 1;
} }
break; break;
case P_ANYOF: case P_ANYOF:
if (patinput == patinend || if (patinput == patinend ||
!patmatchrange((char *)P_OPERAND(scan), !patmatchrange((char *)P_OPERAND(scan),
STOUC(UNMETA(patinput)))) CHARREF(patinput)))
fail = 1; fail = 1;
else else
METAINC(patinput); CHARINC(patinput);
break; break;
case P_ANYBUT: case P_ANYBUT:
if (patinput == patinend || if (patinput == patinend ||
patmatchrange((char *)P_OPERAND(scan), patmatchrange((char *)P_OPERAND(scan),
STOUC(UNMETA(patinput)))) CHARREF(patinput)))
fail = 1; fail = 1;
else else
METAINC(patinput); CHARINC(patinput);
break; break;
case P_NUMRNG: case P_NUMRNG:
case P_NUMFROM: case P_NUMFROM:
@ -1771,7 +2048,8 @@ patmatch(Upat prog)
return 1; return 1;
} }
if (!no && P_OP(next) == P_EXACTLY && if (!no && P_OP(next) == P_EXACTLY &&
!idigit(STOUC(*(char *)P_OPERAND(next))) && (!P_LS_LEN(next) ||
!idigit(STOUC(*P_LS_STR(next)))) &&
!(patglobflags & 0xff)) !(patglobflags & 0xff))
return 0; return 0;
patinput = --save; patinput = --save;
@ -1791,7 +2069,7 @@ patmatch(Upat prog)
case P_NUMANY: case P_NUMANY:
/* This is <->: any old set of digits, don't bother comparing */ /* This is <->: any old set of digits, don't bother comparing */
start = patinput; start = patinput;
while (patinput < patinend && idigit(STOUC(*patinput))) while (patinput < patinend && idigit(CHARREF(patinput)))
patinput++; patinput++;
save = patinput; save = patinput;
no = 0; no = 0;
@ -1799,7 +2077,8 @@ patmatch(Upat prog)
if (patmatch(next)) if (patmatch(next))
return 1; return 1;
if (!no && P_OP(next) == P_EXACTLY && if (!no && P_OP(next) == P_EXACTLY &&
!idigit(STOUC(*(char *)P_OPERAND(next))) && (!P_LS_LEN(next) ||
!idigit(CHARREF(P_LS_STR(next)))) &&
!(patglobflags & 0xff)) !(patglobflags & 0xff))
return 0; return 0;
patinput = --save; patinput = --save;
@ -1963,7 +2242,7 @@ patmatch(Upat prog)
origpatinend = patinend; origpatinend = patinend;
while ((ret = patmatch(P_OPERAND(scan)))) { while ((ret = patmatch(P_OPERAND(scan)))) {
unsigned char *syncpt; unsigned char *syncpt;
char *savpatinstart, *savpatinend; char *savpatinstart;
int savforce = forceerrs; int savforce = forceerrs;
int savpatflags = patflags, synclen; int savpatflags = patflags, synclen;
forceerrs = -1; forceerrs = -1;
@ -1998,7 +2277,6 @@ patmatch(Upat prog)
patflags |= PAT_NOTEND; patflags |= PAT_NOTEND;
} }
savpatinstart = patinstart; savpatinstart = patinstart;
savpatinend = patinend;
next = PATNEXT(scan); next = PATNEXT(scan);
while (next && P_ISEXCLUDE(next)) { while (next && P_ISEXCLUDE(next)) {
patinput = save; patinput = save;
@ -2013,14 +2291,15 @@ patmatch(Upat prog)
opnd = P_OPERAND(next) + 1; opnd = P_OPERAND(next) + 1;
if (P_OP(next) == P_EXCLUDP && patinpath) { if (P_OP(next) == P_EXCLUDP && patinpath) {
/* /*
* top level exclusion with a file, * Top level exclusion with a file,
* applies to whole path so add the * applies to whole path so add the
* segments already matched * segments already matched.
* We copied these in front of the
* test pattern, so patinend doesn't
* need moving.
*/ */
DPUTS(patinput != patinstart, DPUTS(patinput != patinstart,
"BUG: not at start excluding path"); "BUG: not at start excluding path");
patinend = patinpath + pathpos +
(patinend - patinstart);
patinput = patinstart = patinpath; patinput = patinstart = patinpath;
} }
if (patmatch(opnd)) { if (patmatch(opnd)) {
@ -2036,7 +2315,6 @@ patmatch(Upat prog)
patinput = savpatinstart + patinput = savpatinstart +
(patinput - patinstart); (patinput - patinstart);
patinstart = savpatinstart; patinstart = savpatinstart;
patinend = savpatinend;
} }
if (!ret) if (!ret)
break; break;
@ -2146,7 +2424,7 @@ patmatch(Upat prog)
/* Note that no counts possibly metafied characters */ /* Note that no counts possibly metafied characters */
start = patinput; start = patinput;
if (op == P_STAR) { if (op == P_STAR) {
for (no = 0; patinput < patinend; METAINC(patinput)) for (no = 0; patinput < patinend; CHARINC(patinput))
no++; no++;
/* simple optimization for reasonably common case */ /* simple optimization for reasonably common case */
if (P_OP(next) == P_END) if (P_OP(next) == P_END)
@ -2156,7 +2434,7 @@ patmatch(Upat prog)
"BUG: wrong backtracking with approximation."); "BUG: wrong backtracking with approximation.");
if (!globdots && P_NOTDOT(P_OPERAND(scan)) && if (!globdots && P_NOTDOT(P_OPERAND(scan)) &&
patinput == patinstart && patinput < patinend && patinput == patinstart && patinput < patinend &&
*patinput == '.') CHARREF(patinput) == '.')
return 0; return 0;
no = patrepeat(P_OPERAND(scan)); no = patrepeat(P_OPERAND(scan));
} }
@ -2165,8 +2443,9 @@ patmatch(Upat prog)
* Lookahead to avoid useless matches. This is not possible * Lookahead to avoid useless matches. This is not possible
* with approximation. * with approximation.
*/ */
if (P_OP(next) == P_EXACTLY && !(patglobflags & 0xff)) { if (P_OP(next) == P_EXACTLY && P_LS_LEN(next) &&
char *nextop = (char *)P_OPERAND(next); !(patglobflags & 0xff)) {
char *nextop = P_LS_STR(next);
/* /*
* If that P_EXACTLY is last (common in simple patterns, * If that P_EXACTLY is last (common in simple patterns,
* such as *.c), then it can be only be matched at one * such as *.c), then it can be only be matched at one
@ -2175,21 +2454,20 @@ patmatch(Upat prog)
if (P_OP(PATNEXT(next)) == P_END && if (P_OP(PATNEXT(next)) == P_END &&
!(patflags & PAT_NOANCH)) { !(patflags & PAT_NOANCH)) {
int ptlen = patinend - patinput; int ptlen = patinend - patinput;
int oplen = strlen(nextop); int lenmatch = patinend - (min ? CHARNEXT(start) : start);
int lenmatch = patinend - (min ? METANEXT(start) : start);
/* Are we in the right range? */ /* Are we in the right range? */
if (oplen > lenmatch || oplen < ptlen) if (P_LS_LEN(next) > lenmatch || P_LS_LEN(next) < ptlen)
return 0; return 0;
/* Yes, just position appropriately and test. */ /* Yes, just position appropriately and test. */
patinput += ptlen - oplen; patinput += ptlen - P_LS_LEN(next);
if (patinput > start && patinput[-1] == Meta) { /*
/* doesn't align properly, no go */ * Here we will need to be careful that patinput is not
return 0; * in the middle of a multibyte character.
} */
/* Continue loop with P_EXACTLY test. */ /* Continue loop with P_EXACTLY test. */
break; break;
} }
nextch = STOUC(UNMETA(nextop)); nextch = CHARREF(nextop);
} else } else
nextch = -1; nextch = -1;
save = patinput; save = patinput;
@ -2199,14 +2477,17 @@ patmatch(Upat prog)
int charmatch_cache; int charmatch_cache;
if (nextch < 0 || if (nextch < 0 ||
(patinput < patinend && (patinput < patinend &&
CHARMATCH_EXPR(STOUC(UNMETA(patinput)), nextch))) { CHARMATCH_EXPR(CHARREF(patinput), nextch))) {
if (patmatch(next)) if (patmatch(next))
return 1; return 1;
} }
no--; no--;
save--; save--;
if (save > start && save[-1] == Meta) /*
save--; * Here we will need to make sure save is
* decremented properly to the start of
* the preceeding multibyte character.
*/
patinput = save; patinput = save;
patglobflags = savglobflags; patglobflags = savglobflags;
errsfound = saverrsfound; errsfound = saverrsfound;
@ -2270,7 +2551,7 @@ patmatch(Upat prog)
/* Try omitting a character from the input string */ /* Try omitting a character from the input string */
if (patinput < patinend) { if (patinput < patinend) {
METAINC(patinput); CHARINC(patinput);
/* If we are not on an exact match, then this is /* If we are not on an exact match, then this is
* our last gasp effort, so we can optimize out * our last gasp effort, so we can optimize out
* the recursive call. * the recursive call.
@ -2285,11 +2566,11 @@ patmatch(Upat prog)
char *nextexact = savexact; char *nextexact = savexact;
DPUTS(!savexact || !*savexact, DPUTS(!savexact || !*savexact,
"BUG: exact match has not set exactpos"); "BUG: exact match has not set exactpos");
METAINC(nextexact); CHARINC(nextexact);
if (save < patinend) { if (save < patinend) {
char *nextin = save; char *nextin = save;
METAINC(nextin); CHARINC(nextin);
patglobflags = savglobflags; patglobflags = savglobflags;
errsfound = saverrsfound; errsfound = saverrsfound;
exactpos = savexact; exactpos = savexact;
@ -2299,18 +2580,18 @@ patmatch(Upat prog)
* exactpos * exactpos
*/ */
if (save < patinend && nextin < patinend && if (save < patinend && nextin < patinend &&
*nextexact) { nextexact < exactend) {
int cin0 = UNMETA(save); int cin0 = CHARREF(save);
int cpa0 = UNMETA(exactpos); int cpa0 = CHARREF(exactpos);
int cin1 = UNMETA(nextin); int cin1 = CHARREF(nextin);
int cpa1 = UNMETA(nextexact); int cpa1 = CHARREF(nextexact);
if (CHARMATCH(cin0, cpa1) && if (CHARMATCH(cin0, cpa1) &&
CHARMATCH(cin1, cpa0)) { CHARMATCH(cin1, cpa0)) {
patinput = nextin; patinput = nextin;
METAINC(patinput); CHARINC(patinput);
exactpos = nextexact; exactpos = nextexact;
METAINC(exactpos); CHARINC(exactpos);
if (patmatch(scan)) if (patmatch(scan))
return 1; return 1;
@ -2333,12 +2614,13 @@ patmatch(Upat prog)
exactpos = savexact; exactpos = savexact;
} }
DPUTS(exactpos == exactend, "approximating too far");
/* /*
* Try moving up the exact match pattern. * Try moving up the exact match pattern.
* This must be the last attempt, so just loop * This must be the last attempt, so just loop
* instead of calling recursively. * instead of calling recursively.
*/ */
METAINC(exactpos); CHARINC(exactpos);
continue; continue;
} }
} }
@ -2358,6 +2640,11 @@ patmatchrange(char *range, int ch)
{ {
int r1, r2; int r1, r2;
/*
* Careful here: unlike other strings, range is a NULL-terminated,
* metafied string, because we need to treat the Posix and hyphenated
* ranges specially.
*/
for (; *range; range++) { for (; *range; range++) {
if (imeta(STOUC(*range))) { if (imeta(STOUC(*range))) {
switch (STOUC(*range)-STOUC(Meta)) { switch (STOUC(*range)-STOUC(Meta)) {
@ -2459,23 +2746,24 @@ static int patrepeat(Upat p)
break; break;
#endif #endif
case P_EXACTLY: case P_EXACTLY:
tch = STOUC(UNMETA(opnd)); DPUTS(P_LS_LEN(p) != 1, "closure following more than one character");
tch = CHARREF(P_LS_STR(p));
while (scan < patinend && while (scan < patinend &&
CHARMATCH_EXPR(STOUC(UNMETA(scan)), tch)) { CHARMATCH_EXPR(CHARREF(scan), tch)) {
count++; count++;
METAINC(scan); CHARINC(scan);
} }
break; break;
case P_ANYOF: case P_ANYOF:
while (scan < patinend && patmatchrange(opnd, STOUC(UNMETA(scan)))) { while (scan < patinend && patmatchrange(opnd, CHARREF(scan))) {
count++; count++;
METAINC(scan); CHARINC(scan);
} }
break; break;
case P_ANYBUT: case P_ANYBUT:
while (scan < patinend && !patmatchrange(opnd, STOUC(UNMETA(scan)))) { while (scan < patinend && !patmatchrange(opnd, CHARREF(scan))) {
count++; count++;
METAINC(scan); CHARINC(scan);
} }
break; break;
#ifdef DEBUG #ifdef DEBUG
@ -2528,7 +2816,14 @@ patdump(Patprog r)
next = PATNEXT(up); next = PATNEXT(up);
printf("(%d)", next ? next-codestart : 0); printf("(%d)", next ? next-codestart : 0);
s += sizeof(union upat); s += sizeof(union upat);
if (op == P_ANYOF || op == P_ANYBUT || op == P_EXACTLY) { if (op == P_EXACTLY) {
long llen = *(long *)s;
s += sizeof(long);
while (llen--) {
putchar(CHARREF(s));
CHARINC(s);
}
} else if (op == P_ANYOF || op == P_ANYBUT) {
while (*s != '\0') { while (*s != '\0') {
if (itok(*s)) { if (itok(*s)) {
if (*s == Meta + PP_RANGE) { if (*s == Meta + PP_RANGE) {

View File

@ -1088,10 +1088,10 @@ struct patprog {
long startoff; /* length before start of programme */ long startoff; /* length before start of programme */
long size; /* total size from start of struct */ long size; /* total size from start of struct */
long mustoff; /* offset to string that must be present */ long mustoff; /* offset to string that must be present */
long patmlen; /* length of pure string or longest match */
int globflags; /* globbing flags to set at start */ int globflags; /* globbing flags to set at start */
int globend; /* globbing flags set after finish */ int globend; /* globbing flags set after finish */
int flags; /* PAT_* flags */ int flags; /* PAT_* flags */
int patmlen; /* length of pure string or longest match */
int patnpar; /* number of active parentheses */ int patnpar; /* number of active parentheses */
char patstartch; char patstartch;
}; };

View File

@ -132,6 +132,7 @@
>0: [[ 633 = <-1000>33 ]] >0: [[ 633 = <-1000>33 ]]
>0: [[ 633 = <1->33 ]] >0: [[ 633 = <1->33 ]]
>0: [[ 633 = <->33 ]] >0: [[ 633 = <->33 ]]
>0: [[ 12345678901234567890123456789012345678901234567890123456789012345678901234567890foo = <42->foo ]]
>0: [[ READ.ME = (#ia1)readme ]] >0: [[ READ.ME = (#ia1)readme ]]
>1: [[ READ..ME = (#ia1)readme ]] >1: [[ READ..ME = (#ia1)readme ]]
>0: [[ README = (#ia1)readm ]] >0: [[ README = (#ia1)readm ]]