mirror of
git://git.code.sf.net/p/zsh/code
synced 2024-06-01 12:56:04 +02:00
29838: metafy/unmetafy strings for PCRE matching (UTF-8 fixes)
This commit is contained in:
parent
332fbbd700
commit
2f3c16d40f
10
ChangeLog
10
ChangeLog
|
@ -1,3 +1,11 @@
|
|||
2011-10-24 Phil Pennock <pdpennock@users.sourceforge.net>
|
||||
|
||||
* 29838: Src/Modules/pcre.c: metafy/unmetafy strings, to
|
||||
correctly handle non-ASCII characters in UTF-8 for regexp
|
||||
matches.
|
||||
|
||||
* unposted: Test/V07pcre.ztst: some PCRE tests
|
||||
|
||||
2011-10-23 Peter Stephenson <p.w.stephenson@ntlworld.com>
|
||||
|
||||
* users/16492: MACHINES: OpenIndiana issue.
|
||||
|
@ -15484,5 +15492,5 @@
|
|||
|
||||
*****************************************************
|
||||
* This is used by the shell to define $ZSH_PATCHLEVEL
|
||||
* $Revision: 1.5481 $
|
||||
* $Revision: 1.5482 $
|
||||
*****************************************************
|
||||
|
|
|
@ -77,6 +77,7 @@ bin_pcre_compile(char *nam, char **args, Options ops, UNUSED(int func))
|
|||
{
|
||||
int pcre_opts = 0, pcre_errptr;
|
||||
const char *pcre_error;
|
||||
char *target;
|
||||
|
||||
if(OPT_ISSET(ops,'a')) pcre_opts |= PCRE_ANCHORED;
|
||||
if(OPT_ISSET(ops,'i')) pcre_opts |= PCRE_CASELESS;
|
||||
|
@ -92,8 +93,13 @@ bin_pcre_compile(char *nam, char **args, Options ops, UNUSED(int func))
|
|||
if (pcre_pattern)
|
||||
pcre_free(pcre_pattern);
|
||||
|
||||
pcre_pattern = pcre_compile(*args, pcre_opts, &pcre_error, &pcre_errptr, NULL);
|
||||
target = ztrdup(*args);
|
||||
unmetafy(target, NULL);
|
||||
|
||||
pcre_pattern = pcre_compile(target, pcre_opts, &pcre_error, &pcre_errptr, NULL);
|
||||
|
||||
free(target);
|
||||
|
||||
if (pcre_pattern == NULL)
|
||||
{
|
||||
zwarnnam(nam, "error in regex: %s", pcre_error);
|
||||
|
@ -161,7 +167,7 @@ zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar,
|
|||
sprintf(offset_all, "%d %d", ovec[0], ovec[1]);
|
||||
setsparam("ZPCRE_OP", ztrdup(offset_all));
|
||||
}
|
||||
match_all = ztrdup(captures[0]);
|
||||
match_all = metafy(captures[0], -1, META_DUP);
|
||||
setsparam(matchvar, match_all);
|
||||
/*
|
||||
* If we're setting match, mbegin, mend we only do
|
||||
|
@ -169,7 +175,15 @@ zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar,
|
|||
* (c.f. regex.c).
|
||||
*/
|
||||
if (!want_begin_end || nelem) {
|
||||
matches = zarrdup(&captures[capture_start]);
|
||||
char **x, **y;
|
||||
y = &captures[capture_start];
|
||||
matches = x = (char **) zalloc(sizeof(char *) * (arrlen(y) + 1));
|
||||
do {
|
||||
if (*y)
|
||||
*x++ = metafy(*y, -1, META_DUP);
|
||||
else
|
||||
*x++ = NULL;
|
||||
} while (*y++);
|
||||
setaparam(substravar, matches);
|
||||
}
|
||||
|
||||
|
@ -255,6 +269,7 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
|
|||
{
|
||||
int ret, capcount, *ovec, ovecsize, c;
|
||||
char *matched_portion = NULL;
|
||||
char *plaintext = NULL;
|
||||
char *receptacle = NULL;
|
||||
int return_value = 1;
|
||||
/* The subject length and offset start are both int values in pcre_exec */
|
||||
|
@ -278,7 +293,7 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
|
|||
}
|
||||
/* For the entire match, 'Return' the offset byte positions instead of the matched string */
|
||||
if(OPT_ISSET(ops,'b')) want_offset_pair = 1;
|
||||
|
||||
|
||||
if(!*args) {
|
||||
zwarnnam(nam, "not enough arguments");
|
||||
}
|
||||
|
@ -288,26 +303,28 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
|
|||
zwarnnam(nam, "error %d in fullinfo", ret);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
ovecsize = (capcount+1)*3;
|
||||
ovec = zalloc(ovecsize*sizeof(int));
|
||||
|
||||
subject_len = (int)strlen(*args);
|
||||
|
||||
plaintext = ztrdup(*args);
|
||||
unmetafy(plaintext, NULL);
|
||||
subject_len = (int)strlen(plaintext);
|
||||
|
||||
if (offset_start < 0 || offset_start >= subject_len)
|
||||
ret = PCRE_ERROR_NOMATCH;
|
||||
else
|
||||
ret = pcre_exec(pcre_pattern, pcre_hints, *args, subject_len, offset_start, 0, ovec, ovecsize);
|
||||
ret = pcre_exec(pcre_pattern, pcre_hints, plaintext, subject_len, offset_start, 0, ovec, ovecsize);
|
||||
|
||||
if (ret==0) return_value = 0;
|
||||
else if (ret==PCRE_ERROR_NOMATCH) /* no match */;
|
||||
else if (ret>0) {
|
||||
zpcre_get_substrings(*args, ovec, ret, matched_portion, receptacle,
|
||||
zpcre_get_substrings(plaintext, ovec, ret, matched_portion, receptacle,
|
||||
want_offset_pair, 0, 0);
|
||||
return_value = 0;
|
||||
}
|
||||
else {
|
||||
zwarnnam(nam, "error in pcre_exec");
|
||||
zwarnnam(nam, "error in pcre_exec [%d]", ret);
|
||||
}
|
||||
|
||||
if (ovec)
|
||||
|
@ -322,7 +339,8 @@ cond_pcre_match(char **a, int id)
|
|||
{
|
||||
pcre *pcre_pat;
|
||||
const char *pcre_err;
|
||||
char *lhstr, *rhre, *avar=NULL;
|
||||
char *lhstr, *rhre, *lhstr_plain, *rhre_plain, *avar=NULL;
|
||||
char *p;
|
||||
int r = 0, pcre_opts = 0, pcre_errptr, capcnt, *ov, ovsize;
|
||||
int return_value = 0;
|
||||
|
||||
|
@ -331,6 +349,10 @@ cond_pcre_match(char **a, int id)
|
|||
|
||||
lhstr = cond_str(a,0,0);
|
||||
rhre = cond_str(a,1,0);
|
||||
lhstr_plain = ztrdup(lhstr);
|
||||
rhre_plain = ztrdup(rhre);
|
||||
unmetafy(lhstr_plain, NULL);
|
||||
unmetafy(rhre_plain, NULL);
|
||||
pcre_pat = NULL;
|
||||
ov = NULL;
|
||||
|
||||
|
@ -339,7 +361,7 @@ cond_pcre_match(char **a, int id)
|
|||
|
||||
switch(id) {
|
||||
case CPCRE_PLAIN:
|
||||
pcre_pat = pcre_compile(rhre, pcre_opts, &pcre_err, &pcre_errptr, NULL);
|
||||
pcre_pat = pcre_compile(rhre_plain, pcre_opts, &pcre_err, &pcre_errptr, NULL);
|
||||
if (pcre_pat == NULL) {
|
||||
zwarn("failed to compile regexp /%s/: %s", rhre, pcre_err);
|
||||
break;
|
||||
|
@ -347,7 +369,7 @@ cond_pcre_match(char **a, int id)
|
|||
pcre_fullinfo(pcre_pat, NULL, PCRE_INFO_CAPTURECOUNT, &capcnt);
|
||||
ovsize = (capcnt+1)*3;
|
||||
ov = zalloc(ovsize*sizeof(int));
|
||||
r = pcre_exec(pcre_pat, NULL, lhstr, strlen(lhstr), 0, 0, ov, ovsize);
|
||||
r = pcre_exec(pcre_pat, NULL, lhstr_plain, strlen(lhstr_plain), 0, 0, ov, ovsize);
|
||||
/* r < 0 => error; r==0 match but not enough size in ov
|
||||
* r > 0 => (r-1) substrings found; r==1 => no substrings
|
||||
*/
|
||||
|
@ -356,13 +378,16 @@ cond_pcre_match(char **a, int id)
|
|||
return_value = 1;
|
||||
break;
|
||||
}
|
||||
else if (r==PCRE_ERROR_NOMATCH) return 0; /* no match */
|
||||
else if (r==PCRE_ERROR_NOMATCH) {
|
||||
return_value = 0; /* no match */
|
||||
break;
|
||||
}
|
||||
else if (r<0) {
|
||||
zwarn("pcre_exec() error: %d", r);
|
||||
zwarn("pcre_exec() error [%d]", r);
|
||||
break;
|
||||
}
|
||||
else if (r>0) {
|
||||
zpcre_get_substrings(lhstr, ov, r, NULL, avar, 0,
|
||||
zpcre_get_substrings(lhstr_plain, ov, r, NULL, avar, 0,
|
||||
isset(BASHREMATCH),
|
||||
!isset(BASHREMATCH));
|
||||
return_value = 1;
|
||||
|
@ -371,6 +396,10 @@ cond_pcre_match(char **a, int id)
|
|||
break;
|
||||
}
|
||||
|
||||
if (lhstr_plain)
|
||||
free(lhstr_plain);
|
||||
if(rhre_plain)
|
||||
free(rhre_plain);
|
||||
if (pcre_pat)
|
||||
pcre_free(pcre_pat);
|
||||
if (ov)
|
||||
|
|
|
@ -0,0 +1,106 @@
|
|||
%prep
|
||||
|
||||
zmodload zsh/pcre
|
||||
setopt rematch_pcre
|
||||
# Find a UTF-8 locale.
|
||||
setopt multibyte
|
||||
# Don't let LC_* override our choice of locale.
|
||||
unset -m LC_\*
|
||||
mb_ok=
|
||||
langs=(en_{US,GB}.{UTF-,utf}8 en.UTF-8
|
||||
$(locale -a 2>/dev/null | egrep 'utf8|UTF-8'))
|
||||
for LANG in $langs; do
|
||||
if [[ é = ? ]]; then
|
||||
mb_ok=1
|
||||
break;
|
||||
fi
|
||||
done
|
||||
if [[ -z $mb_ok ]]; then
|
||||
ZTST_unimplemented="no UTF-8 locale or multibyte mode is not implemented"
|
||||
else
|
||||
print -u $ZTST_fd Testing PCRE multibyte with locale $LANG
|
||||
mkdir multibyte.tmp && cd multibyte.tmp
|
||||
fi
|
||||
|
||||
%test
|
||||
|
||||
[[ 'foo→bar' =~ .([^[:ascii:]]). ]]
|
||||
print $MATCH
|
||||
print $match[1]
|
||||
0:Basic non-ASCII regexp matching
|
||||
>o→b
|
||||
>→
|
||||
|
||||
[[ foo =~ f.+ ]] ; print $?
|
||||
[[ foo =~ x.+ ]] ; print $?
|
||||
[[ ! foo =~ f.+ ]] ; print $?
|
||||
[[ ! foo =~ x.+ ]] ; print $?
|
||||
[[ foo =~ f.+ && bar =~ b.+ ]] ; print $?
|
||||
[[ foo =~ x.+ && bar =~ b.+ ]] ; print $?
|
||||
[[ foo =~ f.+ && bar =~ x.+ ]] ; print $?
|
||||
[[ ! foo =~ f.+ && bar =~ b.+ ]] ; print $?
|
||||
[[ foo =~ f.+ && ! bar =~ b.+ ]] ; print $?
|
||||
[[ ! ( foo =~ f.+ && bar =~ b.+ ) ]] ; print $?
|
||||
[[ ! foo =~ x.+ && bar =~ b.+ ]] ; print $?
|
||||
[[ foo =~ x.+ && ! bar =~ b.+ ]] ; print $?
|
||||
[[ ! ( foo =~ x.+ && bar =~ b.+ ) ]] ; print $?
|
||||
0:Regex result inversion detection
|
||||
>0
|
||||
>1
|
||||
>1
|
||||
>0
|
||||
>0
|
||||
>1
|
||||
>1
|
||||
>1
|
||||
>1
|
||||
>1
|
||||
>0
|
||||
>1
|
||||
>0
|
||||
|
||||
# Note that PCRE_ANCHORED only means anchored at the start
|
||||
# Also note that we don't unset MATCH/match on failed match (and it's an
|
||||
# open issue as to whether or not we should)
|
||||
pcre_compile '.(→.)'
|
||||
pcre_match foo→bar
|
||||
print $? $MATCH $match ; unset MATCH match
|
||||
pcre_match foo.bar
|
||||
print $? $MATCH $match ; unset MATCH match
|
||||
pcre_match foo†bar
|
||||
print $? $MATCH $match ; unset MATCH match
|
||||
pcre_match foo→†ar
|
||||
print $? $MATCH $match ; unset MATCH match
|
||||
pcre_study
|
||||
pcre_match foo→bar
|
||||
print $? $MATCH $match ; unset MATCH match
|
||||
pcre_compile -a '.(→.)'
|
||||
pcre_match foo→bar
|
||||
print $? $MATCH $match ; unset MATCH match
|
||||
pcre_match o→bar
|
||||
print $? $MATCH $match ; unset MATCH match
|
||||
pcre_match o→b
|
||||
print $? $MATCH $match ; unset MATCH match
|
||||
pcre_compile 'x.(→.)'
|
||||
pcre_match xo→t
|
||||
print $? $MATCH $match ; unset MATCH match
|
||||
pcre_match Xo→t
|
||||
print $? $MATCH $match ; unset MATCH match
|
||||
pcre_compile -i 'x.(→.)'
|
||||
pcre_match xo→t
|
||||
print $? $MATCH $match ; unset MATCH match
|
||||
pcre_match Xo→t
|
||||
print $? $MATCH $match ; unset MATCH match
|
||||
0:pcre_compile interface testing: basic, anchored & case-insensitive
|
||||
>0 o→b →b
|
||||
>1
|
||||
>1
|
||||
>0 o→† →†
|
||||
>0 o→b →b
|
||||
>1
|
||||
>0 o→b →b
|
||||
>0 o→b →b
|
||||
>0 xo→t →t
|
||||
>1
|
||||
>0 xo→t →t
|
||||
>0 Xo→t →t
|
Loading…
Reference in New Issue