develooper Front page | perl.perl5.porters | Postings from April 2001

Re: [PATCH bleadperl] [ID 20010426.002] Word boundry regex [...]

Thread Previous | Thread Next
From:
Hugo
Date:
April 29, 2001 10:07
Subject:
Re: [PATCH bleadperl] [ID 20010426.002] Word boundry regex [...]
Message ID:
200104291609.RAA17790@crypt.compulink.co.uk
In <20010428164714.B22905@math.ohio-state.edu>, Ilya Zakharevich writes:
:On Sat, Apr 28, 2001 at 08:41:13PM +0100, Hugo wrote:
:> :The only suggestion I have is that there might have been a case when
:> :this chunk is entered *before* PL_regprev is set.  Can this happen?
:> 
:> Not that I can see: regexec_flags() sets it whenever startpos == strbeg,
:> before any calls to find_byclass either indirectly (through intuit_start)
:> or directly.
:
:This means that I'm right: intuit_start() is usually called *before*
:regexec_flags() is entered.

However, it is called either at the start of the string or after a
successful call to regexec_flags(), after either of which PL_regprev
should be correctly set. The only exception is when we get a successful
match without calling regexec_flags(), in pp_match (look for 'goto yup'),
so I guess we additionally need to set PL_regprev in this case.

:> Is it possible that lookbehind could get back to strbeg when PL_regprev
:> is not set, or not set correctly?
:
:I would hope that lookbehind resets PL_regprev.

Not that I can see.

It seems real difficult to get PL_regprev correct in all cases. Below is
a patch to remove PL_regprev entirely, which doesn't fail any of the
existing tests and which I believe should offer a performance improvement.
I assume there may be problems not currently tested, particularly in the
area of in-place substitutions - can you suggest test cases that fail?

The remaining mention of PL_regprev is in Porting/findvars - I'm not sure
what that is used for, so I have left it alone.

Hugo
--- thrdvar.h.old	Tue Jan 30 18:14:40 2001
+++ thrdvar.h	Sun Apr 29 16:18:07 2001
@@ -183,7 +183,6 @@
 PERLVAR(Tregendp,	I32 *)		/* Ditto for endp. */
 PERLVAR(Treglastparen,	U32 *)		/* Similarly for lastparen. */
 PERLVAR(Tregtill,	char *)		/* How far we are required to go. */
-PERLVAR(Tregprev,	char)		/* char before regbol, \n if none */
 PERLVAR(Treg_start_tmp,	char **)	/* from regexec.c */
 PERLVAR(Treg_start_tmpl,U32)		/* from regexec.c */
 PERLVAR(Tregdata,	struct reg_data *)
--- sv.c.old	Mon Apr 23 15:43:21 2001
+++ sv.c	Sun Apr 29 16:16:16 2001
@@ -9295,7 +9295,6 @@
     PL_regendp		= (I32*)NULL;
     PL_reglastparen	= (U32*)NULL;
     PL_regtill		= Nullch;
-    PL_regprev		= '\n';
     PL_reg_start_tmp	= (char**)NULL;
     PL_reg_start_tmpl	= 0;
     PL_regdata		= (struct reg_data*)NULL;
--- regcomp.c.old	Thu Apr 26 15:21:19 2001
+++ regcomp.c	Sun Apr 29 16:15:33 2001
@@ -4683,7 +4683,6 @@
     SAVEVPTR(PL_regendp);		/* Ditto for endp. */
     SAVEVPTR(PL_reglastparen);		/* Similarly for lastparen. */
     SAVEPPTR(PL_regtill);		/* How far we are required to go. */
-    SAVEI8(PL_regprev);			/* char before regbol, \n if none */
     SAVEGENERICPV(PL_reg_start_tmp);		/* from regexec.c */
     PL_reg_start_tmp = 0;
     SAVEI32(PL_reg_start_tmpl);		/* from regexec.c */
--- regexec.c.old	Wed Apr 11 14:14:26 2001
+++ regexec.c	Sun Apr 29 17:00:30 2001
@@ -946,7 +946,7 @@
 	    /* FALL THROUGH */
 	case BOUND:
 	    if (do_utf8) {
-		if (s == startpos)
+		if (s == PL_bostr)
 		    tmp = '\n';
 		else {
 		    U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
@@ -969,7 +969,7 @@
 		}
 	    }
 	    else {
-		tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
+		tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
 		tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
 		while (s < strend) {
 		    if (tmp ==
@@ -989,7 +989,7 @@
 	    /* FALL THROUGH */
 	case NBOUND:
 	    if (do_utf8) {
-		if (s == startpos)
+		if (s == PL_bostr)
 		    tmp = '\n';
 		else {
 		    U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
@@ -1010,7 +1010,7 @@
 		}
 	    }
 	    else {
-		tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
+		tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
 		tmp = ((OP(c) == NBOUND ?
 			isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
 		while (s < strend) {
@@ -1429,19 +1429,6 @@
       if (strend - startpos < minlen) goto phooey;
     }
 
-    if (startpos == strbeg)	/* is ^ valid at stringarg? */
-	PL_regprev = '\n';
-    else {
-        if (prog->reganch & ROPT_UTF8 && do_utf8) {
-	    U8 *s = reghop3((U8*)stringarg, -1, (U8*)strbeg);
-	    PL_regprev = utf8n_to_uvchr(s, (U8*)stringarg - s, NULL, 0);
-	}
-	else
-	    PL_regprev = (U32)stringarg[-1];
-	if (!PL_multiline && PL_regprev == '\n')
-	    PL_regprev = '\0';		/* force ^ to NOT match */
-    }
-
     /* Check validity of program. */
     if (UCHARAT(prog->program) != REG_MAGIC) {
 	Perl_croak(aTHX_ "corrupted regexp program");
@@ -2044,19 +2031,16 @@
 
 	switch (OP(scan)) {
 	case BOL:
-	    if (locinput == PL_bostr
-		? PL_regprev == '\n'
-		: (PL_multiline &&
-		   (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
+	    if (locinput == PL_bostr || (PL_multiline &&
+		(nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
 	    {
 		/* regtill = regbol; */
 		break;
 	    }
 	    sayNO;
 	case MBOL:
-	    if (locinput == PL_bostr
-		? PL_regprev == '\n'
-		: ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
+	    if (locinput == PL_bostr ||
+		((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
 	    {
 		break;
 	    }
@@ -2259,8 +2243,8 @@
 	case NBOUND:
 	    /* was last char in word? */
 	    if (do_utf8) {
-		if (locinput == PL_regbol)
-		    ln = PL_regprev;
+		if (locinput == PL_bostr)
+		    ln = '\n';
 		else {
 		    U8 *r = reghop((U8*)locinput, -1);
 		
@@ -2277,8 +2261,8 @@
 		}
 	    }
 	    else {
-		ln = (locinput != PL_regbol) ?
-		    UCHARAT(locinput - 1) : PL_regprev;
+		ln = (locinput != PL_bostr) ?
+		    UCHARAT(locinput - 1) : '\n';
 		if (OP(scan) == BOUND || OP(scan) == NBOUND) {
 		    ln = isALNUM(ln);
 		    n = isALNUM(nextchr);
--- t/op/re_tests.old	Thu Mar 22 06:06:54 2001
+++ t/op/re_tests	Sun Apr 29 16:58:51 2001
@@ -787,3 +787,4 @@
 (a)?(a)+	a	y	$1:$2	:a	-
 (ab)?(ab)+	ab	y	$1:$2	:ab	-
 (abc)?(abc)+	abc	y	$1:$2	:abc	-
+'b\s^'m	a\nb\n	n	-	-
--- t/op/subst.t.old	Tue Aug 29 13:54:13 2000
+++ t/op/subst.t	Sat Apr 28 13:57:22 2001
@@ -6,7 +6,7 @@
     require Config; import Config;
 }
 
-print "1..84\n";
+print "1..85\n";
 
 $x = 'foo';
 $_ = "x";
@@ -378,4 +378,8 @@
 $_ = "C:/";
 s/^([a-z]:)/\u$1/ and print "not ";
 print "ok 84\n";
+
+$_ = "Charles Bronson";
+s/\B\w//g;
+print $_ eq "C B" ? "ok 85\n" : "not ok 85\n# \$_ eq '$_'\n";
 

Thread Previous | Thread Next


nntp.perl.org: Perl Programming lists via nntp and http.
Comments to Ask Bjørn Hansen at ask@perl.org | Group listing | About