develooper Front page | perl.perl5.porters | Postings from January 2003

[perl #18232] [PATCH] store PL_reg_match_utf8 in reganch

Thread Next
From:
Rafael Garcia-Suarez
Date:
January 22, 2003 13:30
Subject:
[perl #18232] [PATCH] store PL_reg_match_utf8 in reganch
Message ID:
20030122223556.57d597a3.rgarciasuarez@free.fr
The following patch fixes bug #18232. The bug is that the utf8 flag on
$<digit> variables is dropped when an inner scope also modifies the
$<digit> vars. (See the included regression test.) That's because this
flag is stored in a global variable PL_reg_match_utf8.

I don't get rid completely of PL_reg_match_utf8 but I duplicate it with
a new flag in regexp.reganch. PL_reg_match_utf8 is still used in
regexec.c to avoid the need to carry on the current regexp everywhere. I
suspect this is also more efficient.


Index: pp.c
===================================================================
--- pp.c	(revision 622)
+++ pp.c	(working copy)
@@ -4391,7 +4391,7 @@
     TAINT_IF((pm->op_pmflags & PMf_LOCALE) &&
 	     (pm->op_pmflags & (PMf_WHITE | PMf_SKIPWHITE)));
 
-    PL_reg_match_utf8 = do_utf8;
+    RX_MATCH_UTF8_set(rx, do_utf8);
 
     if (pm->op_pmreplroot) {
 #ifdef USE_ITHREADS
Index: pp_ctl.c
===================================================================
--- pp_ctl.c	(revision 629)
+++ pp_ctl.c	(working copy)
@@ -158,7 +158,7 @@
     register REGEXP *rx = cx->sb_rx;
 
     rxres_restore(&cx->sb_rxres, rx);
-    PL_reg_match_utf8 = SvUTF8(cx->sb_targ) ? 1 : 0;
+    RX_MATCH_UTF8_set(rx, SvUTF8(cx->sb_targ));
 
     if (cx->sb_iters++) {
 	I32 saviters = cx->sb_iters;
Index: regexp.h
===================================================================
--- regexp.h	(revision 622)
+++ regexp.h	(working copy)
@@ -71,6 +71,7 @@
 #define ROPT_NAUGHTY		0x20000 /* how exponential is this pattern? */
 #define ROPT_COPY_DONE		0x40000	/* subbeg is a copy of the string */
 #define ROPT_TAINTED_SEEN	0x80000
+#define ROPT_MATCH_UTF8		0x10000000 /* subbeg is utf-8 */
 
 #define RE_USE_INTUIT_NOML	0x0100000 /* Best to intuit before matching */
 #define RE_USE_INTUIT_ML	0x0200000
@@ -99,6 +100,13 @@
 					 ? RX_MATCH_COPIED_on(prog) \
 					 : RX_MATCH_COPIED_off(prog))
 
+#define RX_MATCH_UTF8(prog)		((prog)->reganch & ROPT_MATCH_UTF8)
+#define RX_MATCH_UTF8_on(prog)		((prog)->reganch |= ROPT_MATCH_UTF8)
+#define RX_MATCH_UTF8_off(prog)		((prog)->reganch &= ~ROPT_MATCH_UTF8)
+#define RX_MATCH_UTF8_set(prog, t)	((t) \
+			? (RX_MATCH_UTF8_on(prog), (PL_reg_match_utf8 = 1)) \
+			: (RX_MATCH_UTF8_off(prog), (PL_reg_match_utf8 = 0)))
+    
 #define REXEC_COPY_STR	0x01		/* Need to copy the string. */
 #define REXEC_CHECKED	0x02		/* check_substr already checked. */
 #define REXEC_SCREAM	0x04		/* use scream table. */
Index: t/op/pat.t
===================================================================
--- t/op/pat.t	(revision 622)
+++ t/op/pat.t	(working copy)
@@ -6,7 +6,7 @@
 
 $| = 1;
 
-print "1..968\n";
+print "1..971\n";
 
 BEGIN {
     chdir 't' if -d 't';
@@ -3054,5 +3054,14 @@
     ok($a !~ /^\C{4}y/,     'not match \C{4}y');
 }
 
-# last test 968
+{
+    # [perl #18232]
+    "\x{100}" =~ /(.)/;
+    ok( $1 eq "\x{100}", '$1 is utf-8' );
+    { 'a' =~ /./; }
+    ok( $1 eq "\x{100}", '$1 is still utf-8' );
+    ok( $1 ne "\xC4\x80", '$1 is not non-utf-8' );
+}
+
+# last test 971
 
Index: regexec.c
===================================================================
--- regexec.c	(revision 634)
+++ regexec.c	(working copy)
@@ -400,6 +400,7 @@
     char *i_strpos = strpos;
     SV *dsv = PERL_DEBUG_PAD_ZERO(0);
 #endif
+    RX_MATCH_UTF8_set(prog,do_utf8);
 
     if (prog->reganch & ROPT_UTF8) {
 	DEBUG_r(PerlIO_printf(Perl_debug_log,
@@ -1615,6 +1616,7 @@
     SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
     SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
 #endif
+    RX_MATCH_UTF8_set(prog,do_utf8);
 
     PL_regcc = 0;
 
Index: mg.c
===================================================================
--- mg.c	(revision 631)
+++ mg.c	(working copy)
@@ -418,7 +418,7 @@
 		else			/* @- */
 		    i = s;
 
-		if (i > 0 && PL_reg_match_utf8) {
+		if (i > 0 && RX_MATCH_UTF8(rx)) {
 		    char *b = rx->subbeg;
 		    if (b)
 		        i = Perl_utf8_length(aTHX_ (U8*)b, (U8*)(b+i));
@@ -459,7 +459,7 @@
 	    {
 		i = t1 - s1;
 	      getlen:
-		if (i > 0 && PL_reg_match_utf8) {
+		if (i > 0 && RX_MATCH_UTF8(rx)) {
 		    char *s    = rx->subbeg + s1;
 		    char *send = rx->subbeg + t1;
 
@@ -707,7 +707,7 @@
 	      getrx:
 		if (i >= 0) {
 		    sv_setpvn(sv, s, i);
-		    if (PL_reg_match_utf8 && is_utf8_string((U8*)s, i))
+		    if (RX_MATCH_UTF8(rx) && is_utf8_string((U8*)s, i))
 			SvUTF8_on(sv);
 		    else
 			SvUTF8_off(sv);
Index: pp_hot.c
===================================================================
--- pp_hot.c	(revision 624)
+++ pp_hot.c	(working copy)
@@ -1180,7 +1180,7 @@
 		 (PL_tainted && (pm->op_pmflags & PMf_RETAINT)));
     TAINT_NOT;
 
-    PL_reg_match_utf8 = DO_UTF8(TARG);
+    RX_MATCH_UTF8_set(rx, DO_UTF8(TARG));
 
     /* PMdf_USED is set after a ?? matches once */
     if (pm->op_pmdynflags & PMdf_USED) {
@@ -1355,7 +1355,7 @@
     if (global) {
 	rx->subbeg = truebase;
 	rx->startp[0] = s - truebase;
-	if (PL_reg_match_utf8) {
+	if (RX_MATCH_UTF8(rx)) {
 	    char *t = (char*)utf8_hop((U8*)s, rx->minlen);
 	    rx->endp[0] = t - truebase;
 	}
@@ -1907,14 +1907,14 @@
 	rxtainted |= 2;
     TAINT_NOT;
 
-    PL_reg_match_utf8 = DO_UTF8(TARG);
+    RX_MATCH_UTF8_set(rx, DO_UTF8(TARG));
 
   force_it:
     if (!pm || !s)
 	DIE(aTHX_ "panic: pp_subst");
 
     strend = s + len;
-    slen = PL_reg_match_utf8 ? utf8_length((U8*)s, (U8*)strend) : len;
+    slen = RX_MATCH_UTF8(rx) ? utf8_length((U8*)s, (U8*)strend) : len;
     maxiters = 2 * slen + 10;	/* We can match twice at each
 				   position, once with zero-length,
 				   second time with non-zero. */

Thread Next


nntp.perl.org: Perl Programming lists via nntp and http.
Comments to Ask Bjørn Hansen at ask@perl.org | Group listing | About