develooper Front page | perl.perl5.porters | Postings from March 2001

[PATCH 5.6.1] m// and s/// with utf8

Thread Next
From:
Ilya Zakharevich
Date:
March 5, 2001 11:44
Subject:
[PATCH 5.6.1] m// and s/// with utf8
Message ID:
20010305144440.A13178@math.ohio-state.edu
This patch does not:

  a) make s///g work (a similar chunk is needed for substcont);

  b) make split work;

  c) some additional work may be needed to upgrade-during compile the
     RExen which do not make sense in UTF-less situation;

I may have some time later today to fix a) and b).  Maybe c) as well...

Some pragma/utf8.t subtests break, but the behaviour they test was
buggy, and 'use utf8' should not mean anything now.

Enjoy,
Ilya

--- ./embed.pl-ppp	Wed Jan 31 10:56:46 2001
+++ ./embed.pl	Mon Mar  5 10:43:22 2001
@@ -1880,6 +1880,7 @@ Ap	|char*	|re_intuit_start|regexp* prog|
 				|char* strend|U32 flags \
 				|struct re_scream_pos_data_s *data
 Ap	|SV*	|re_intuit_string|regexp* prog
+Ap	|void	|re_utf8_upgrade|PMOP* pm
 Ap	|I32	|regexec_flags	|regexp* prog|char* stringarg \
 				|char* strend|char* strbeg|I32 minend \
 				|SV* screamer|void* data|U32 flags
--- ./pp_hot.c-ppp	Wed Jan 31 10:56:50 2001
+++ ./pp_hot.c	Mon Mar  5 11:53:16 2001
@@ -1007,13 +1007,6 @@ PP(pp_match)
 	EXTEND(SP,1);
     }
     PUTBACK;				/* EVAL blocks need stack_sp. */
-    s = SvPV(TARG, len);
-    strend = s + len;
-    if (!s)
-	DIE(aTHX_ "panic: pp_match");
-    rxtainted = ((pm->op_pmdynflags & PMdf_TAINTED) ||
-		 (PL_tainted && (pm->op_pmflags & PMf_RETAINT)));
-    TAINT_NOT;
 
     if (pm->op_pmdynflags & PMdf_USED) {
       failure:
@@ -1026,6 +1019,22 @@ PP(pp_match)
 	pm = PL_curpm;
 	rx = pm->op_pmregexp;
     }
+    if (DO_UTF8(TARG)) {
+	if (!(pm->op_pmdynflags & PMdf_UTF8)) {
+	    if (!pm->op_pmregexp->utf8_re)
+		Perl_re_utf8_upgrade(aTHX_ pm);
+	    rx = pm->op_pmregexp->utf8_re;
+	}
+    } else if ((pm->op_pmdynflags & PMdf_UTF8) && !IN_BYTE)
+	sv_utf8_upgrade(TARG);
+    s = SvPV(TARG, len);
+    strend = s + len;
+    if (!s)
+	DIE(aTHX_ "panic: pp_match");
+
+    rxtainted = ((pm->op_pmdynflags & PMdf_TAINTED) ||
+		 (PL_tainted && (pm->op_pmflags & PMf_RETAINT)));
+    TAINT_NOT;
     if (rx->minlen > len) goto failure;
 
     truebase = t = s;
@@ -1795,6 +1804,20 @@ PP(pp_subst)
 	TARG = DEFSV;
 	EXTEND(SP,1);
     }
+    if (!rx->prelen && PL_curpm) {
+	pm = PL_curpm;
+	rx = pm->op_pmregexp;
+    }
+    if (!pm)
+	DIE(aTHX_ "panic: pp_subst");
+    if (DO_UTF8(TARG)) {
+	if (!(pm->op_pmdynflags & PMdf_UTF8)) {
+	    if (!pm->op_pmregexp->utf8_re)
+		Perl_re_utf8_upgrade(aTHX_ pm);
+	    rx = pm->op_pmregexp->utf8_re;
+	}
+    } else if ((pm->op_pmdynflags & PMdf_UTF8) && !IN_BYTE)
+	sv_utf8_upgrade(TARG);
     do_utf8 = DO_UTF8(TARG);
     if (SvFAKE(TARG) && SvREADONLY(TARG))
 	sv_force_normal(TARG);
@@ -1814,7 +1837,7 @@ PP(pp_subst)
     TAINT_NOT;
 
   force_it:
-    if (!pm || !s)
+    if (!s)
 	DIE(aTHX_ "panic: pp_subst");
 
     strend = s + len;
@@ -1823,10 +1846,6 @@ PP(pp_subst)
 				   position, once with zero-length,
 				   second time with non-zero. */
 
-    if (!rx->prelen && PL_curpm) {
-	pm = PL_curpm;
-	rx = pm->op_pmregexp;
-    }
     r_flags = (rx->nparens || SvTEMP(TARG) || PL_sawampersand)
 		? REXEC_COPY_STR : 0;
     if (SvSCREAM(TARG))
--- ./regcomp.c-ppp	Wed Jan 31 10:56:50 2001
+++ ./regcomp.c	Mon Mar  5 10:50:06 2001
@@ -1615,6 +1615,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xen
     r->substrs = 0;			/* Useful during FAIL. */
     r->startp = 0;			/* Useful during FAIL. */
     r->endp = 0;			/* Useful during FAIL. */
+    r->utf8_re = 0;			/* We do not have utf8 flavor yet */
 
     PL_regcomp_rx = r;
 
@@ -4424,6 +4425,8 @@ Perl_pregfree(pTHX_ struct regexp *r)
 
     if (r->precomp)
 	Safefree(r->precomp);
+    if (r->utf8_re)
+	Perl_pregfree(aTHX_ r->utf8_re);
     if (RX_MATCH_COPIED(r))
 	Safefree(r->subbeg);
     if (r->substrs) {
@@ -4601,3 +4604,13 @@ clear_re(pTHXo_ void *r)
     ReREFCNT_dec((regexp *)r);
 }
 
+void
+Perl_re_utf8_upgrade(pTHX_ PMOP *pm)
+{
+    regexp *r = pm->op_pmregexp;
+    U8 tmp = pm->op_pmdynflags;
+
+    pm->op_pmdynflags |= PMdf_UTF8;
+    r->utf8_re = CALLREGCOMP(aTHX_ r->precomp, r->precomp + r->prelen, pm);
+    pm->op_pmdynflags = tmp;		/* Do not  */
+}
--- ./regexp.h-ppp	Wed Jan 31 10:56:50 2001
+++ ./regexp.h	Mon Mar  5 10:47:50 2001
@@ -38,6 +38,7 @@ typedef struct regexp {
 	U32 lastparen;		/* last paren matched */
 	U32 reganch;		/* Internal use only +
 				   Tainted information used by regexec? */
+	struct regexp *utf8_re;	/* The utf8 flavor. */
 	regnode program[1];	/* Unwarranted chumminess with compiler. */
 } regexp;
 

Thread Next


nntp.perl.org: Perl Programming lists via nntp and http.
Comments to Ask Bjørn Hansen at ask@perl.org | Group listing | About