Front page | perl.perl5.porters |
Postings from March 2001
[PATCH 5.6.1] m// and s/// with utf8
Thread Next
From:
Ilya Zakharevich
Date:
March 5, 2001 11:44
Subject:
[PATCH 5.6.1] m// and s/// with utf8
Message ID:
20010305144440.A13178@math.ohio-state.edu
This patch does not:
a) make s///g work (a similar chunk is needed for substcont);
b) make split work;
c) some additional work may be needed to upgrade-during compile the
RExen which do not make sense in UTF-less situation;
I may have some time later today to fix a) and b). Maybe c) as well...
Some pragma/utf8.t subtests break, but the behaviour they test was
buggy, and 'use utf8' should not mean anything now.
Enjoy,
Ilya
--- ./embed.pl-ppp Wed Jan 31 10:56:46 2001
+++ ./embed.pl Mon Mar 5 10:43:22 2001
@@ -1880,6 +1880,7 @@ Ap |char* |re_intuit_start|regexp* prog|
|char* strend|U32 flags \
|struct re_scream_pos_data_s *data
Ap |SV* |re_intuit_string|regexp* prog
+Ap |void |re_utf8_upgrade|PMOP* pm
Ap |I32 |regexec_flags |regexp* prog|char* stringarg \
|char* strend|char* strbeg|I32 minend \
|SV* screamer|void* data|U32 flags
--- ./pp_hot.c-ppp Wed Jan 31 10:56:50 2001
+++ ./pp_hot.c Mon Mar 5 11:53:16 2001
@@ -1007,13 +1007,6 @@ PP(pp_match)
EXTEND(SP,1);
}
PUTBACK; /* EVAL blocks need stack_sp. */
- s = SvPV(TARG, len);
- strend = s + len;
- if (!s)
- DIE(aTHX_ "panic: pp_match");
- rxtainted = ((pm->op_pmdynflags & PMdf_TAINTED) ||
- (PL_tainted && (pm->op_pmflags & PMf_RETAINT)));
- TAINT_NOT;
if (pm->op_pmdynflags & PMdf_USED) {
failure:
@@ -1026,6 +1019,22 @@ PP(pp_match)
pm = PL_curpm;
rx = pm->op_pmregexp;
}
+ if (DO_UTF8(TARG)) {
+ if (!(pm->op_pmdynflags & PMdf_UTF8)) {
+ if (!pm->op_pmregexp->utf8_re)
+ Perl_re_utf8_upgrade(aTHX_ pm);
+ rx = pm->op_pmregexp->utf8_re;
+ }
+ } else if ((pm->op_pmdynflags & PMdf_UTF8) && !IN_BYTE)
+ sv_utf8_upgrade(TARG);
+ s = SvPV(TARG, len);
+ strend = s + len;
+ if (!s)
+ DIE(aTHX_ "panic: pp_match");
+
+ rxtainted = ((pm->op_pmdynflags & PMdf_TAINTED) ||
+ (PL_tainted && (pm->op_pmflags & PMf_RETAINT)));
+ TAINT_NOT;
if (rx->minlen > len) goto failure;
truebase = t = s;
@@ -1795,6 +1804,20 @@ PP(pp_subst)
TARG = DEFSV;
EXTEND(SP,1);
}
+ if (!rx->prelen && PL_curpm) {
+ pm = PL_curpm;
+ rx = pm->op_pmregexp;
+ }
+ if (!pm)
+ DIE(aTHX_ "panic: pp_subst");
+ if (DO_UTF8(TARG)) {
+ if (!(pm->op_pmdynflags & PMdf_UTF8)) {
+ if (!pm->op_pmregexp->utf8_re)
+ Perl_re_utf8_upgrade(aTHX_ pm);
+ rx = pm->op_pmregexp->utf8_re;
+ }
+ } else if ((pm->op_pmdynflags & PMdf_UTF8) && !IN_BYTE)
+ sv_utf8_upgrade(TARG);
do_utf8 = DO_UTF8(TARG);
if (SvFAKE(TARG) && SvREADONLY(TARG))
sv_force_normal(TARG);
@@ -1814,7 +1837,7 @@ PP(pp_subst)
TAINT_NOT;
force_it:
- if (!pm || !s)
+ if (!s)
DIE(aTHX_ "panic: pp_subst");
strend = s + len;
@@ -1823,10 +1846,6 @@ PP(pp_subst)
position, once with zero-length,
second time with non-zero. */
- if (!rx->prelen && PL_curpm) {
- pm = PL_curpm;
- rx = pm->op_pmregexp;
- }
r_flags = (rx->nparens || SvTEMP(TARG) || PL_sawampersand)
? REXEC_COPY_STR : 0;
if (SvSCREAM(TARG))
--- ./regcomp.c-ppp Wed Jan 31 10:56:50 2001
+++ ./regcomp.c Mon Mar 5 10:50:06 2001
@@ -1615,6 +1615,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xen
r->substrs = 0; /* Useful during FAIL. */
r->startp = 0; /* Useful during FAIL. */
r->endp = 0; /* Useful during FAIL. */
+ r->utf8_re = 0; /* We do not have utf8 flavor yet */
PL_regcomp_rx = r;
@@ -4424,6 +4425,8 @@ Perl_pregfree(pTHX_ struct regexp *r)
if (r->precomp)
Safefree(r->precomp);
+ if (r->utf8_re)
+ Perl_pregfree(aTHX_ r->utf8_re);
if (RX_MATCH_COPIED(r))
Safefree(r->subbeg);
if (r->substrs) {
@@ -4601,3 +4604,13 @@ clear_re(pTHXo_ void *r)
ReREFCNT_dec((regexp *)r);
}
+void
+Perl_re_utf8_upgrade(pTHX_ PMOP *pm)
+{
+ regexp *r = pm->op_pmregexp;
+ U8 tmp = pm->op_pmdynflags;
+
+ pm->op_pmdynflags |= PMdf_UTF8;
+ r->utf8_re = CALLREGCOMP(aTHX_ r->precomp, r->precomp + r->prelen, pm);
+ pm->op_pmdynflags = tmp; /* Do not */
+}
--- ./regexp.h-ppp Wed Jan 31 10:56:50 2001
+++ ./regexp.h Mon Mar 5 10:47:50 2001
@@ -38,6 +38,7 @@ typedef struct regexp {
U32 lastparen; /* last paren matched */
U32 reganch; /* Internal use only +
Tainted information used by regexec? */
+ struct regexp *utf8_re; /* The utf8 flavor. */
regnode program[1]; /* Unwarranted chumminess with compiler. */
} regexp;
Thread Next
-
[PATCH 5.6.1] m// and s/// with utf8
by Ilya Zakharevich