develooper Front page | perl.perl5.porters | Postings from March 2003

[PATCH] new m//w flag to return whole match with submatches

Thread Next
From:
sthoenna
Date:
March 26, 2003 20:29
Subject:
[PATCH] new m//w flag to return whole match with submatches
Message ID:
wOmg+gzkgqPQ092yn@efn.org
There was a recent discussion on clp.moderated about adding a @MATCH
(aka @&) array, where $MATCH[0] is $&, $MATCH[1] is $1, etc.

While ways were pointed out to achieve this with tie, @+, and @-, it
became clear to me that the existing return values of m// in list
context are inadequate, since there is a disjunction in the returned
values between no paren groups and some paren groups--and in the case
of an arbitrary regexp, you may not know if there are paren groups.

This is an attempt to give the user the control to remove the disjuction
in return values:
                   without //w              with //w
  "a"=~/./           (1)                      ("a")
  "a"=~/(.)/         ("a")                    ("a","a")
  "a"=~/((.))/       ("a","a")                ("a","a","a")

  "a"=~/./g          ("a")                    ("a")
  "a"=~/(.)/g        ("a")                    ("a","a")
  "a"=~/((.))/g      ("a","a")                ("a","a","a")

It also lets you conveniently get the result of a match (or undef if no
match):

if (defined($fud = ($foo =~ /$bar/w)[0])) ...

(I thought of making scalar context //w return the whole match on
success and undef on failure, but that would require additional work
to get C<!~ /foo/w> to work.)

Patch also removes a block of dead code from pp_match (the yup: label
can never be reached if PL_sawampersand).

As an aside, since I think both pat.t and regexp.t use $&, that yup:
code is never exercised by them.  I'm looking into fixing this.

--- perl/pod/perlop.pod.orig	Mon Mar 17 17:40:36 2003
+++ perl/pod/perlop.pod	Fri Mar 21 16:47:26 2003
@@ -850,6 +850,7 @@
     o	Compile pattern only once.
     s	Treat string as single line.
     x	Use extended regular expressions.
+    w	Return whole match before submatches.
 
 If "/" is the delimiter then the initial C<m> is optional.  With the C<m>
 you can use any pair of non-alphanumeric, non-whitespace characters 
@@ -888,10 +889,12 @@
 If the C</g> option is not used, C<m//> in list context returns a
 list consisting of the subexpressions matched by the parentheses in the
 pattern, i.e., (C<$1>, C<$2>, C<$3>...).  (Note that here C<$1> etc. are
-also set, and that this differs from Perl 4's behavior.)  When there are
-no parentheses in the pattern, the return value is the list C<(1)> for
-success.  With or without parentheses, an empty list is returned upon
-failure.
+also set, and that this differs from Perl 4's behavior.)  If the C</w>
+option is used, the whole match (i.e. C<$&>) is returned at the
+beginning of the list.  When there are no parentheses in the pattern,
+the return value is the list C<(1)> for success (or (C<$&>) with the
+C</w> option).  With or without parentheses, an empty list is returned
+upon failure.
 
 Examples:
 
@@ -919,9 +922,10 @@
 matching as many times as possible within the string.  How it behaves
 depends on the context.  In list context, it returns a list of the
 substrings matched by any capturing parentheses in the regular
-expression.  If there are no parentheses, it returns a list of all
-the matched strings, as if there were parentheses around the whole
-pattern.
+expression.  If the C</w> option is used, the list includes the
+whole matched strings, as if there were parentheses around the whole
+pattern.  If there are no parentheses, it returns a list of all
+the matched strings (whether or not C</w> was used).
 
 In scalar context, each execution of C<m//g> finds the next match,
 returning true if it matches, and false if there is no further match.
--- perl/op.h.orig	Sun Mar  2 08:34:00 2003
+++ perl/op.h	Thu Mar 20 22:05:02 2003
@@ -288,8 +288,10 @@
 #define PMf_SINGLELINE	0x2000		/* assume single line */
 #define PMf_FOLD	0x4000		/* case insensitivity */
 #define PMf_EXTENDED	0x8000		/* chuck embedded whitespace */
+#define PMf_WHOLE	0x10000		/* return whole match in list context */
 
 /* mask of bits stored in regexp->reganch */
+/* N.B.: The COMPILETIME values are hardcoded in sv_2pv_flags */
 #define PMf_COMPILETIME	(PMf_MULTILINE|PMf_SINGLELINE|PMf_LOCALE|PMf_FOLD|PMf_EXTENDED)
 
 #ifdef USE_ITHREADS
--- perl/toke.c.orig	Fri Mar 14 13:39:54 2003
+++ perl/toke.c	Thu Mar 20 19:43:30 2003
@@ -6341,6 +6341,8 @@
 	*pmfl |= PMf_SINGLELINE;
     else if (ch == 'x')
 	*pmfl |= PMf_EXTENDED;
+    else if (ch == 'w')
+	*pmfl |= PMf_WHOLE;
 }
 
 STATIC char *
@@ -6361,7 +6363,7 @@
 	    pmflag(&pm->op_pmflags,*s++);
     }
     else {
-	while (*s && strchr("iogcmsx", *s))
+	while (*s && strchr("iogcmsxw", *s))
 	    pmflag(&pm->op_pmflags,*s++);
     }
     /* issue a warning if /c is specified,but /g is not */
--- perl/pp_hot.c.orig	Sun Mar  2 08:34:00 2003
+++ perl/pp_hot.c	Fri Mar 21 16:29:46 2003
@@ -1280,7 +1280,7 @@
 	I32 nparens, i, len;
 
 	nparens = rx->nparens;
-	if (global && !nparens)
+	if ((global && !nparens) || (pm->op_pmflags & PMf_WHOLE))
 	    i = 1;
 	else
 	    i = 0;
@@ -1324,7 +1324,7 @@
 	    r_flags |= REXEC_IGNOREPOS | REXEC_NOT_FIRST;
 	    goto play_it_again;
 	}
-	else if (!nparens)
+	else if (!nparens && (pm->op_pmflags & PMf_WHOLE) == 0)
 	    XPUSHs(&PL_sv_yes);
 	LEAVE_SCOPE(oldsave);
 	RETURN;
@@ -1374,38 +1374,20 @@
 	rx->sublen = strend - truebase;
 	goto gotcha;
     }
-    if (PL_sawampersand) {
-	I32 off;
-#ifdef PERL_COPY_ON_WRITE
-	if (SvIsCOW(TARG) || (SvFLAGS(TARG) & CAN_COW_MASK) == CAN_COW_FLAGS) {
-	    if (DEBUG_C_TEST) {
-		PerlIO_printf(Perl_debug_log,
-			      "Copy on write: pp_match $& capture, type %d, truebase=%p, t=%p, difference %d\n",
-			      (int) SvTYPE(TARG), truebase, t,
-			      (int)(t-truebase));
-	    }
-	    rx->saved_copy = sv_setsv_cow(rx->saved_copy, TARG);
-	    rx->subbeg = SvPVX(rx->saved_copy) + (t - truebase);
-	    assert (SvPOKp(rx->saved_copy));
-	} else
-#endif
-	{
+    rx->startp[0] = s - truebase;
+    rx->endp[0] = s - truebase + rx->minlen;
+    rx->nparens = rx->lastparen = 0;	/* used by @- and @+ */
 
-	    rx->subbeg = savepvn(t, strend - t);
-#ifdef PERL_COPY_ON_WRITE
-	    rx->saved_copy = Nullsv;
-#endif
-	}
-	rx->sublen = strend - t;
-	RX_MATCH_COPIED_on(rx);
-	off = rx->startp[0] = s - t;
-	rx->endp[0] = off + rx->minlen;
-    }
-    else {			/* startp/endp are used by @- @+. */
-	rx->startp[0] = s - truebase;
-	rx->endp[0] = s - truebase + rx->minlen;
+    if (GIMME == G_ARRAY && (pm->op_pmflags & PMf_WHOLE)) {
+	I32 len = rx->minlen;
+	PUSHs(sv_newmortal());
+	sv_setpvn(*SP, s, len);
+	if (DO_UTF8(TARG) && is_utf8_string((U8*)s, len))
+	    SvUTF8_on(*SP);
+	LEAVE_SCOPE(oldsave);
+	RETURN;
     }
-    rx->nparens = rx->lastparen = 0;	/* used by @- and @+ */
+
     LEAVE_SCOPE(oldsave);
     RETPUSHYES;
 
--- perl/t/op/pat.t.orig	Sun Mar  9 04:42:38 2003
+++ perl/t/op/pat.t	Wed Mar 26 18:31:14 2003
@@ -6,7 +6,7 @@
 
 $| = 1;
 
-print "1..996\n";
+print "1..1024\n";
 
 BEGIN {
     chdir 't' if -d 't';
@@ -3159,4 +3159,47 @@
     ok(join(":", /\b(.)\x{100}/g) eq "a:/", "re_intuit_start and PL_bostr");
 }
 
-# last test 996
+$x = "abc\nbc\n01";
+
+# scalar context should be uneffected by //w
+ok($x =~ /abc/w, "scalar context //w");
+ok($x =~ /ab./w, "scalar context //w");
+ok($x =~ /.../w, "scalar context //w");
+ok($x =~ /(?=b)/w, "scalar context //w");
+ok($x =~ /0/w, "scalar context //w");
+ok($x =~ /0(?=1)/w, "scalar context //w");
+ok($x =~ /\d/w, "scalar context //w");
+ok($x !~ /ghi/w, "scalar context //w");
+ok($x !~ /abc./w, "scalar context //w");
+ok($x !~ /bc./w, "scalar context //w");
+ok($x !~ /..../w, "scalar context //w");
+
+# list no paren //w
+ok(join(':',$x=~/bc/w) eq 'bc', "list context no paren //w");
+ok(join(':',$x=~/.bc/w) eq 'abc', "list context no paren //w");
+ok(join(':',$x=~/.../w) eq 'abc', "list context no paren //w");
+ok(join(':',$x=~/../w) eq 'ab', "list context no paren //w");
+
+# list no paren //wg
+ok(join(':',$x=~/bc/wg) eq 'bc:bc', "list context no paren //wg");
+ok(join(':',$x=~/.bc/wg) eq 'abc', "list context no paren //wg");
+ok(join(':',$x=~/.../wg) eq 'abc', "list context no paren //wg");
+ok(join(':',$x=~/../wg) eq 'ab:bc:01', "list context no paren //wg");
+
+# list paren //w
+ok(join(':',$x=~/(bc)/w) eq 'bc:bc', "list context with parens //w");
+ok(join(':',$x=~/()/w) eq ':', "list context with parens //w");
+ok(join(':',$x=~/(.?)(bc)/w) eq 'abc:a:bc', "list context with parens //w");
+ok(join(':',$x=~/(.)((.?).)/w) eq 'abc:a:bc:b', "list context with parens //w");
+ok(join(':',$x=~/.().()/w) eq 'ab::', "list context with parens //w");
+
+# list paren //wg
+ok(join(':',$x=~/(bc)/wg) eq 'bc:bc:bc:bc', "list context with parens //wg");
+ok(join(':',$x=~/(.?)(bc)/wg) eq 'abc:a:bc:bc::bc',
+   "list context with parens //wg");
+ok(join(':',$x=~/(.)((.?).)/wg) eq 'abc:a:bc:b:bc:b:c::01:0:1:',
+   "list context with parens //wg");
+ok(join(':',$x=~/.().()/wg) eq 'ab:::bc:::01::',
+   "list context with parens //wg");
+
+# last test 1024

Thread Next


nntp.perl.org: Perl Programming lists via nntp and http.
Comments to Ask Bjørn Hansen at ask@perl.org | Group listing | About