develooper Front page | perl.perl5.changes | Postings from March 2019

[perl.git] branch blead updated. v5.29.8-151-g765e6ecf32

From:
Karl Williamson
Date:
March 20, 2019 18:13
Subject:
[perl.git] branch blead updated. v5.29.8-151-g765e6ecf32
Message ID:
E1h6fho-00015p-2M@git.dc.perl.space
In perl.git, the branch blead has been updated

<https://perl5.git.perl.org/perl.git/commitdiff/765e6ecf32a570694dcff91c1c72f98306a9390e?hp=80e7c5414423d633f11ec93a7990915e97489502>

- Log -----------------------------------------------------------------
commit 765e6ecf32a570694dcff91c1c72f98306a9390e
Author: Karl Williamson <khw@cpan.org>
Date:   Wed Mar 20 11:47:15 2019 -0600

    Add common UTF-8 first byte to ANYOFH regnodes
    
    An ANYOFH regnode is generated instead of a plain ANYOF one when
    nothing it can match is in the bitmap used in ANYOF nodes.  It is
    therefore smaller as the 4 word (or more) bitmap is omitted.
    
    This means that for it to match a target string, that string must be
    UTF-8 (since the bitmap is for at least the lowest 256 code points).
    And only in rare circumstances are there any flags associated with it in
    the regnode flags field.
    
    This commit changes things so that the flags field in an ANYOFH node is
    repurposed to be the first UTF-8 encoded byte of every code point
    matched by the class if there is a common byte for all of them; or 0 if
    some have different first bytes.
    
    (That means that those rare cases where the flags field isn't otherwise
    empty can no longer be ANYOFH nodes.)
    
    The purpose of this is so that a future commit can take advantage of
    this, and more quickly scan the target string for places that this node
    can match.  A problem with ANYOF nodes is that they are code point
    oriented (U32 or U64), and the target string is UTF-8, so conversion has
    to be done.  By having the partial conversion compiled in, we can look
    for that at runtime instead of having to look at every character in the
    scan.

-----------------------------------------------------------------------

Summary of changes:
 pod/perldebguts.pod |  4 +++-
 regcomp.c           | 67 +++++++++++++++++++++++++++++++++++++++--------------
 regcomp.sym         |  2 +-
 regexec.c           |  2 +-
 regnodes.h          |  2 +-
 t/re/anyof.t        | 12 ++++++++--
 6 files changed, 66 insertions(+), 23 deletions(-)

diff --git a/pod/perldebguts.pod b/pod/perldebguts.pod
index c0bcc1570b..de413fd813 100644
--- a/pod/perldebguts.pod
+++ b/pod/perldebguts.pod
@@ -612,7 +612,9 @@ will be lost.
                   charclass_ classes
                   posixl
  ANYOFH           sv 1       Like ANYOF, but only has "High" matches,
-                             none in the bitmap
+                             none in the bitmap; non-zero flags "f"
+                             means "f" is the first UTF-8 byte shared in
+                             common by all code points matched
  ANYOFM           byte 1     Like ANYOF, but matches an invariant byte
                              as determined by the mask and arg
  NANYOFM          byte 1     complement of ANYOFM
diff --git a/regcomp.c b/regcomp.c
index f44ec79bd1..864f9a02f2 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -1713,6 +1713,7 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
      * another SSC or a regular ANYOF class.  Can create false positives. */
 
     SV* anded_cp_list;
+    U8  and_with_flags = (OP(and_with) == ANYOFH) ? 0 : ANYOF_FLAGS(and_with);
     U8  anded_flags;
 
     PERL_ARGS_ASSERT_SSC_AND;
@@ -1723,7 +1724,7 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
      * the code point inversion list and just the relevant flags */
     if (is_ANYOF_SYNTHETIC(and_with)) {
         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
-        anded_flags = ANYOF_FLAGS(and_with);
+        anded_flags = and_with_flags;
 
         /* XXX This is a kludge around what appears to be deficiencies in the
          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
@@ -1747,14 +1748,14 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
     else {
         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
         if (OP(and_with) == ANYOFD) {
-            anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
+            anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
         }
         else {
-            anded_flags = ANYOF_FLAGS(and_with)
+            anded_flags = and_with_flags
             &( ANYOF_COMMON_FLAGS
               |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
               |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
-            if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) {
+            if (ANYOFL_UTF8_LOCALE_REQD(and_with_flags)) {
                 anded_flags &=
                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
             }
@@ -1794,7 +1795,7 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
      *                         <=  (C1 & ~C2) | (P1 & ~P2)
      * */
 
-    if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
+    if ((and_with_flags & ANYOF_INVERT)
         && ! is_ANYOF_SYNTHETIC(and_with))
     {
         unsigned int i;
@@ -1806,7 +1807,7 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
 
         /* If either P1 or P2 is empty, the intersection will be also; can skip
          * the loop */
-        if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
+        if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
             ANYOF_POSIXL_ZERO(ssc);
         }
         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
@@ -1866,16 +1867,16 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
             else {
                 ssc->invlist = anded_cp_list;
                 ANYOF_POSIXL_ZERO(ssc);
-                if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
+                if (and_with_flags & ANYOF_MATCHES_POSIXL) {
                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
                 }
             }
         }
         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
-                 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
+                 || (and_with_flags & ANYOF_MATCHES_POSIXL))
         {
             /* One or the other of P1, P2 is non-empty. */
-            if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
+            if (and_with_flags & ANYOF_MATCHES_POSIXL) {
                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
             }
             ssc_union(ssc, anded_cp_list, FALSE);
@@ -1896,6 +1897,7 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
 
     SV* ored_cp_list;
     U8 ored_flags;
+    U8  or_with_flags = (OP(or_with) == ANYOFH) ? 0 : ANYOF_FLAGS(or_with);
 
     PERL_ARGS_ASSERT_SSC_OR;
 
@@ -1905,17 +1907,17 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
      * the code point inversion list and just the relevant flags */
     if (is_ANYOF_SYNTHETIC(or_with)) {
         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
-        ored_flags = ANYOF_FLAGS(or_with);
+        ored_flags = or_with_flags;
     }
     else {
         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
-        ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
+        ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
         if (OP(or_with) != ANYOFD) {
             ored_flags
-            |= ANYOF_FLAGS(or_with)
+            |= or_with_flags
              & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
                 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
-            if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) {
+            if (ANYOFL_UTF8_LOCALE_REQD(or_with_flags)) {
                 ored_flags |=
                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
             }
@@ -1942,12 +1944,12 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
      * (which results in actually simpler code than the non-inverted case)
      * */
 
-    if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
+    if ((or_with_flags & ANYOF_INVERT)
         && ! is_ANYOF_SYNTHETIC(or_with))
     {
         /* We ignore P2, leaving P1 going forward */
     }   /* else  Not inverted */
-    else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
+    else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
             unsigned int i;
@@ -18972,8 +18974,34 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
         * bitmap, optimize to indicate that */
         if (     start[0] >= NUM_ANYOF_CODE_POINTS
             && ! LOC
-            && ! upper_latin1_only_utf8_matches)
+            && ! upper_latin1_only_utf8_matches
+            &&   anyof_flags == 0)
         {
+            UV highest_cp = invlist_highest(cp_list);
+
+            /* If the lowest and highest code point in the class have the same
+             * UTF-8 first byte, then all do, and we can store that byte for
+             * regexec.c to use so that it can more quickly scan the target
+             * string for potential matches for this class.  We co-opt the the
+             * flags field for this.  Zero means, they don't have the same
+             * first byte.  We do accept here very large code points (for
+             * future use), but don't bother with this optimization for them,
+             * as it would cause other complications */
+            if (highest_cp > IV_MAX) {
+                anyof_flags = 0;
+            }
+            else {
+                U8 low_utf8[UTF8_MAXBYTES+1];
+                U8 high_utf8[UTF8_MAXBYTES+1];
+
+                (void) uvchr_to_utf8(low_utf8, start[0]);
+                (void) uvchr_to_utf8(high_utf8, invlist_highest(cp_list));
+
+                anyof_flags = (low_utf8[0] == high_utf8[0])
+                            ? low_utf8[0]
+                            : 0;
+            }
+
             op = ANYOFH;
         }
     }   /* End of seeing if can optimize it into a different node */
@@ -20273,7 +20301,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
         /* 2: embedded, otherwise 1 */
 	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
     else if (k == ANYOF) {
-	const U8 flags = ANYOF_FLAGS(o);
+	const U8 flags = (OP(o) == ANYOFH) ? 0 : ANYOF_FLAGS(o);
         bool do_sep = FALSE;    /* Do we need to separate various components of
                                    the output? */
         /* Set if there is still an unresolved user-defined property */
@@ -20427,6 +20455,11 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
         /* And finally the matching, closing ']' */
 	Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
 
+        if (OP(o) == ANYOFH && FLAGS(o) != 0) {
+            Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=\\x%02x)", FLAGS(o));
+        }
+
+
         SvREFCNT_dec(unresolved);
     }
     else if (k == ANYOFM) {
diff --git a/regcomp.sym b/regcomp.sym
index 11cf43f429..a35beca063 100644
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -62,7 +62,7 @@ ANYOF       ANYOF,      sv charclass S    ; Match character in (or not in) this
 ANYOFD      ANYOF,      sv charclass S    ; Like ANYOF, but /d is in effect
 ANYOFL      ANYOF,      sv charclass S    ; Like ANYOF, but /l is in effect
 ANYOFPOSIXL ANYOF,      sv charclass_posixl S    ; Like ANYOFL, but matches [[:posix:]] classes
-ANYOFH      ANYOF,      sv 1 S    ; Like ANYOF, but only has "High" matches, none in the bitmap
+ANYOFH      ANYOF,      sv 1 S    ; Like ANYOF, but only has "High" matches, none in the bitmap; non-zero flags "f" means "f" is the first UTF-8 byte shared in common by all code points matched
 ANYOFM      ANYOFM      byte 1 S  ; Like ANYOF, but matches an invariant byte as determined by the mask and arg
 NANYOFM     ANYOFM      byte 1 S  ; complement of ANYOFM
 
diff --git a/regexec.c b/regexec.c
index 87d02fbd37..45a817a7b2 100644
--- a/regexec.c
+++ b/regexec.c
@@ -9744,7 +9744,7 @@ STATIC bool
 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
 {
     dVAR;
-    const char flags = ANYOF_FLAGS(n);
+    const char flags = (OP(n) == ANYOFH) ? 0 : ANYOF_FLAGS(n);
     bool match = FALSE;
     UV c = *p;
 
diff --git a/regnodes.h b/regnodes.h
index 803938ac48..ba691a2c18 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -33,7 +33,7 @@
 #define	ANYOFD                	19	/* 0x13 Like ANYOF, but /d is in effect */
 #define	ANYOFL                	20	/* 0x14 Like ANYOF, but /l is in effect */
 #define	ANYOFPOSIXL           	21	/* 0x15 Like ANYOFL, but matches [[:posix:]] classes */
-#define	ANYOFH                	22	/* 0x16 Like ANYOF, but only has "High" matches, none in the bitmap */
+#define	ANYOFH                	22	/* 0x16 Like ANYOF, but only has "High" matches, none in the bitmap; non-zero flags "f" means "f" is the first UTF-8 byte shared in common by all code points matched */
 #define	ANYOFM                	23	/* 0x17 Like ANYOF, but matches an invariant byte as determined by the mask and arg */
 #define	NANYOFM               	24	/* 0x18 complement of ANYOFM */
 #define	POSIXD                	25	/* 0x19 Some [[:class:]] under /d; the FLAGS field gives which one */
diff --git a/t/re/anyof.t b/t/re/anyof.t
index f8be0eec31..d33cbb2abe 100644
--- a/t/re/anyof.t
+++ b/t/re/anyof.t
@@ -868,9 +868,17 @@ while (defined (my $test = shift @tests)) {
 
         my $display_expected = $expected
                                   =~ s/ INFTY_minus_1 /$next_highest_cp/xgr;
+        my $test_name = "Verify compilation of $test displays as"
+                      . " $display_expected";
 
         my $result = get_compiled($test);
-        is($result, $expected,
-               "Verify compilation of $test displays as $display_expected");
+        if ($expected =~ / ^ ANYOFH /x) {
+            like($result, qr/ ^ \Q$expected\E (?:\Q (First UTF-8 byte=\x\E
+                              [[:xdigit:]]{2}\) )? $ /x, $test_name);
+        }
+        else {
+            is($result, $expected,
+               "Verify compilation of $test displays as $test_name");
+        }
     }
 }

-- 
Perl5 Master Repository



nntp.perl.org: Perl Programming lists via nntp and http.
Comments to Ask Bjørn Hansen at ask@perl.org | Group listing | About