develooper Front page | perl.perl5.changes | Postings from March 2019

[perl.git] branch blead updated. v5.29.8-101-g99956112fb

From:
Karl Williamson
Date:
March 14, 2019 00:18
Subject:
[perl.git] branch blead updated. v5.29.8-101-g99956112fb
Message ID:
E1h4E4O-0001Nx-41@git.dc.perl.space
In perl.git, the branch blead has been updated

<https://perl5.git.perl.org/perl.git/commitdiff/99956112fb8debe619f1aa0f9a8c2a2887b1bd7a?hp=26b0dc0c52fbcb48a1f10935a8dd8f0b0d4c9209>

- Log -----------------------------------------------------------------
commit 99956112fb8debe619f1aa0f9a8c2a2887b1bd7a
Author: Karl Williamson <khw@cpan.org>
Date:   Tue Mar 12 15:56:05 2019 -0600

    t/re/pat.t: Outdent 2 lines

commit 945961fdfa1d070cc833955c8920cd33e36826b4
Author: Karl Williamson <khw@cpan.org>
Date:   Wed Mar 13 18:03:01 2019 -0600

    Allow qr'\N{...}'

commit f7a9988ec2e0db8a15b4d43dfe2735af47509ca7
Author: Karl Williamson <khw@cpan.org>
Date:   Wed Mar 13 17:58:00 2019 -0600

    regcomp.c: White-space, comments only

commit 25c7fb78daaad4f8ca1aaa9e19d6db99366388c7
Author: Karl Williamson <khw@cpan.org>
Date:   Thu Jan 12 14:50:26 2017 -0700

    toke.c: Allow \N{} handling fcn to be used elsewhere in core
    
    This function will be used in regcomp.c in a later commit.  This commit
    changes the function so that it is callable outside of toke.c.  It adds
    a parameter and moves some code in new_constant to the wrapper function
    so that these do not cause problems when called from outside toke.  And
    it adds some assertions

commit 2c43c309caa1db7b713cc5d1bed58a8f534d977b
Author: Karl Williamson <khw@cpan.org>
Date:   Thu Jan 12 14:46:21 2017 -0700

    toke.c: Add wrapper function
    
    This is in preparation for the underlying function to be called from
    elsewhere.  This adds a wrapper to be used internally in toke.c that
    keeps the other caller of the underlying function from having to know
    the changes to that function.  That function is changed to return any
    error message instead of raising it itself.

commit 164e423c70b80a4c3880f33561005d1fb870699c
Author: Karl Williamson <khw@cpan.org>
Date:   Mon Jan 16 17:43:06 2017 -0700

    toke.c: Change API of static function
    
    This will be useful in future commits.  new_constant() is changed so
    that if an extra parameter is not NULL, it sets it to point to an error
    message instead of raising the message itself.  Thus its caller can
    choose to handle errors itself.

-----------------------------------------------------------------------

Summary of changes:
 embed.fnc               |  13 ++--
 embed.h                 |   3 +-
 pod/perldelta.pod       |   6 ++
 pod/perldiag.pod        |  32 ---------
 pod/perlrebackslash.pod |   9 ++-
 proto.h                 |  11 +++-
 regcomp.c               | 171 ++++++++++++++++++++++++++++++++++++++++--------
 t/re/pat.t              |   8 ++-
 t/re/pat_advanced.t     |  30 +++++++++
 t/re/pat_rt_report.t    |   8 ++-
 t/re/re_tests           |   5 +-
 toke.c                  | 123 ++++++++++++++++++++--------------
 12 files changed, 298 insertions(+), 121 deletions(-)

diff --git a/embed.fnc b/embed.fnc
index 4b04389c20..517bcb577c 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -2736,6 +2736,10 @@ s	|void	|anonymise_cv_maybe	|NN GV *gv|NN CV *cv
 : Used in sv.c and hv.c
 po	|void *	|more_bodies	|const svtype sv_type|const size_t body_size \
 				|const size_t arena_size
+EXpR	|SV*	|get_and_check_backslash_N_name|NN const char* s	\
+				|NN const char* const e			\
+				|const bool is_utf8			\
+				|NN const char** error_msg
 
 #if defined(PERL_IN_TOKE_C)
 s	|void	|check_uni
@@ -2746,7 +2750,7 @@ s	|char*	|force_word	|NN char *start|int token|int check_keyword \
 				|int allow_pack
 s	|SV*	|tokeq		|NN SV *sv
 sR	|char*	|scan_const	|NN char *start
-sR	|SV*	|get_and_check_backslash_N_name|NN const char* s \
+sR	|SV*	|get_and_check_backslash_N_name_wrapper|NN const char* s \
 				|NN const char* const e
 sR	|char*	|scan_formline	|NN char *s
 sR	|char*	|scan_heredoc	|NN char *s
@@ -2786,10 +2790,11 @@ sR	|I32	|sublex_start
 sR	|char *	|filter_gets	|NN SV *sv|STRLEN append
 sR	|HV *	|find_in_my_stash|NN const char *pkgname|STRLEN len
 sR	|char *	|tokenize_use	|int is_use|NN char *s
-so	|SV*	|new_constant	|NULLOK const char *s|STRLEN len \
+so	|SV*	|new_constant	|NULLOK const char *s|STRLEN len	    \
 				|NN const char *key|STRLEN keylen|NN SV *sv \
-				|NULLOK SV *pv|NULLOK const char *type \
-				|STRLEN typelen
+				|NULLOK SV *pv|NULLOK const char *type	    \
+				|STRLEN typelen				    \
+				|NULLOK const char ** error_msg
 s	|int	|ao		|int toketype
 s	|void|parse_ident|NN char **s|NN char **d \
                      |NN char * const e|int allow_package \
diff --git a/embed.h b/embed.h
index 9439f4083b..5e87348293 100644
--- a/embed.h
+++ b/embed.h
@@ -1086,6 +1086,7 @@
 #define av_reify(a)		Perl_av_reify(aTHX_ a)
 #define current_re_engine()	Perl_current_re_engine(aTHX)
 #define cv_ckproto_len_flags(a,b,c,d,e)	Perl_cv_ckproto_len_flags(aTHX_ a,b,c,d,e)
+#define get_and_check_backslash_N_name(a,b,c,d)	Perl_get_and_check_backslash_N_name(aTHX_ a,b,c,d)
 #define grok_atoUV		Perl_grok_atoUV
 #define mg_find_mglob(a)	Perl_mg_find_mglob(aTHX_ a)
 #define multiconcat_stringify(a)	Perl_multiconcat_stringify(aTHX_ a)
@@ -2030,7 +2031,7 @@
 #define force_strict_version(a)	S_force_strict_version(aTHX_ a)
 #define force_version(a,b)	S_force_version(aTHX_ a,b)
 #define force_word(a,b,c,d)	S_force_word(aTHX_ a,b,c,d)
-#define get_and_check_backslash_N_name(a,b)	S_get_and_check_backslash_N_name(aTHX_ a,b)
+#define get_and_check_backslash_N_name_wrapper(a,b)	S_get_and_check_backslash_N_name_wrapper(aTHX_ a,b)
 #define incline(a,b)		S_incline(aTHX_ a,b)
 #define intuit_method(a,b,c)	S_intuit_method(aTHX_ a,b,c)
 #define intuit_more(a,b)	S_intuit_more(aTHX_ a,b)
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index 7518690081..06ae872679 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -43,6 +43,12 @@ the Unicode Consortium suggests.
 Most properties are supported, with the remainder planned for 5.32.
 Details are in L<perlunicode/Wildcards in Property Values>.
 
+=head2 qr'\N{name}' is now supported
+
+Previously it was an error to evaluate a named character C<\N{...}>
+within a single quoted regular expression pattern (whose evaluation is
+deferred from the normal place).  This restriction is now removed.
+
 =head2 Unicode 12.0 is supported
 
 For details, see L<https://www.unicode.org/versions/Unicode12.0.0/>.
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index eb24f14220..c1d776bb07 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -4021,38 +4021,6 @@ C<\N{...}> is used as one of the end points of the range, such as in
 What is meant here is unclear, as the C<\N{...}> escape is a sequence
 of code points, so this is made an error.
 
-=item \N{NAME} must be resolved by the lexer in regex; marked by
-S<<-- HERE> in m/%s/
-
-(F) When compiling a regex pattern, an unresolved named character or
-sequence was encountered.  This can happen in any of several ways that
-bypass the lexer, such as using single-quotish context, or an extra
-backslash in double-quotish:
-
-    $re = '\N{SPACE}';	# Wrong!
-    $re = "\\N{SPACE}";	# Wrong!
-    /$re/;
-
-Instead, use double-quotes with a single backslash:
-
-    $re = "\N{SPACE}";	# ok
-    /$re/;
-
-The lexer can be bypassed as well by creating the pattern from smaller
-components:
-
-    $re = '\N';
-    /${re}{SPACE}/;	# Wrong!
-
-It's not a good idea to split a construct in the middle like this, and
-it doesn't work here.  Instead use the solution above.
-
-Finally, the message also can happen under the C</x> regex modifier when the
-C<\N> is separated by spaces from the C<{>, in which case, remove the spaces.
-
-    /\N {SPACE}/x;	# Wrong!
-    /\N{SPACE}/x;	# ok
-
 =item No %s allowed while running setuid
 
 (F) Certain operations are deemed to be too insecure for a setuid or
diff --git a/pod/perlrebackslash.pod b/pod/perlrebackslash.pod
index 01226e6a6e..cfd182a7e1 100644
--- a/pod/perlrebackslash.pod
+++ b/pod/perlrebackslash.pod
@@ -187,7 +187,14 @@ rarely see it written without the two leading zeros.  C<\N{U+0041}> means
 "A" even on EBCDIC machines (where the ordinal value of "A" is not 0x41).
 
 It is even possible to give your own names to characters and character
-sequences.  For details, see L<charnames>.
+sequences by using the L<charnames> module.  These custom names are
+lexically scoped, and so a given code point may have different names
+in different scopes.  The name used is what is in effect at the time the
+C<\N{}> is expanded.  For patterns in double-quotish context, that means
+at the time the pattern is parsed.  But for patterns that are delimitted
+by single quotes, the expansion is deferred until pattern compilation
+time, which may very well have a different C<charnames> translator in
+effect.
 
 (There is an expanded internal form that you may see in debug output:
 C<\N{U+I<code point>.I<code point>...}>.
diff --git a/proto.h b/proto.h
index 31d77c1e4e..936d344c32 100644
--- a/proto.h
+++ b/proto.h
@@ -972,6 +972,11 @@ PERL_CALLCONV char*	Perl_form(pTHX_ const char* pat, ...)
 
 PERL_CALLCONV void	Perl_free_tied_hv_pool(pTHX);
 PERL_CALLCONV void	Perl_free_tmps(pTHX);
+PERL_CALLCONV SV*	Perl_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e, const bool is_utf8, const char** error_msg)
+			__attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME	\
+	assert(s); assert(e); assert(error_msg)
+
 PERL_CALLCONV AV*	Perl_get_av(pTHX_ const char *name, I32 flags);
 #define PERL_ARGS_ASSERT_GET_AV	\
 	assert(name)
@@ -6054,9 +6059,9 @@ STATIC char*	S_force_version(pTHX_ char *s, int guessing);
 STATIC char*	S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack);
 #define PERL_ARGS_ASSERT_FORCE_WORD	\
 	assert(start)
-STATIC SV*	S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
+STATIC SV*	S_get_and_check_backslash_N_name_wrapper(pTHX_ const char* s, const char* const e)
 			__attribute__warn_unused_result__;
-#define PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME	\
+#define PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME_WRAPPER	\
 	assert(s); assert(e)
 
 STATIC void	S_incline(pTHX_ const char *s, const char *end);
@@ -6074,7 +6079,7 @@ STATIC I32	S_lop(pTHX_ I32 f, U8 x, char *s);
 PERL_STATIC_NO_RET void	S_missingterm(pTHX_ char *s, STRLEN len)
 			__attribute__noreturn__;
 
-STATIC SV*	S_new_constant(pTHX_ const char *s, STRLEN len, const char *key, STRLEN keylen, SV *sv, SV *pv, const char *type, STRLEN typelen);
+STATIC SV*	S_new_constant(pTHX_ const char *s, STRLEN len, const char *key, STRLEN keylen, SV *sv, SV *pv, const char *type, STRLEN typelen, const char ** error_msg);
 #define PERL_ARGS_ASSERT_NEW_CONSTANT	\
 	assert(key); assert(sv)
 STATIC void	S_no_op(pTHX_ const char *const what, char *s);
diff --git a/regcomp.c b/regcomp.c
index 3b269466ee..d7c3d46fc8 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -193,6 +193,7 @@ struct RExC_state_t {
     scan_frame *frame_last;
     U32         frame_count;
     AV         *warn_text;
+    HV         *unlexed_names;
 #ifdef ADD_TO_REGEXEC
     char 	*starttry;		/* -Dr: where regtry was called. */
 #define RExC_starttry	(pRExC_state->starttry)
@@ -280,6 +281,7 @@ struct RExC_state_t {
 #define RExC_warn_text (pRExC_state->warn_text)
 #define RExC_in_script_run      (pRExC_state->in_script_run)
 #define RExC_use_BRANCHJ        (pRExC_state->use_BRANCHJ)
+#define RExC_unlexed_names (pRExC_state->unlexed_names)
 
 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
  * a flag to disable back-off on the fixed/floating substrings - if it's
@@ -7357,6 +7359,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
     }
 
     pRExC_state->warn_text = NULL;
+    pRExC_state->unlexed_names = NULL;
     pRExC_state->code_blocks = NULL;
 
     if (is_bare_re)
@@ -12610,20 +12613,23 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
   * points) that this \N sequence matches.  This is set, and the input is
   * parsed for errors, even if the function returns FALSE, as detailed below.
   *
-  * There are 5 possibilities here, as detailed in the next 5 paragraphs.
+  * There are 6 possibilities here, as detailed in the next 6 paragraphs.
   *
   * Probably the most common case is for the \N to specify a single code point.
   * *cp_count will be set to 1, and *code_point_p will be set to that code
   * point.
   *
-  * Another possibility is for the input to be an empty \N{}, which for
-  * backwards compatibility we accept.  *cp_count will be set to 0. *node_p
-  * will be set to a generated NOTHING node.
+  * Another possibility is for the input to be an empty \N{}.  This is no
+  * longer accepted, and will generate a fatal error.
+  *
+  * Another possibility is for a custom charnames handler to be in effect which
+  * translates the input name to an empty string.  *cp_count will be set to 0.
+  * *node_p will be set to a generated NOTHING node.
   *
   * Still another possibility is for the \N to mean [^\n]. *cp_count will be
   * set to 0. *node_p will be set to a generated REG_ANY node.
   *
-  * The fourth possibility is that \N resolves to a sequence of more than one
+  * The fifth possibility is that \N resolves to a sequence of more than one
   * code points.  *cp_count will be set to the number of code points in the
   * sequence. *node_p will be set to a generated node returned by this
   * function calling S_reg().
@@ -12631,7 +12637,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
   * The final possibility is that it is premature to be calling this function;
   * the parse needs to be restarted.  This can happen when this changes from
   * /d to /u rules, or when the pattern needs to be upgraded to UTF-8.  The
-  * latter occurs only when the fourth possibility would otherwise be in
+  * latter occurs only when the fifth possibility would otherwise be in
   * effect, and is because one of those code points requires the pattern to be
   * recompiled as UTF-8.  The function returns FALSE, and sets the
   * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate.  When this
@@ -12648,12 +12654,11 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
   * so we need a way to take a snapshot of what they resolve to at the time of
   * the original parse. [perl #56444].
   *
-  * That parsing is skipped for single-quoted regexes, so we may here get
-  * '\N{NAME}'.  This is a fatal error.  These names have to be resolved by the
-  * parser.  But if the single-quoted regex is something like '\N{U+41}', that
-  * is legal and handled here.  The code point is Unicode, and has to be
-  * translated into the native character set for non-ASCII platforms.
-  */
+  * That parsing is skipped for single-quoted regexes, so here we may get
+  * '\N{NAME}', which is parsed now.  If the single-quoted regex is something
+  * like '\N{U+41}', that code point is Unicode, and has to be translated into
+  * the native character set for non-ASCII platforms.  The other possibilities
+  * are already native, so no translation is done. */
 
     char * endbrace;    /* points to '}' following the name */
     char* p = RExC_parse; /* Temporary */
@@ -12661,8 +12666,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
     SV * substitute_parse = NULL;
     char *orig_end;
     char *save_start;
+    bool save_strict;
     I32 flags;
-    Size_t count = 0;   /* code point count kept internally by this function */
 
     GET_RE_DEBUG_FLAGS_DECL;
 
@@ -12685,7 +12690,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
 
     /* Disambiguate between \N meaning a named character versus \N meaning
      * [^\n].  The latter is assumed when the {...} following the \N is a legal
-     * quantifier, or there is no '{' at all */
+     * quantifier, or if there is no '{' at all */
     if (*p != '{' || regcurly(p)) {
         RExC_parse = p;
         if (cp_count) {
@@ -12718,15 +12723,19 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
         vFAIL2("Missing right brace on \\%c{}", 'N');
     }
 
-    /* Here, we have decided it should be a named character or sequence */
-    REQUIRE_UNI_RULES(flagp, FALSE); /* Unicode named chars imply Unicode
-                                        semantics */
+    /* Here, we have decided it should be a named character or sequence.  These
+     * imply Unicode semantics */
+    REQUIRE_UNI_RULES(flagp, FALSE);
 
-    if (endbrace == RExC_parse) {   /* empty: \N{} */
+    /* \N{_} is what toke.c returns to us to indicate a name that evaluates to
+     * nothing at all (not allowed under strict) */
+    if (endbrace - RExC_parse == 1 && *RExC_parse == '_') {
+        RExC_parse = endbrace;
         if (strict) {
             RExC_parse++;   /* Position after the "}" */
             vFAIL("Zero length \\N{}");
         }
+
         if (cp_count) {
             *cp_count = 0;
         }
@@ -12739,15 +12748,122 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
         return TRUE;
     }
 
-    /* If we haven't got something that begins with 'U+', then it didn't get lexed. */
-    if (   endbrace - RExC_parse < 2
-        || strnNE(RExC_parse, "U+", 2))
-    {
-        RExC_parse = endbrace;  /* position msg's '<--HERE' */
-        vFAIL("\\N{NAME} must be resolved by the lexer");
-    }
+    if (endbrace - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) {
+
+        /* Here, the name isn't of the form  U+....  This can happen if the
+         * pattern is single-quoted, so didn't get evaluated in toke.c.  Now
+         * is the time to find out what the name means */
+
+        const STRLEN name_len = endbrace - RExC_parse;
+        SV *  value_sv;     /* What does this name evaluate to */
+        SV ** value_svp;
+        const U8 * value;   /* string of name's value */
+        STRLEN value_len;   /* and its length */
+
+        /*  RExC_unlexed_names is a hash of names that weren't evaluated by
+         *  toke.c, and their values. Make sure is initialized */
+        if (! RExC_unlexed_names) {
+            RExC_unlexed_names = newHV();
+        }
+
+        /* If we have already seen this name in this pattern, use that.  This
+         * allows us to only call the charnames handler once per name per
+         * pattern.  A broken or malicious handler could return something
+         * different each time, which could cause the results to vary depending
+         * on if something gets added or subtracted from the pattern that
+         * causes the number of passes to change, for example */
+        if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse,
+                                                      name_len, 0)))
+        {
+            value_sv = *value_svp;
+        }
+        else { /* Otherwise we have to go out and get the name */
+            const char * error_msg = NULL;
+            value_sv = get_and_check_backslash_N_name(RExC_parse, endbrace,
+                                                      UTF,
+                                                      &error_msg);
+            if (error_msg) {
+                RExC_parse = endbrace;
+                vFAIL(error_msg);
+            }
+
+            /* If no error message, should have gotten a valid return */
+            assert (value_sv);
+
+            /* Save the name's meaning for later use */
+            if (! hv_store(RExC_unlexed_names, RExC_parse, name_len,
+                           value_sv, 0))
+            {
+                Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
+            }
+        }
+
+        /* Here, we have the value the name evaluates to in 'value_sv' */
+        value = (U8 *) SvPV(value_sv, value_len);
+
+        /* See if the result is one code point vs 0 or multiple */
+        if (value_len > 0 && value_len <= ((SvUTF8(value_sv))
+                                           ? UTF8SKIP(value)
+                                           : 1))
+        {
+            /* Here, exactly one code point.  If that isn't what is wanted,
+             * fail */
+            if (! code_point_p) {
+                RExC_parse = p;
+                return FALSE;
+            }
 
-        /* This code purposely indented below because of future changes coming */
+            /* Convert from string to numeric code point */
+            *code_point_p = (SvUTF8(value_sv))
+                            ? valid_utf8_to_uvchr(value, NULL)
+                            : *value;
+
+            /* Have parsed this entire single code point \N{...}.  *cp_count
+             * has already been set to 1, so don't do it again. */
+            RExC_parse = endbrace;
+            nextchar(pRExC_state);
+            return TRUE;
+        } /* End of is a single code point */
+
+        /* Count the code points, if caller desires.  The API says to do this
+         * even if we will later return FALSE */
+        if (cp_count) {
+            *cp_count = 0;
+
+            *cp_count = (SvUTF8(value_sv))
+                        ? utf8_length(value, value + value_len)
+                        : value_len;
+        }
+
+        /* Fail if caller doesn't want to handle a multi-code-point sequence.
+         * But don't back the pointer up if the caller wants to know how many
+         * code points there are (they need to handle it themselves in this
+         * case).  */
+        if (! node_p) {
+            if (! cp_count) {
+                RExC_parse = p;
+            }
+            return FALSE;
+        }
+
+        /* Convert this to a sub-pattern of the form "(?: ... )", and then call
+         * reg recursively to parse it.  That way, it retains its atomicness,
+         * while not having to worry about any special handling that some code
+         * points may have. */
+
+        substitute_parse = newSVpvs("?:");
+        sv_catsv(substitute_parse, value_sv);
+        sv_catpv(substitute_parse, ")");
+
+#ifdef EBCDIC
+        /* The value should already be native, so no need to convert on EBCDIC
+         * platforms.*/
+        assert(! RExC_recode_x_to_native);
+#endif
+
+    }
+    else {   /* \N{U+...} */
+        Size_t count = 0;   /* code point count kept internally */
 
         /* We can get to here when the input is \N{U+...} or when toke.c has
          * converted a name to the \N{U+...} form.  This include changing a
@@ -12882,6 +12998,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
         RExC_recode_x_to_native = 1;
 #endif
 
+    }
+
     /* Here, we have the string the name evaluates to, ready to be parsed,
      * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
      * constructs.  This can be called from within a substitute parse already.
@@ -13766,7 +13884,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                    || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
                    || UTF8_IS_START(UCHARAT(RExC_parse)));
 
-
             /* Here, we have a literal character.  Find the maximal string of
              * them in the input that we can fit into a single EXACTish node.
              * We quit at the first non-literal or when the node gets full, or
diff --git a/t/re/pat.t b/t/re/pat.t
index c3e4521131..e97031f2d9 100644
--- a/t/re/pat.t
+++ b/t/re/pat.t
@@ -20,10 +20,11 @@ BEGIN {
     require './loc_tools.pl';
     set_up_inc('../lib', '.', '../ext/re');
 }
-    skip_all('no re module') unless defined &DynaLoader::boot_DynaLoader;
-    skip_all_without_unicode_tables();
 
-plan tests => 853;  # Update this when adding/deleting tests.
+skip_all('no re module') unless defined &DynaLoader::boot_DynaLoader;
+skip_all_without_unicode_tables();
+
+plan tests => 854;  # Update this when adding/deleting tests.
 
 run_tests() unless caller;
 
@@ -1352,6 +1353,7 @@ EOP
         unlike "\x{100}", qr/(?i:\w)/, "(?i: shouldn't lose the passed in /a";
         use re '/aa';
         unlike 'k', qr/(?i:\N{KELVIN SIGN})/, "(?i: shouldn't lose the passed in /aa";
+        unlike 'k', qr'(?i:\N{KELVIN SIGN})', "(?i: shouldn't lose the passed in /aa";
     }
 
     {
diff --git a/t/re/pat_advanced.t b/t/re/pat_advanced.t
index 77befc14ba..21a43b8fe2 100644
--- a/t/re/pat_advanced.t
+++ b/t/re/pat_advanced.t
@@ -534,28 +534,46 @@ sub run_tests {
 
         like("\N{LATIN SMALL LETTER SHARP S}",
 	     qr/\N{LATIN SMALL LETTER SHARP S}/, $message);
+        like("\N{LATIN SMALL LETTER SHARP S}",
+	     qr'\N{LATIN SMALL LETTER SHARP S}', $message);
         like("\N{LATIN SMALL LETTER SHARP S}",
 	     qr/\N{LATIN SMALL LETTER SHARP S}/i, $message);
+        like("\N{LATIN SMALL LETTER SHARP S}",
+	     qr'\N{LATIN SMALL LETTER SHARP S}'i, $message);
         like("\N{LATIN SMALL LETTER SHARP S}",
 	     qr/[\N{LATIN SMALL LETTER SHARP S}]/, $message);
+        like("\N{LATIN SMALL LETTER SHARP S}",
+	     qr'[\N{LATIN SMALL LETTER SHARP S}]', $message);
         like("\N{LATIN SMALL LETTER SHARP S}",
 	     qr/[\N{LATIN SMALL LETTER SHARP S}]/i, $message);
+        like("\N{LATIN SMALL LETTER SHARP S}",
+	     qr'[\N{LATIN SMALL LETTER SHARP S}]'i, $message);
 
         like("ss", qr /\N{LATIN SMALL LETTER SHARP S}/i, $message);
+        like("ss", qr '\N{LATIN SMALL LETTER SHARP S}'i, $message);
         like("SS", qr /\N{LATIN SMALL LETTER SHARP S}/i, $message);
+        like("SS", qr '\N{LATIN SMALL LETTER SHARP S}'i, $message);
         like("ss", qr/[\N{LATIN SMALL LETTER SHARP S}]/i, $message);
+        like("ss", qr'[\N{LATIN SMALL LETTER SHARP S}]'i, $message);
         like("SS", qr/[\N{LATIN SMALL LETTER SHARP S}]/i, $message);
+        like("SS", qr'[\N{LATIN SMALL LETTER SHARP S}]'i, $message);
 
         like("\N{LATIN SMALL LETTER SHARP S}", qr/ss/i, $message);
         like("\N{LATIN SMALL LETTER SHARP S}", qr/SS/i, $message);
 
          $message = "Unoptimized named sequence in class";
         like("ss", qr/[\N{LATIN SMALL LETTER SHARP S}x]/i, $message);
+        like("ss", qr'[\N{LATIN SMALL LETTER SHARP S}x]'i, $message);
         like("SS", qr/[\N{LATIN SMALL LETTER SHARP S}x]/i, $message);
+        like("SS", qr'[\N{LATIN SMALL LETTER SHARP S}x]'i, $message);
         like("\N{LATIN SMALL LETTER SHARP S}",
 	     qr/[\N{LATIN SMALL LETTER SHARP S}x]/, $message);
+        like("\N{LATIN SMALL LETTER SHARP S}",
+	     qr'[\N{LATIN SMALL LETTER SHARP S}x]', $message);
         like("\N{LATIN SMALL LETTER SHARP S}",
 	     qr/[\N{LATIN SMALL LETTER SHARP S}x]/i, $message);
+        like("\N{LATIN SMALL LETTER SHARP S}",
+	     qr'[\N{LATIN SMALL LETTER SHARP S}x]'i, $message);
     }
 
     {
@@ -825,6 +843,8 @@ sub run_tests {
             for my $tail ('\N{SNOWFLAKE}') {
                 eval qq {use charnames ':full';
                          like("$head$tail", qr/$head$tail/, \$message)};
+                eval qq {use charnames ':full';
+                         like("$head$tail", qr'$head$tail', \$message)};
 		is($@, '', $message);
             }
         }
@@ -942,8 +962,12 @@ sub run_tests {
         # time: A AB ABC ABCD ...
         ok 'AB'  =~ /(\N{EVIL})/ && $1 eq 'A', 'Charname caching $1';
         like 'ABC', qr/(\N{EVIL})/,              'Charname caching $1';
+        ok 'ABCD'  =~ m'(\N{EVIL})' && $1 eq 'ABC', 'Charname caching $1';
+        ok 'ABCDE'  =~ m'(\N{EVIL})',          'Charname caching $1';
         like 'xy',  qr/x\N{EMPTY-STR}y/,
                     'Empty string charname produces NOTHING node';
+        ok 'xy'  =~ 'x\N{EMPTY-STR}y',
+                    'Empty string charname produces NOTHING node';
         like '', qr/\N{EMPTY-STR}/,
                     'Empty string charname produces NOTHING node';
         like "\N{LONG-STR}", qr/^\N{LONG-STR}$/, 'Verify that long string works';
@@ -951,9 +975,14 @@ sub run_tests {
 
         # perlhacktips points out that these work on both ASCII and EBCDIC
         like "\xfc", qr/\N{EMPTY-STR}\xdc/i, 'Empty \N{} should change /d to /u';
+        like "\xfc", qr'\N{EMPTY-STR}\xdc'i, 'Empty \N{} should change /d to /u';
 
         eval '/(?[[\N{EMPTY-STR}]])/';
         like $@, qr/Zero length \\N\{\}/, 'Verify zero-length return from \N{} correctly fails';
+        ok "\N{LONG-STR}" =~ /^\N{LONG-STR}$/, 'Verify that long string works';
+        ok "\N{LONG-STR}" =~ '^\N{LONG-STR}$', 'Verify that long string works';
+        ok "\N{LONG-STR}" =~ /^\N{LONG-STR}$/i, 'Verify under folding that long string works';
+        ok "\N{LONG-STR}" =~ m'^\N{LONG-STR}$'i, 'Verify under folding that long string works';
 
         undef $w;
         {
@@ -2436,6 +2465,7 @@ EOF
     {   # [perl #126606 crashed the interpreter
         use Cname;
         like("sS", qr/\N{EMPTY-STR}Ss|/i, '\N{} with empty branch alternation works');
+        like("sS", qr'\N{EMPTY-STR}Ss|'i, '\N{} with empty branch alternation works');
     }
 
     { # Regexp:Grammars was broken:
diff --git a/t/re/pat_rt_report.t b/t/re/pat_rt_report.t
index dd740e713b..de2590047d 100644
--- a/t/re/pat_rt_report.t
+++ b/t/re/pat_rt_report.t
@@ -20,7 +20,7 @@ use warnings;
 use 5.010;
 use Config;
 
-plan tests => 2504;  # Update this when adding/deleting tests.
+plan tests => 2509;  # Update this when adding/deleting tests.
 
 run_tests() unless caller;
 
@@ -1044,23 +1044,29 @@ sub run_tests {
         use charnames ":full";
         # Delayed interpolation of \N'
         my $r1 = qr/\N{THAI CHARACTER SARA I}/;
+        my $r2 = qr'\N{THAI CHARACTER SARA I}';
         my $s1 = "\x{E34}\x{E34}\x{E34}\x{E34}";
 
         # Bug #56444
         ok $s1 =~ /$r1+/, 'my $r1 = qr/\N{THAI CHARACTER SARA I}/; my $s1 = "\x{E34}\x{E34}\x{E34}\x{E34}; $s1 =~ /$r1+/';
+        ok $s1 =~ /$r2+/, 'my $r2 = qr\'\N{THAI CHARACTER SARA I}\'; my $s1 = "\x{E34}\x{E34}\x{E34}\x{E34}; $s1 =~ \'$r2+\'';
 
         # Bug #62056
         ok "${s1}A" =~ m/$s1\N{LATIN CAPITAL LETTER A}/, '"${s1}A" =~ m/$s1\N{LATIN CAPITAL LETTER A}/';
 
         ok "abbbbc" =~ m/\N{1}/ && $& eq "a", '"abbbbc" =~ m/\N{1}/ && $& eq "a"';
+        ok "abbbbc" =~ m'\N{1}' && $& eq "a", '"abbbbc" =~ m\'\N{1}\' && $& eq "a"';
         ok "abbbbc" =~ m/\N{3,4}/ && $& eq "abbb", '"abbbbc" =~ m/\N{3,4}/ && $& eq "abbb"';
+        ok "abbbbc" =~ m'\N{3,4}' && $& eq "abbb", '"abbbbc" =~ m\'\N{3,4}\' && $& eq "abbb"';
     }
 
     {
         use charnames ":full";
         my $message = '[perl #74982] Period coming after \N{}';
         ok("\x{ff08}." =~ m/\N{FULLWIDTH LEFT PARENTHESIS}./ && $& eq "\x{ff08}.", $message);
+        ok("\x{ff08}." =~ m'\N{FULLWIDTH LEFT PARENTHESIS}.' && $& eq "\x{ff08}.", $message);
         ok("\x{ff08}." =~ m/[\N{FULLWIDTH LEFT PARENTHESIS}]./ && $& eq "\x{ff08}.", $message);
+        ok("\x{ff08}." =~ m'[\N{FULLWIDTH LEFT PARENTHESIS}].' && $& eq "\x{ff08}.", $message);
     }
 
 SKIP: {
diff --git a/t/re/re_tests b/t/re/re_tests
index a8b6748c7e..9b615ea2c2 100644
--- a/t/re/re_tests
+++ b/t/re/re_tests
@@ -1446,7 +1446,9 @@ foo(\h)bar	foo\tbar	y	$1	\t
 
 # Verify that \N{U+...} forces Unicode rules
 /\N{U+41}\x{c1}/i	a\x{e1}	y	$&	a\x{e1}
+'\N{U+41}\x{c1}'i	a\x{e1}	y	$&	a\x{e1}
 /[\N{U+41}\x{c1}]/i	\x{e1}	y	$&	\x{e1}
+'[\N{U+41}\x{c1}]'i	\x{e1}	y	$&	\x{e1}
 '\N{U+41}'	A	y	$&	A		# Even for single quoted patterns
 
 [\s][\S]	\x{a0}\x{a0}	n	-	-	# Unicode complements should not match same character
@@ -1479,7 +1481,7 @@ abc\N	abc\n	n
 [\N{U+}]	-	c	-	Invalid hexadecimal number
 \N{U+4AG3}	-	c	-	Invalid hexadecimal number
 [\N{U+4AG3}]	-	c	-	Invalid hexadecimal number
-abc\N{def}	-	c	-	\\N{NAME} must be resolved by the lexer
+abc\N{def}	-	c	-	Unknown charname 'def' in regex
 abc\N{U+4AG3	-	c	-	Missing right brace on \\N{}
 abc\N{def	-	c	-	Missing right brace on \\N{}
 abc\N{	-	c	-	Missing right brace on \\N{}
@@ -1490,6 +1492,7 @@ abc\N{	-	c	-	Missing right brace on \\N{}
 
 # Verifies catches hex errors
 /\N{U+0xBEEF}/	-	c	-	Invalid hexadecimal number
+\N{U+0xBEEF}	-	c	-	Invalid hexadecimal number
 # Used to be an error, but not any more:
 /\N{U+BEEF.BEAD}/	-	c	-	
 
diff --git a/toke.c b/toke.c
index 04d851232f..f17bfe143e 100644
--- a/toke.c
+++ b/toke.c
@@ -41,8 +41,8 @@ Individual members of C<PL_parser> have their own documentation.
 #include "dquote_inline.h"
 #include "invlist_inline.h"
 
-#define new_constant(a,b,c,d,e,f,g)	\
-	S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
+#define new_constant(a,b,c,d,e,f,g, h)	\
+	S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g, h)
 
 #define pl_yylval	(PL_parser->yylval)
 
@@ -2332,7 +2332,7 @@ S_tokeq(pTHX_ SV *sv)
     SvCUR_set(sv, d - SvPVX_const(sv));
   finish:
     if ( PL_hints & HINT_NEW_STRING )
-       return new_constant(NULL, 0, "q", sv, pv, "q", 1);
+       return new_constant(NULL, 0, "q", sv, pv, "q", 1, NULL);
     return sv;
 }
 
@@ -2591,34 +2591,69 @@ S_sublex_done(pTHX)
 }
 
 STATIC SV*
-S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
+S_get_and_check_backslash_N_name_wrapper(pTHX_ const char* s, const char* const e)
+{
+    /* This justs wraps get_and_check_backslash_N_name() to output any error
+     * message it returns. */
+
+    const char * error_msg = NULL;
+    SV * result;
+
+    PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME_WRAPPER;
+
+    /* charnames doesn't work well if there have been errors found */
+    if (PL_error_count > 0) {
+	return NULL;
+    }
+
+    result = get_and_check_backslash_N_name(s, e, cBOOL(UTF), &error_msg);
+
+    if (error_msg) {
+        yyerror_pv(error_msg, UTF ? SVf_UTF8 : 0);
+    }
+
+    return result;
+}
+
+SV*
+Perl_get_and_check_backslash_N_name(pTHX_ const char* s,
+                                          const char* const e,
+                                          const bool is_utf8,
+                                          const char ** error_msg)
 {
     /* <s> points to first character of interior of \N{}, <e> to one beyond the
      * interior, hence to the "}".  Finds what the name resolves to, returning
-     * an SV* containing it; NULL if no valid one found */
-
-    dVAR;
-    SV* res = newSVpvn_flags(s, e - s, UTF ? SVf_UTF8 : 0);
+     * an SV* containing it; NULL if no valid one found.
+     *
+     * 'is_utf8' is TRUE if we know we want the result to be UTF-8 even if it
+     * doesn't have to be. */
 
+    SV* res;
     HV * table;
     SV **cvp;
     SV *cv;
     SV *rv;
     HV *stash;
     const char* backslash_ptr = s - 3; /* Points to the <\> of \N{... */
+    dVAR;
 
     PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
 
+    assert(e >= s);
+    assert(s > (char *) 3);
+
+    res = newSVpvn_flags(s, e - s, (is_utf8) ? SVf_UTF8 : 0);
+
     if (!SvCUR(res)) {
         SvREFCNT_dec_NN(res);
         /* diag_listed_as: Unknown charname '%s' */
-        yyerror("Unknown charname ''");
+        *error_msg = Perl_form(aTHX_ "Unknown charname ''");
         return NULL;
     }
 
     res = new_constant( NULL, 0, "charnames", res, NULL, backslash_ptr,
                         /* include the <}> */
-                        e - backslash_ptr + 1);
+                        e - backslash_ptr + 1, error_msg);
     if (! SvPOK(res)) {
         SvREFCNT_dec_NN(res);
         return NULL;
@@ -2647,7 +2682,7 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
      * characters that begin a character name alias are alphabetic, otherwise
      * would have to create a isCHARNAME_BEGIN macro */
 
-    if (! UTF) {
+    if (! is_utf8) {
         if (! isALPHAU(*s)) {
             goto bad_charname;
         }
@@ -2721,14 +2756,11 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
         /* diag_listed_as: charnames alias definitions may not contain
                            trailing white-space; marked by <-- HERE in %s
          */
-        yyerror_pv(
-            Perl_form(aTHX_
+        *error_msg = Perl_form(aTHX_
             "charnames alias definitions may not contain trailing "
             "white-space; marked by <-- HERE in %.*s<-- HERE %.*s",
             (int)(s - backslash_ptr + 1), backslash_ptr,
-            (int)(e - s + 1), s + 1
-            ),
-        UTF ? SVf_UTF8 : 0);
+            (int)(e - s + 1), s + 1);
         return NULL;
     }
 
@@ -2745,13 +2777,10 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
                                               0 /* 0 means don't die */ );
             /* diag_listed_as: Malformed UTF-8 returned by \N{%s}
                                immediately after '%s' */
-            yyerror_pv(
-              Perl_form(aTHX_
+            *error_msg = Perl_form(aTHX_
                 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
                  (int) (e - backslash_ptr + 1), backslash_ptr,
-                 (int) ((char *) first_bad_char_loc - str), str
-              ),
-              SVf_UTF8);
+                 (int) ((char *) first_bad_char_loc - str), str);
             return NULL;
         }
     }
@@ -2764,13 +2793,10 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
          * that this print won't run off the end of the string */
         /* diag_listed_as: Invalid character in \N{...}; marked by <-- HERE
                            in \N{%s} */
-        yyerror_pv(
-          Perl_form(aTHX_
+        *error_msg = Perl_form(aTHX_
             "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
             (int)(s - backslash_ptr + 1), backslash_ptr,
-            (int)(e - s + 1), s + 1
-          ),
-          UTF ? SVf_UTF8 : 0);
+            (int)(e - s + 1), s + 1);
         return NULL;
     }
 
@@ -2778,14 +2804,11 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
         /* diag_listed_as: charnames alias definitions may not contain a
                            sequence of multiple spaces; marked by <-- HERE
                            in %s */
-        yyerror_pv(
-          Perl_form(aTHX_
+        *error_msg = Perl_form(aTHX_
             "charnames alias definitions may not contain a sequence of "
             "multiple spaces; marked by <-- HERE in %.*s<-- HERE %.*s",
             (int)(s - backslash_ptr + 1), backslash_ptr,
-            (int)(e - s + 1), s + 1
-          ),
-          UTF ? SVf_UTF8 : 0);
+            (int)(e - s + 1), s + 1);
         return NULL;
 }
 
@@ -3764,15 +3787,16 @@ S_scan_const(pTHX_ char *start)
 		    }
 		}
 		else /* Here is \N{NAME} but not \N{U+...}. */
-                     if ((res = get_and_check_backslash_N_name(s, e)))
+                     if ((res = get_and_check_backslash_N_name_wrapper(s, e)))
                 {
                     STRLEN len;
                     const char *str = SvPV_const(res, len);
                     if (PL_lex_inpat) {
 
 			if (! len) { /* The name resolved to an empty string */
-			    Copy("\\N{}", d, 4, char);
-			    d += 4;
+                            const char empty_N[] = "\\N{_}";
+                            Copy(empty_N, d, sizeof(empty_N) - 1, char);
+                            d += sizeof(empty_N) - 1;
 			}
 			else {
 			    /* In order to not lose information for the regex
@@ -4118,7 +4142,7 @@ S_scan_const(pTHX_ char *start)
 	    }
 
 	    sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
-				type, typelen);
+				type, typelen, NULL);
 	}
         pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
     }
@@ -9176,11 +9200,15 @@ S_checkcomma(pTHX_ const char *s, const char *name, const char *what)
    Best used as sv=new_constant(..., sv, ...).
    If s, pv are NULL, calls subroutine with one argument,
    and <type> is used with error messages only.
-   <type> is assumed to be well formed UTF-8 */
+   <type> is assumed to be well formed UTF-8.
+
+   If error_msg is not NULL, *error_msg will be set to any error encountered.
+   Otherwise yyerror() will be used to output it */
 
 STATIC SV *
 S_new_constant(pTHX_ const char *s, STRLEN len, const char *key, STRLEN keylen,
-	       SV *sv, SV *pv, const char *type, STRLEN typelen)
+	       SV *sv, SV *pv, const char *type, STRLEN typelen,
+               const char ** error_msg)
 {
     dSP;
     HV * table = GvHV(PL_hintgv);		 /* ^H */
@@ -9195,13 +9223,6 @@ S_new_constant(pTHX_ const char *s, STRLEN len, const char *key, STRLEN keylen,
     if (*key == 'c') { assert (strEQ(key, "charnames")); }
     assert(type || s);
 
-    /* charnames doesn't work well if there have been errors found */
-    if (PL_error_count > 0 && *key == 'c')
-    {
-	SvREFCNT_dec_NN(sv);
-	return &PL_sv_undef;
-    }
-
     sv_2mortal(sv);			/* Parent created it permanently */
     if (!table
 	|| ! (PL_hints & HINT_LOCALIZE_HH)
@@ -9256,7 +9277,12 @@ S_new_constant(pTHX_ const char *s, STRLEN len, const char *key, STRLEN keylen,
                                     (type ? type: s), why1, why2, why3);
             }
         }
-	yyerror_pv(msg, UTF ? SVf_UTF8 : 0);
+        if (error_msg) {
+            *error_msg = msg;
+        }
+        else {
+            yyerror_pv(msg, UTF ? SVf_UTF8 : 0);
+        }
   	return SvREFCNT_inc_simple_NN(sv);
     }
   now_ok:
@@ -11246,9 +11272,10 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
 	    }
 	    if (just_zero && (PL_hints & HINT_NEW_INTEGER))
 		sv = new_constant(start, s - start, "integer",
-				  sv, NULL, NULL, 0);
+				  sv, NULL, NULL, 0, NULL);
 	    else if (PL_hints & HINT_NEW_BINARY)
-		sv = new_constant(start, s - start, "binary", sv, NULL, NULL, 0);
+		sv = new_constant(start, s - start, "binary",
+                                  sv, NULL, NULL, 0, NULL);
 	}
 	break;
 
@@ -11453,7 +11480,7 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
 	    const char *const key = floatit ? "float" : "integer";
 	    const STRLEN keylen = floatit ? 5 : 7;
 	    sv = S_new_constant(aTHX_ PL_tokenbuf, d - PL_tokenbuf,
-				key, keylen, sv, NULL, NULL, 0);
+				key, keylen, sv, NULL, NULL, 0, NULL);
 	}
 	break;
 

-- 
Perl5 Master Repository



nntp.perl.org: Perl Programming lists via nntp and http.
Comments to Ask Bjørn Hansen at ask@perl.org | Group listing | About