develooper Front page | perl.perl5.changes | Postings from March 2019

[perl.git] branch blead updated. v5.29.8-93-g14f657d436

From:
Karl Williamson
Date:
March 13, 2019 21:43
Subject:
[perl.git] branch blead updated. v5.29.8-93-g14f657d436
Message ID:
E1h4BeI-0007hh-7i@git.dc.perl.space
In perl.git, the branch blead has been updated

<https://perl5.git.perl.org/perl.git/commitdiff/14f657d436dd5738712c1d294e7d5f7898336ba4?hp=823c3b2daca3409863f10ec5e1c6d416d2614a5a>

- Log -----------------------------------------------------------------
commit 14f657d436dd5738712c1d294e7d5f7898336ba4
Author: Karl Williamson <khw@cpan.org>
Date:   Wed Mar 13 15:21:39 2019 -0600

    regexec.c: We know the end ptr; don't need to recalc

commit 2892a27e931e4ba534dd20dc9c94542eda19afbf
Author: Karl Williamson <khw@cpan.org>
Date:   Wed Mar 13 15:20:50 2019 -0600

    regexec.c: Add assertion

commit 1f25ceb1dbba05ad62ee0a371c12863528fc4a9f
Author: Karl Williamson <khw@cpan.org>
Date:   Wed Mar 13 14:23:03 2019 -0600

    regcomp.c: Add assertion

commit ee2223a54bee07e3f5e8a63ce5bd71c29f9a2e85
Author: Karl Williamson <khw@cpan.org>
Date:   Wed Mar 13 13:36:00 2019 -0600

    regcomp.c: Rmv unnecessary branch
    
    The function memchr() seems to get inlined so it is very fast, and it's
    legal to call it with a 0 length, so let it figure out that it's zero.

commit 135226faaef3671e917c2e1d253e89a47c2b64f0
Author: Karl Williamson <khw@cpan.org>
Date:   Wed Mar 13 13:23:24 2019 -0600

    perlvar: Fix broken link

commit 67d5c462b11923ef2f2c3b6dc5834d982347e17b
Author: Karl Williamson <khw@cpan.org>
Date:   Wed Mar 13 13:18:28 2019 -0600

    perlrecharclass: Minor wording improvements

commit 447fcf49ce69125df8ec9c1b46a7e6f24df98683
Author: Karl Williamson <khw@cpan.org>
Date:   Wed Mar 13 13:17:39 2019 -0600

    perlre: Minor wording improvements

commit 407fecf1ecbd5b45621badd1485c91ddf95256e1
Author: Karl Williamson <khw@cpan.org>
Date:   Wed Mar 13 13:16:43 2019 -0600

    perlre: Italicize a bunch of stuff
    
    These are not meant to be written literally.

commit 6dd641e14cd2675068749eeea8c8aabee158595e
Author: Karl Williamson <khw@cpan.org>
Date:   Wed Mar 13 11:42:15 2019 -0600

    dquote.c: Use UTF8_SAFE_SKIP
    
    Otherwise malformed input could cause this to return a pointer outside
    its buffer

commit 85fcc8f2234ce65ebd31480efc38dc4a3ec8ad13
Author: Karl Williamson <khw@cpan.org>
Date:   Wed Mar 13 11:41:09 2019 -0600

    Add UTF8_SAFE_SKIP API macro
    
    This version of UTF8SKIP refuses to advance beyond the end pointer

-----------------------------------------------------------------------

Summary of changes:
 dquote.c                |   6 +-
 pod/perlre.pod          | 196 ++++++++++++++++++++++++------------------------
 pod/perlrecharclass.pod |   7 +-
 pod/perlvar.pod         |   3 +-
 regcomp.c               |   5 +-
 regexec.c               |   4 +-
 utf8.h                  |  11 +++
 7 files changed, 124 insertions(+), 108 deletions(-)

diff --git a/dquote.c b/dquote.c
index 6913ca5ce4..10fb2b5df0 100644
--- a/dquote.c
+++ b/dquote.c
@@ -141,7 +141,7 @@ Perl_grok_bslash_o(pTHX_ char **s, const char * const send, UV *uv,
     if (numbers_len != (STRLEN) (e - *s)) {
         if (strict) {
             *s += numbers_len;
-            *s += (UTF) ? UTF8SKIP(*s) : (STRLEN) 1;
+            *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
             *error_msg = "Non-octal character";
             return FALSE;
         }
@@ -223,7 +223,7 @@ Perl_grok_bslash_x(pTHX_ char **s, const char * const send, UV *uv,
 	*s += len;
         if (strict && len != 2) {
             if (len < 2) {
-                *s += (UTF) ? UTF8SKIP(*s) : 1;
+                *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
                 *error_msg = "Non-hex character";
             }
             else {
@@ -272,7 +272,7 @@ Perl_grok_bslash_x(pTHX_ char **s, const char * const send, UV *uv,
 
     if (strict && numbers_len != (STRLEN) (e - *s)) {
         *s += numbers_len;
-        *s += (UTF) ? UTF8SKIP(*s) : 1;
+        *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
         *error_msg = "Non-hex character";
         return FALSE;
     }
diff --git a/pod/perlre.pod b/pod/perlre.pod
index 900c28497a..209cac7f8d 100644
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -563,7 +563,8 @@ At any given time, exactly one of these modifiers is in effect.  Their
 existence allows Perl to keep the originally compiled behavior of a
 regular expression, regardless of what rules are in effect when it is
 actually executed.  And if it is interpolated into a larger regex, the
-original's rules continue to apply to it, and only it.
+original's rules continue to apply to it, and don't affect the other
+parts.
 
 The C</l> and C</u> modifiers are automatically selected for
 regular expressions compiled within the scope of various pragmas,
@@ -720,8 +721,8 @@ the pattern uses L<C<(*script_run: ...)>|/Script Runs>
 Another mnemonic for this modifier is "Depends", as the rules actually
 used depend on various things, and as a result you can get unexpected
 results.  See L<perlunicode/The "Unicode Bug">.  The Unicode Bug has
-become rather infamous, leading to yet another (printable) name for this
-modifier, "Dodgy".
+become rather infamous, leading to yet another (without swearing) name
+for this modifier, "Dodgy".
 
 Unless the pattern or string are encoded in UTF-8, only ASCII characters
 can match positively.
@@ -925,7 +926,7 @@ string" problem can be most efficiently performed when written as:
 
 as we know that if the final quote does not match, backtracking will not
 help. See the independent subexpression
-L</C<< (?>pattern) >>> for more details;
+L</C<< (?>I<pattern>) >>> for more details;
 possessive quantifiers are just syntactic sugar for that construct. For
 instance the above example could also be written as follows:
 
@@ -1035,8 +1036,9 @@ See L</Extended Patterns> below for details.
 
 =item [7]
 
-Note that C<\N> has two meanings.  When of the form C<\N{NAME}>, it matches the
-character or character sequence whose name is C<NAME>; and similarly
+Note that C<\N> has two meanings.  When of the form C<\N{I<NAME>}>, it
+matches the character or character sequence whose name is I<NAME>; and
+similarly
 when of the form C<\N{U+I<hex>}>, it matches the character whose Unicode
 code point is I<hex>.  Otherwise it matches any character but C<\n>.
 
@@ -1337,10 +1339,10 @@ expressions, and 2) whenever you see one, you should stop and
 
 =over 4
 
-=item C<(?#text)>
+=item C<(?#I<text>)>
 X<(?#)>
 
-A comment.  The text is ignored.
+A comment.  The I<text> is ignored.
 Note that Perl closes
 the comment as soon as it sees a C<")">, so there is no way to put a literal
 C<")"> in the comment.  The pattern's closing delimiter must be escaped by
@@ -1402,8 +1404,8 @@ repetition of the previous word, assuming the C</x> modifier, and no C</i>
 modifier outside this group.
 
 These modifiers do not carry over into named subpatterns called in the
-enclosing group. In other words, a pattern such as C<((?i)(?&NAME))> does not
-change the case-sensitivity of the C<"NAME"> pattern.
+enclosing group. In other words, a pattern such as C<((?i)(?&I<NAME>))> does not
+change the case-sensitivity of the I<NAME> pattern.
 
 A modifier is overridden by later occurrences of this construct in the
 same scope containing the same modifier, so that
@@ -1448,12 +1450,12 @@ C<(?-d:...)> and C<(?dl:...)> are fatal errors.
 Note also that the C<"p"> modifier is special in that its presence
 anywhere in a pattern has a global effect.
 
-=item C<(?:pattern)>
+=item C<(?:I<pattern>)>
 X<(?:)>
 
-=item C<(?adluimnsx-imnsx:pattern)>
+=item C<(?adluimnsx-imnsx:I<pattern>)>
 
-=item C<(?^aluimnsx:pattern)>
+=item C<(?^aluimnsx:I<pattern>)>
 X<(?^:)>
 
 This is for clustering, not capturing; it groups subexpressions like
@@ -1518,7 +1520,7 @@ redundant.
 Mnemonic for C<(?^...)>:  A fresh beginning since the usual use of a caret is
 to match at the beginning.
 
-=item C<(?|pattern)>
+=item C<(?|I<pattern>)>
 X<(?|)> X<Branch reset>
 
 This is the "branch reset" pattern, which has the special property
@@ -1574,11 +1576,11 @@ lookahead matches text following the current match position.
 
 =over 4
 
-=item C<(?=pattern)>
+=item C<(?=I<pattern>)>
 
-=item C<(*pla:pattern)>
+=item C<(*pla:I<pattern>)>
 
-=item C<(*positive_lookahead:pattern)>
+=item C<(*positive_lookahead:I<pattern>)>
 X<(?=)>
 X<(*pla>
 X<(*positive_lookahead>
@@ -1590,11 +1592,11 @@ matches a word followed by a tab, without including the tab in C<$&>.
 The alphabetic forms are experimental; using them yields a warning in the
 C<experimental::alpha_assertions> category.
 
-=item C<(?!pattern)>
+=item C<(?!I<pattern>)>
 
-=item C<(*nla:pattern)>
+=item C<(*nla:I<pattern>)>
 
-=item C<(*negative_lookahead:pattern)>
+=item C<(*negative_lookahead:I<pattern>)>
 X<(?!)>
 X<(*nla>
 X<(*negative_lookahead>
@@ -1613,13 +1615,13 @@ match.  Use lookbehind instead (see below).
 The alphabetic forms are experimental; using them yields a warning in the
 C<experimental::alpha_assertions> category.
 
-=item C<(?<=pattern)>
+=item C<(?<=I<pattern>)>
 
 =item C<\K>
 
-=item C<(*plb:pattern)>
+=item C<(*plb:I<pattern>)>
 
-=item C<(*positive_lookbehind:pattern)>
+=item C<(*positive_lookbehind:I<pattern>)>
 X<(?<=)>
 X<(*plb>
 X<(*positive_lookbehind>
@@ -1654,11 +1656,11 @@ can be rewritten as the much more efficient
 The alphabetic forms (not including C<\K> are experimental; using them
 yields a warning in the C<experimental::alpha_assertions> category.
 
-=item C<(?<!pattern)>
+=item C<(?<!I<pattern>)>
 
-=item C<(*nlb:pattern)>
+=item C<(*nlb:I<pattern>)>
 
-=item C<(*negative_lookbehind:pattern)>
+=item C<(*negative_lookbehind:I<pattern>)>
 X<(?<!)>
 X<(*nlb>
 X<(*negative_lookbehind>
@@ -1677,22 +1679,23 @@ C<experimental::alpha_assertions> category.
 
 =back
 
-=item C<< (?<NAME>pattern) >>
+=item C<< (?<I<NAME>>I<pattern>) >>
 
-=item C<(?'NAME'pattern)>
+=item C<(?'I<NAME>'I<pattern>)>
 X<< (?<NAME>) >> X<(?'NAME')> X<named capture> X<capture>
 
 A named capture group. Identical in every respect to normal capturing
 parentheses C<()> but for the additional fact that the group
 can be referred to by name in various regular expression
-constructs (like C<\g{NAME}>) and can be accessed by name
+constructs (like C<\g{I<NAME>}>) and can be accessed by name
 after a successful match via C<%+> or C<%->. See L<perlvar>
 for more details on the C<%+> and C<%-> hashes.
 
 If multiple distinct capture groups have the same name, then
-C<$+{NAME}> will refer to the leftmost defined group in the match.
+C<$+{I<NAME>}> will refer to the leftmost defined group in the match.
 
-The forms C<(?'NAME'pattern)> and C<< (?<NAME>pattern) >> are equivalent.
+The forms C<(?'I<NAME>'I<pattern>)> and C<< (?<I<NAME>>I<pattern>) >>
+are equivalent.
 
 B<NOTE:> While the notation of this construct is the same as the similar
 function in .NET regexes, the behavior is not. In Perl the groups are
@@ -1701,7 +1704,7 @@ pattern
 
   /(x)(?<foo>y)(z)/
 
-C<$+{I<foo>}> will be the same as C<$2>, and C<$3> will contain 'z' instead of
+C<$+{foo}> will be the same as C<$2>, and C<$3> will contain 'z' instead of
 the opposite which is what a .NET regex hacker might expect.
 
 Currently I<NAME> is restricted to simple identifiers only.
@@ -1710,29 +1713,30 @@ its Unicode extension (see L<utf8>),
 though it isn't extended by the locale (see L<perllocale>).
 
 B<NOTE:> In order to make things easier for programmers with experience
-with the Python or PCRE regex engines, the pattern C<< (?PE<lt>NAMEE<gt>pattern) >>
-may be used instead of C<< (?<NAME>pattern) >>; however this form does not
+with the Python or PCRE regex engines, the pattern C<<
+(?PE<lt>I<NAME>E<gt>I<pattern>) >>
+may be used instead of C<< (?<I<NAME>>I<pattern>) >>; however this form does not
 support the use of single quotes as a delimiter for the name.
 
-=item C<< \k<NAME> >>
+=item C<< \k<I<NAME>> >>
 
-=item C<< \k'NAME' >>
+=item C<< \k'I<NAME>' >>
 
 Named backreference. Similar to numeric backreferences, except that
 the group is designated by name and not number. If multiple groups
 have the same name then it refers to the leftmost defined group in
 the current match.
 
-It is an error to refer to a name not defined by a C<< (?<NAME>) >>
+It is an error to refer to a name not defined by a C<< (?<I<NAME>>) >>
 earlier in the pattern.
 
 Both forms are equivalent.
 
 B<NOTE:> In order to make things easier for programmers with experience
-with the Python or PCRE regex engines, the pattern C<< (?P=NAME) >>
-may be used instead of C<< \k<NAME> >>.
+with the Python or PCRE regex engines, the pattern C<< (?P=I<NAME>) >>
+may be used instead of C<< \k<I<NAME>> >>.
 
-=item C<(?{ code })>
+=item C<(?{ I<code> })>
 X<(?{})> X<regex, code in> X<regexp, code in> X<regular expression, code in>
 
 B<WARNING>: Using this feature safely requires that you understand its
@@ -1836,9 +1840,9 @@ This assertion may be used as the condition in a
 
     (?(condition)yes-pattern|no-pattern)
 
-switch.  If I<not> used in this way, the result of evaluation of C<code>
+switch.  If I<not> used in this way, the result of evaluation of I<code>
 is put into the special variable C<$^R>.  This happens immediately, so
-C<$^R> can be used from other C<(?{ code })> assertions inside the same
+C<$^R> can be used from other C<(?{ I<code> })> assertions inside the same
 regular expression.
 
 The assignment to C<$^R> above is properly localized, so the old
@@ -1854,7 +1858,7 @@ keep track of the number of nested parentheses. For example:
   print "color = $color, animal = $animal\n";
 
 
-=item C<(??{ code })>
+=item C<(??{ I<code> })>
 X<(??{})>
 X<regex, postponed> X<regexp, postponed> X<regular expression, postponed>
 
@@ -1865,7 +1869,7 @@ optimisations in the regex engine.  For more information on this, see
 L</Embedded Code Execution Frequency>.
 
 This is a "postponed" regular subexpression.  It behaves in I<exactly> the
-same way as a C<(?{ code })> code block as described above, except that
+same way as a C<(?{ I<code> })> code block as described above, except that
 its return value, rather than being assigned to C<$^R>, is treated as a
 pattern, compiled if it's a string (or used as-is if its a qr// object),
 then matched as if it were inserted instead of this construct.
@@ -1901,7 +1905,7 @@ The following pattern matches a parenthesized group:
          }x;
 
 See also
-L<C<(?I<PARNO>)>|/(?PARNO) (?-PARNO) (?+PARNO) (?R) (?0)>
+L<C<(?I<PARNO>)>|/(?I<PARNO>) (?-I<PARNO>) (?+I<PARNO>) (?R) (?0)>
 for a different, more efficient way to accomplish
 the same task.
 
@@ -1921,11 +1925,11 @@ the current position in the string. Information about capture state from
 the caller for things like backreferences is available to the subpattern,
 but capture buffers set by the subpattern are not visible to the caller.
 
-Similar to C<(??{ code })> except that it does not involve executing any
+Similar to C<(??{ I<code> })> except that it does not involve executing any
 code or potentially compiling a returned pattern string; instead it treats
 the part of the current pattern contained within a specified capture group
 as an independent pattern that must match at the current position. Also
-different is the treatment of capture buffers, unlike C<(??{ code })>
+different is the treatment of capture buffers, unlike C<(??{ I<code> })>
 recursive patterns have access to their caller's match state, so one can
 use backreferences safely.
 
@@ -1993,7 +1997,7 @@ as atomic. Also, modifiers are resolved at compile time, so constructs
 like C<(?i:(?1))> or C<(?:(?i)(?1))> do not affect how the sub-pattern will
 be processed.
 
-=item C<(?&NAME)>
+=item C<(?&I<NAME>)>
 X<(?&NAME)>
 
 Recurse to a named subpattern. Identical to C<(?I<PARNO>)> except that the
@@ -2004,19 +2008,19 @@ It is an error to refer to a name that is not declared somewhere in the
 pattern.
 
 B<NOTE:> In order to make things easier for programmers with experience
-with the Python or PCRE regex engines the pattern C<< (?P>NAME) >>
-may be used instead of C<< (?&NAME) >>.
+with the Python or PCRE regex engines the pattern C<< (?P>I<NAME>) >>
+may be used instead of C<< (?&I<NAME>) >>.
 
-=item C<(?(condition)yes-pattern|no-pattern)>
+=item C<(?(I<condition>)I<yes-pattern>|I<no-pattern>)>
 X<(?()>
 
-=item C<(?(condition)yes-pattern)>
+=item C<(?(I<condition>)I<yes-pattern>)>
 
-Conditional expression. Matches C<yes-pattern> if C<condition> yields
-a true value, matches C<no-pattern> otherwise. A missing pattern always
+Conditional expression. Matches I<yes-pattern> if I<condition> yields
+a true value, matches I<no-pattern> otherwise. A missing pattern always
 matches.
 
-C<(condition)> should be one of:
+C<(I<condition>)> should be one of:
 
 =over 4
 
@@ -2036,7 +2040,7 @@ matched);
 (true when evaluated inside of recursion or eval).  Additionally the
 C<"R"> may be
 followed by a number, (which will be true when evaluated when recursing
-inside of the appropriate group), or by C<&NAME>, in which case it will
+inside of the appropriate group), or by C<&I<NAME>>, in which case it will
 be true only when evaluated during recursion in the named group.
 
 =back
@@ -2064,12 +2068,12 @@ Full syntax: C<< (?(?=I<lookahead>)I<then>|I<else>) >>
 =item C<(?{ I<CODE> })>
 
 Treats the return value of the code block as the condition.
-Full syntax: C<< (?(?{ code })then|else) >>
+Full syntax: C<< (?(?{ I<code> })I<then>|I<else>) >>
 
 =item C<(R)>
 
 Checks if the expression has been evaluated inside of recursion.
-Full syntax: C<< (?(R)then|else) >>
+Full syntax: C<< (?(R)I<then>|I<else>) >>
 
 =item C<(R1)> C<(R2)> ...
 
@@ -2080,7 +2084,7 @@ inside of the n-th capture group. This check is the regex equivalent of
 
 In other words, it does not check the full recursion stack.
 
-Full syntax: C<< (?(R1)then|else) >>
+Full syntax: C<< (?(R1)I<then>|I<else>) >>
 
 =item C<(R&I<NAME>)>
 
@@ -2088,14 +2092,14 @@ Similar to C<(R1)>, this predicate checks to see if we're executing
 directly inside of the leftmost group with a given name (this is the same
 logic used by C<(?&I<NAME>)> to disambiguate). It does not check the full
 stack, but only the name of the innermost active recursion.
-Full syntax: C<< (?(R&name)then|else) >>
+Full syntax: C<< (?(R&I<name>)I<then>|I<else>) >>
 
 =item C<(DEFINE)>
 
 In this case, the yes-pattern is never directly executed, and no
 no-pattern is allowed. Similar in spirit to C<(?{0})> but more efficient.
 See below for details.
-Full syntax: C<< (?(DEFINE)definitions...) >>
+Full syntax: C<< (?(DEFINE)I<definitions>...) >>
 
 =back
 
@@ -2148,15 +2152,15 @@ Will output 2, not 1. This is particularly important if you intend to
 compile the definitions with the C<qr//> operator, and later
 interpolate them in another pattern.
 
-=item C<< (?>pattern) >>
+=item C<< (?>I<pattern>) >>
 
-=item C<< (*atomic:pattern) >>
+=item C<< (*atomic:I<pattern>) >>
 X<(?E<gt>pattern)>
 X<(*atomic>
 X<backtrack> X<backtracking> X<atomic> X<possessive>
 
 An "independent" subexpression, one which matches the substring
-that a I<standalone> C<pattern> would match if anchored at the given
+that a standalone I<pattern> would match if anchored at the given
 position, and it matches I<nothing other than this substring>.  This
 construct is useful for optimizations of what would otherwise be
 "eternal" matches, because it will not backtrack (see L</"Backtracking">).
@@ -2172,12 +2176,12 @@ group C<ab> (see L</"Backtracking">).  In particular, C<a*> inside
 C<a*ab> will match fewer characters than a standalone C<a*>, since
 this makes the tail match.
 
-C<< (?>pattern) >> does not disable backtracking altogether once it has
+C<< (?>I<pattern>) >> does not disable backtracking altogether once it has
 matched. It is still possible to backtrack past the construct, but not
 into it. So C<< ((?>a*)|(?>b*))ar >> will still match "bar".
 
-An effect similar to C<< (?>pattern) >> may be achieved by writing
-C<(?=(pattern))\g{-1}>.  This matches the same substring as a standalone
+An effect similar to C<< (?>I<pattern>) >> may be achieved by writing
+C<(?=(I<pattern>))\g{-1}>.  This matches the same substring as a standalone
 C<a+>, and the following C<\g{-1}> eats the matched string; it therefore
 makes a zero-length assertion into an analogue of C<< (?>...) >>.
 (The difference between these two constructs is that the second one
@@ -2536,7 +2540,7 @@ you can write either of these:
  (*atomic_script_run:pattern)
  (*asr:pattern)
 
-(See L</C<(?E<gt>pattern)>>.)
+(See L</C<(?E<gt>I<pattern>)>>.)
 
 In Taiwan, Japan, and Korea, it is common for text to have a mixture of
 characters from their native scripts and base Chinese.  Perl follows
@@ -2652,13 +2656,13 @@ rules apply:
 On failure, the C<$REGERROR> variable will be set to the I<ARG> value of the
 verb pattern, if the verb was involved in the failure of the match. If the
 I<ARG> part of the pattern was omitted, then C<$REGERROR> will be set to the
-name of the last C<(*MARK:NAME)> pattern executed, or to TRUE if there was
+name of the last C<(*MARK:I<NAME>)> pattern executed, or to TRUE if there was
 none. Also, the C<$REGMARK> variable will be set to FALSE.
 
 On a successful match, the C<$REGERROR> variable will be set to FALSE, and
 the C<$REGMARK> variable will be set to the name of the last
-C<(*MARK:NAME)> pattern executed.  See the explanation for the
-C<(*MARK:NAME)> verb below for more details.
+C<(*MARK:I<NAME>)> pattern executed.  See the explanation for the
+C<(*MARK:I<NAME>)> verb below for more details.
 
 B<NOTE:> C<$REGERROR> and C<$REGMARK> are not magic variables like C<$1>
 and most other regex-related variables. They are not local to a scope, nor
@@ -2677,7 +2681,7 @@ argument, then C<$REGERROR> and C<$REGMARK> are not touched at all.
 
 =over 4
 
-=item C<(*PRUNE)> C<(*PRUNE:NAME)>
+=item C<(*PRUNE)> C<(*PRUNE:I<NAME>)>
 X<(*PRUNE)> X<(*PRUNE:NAME)>
 
 This zero-width pattern prunes the backtracking tree at the current point
@@ -2722,14 +2726,14 @@ at each matching starting point like so:
 
 Any number of C<(*PRUNE)> assertions may be used in a pattern.
 
-See also C<<< L<< /(?>pattern) >> >>> and possessive quantifiers for
+See also C<<< L<< /(?>I<pattern>) >> >>> and possessive quantifiers for
 other ways to
 control backtracking. In some cases, the use of C<(*PRUNE)> can be
 replaced with a C<< (?>pattern) >> with no functional difference; however,
 C<(*PRUNE)> can be used to handle cases that cannot be expressed using a
 C<< (?>pattern) >> alone.
 
-=item C<(*SKIP)> C<(*SKIP:NAME)>
+=item C<(*SKIP)> C<(*SKIP:I<NAME>)>
 X<(*SKIP)>
 
 This zero-width pattern is similar to C<(*PRUNE)>, except that on
@@ -2739,8 +2743,8 @@ of this pattern. This effectively means that the regex engine "skips" forward
 to this position on failure and tries to match again, (assuming that
 there is sufficient room to match).
 
-The name of the C<(*SKIP:NAME)> pattern has special significance. If a
-C<(*MARK:NAME)> was encountered while matching, then it is that position
+The name of the C<(*SKIP:I<NAME>)> pattern has special significance. If a
+C<(*MARK:I<NAME>)> was encountered while matching, then it is that position
 which is used as the "skip point". If no C<(*MARK)> of that name was
 encountered, then the C<(*SKIP)> operator has no effect. When used
 without a name the "skip point" is where the match point was when
@@ -2762,7 +2766,7 @@ Once the 'aaab' at the start of the string has matched, and the C<(*SKIP)>
 executed, the next starting point will be where the cursor was when the
 C<(*SKIP)> was executed.
 
-=item C<(*MARK:NAME)> C<(*:NAME)>
+=item C<(*MARK:I<NAME>)> C<(*:I<NAME>)>
 X<(*MARK)> X<(*MARK:NAME)> X<(*:NAME)>
 
 This zero-width pattern can be used to mark the point reached in a string
@@ -2771,13 +2775,13 @@ mark may be given a name. A later C<(*SKIP)> pattern will then skip
 forward to that point if backtracked into on failure. Any number of
 C<(*MARK)> patterns are allowed, and the I<NAME> portion may be duplicated.
 
-In addition to interacting with the C<(*SKIP)> pattern, C<(*MARK:NAME)>
+In addition to interacting with the C<(*SKIP)> pattern, C<(*MARK:I<NAME>)>
 can be used to "label" a pattern branch, so that after matching, the
 program can determine which branches of the pattern were involved in the
 match.
 
 When a match is successful, the C<$REGMARK> variable will be set to the
-name of the most recently executed C<(*MARK:NAME)> that was involved
+name of the most recently executed C<(*MARK:I<NAME>)> that was involved
 in the match.
 
 This can be used to determine which branch of a pattern was matched
@@ -2789,19 +2793,19 @@ C</(?:x(*MARK:x)|y(*MARK:y)|z(*MARK:z))/>.
 When a match has failed, and unless another verb has been involved in
 failing the match and has provided its own name to use, the C<$REGERROR>
 variable will be set to the name of the most recently executed
-C<(*MARK:NAME)>.
+C<(*MARK:I<NAME>)>.
 
 See L</(*SKIP)> for more details.
 
-As a shortcut C<(*MARK:NAME)> can be written C<(*:NAME)>.
+As a shortcut C<(*MARK:I<NAME>)> can be written C<(*:I<NAME>)>.
 
-=item C<(*THEN)> C<(*THEN:NAME)>
+=item C<(*THEN)> C<(*THEN:I<NAME>)>
 
 This is similar to the "cut group" operator C<::> from Perl 6.  Like
 C<(*PRUNE)>, this verb always matches, and when backtracked into on
 failure, it causes the regex engine to try the next alternation in the
 innermost enclosing group (capturing or otherwise) that has alternations.
-The two branches of a C<(?(condition)yes-pattern|no-pattern)> do not
+The two branches of a C<(?(I<condition>)I<yes-pattern>|I<no-pattern>)> do not
 count as an alternation, as far as C<(*THEN)> is concerned.
 
 Its name comes from the observation that this operation combined with the
@@ -2830,7 +2834,7 @@ is not the same as
 as after matching the I<A> but failing on the I<B> the C<(*THEN)> verb will
 backtrack and try I<C>; but the C<(*PRUNE)> verb will simply fail.
 
-=item C<(*COMMIT)> C<(*COMMIT:args)>
+=item C<(*COMMIT)> C<(*COMMIT:I<args>)>
 X<(*COMMIT)>
 
 This is the Perl 6 "commit pattern" C<< <commit> >> or C<:::>. It's a
@@ -2851,7 +2855,7 @@ In other words, once the C<(*COMMIT)> has been entered, and if the pattern
 does not match, the regex engine will not try any further matching on the
 rest of the string.
 
-=item C<(*FAIL)> C<(*F)> C<(*FAIL:arg)>
+=item C<(*FAIL)> C<(*F)> C<(*FAIL:I<arg>)>
 X<(*FAIL)> X<(*F)>
 
 This pattern matches nothing and always fails. It can be used to force the
@@ -2862,7 +2866,7 @@ the argument can be obtained from C<$REGERROR>.
 
 It is probably useful only when combined with C<(?{})> or C<(??{})>.
 
-=item C<(*ACCEPT)> C<(*ACCEPT:arg)>
+=item C<(*ACCEPT)> C<(*ACCEPT:I<arg>)>
 X<(*ACCEPT)>
 
 This pattern matches nothing and causes the end of successful matching at
@@ -3095,14 +3099,14 @@ else in the whole regular expression.)
 For this grouping operator there is no need to describe the ordering, since
 only whether or not C<"S"> can match is important.
 
-=item C<(??{ EXPR })>, C<(?I<PARNO>)>
+=item C<(??{ I<EXPR> })>, C<(?I<PARNO>)>
 
 The ordering is the same as for the regular expression which is
-the result of EXPR, or the pattern contained by capture group I<PARNO>.
+the result of I<EXPR>, or the pattern contained by capture group I<PARNO>.
 
-=item C<(?(condition)yes-pattern|no-pattern)>
+=item C<(?(I<condition>)I<yes-pattern>|I<no-pattern>)>
 
-Recall that which of C<yes-pattern> or C<no-pattern> actually matches is
+Recall that which of I<yes-pattern> or I<no-pattern> actually matches is
 already determined.  The ordering of the matches is the same as for the
 chosen subexpression.
 
@@ -3210,17 +3214,17 @@ Perl-specific syntax, the following are also accepted:
 
 =over 4
 
-=item C<< (?PE<lt>NAMEE<gt>pattern) >>
+=item C<< (?PE<lt>I<NAME>E<gt>I<pattern>) >>
 
-Define a named capture group. Equivalent to C<< (?<NAME>pattern) >>.
+Define a named capture group. Equivalent to C<< (?<I<NAME>>I<pattern>) >>.
 
-=item C<< (?P=NAME) >>
+=item C<< (?P=I<NAME>) >>
 
-Backreference to a named capture group. Equivalent to C<< \g{NAME} >>.
+Backreference to a named capture group. Equivalent to C<< \g{I<NAME>} >>.
 
-=item C<< (?P>NAME) >>
+=item C<< (?P>I<NAME>) >>
 
-Subroutine call to a named capture group. Equivalent to C<< (?&NAME) >>.
+Subroutine call to a named capture group. Equivalent to C<< (?&I<NAME>) >>.
 
 =back
 
diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod
index 0f6a624e85..bda60cd49e 100644
--- a/pod/perlrecharclass.pod
+++ b/pod/perlrecharclass.pod
@@ -359,9 +359,10 @@ C</\pLl/> is valid, but means something different.
 It matches a two character string: a letter (Unicode property C<\pL>),
 followed by a lowercase C<l>.
 
-If locale rules are not in effect, the use of
-a Unicode property will force the regular expression into using Unicode
-rules, if it isn't already.
+What a Unicode property matches is never subject to locale rules, and
+if locale rules are not otherwise in effect, the use of a Unicode
+property will force the regular expression into using Unicode rules, if
+it isn't already.
 
 Note that almost all properties are immune to case-insensitive matching.
 That is, adding a C</i> regular expression modifier does not change what
diff --git a/pod/perlvar.pod b/pod/perlvar.pod
index 03b2215b66..d67d4cd8b1 100644
--- a/pod/perlvar.pod
+++ b/pod/perlvar.pod
@@ -931,7 +931,8 @@ is equivalent to $2, etc.
 
 should output "f-o-a-l".
 
-See also L</$I<digits>>, L</%{^CAPTURE}> and L</%{^CAPTURE_ALL}>.
+See also L<<< /$<I<digits>> ($1, $2, ...) >>>, L</%{^CAPTURE}> and
+L</%{^CAPTURE_ALL}>.
 
 Note that unlike most other regex magic variables there is no single
 letter equivalent to C<@{^CAPTURE}>.
diff --git a/regcomp.c b/regcomp.c
index b5903bf8df..3b269466ee 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -13269,9 +13269,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                 char name = *RExC_parse;
                 char * endbrace = NULL;
                 RExC_parse += 2;
-                if (RExC_parse < RExC_end) {
-                    endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
-                }
+                endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
 
                 if (! endbrace) {
                     vFAIL2("Missing right brace on \\%c{}", name);
@@ -16796,6 +16794,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                               "Ignoring zero length \\N{} in character class");
                         }
                         else { /* cp_count > 1 */
+                            assert(cp_count > 1);
                             if (! RExC_in_multi_char_class) {
                                 if (invert || range || *RExC_parse == '-') {
                                     if (strict) {
diff --git a/regexec.c b/regexec.c
index e50145d449..64a65462b5 100644
--- a/regexec.c
+++ b/regexec.c
@@ -155,7 +155,7 @@ static const char* const non_utf8_target_but_utf8_required
 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 #define NEXTCHR_IS_EOS (nextchr < 0)
 
-#define SET_nextchr \
+#define SET_nextchr __ASSERT_(locinput <= reginfo->strend)                     \
     nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
 
 #define SET_locinput(p) \
@@ -1760,7 +1760,7 @@ STMT_START {
     case trie_utf8l:                                                                \
         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
-            _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
+            _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end);                     \
         }                                                                           \
         /* FALLTHROUGH */                                                           \
     case trie_utf8:                                                                 \
diff --git a/utf8.h b/utf8.h
index 99e795d3a4..7773007e49 100644
--- a/utf8.h
+++ b/utf8.h
@@ -498,6 +498,17 @@ only) byte is pointed to by C<s>.
 #define UTF8SKIP(s)  PL_utf8skip[*(const U8*)(s)]
 #define UTF8_SKIP(s) UTF8SKIP(s)
 
+/*
+
+=for apidoc Am|STRLEN|UTF8_SAFE_SKIP|char* s|char* e
+returns the number of bytes in the UTF-8 encoded character whose first (perhaps
+only) byte is pointed to by C<s>.  But never returns beyond C<e>.
+
+=cut
+ */
+#define UTF8_SAFE_SKIP(s, e)  (__ASSERT_((e) > (s))             \
+                               MIN(((e) - (s)), UTF8_SKIP(s)))
+
 /* Most code that says 'UNI_' really means the native value for code points up
  * through 255 */
 #define UNI_IS_INVARIANT(cp)   UVCHR_IS_INVARIANT(cp)

-- 
Perl5 Master Repository



nntp.perl.org: Perl Programming lists via nntp and http.
Comments to Ask Bjørn Hansen at ask@perl.org | Group listing | About