develooper Front page | perl.perl5.changes | Postings from February 2018

[perl.git] branch blead updated. v5.27.8-165-g7c4a22ed65

From:
Karl Williamson
Date:
February 7, 2018 18:19
Subject:
[perl.git] branch blead updated. v5.27.8-165-g7c4a22ed65
Message ID:
E1ejUJT-0002Ti-6z@git.dc.perl.space
In perl.git, the branch blead has been updated

<https://perl5.git.perl.org/perl.git/commitdiff/7c4a22ed65ae021292a3c6b7a2736204e26c55cf?hp=1848346ffa44e6cc26b51a9cc2ef878e44ae3dd8>

- Log -----------------------------------------------------------------
commit 7c4a22ed65ae021292a3c6b7a2736204e26c55cf
Author: Karl Williamson <khw@cpan.org>
Date:   Sat Feb 3 22:16:39 2018 -0700

    perlapi: Rmv nonapplicable text

commit 142ece992c577bd9e5e66a0f9cdce1c4813740ae
Author: Karl Williamson <khw@cpan.org>
Date:   Sat Feb 3 22:15:51 2018 -0700

    regcomp.c: Fix comment

commit 33f38593a187f963d7c347cd8125ddce79cccd5c
Author: Karl Williamson <khw@cpan.org>
Date:   Sat Feb 3 22:14:22 2018 -0700

    Add uvchr_to_utf8_flags_msgs()
    
    This is propmpted by Encode's needs.  When called with the proper
    parameter, it returns any warnings instead of displaying them directly.

commit 50f3d1063eaf7bc0b5db35dda5b1b3567b931a5b
Author: Karl Williamson <khw@cpan.org>
Date:   Sat Feb 3 22:09:28 2018 -0700

    APItest:t/utf8_warn_base.pl: Clarify some comments

commit b8e5e8ad135d8d9df5485d3ed4ec9f5bdaf633bb
Author: Karl Williamson <khw@cpan.org>
Date:   Sat Feb 3 22:06:35 2018 -0700

    APItest:t/utf8_warn_base.pl: Move a variable outside sub()
    
    This is in preparation for a future commit which will want to refer to
    this variable independently.

commit 1a35ea2303d28449c7a073ab65a828b9420059c6
Author: Karl Williamson <khw@cpan.org>
Date:   Fri Feb 2 11:38:29 2018 -0700

    APItest:t/utf8_warn_base.pl; Fix 'ok' tests
    
    This was putting the condition for the ok in a string, which always
    succeeds

commit bb07812ea6cbac9162a7e3f9537c709ca57d4e57
Author: Karl Williamson <khw@cpan.org>
Date:   Fri Feb 2 10:43:33 2018 -0700

    utf8.c: Extract code into separate function
    
    This is in preparation for the next commit which will use this code in
    multiple places

-----------------------------------------------------------------------

Summary of changes:
 embed.fnc                          |   7 +-
 embed.h                            |   3 +-
 ext/XS-APItest/APItest.xs          |  30 +++++++
 ext/XS-APItest/t/utf8_warn_base.pl | 132 ++++++++++++++++++---------
 proto.h                            |   9 ++
 regcomp.c                          |   2 +-
 utf8.c                             | 178 +++++++++++++++++++++++++++++--------
 utf8.h                             |  12 ++-
 8 files changed, 288 insertions(+), 85 deletions(-)

diff --git a/embed.fnc b/embed.fnc
index ea389e4155..ce876ebab4 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1878,7 +1878,9 @@ Ap	|UV	|utf8n_to_uvuni|NN const U8 *s|STRLEN curlen|NULLOK STRLEN *retlen|U32 fl
 Adm	|U8*	|uvchr_to_utf8	|NN U8 *d|UV uv
 Ap	|U8*	|uvuni_to_utf8	|NN U8 *d|UV uv
 Adm	|U8*	|uvchr_to_utf8_flags	|NN U8 *d|UV uv|UV flags
-Apd	|U8*	|uvoffuni_to_utf8_flags	|NN U8 *d|UV uv|const UV flags
+AdmM	|U8*	|uvchr_to_utf8_flags_msgs|NN U8 *d|UV uv|UV flags|NULLOK HV ** msgs
+Apod	|U8*	|uvoffuni_to_utf8_flags	|NN U8 *d|UV uv|const UV flags
+ApM	|U8*	|uvoffuni_to_utf8_flags_msgs|NN U8 *d|UV uv|const UV flags|NULLOK HV** msgs
 Ap	|U8*	|uvuni_to_utf8_flags	|NN U8 *d|UV uv|UV flags
 Apd	|char*	|pv_uni_display	|NN SV *dsv|NN const U8 *spv|STRLEN len|STRLEN pvlim|UV flags
 ApdR	|char*	|sv_uni_display	|NN SV *dsv|NN SV *ssv|STRLEN pvlim|UV flags
@@ -2847,6 +2849,9 @@ sn	|NV|mulexp10	|NV value|I32 exponent
 #endif
 
 #if defined(PERL_IN_UTF8_C)
+sR	|HV *	|new_msg_hv |NN const char * const message		    \
+			    |U32 categories				    \
+			    |U32 flag
 sRM	|UV	|check_locale_boundary_crossing				    \
 		|NN const U8* const p					    \
 		|const UV result					    \
diff --git a/embed.h b/embed.h
index d1fe34ab66..1405176255 100644
--- a/embed.h
+++ b/embed.h
@@ -739,7 +739,7 @@
 #define utf8_to_uvuni_buf(a,b,c)	Perl_utf8_to_uvuni_buf(aTHX_ a,b,c)
 #define utf8n_to_uvchr_msgs(a,b,c,d,e,f)	Perl_utf8n_to_uvchr_msgs(aTHX_ a,b,c,d,e,f)
 #define utf8n_to_uvuni(a,b,c,d)	Perl_utf8n_to_uvuni(aTHX_ a,b,c,d)
-#define uvoffuni_to_utf8_flags(a,b,c)	Perl_uvoffuni_to_utf8_flags(aTHX_ a,b,c)
+#define uvoffuni_to_utf8_flags_msgs(a,b,c,d)	Perl_uvoffuni_to_utf8_flags_msgs(aTHX_ a,b,c,d)
 #define uvuni_to_utf8(a,b)	Perl_uvuni_to_utf8(aTHX_ a,b)
 #define uvuni_to_utf8_flags(a,b,c)	Perl_uvuni_to_utf8_flags(aTHX_ a,b,c)
 #define valid_utf8_to_uvchr	Perl_valid_utf8_to_uvchr
@@ -1896,6 +1896,7 @@
 #define is_utf8_common(a,b,c,d)	S_is_utf8_common(aTHX_ a,b,c,d)
 #define is_utf8_common_with_len(a,b,c,d,e)	S_is_utf8_common_with_len(aTHX_ a,b,c,d,e)
 #define is_utf8_overlong_given_start_byte_ok	S_is_utf8_overlong_given_start_byte_ok
+#define new_msg_hv(a,b,c)	S_new_msg_hv(aTHX_ a,b,c)
 #define swash_scan_list_line(a,b,c,d,e,f,g)	S_swash_scan_list_line(aTHX_ a,b,c,d,e,f,g)
 #define swatch_get(a,b,c)	S_swatch_get(aTHX_ a,b,c)
 #define to_lower_latin1		S_to_lower_latin1
diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs
index 5e67e7fa40..b12cc829e0 100644
--- a/ext/XS-APItest/APItest.xs
+++ b/ext/XS-APItest/APItest.xs
@@ -1523,6 +1523,36 @@ test_uvchr_to_utf8_flags(uv, flags)
     OUTPUT:
         RETVAL
 
+AV *
+test_uvchr_to_utf8_flags_msgs(uv, flags)
+
+        SV *uv
+        SV *flags
+    PREINIT:
+        U8 dest[UTF8_MAXBYTES + 1];
+        U8 *ret;
+
+    CODE:
+        HV *msgs = NULL;
+        RETVAL = newAV();
+        sv_2mortal((SV*)RETVAL);
+
+        ret = uvchr_to_utf8_flags_msgs(dest, SvUV(uv), SvUV(flags), &msgs);
+
+        if (ret) {
+            av_push(RETVAL, newSVpvn((char *) dest, ret - dest));
+        }
+        else {
+            av_push(RETVAL,  &PL_sv_undef);
+        }
+
+        if (msgs) {
+            av_push(RETVAL, newRV_noinc((SV*)msgs));
+        }
+
+    OUTPUT:
+        RETVAL
+
 MODULE = XS::APItest:Overload	PACKAGE = XS::APItest::Overload
 
 void
diff --git a/ext/XS-APItest/t/utf8_warn_base.pl b/ext/XS-APItest/t/utf8_warn_base.pl
index 6c3b04afeb..0c9e20b9ca 100644
--- a/ext/XS-APItest/t/utf8_warn_base.pl
+++ b/ext/XS-APItest/t/utf8_warn_base.pl
@@ -628,6 +628,17 @@ sub utf8n_display_call($)
          . ')';
 }
 
+my @uvchr_flags_to_text =  ( qw(
+        W_SURROGATE
+        W_NONCHAR
+        W_SUPER
+        W_PERL_EXTENDED
+        D_SURROGATE
+        D_NONCHAR
+        D_SUPER
+        D_PERL_EXTENDED
+) );
+
 sub uvchr_display_call($)
 {
     # Converts an eval string that calls test_uvchr_to_utf8 into a more human
@@ -635,23 +646,13 @@ sub uvchr_display_call($)
     #   test_uvchr_to_utf8n_flags($uv, $flags)
     #diag $_[0];
 
-    my @flags_to_text =  ( qw(
-            W_SURROGATE
-            W_NONCHAR
-            W_SUPER
-            W_PERL_EXTENDED
-            D_SURROGATE
-            D_NONCHAR
-            D_SUPER
-            D_PERL_EXTENDED
-       ) );
 
     $_[0] =~ / ^ ( [^(]* \( ) ( \d+ ) , \s* ( \d+ ) \) $ /x;
     my $text = $1;
     my $cp = sprintf "%X", $2;
     my $flags = $3;
 
-    return "${text}0x$cp, " . flags_to_text($flags, \@flags_to_text) . ')';
+    return "${text}0x$cp, " . flags_to_text($flags, \@uvchr_flags_to_text) . ')';
 }
 
 sub do_warnings_test(@)
@@ -1352,20 +1353,19 @@ foreach my $test (@tests) {
               next if $skip_most_tests;
             }
 
-            # This tests three functions.  utf8n_to_uvchr_error,
-            # utf8n_to_uvchr_msgs, and uvchr_to_utf8_flags.  But only the
-            # first two are variants of each other.  We use a loop
-            # 'which_func' to determine which of these.  uvchr_to_utf8_flags
-            # is done separately at the end of each iteration, only when
-            # which_func is 0.  which_func is numeric in part so we don't
-            # have to type in the function name and risk misspelling it
-            # somewhere, and also it sets whether we are expecting warnings
-            # or not in certain places.  The _msgs() version of the function
-            # expects warnings even if lexical ones are turned off, so by
-            # making its which_func == 1, we can say we want warnings;
-            # whereas the other one with the value 0, doesn't get them.
+            # This tests four functions: utf8n_to_uvchr_error,
+            # utf8n_to_uvchr_msgs, uvchr_to_utf8_flags, and
+            # uvchr_to_utf8_msgs.  The first two are variants of each other,
+            # and the final two also form a pair.  We use a loop 'which_func'
+            # to determine which of each pair is being tested.  The main loop
+            # tests either the first and third, or the 2nd and fourth.
+            # which_func is sets whether we are expecting warnings or not in
+            # certain places.  The _msgs() version of the functions expects
+            # warnings even if lexical ones are turned off, so by making its
+            # which_func == 1, we can say we want warnings; whereas the other
+            # one with the value 0, doesn't get them.
             for my $which_func (0, 1) {
-              my $func = ($which_func)
+              my $utf8_func = ($which_func)
                           ? 'utf8n_to_uvchr_msgs'
                           : 'utf8n_to_uvchr_error';
 
@@ -1377,10 +1377,10 @@ foreach my $test (@tests) {
                     if ($use_warn_flag) {
                         next if $initially_overlong || $initially_orphan;
 
-                        # Since utf8n_to_uvchr_msgs() expects warnings even
-                        # when lexical ones are turned off, we can skip
-                        # testing it when they are turned on, with little
-                        # likelihood of missing an error case.
+                        # Since foo_msgs() expects warnings even when lexical
+                        # ones are turned off, we can skip testing it when
+                        # they are turned on, with little likelihood of
+                        # missing an error case.
                         next if $which_func;
                     }
                     else {
@@ -1405,7 +1405,7 @@ foreach my $test (@tests) {
 
                         # We ordinarily expect overflow warnings here.  But it
                         # is somewhat more complicated, and the final
-                        # determination is deferred to one place in the filw
+                        # determination is deferred to one place in the file
                         # where we handle overflow.
                         $expect_warnings_for_overflow = 1;
 
@@ -1593,7 +1593,7 @@ foreach my $test (@tests) {
                         }
                     }
 
-                    my $this_name = "$func() $testname: ";
+                    my $this_name = "$utf8_func() $testname: ";
                     my @scratch_expected_return_flags = @expected_return_flags;
                     if (! $initially_malformed) {
                         $this_name .= ($disallowed)
@@ -1613,10 +1613,10 @@ foreach my $test (@tests) {
                     my $this_flags
                         = $allow_flags|$this_warning_flags|$this_disallow_flags;
                     my $eval_text =      "$eval_warn; \$ret_ref"
-                            . " = test_$func("
+                            . " = test_$utf8_func("
                             . "'$this_bytes', $this_length, $this_flags)";
                     eval "$eval_text";
-                    if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
+                    if (! ok ($@ eq "", "$this_name: eval succeeded"))
                     {
                         diag "\$@='$@'; call was: "
                            . utf8n_display_call($eval_text);
@@ -1712,8 +1712,9 @@ foreach my $test (@tests) {
                                   pass("flag for returned msg is expected");
                               }
                               else {
-                                  fail("flag for returned msg is expected: "
-                                 . flags_to_text($flag, \@utf8n_flags_to_text));
+                                  fail("flag ("
+                                     . flags_to_text($flag, \@utf8n_flags_to_text)
+                                     . ") for returned msg is expected");
                               }
                             }
 
@@ -1722,7 +1723,7 @@ foreach my $test (@tests) {
                                           "returned category for msg isn't 0");
                         }
 
-                        ok(@warnings_gotten == 0, "$func raised no warnings;"
+                        ok(@warnings_gotten == 0, "$utf8_func raised no warnings;"
                               . " the next tests are for ones in the returned"
                               . " variable")
                             or diag join "\n", "The unexpected warnings were:",
@@ -1744,11 +1745,11 @@ foreach my $test (@tests) {
                         $tested_CHECK_ONLY = 1;
                         my $this_flags = $this_disallow_flags|$::UTF8_CHECK_ONLY;
                         my $eval_text = "use warnings; \$ret_ref ="
-                                      . " test_$func('"
+                                      . " test_$utf8_func('"
                                       . "$this_bytes', $this_length,"
                                       . " $this_flags)";
                         eval $eval_text;
-                        if (! ok ("$@ eq ''",
+                        if (! ok ($@ eq "",
                             "    And eval succeeded with CHECK_ONLY"))
                         {
                             diag "\$@='$@'; Call was: "
@@ -1773,7 +1774,10 @@ foreach my $test (@tests) {
                     # existing code point, it hasn't overflowed, and isn't
                     # malformed.
                     next if @malformation_names;
-                    next if $which_func;
+
+                    my $uvchr_func = ($which_func)
+                                     ? 'uvchr_to_utf8_flags_msgs'
+                                     : 'uvchr_to_utf8_flags';
 
                     $this_warning_flags = ($use_warn_flag)
                                           ? $this_uvchr_flag_to_warn
@@ -1789,10 +1793,10 @@ foreach my $test (@tests) {
                                           ? 'with flag for raising warnings'
                                           : 'no flag for raising warnings');
 
-                    $this_name = "uvchr_to_utf8_flags() $testname: "
-                                            . (($disallowed)
-                                                ? 'disallowed'
-                                                : 'allowed');
+                    $this_name = "$uvchr_func() $testname: "
+                                        . (($disallowed)
+                                           ? 'disallowed'
+                                           : 'allowed');
                     $this_name .= ", $eval_warn";
                     $this_name .= ", " . ((  $this_warning_flags
                                            & $this_uvchr_flag_to_warn)
@@ -1803,15 +1807,55 @@ foreach my $test (@tests) {
                     my $ret;
                     $this_flags = $this_warning_flags|$this_disallow_flags;
                     $eval_text = "$eval_warn; \$ret ="
-                            . " test_uvchr_to_utf8_flags("
+                            . " test_$uvchr_func("
                             . "$allowed_uv, $this_flags)";
                     eval "$eval_text";
-                    if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
+                    if (! ok ($@ eq "", "$this_name: eval succeeded"))
                     {
                         diag "\$@='$@'; call was: "
                            . uvchr_display_call($eval_text);
                         next;
                     }
+
+                    if ($which_func) {
+                        if (defined $ret->[1]) {
+                            my @returned_warnings;
+                            push @returned_warnings, $ret->[1]{'text'};
+                            my $text = $ret->[1]{'text'};
+                            my $flag = $ret->[1]{'flag_bit'};
+                            my $category = $ret->[1]{'warning_category'};
+
+                            if (! ok(($flag & ($flag-1)) == 0,
+                                        "flag for returned msg is a single bit"))
+                            {
+                                diag sprintf("flags are %x; msg=%s", $flag, $text);
+                            }
+                            else {
+                                if ($flag & $this_uvchr_flag_to_disallow) {
+                                    pass("flag for returned msg is expected");
+                                }
+                                else {
+                                    fail("flag ("
+                                        . flags_to_text($flag, \@utf8n_flags_to_text)
+                                        . ") for returned msg is expected");
+                                }
+                            }
+
+                            # In perl space, don't know the category numbers
+                            isnt($category, 0,
+                                            "returned category for msg isn't 0");
+
+                            ok(@warnings_gotten == 0, "$uvchr_func raised no warnings;"
+                                . " the next tests are for ones in the returned"
+                                . " variable")
+                                or diag join "\n", "The unexpected warnings were:",
+                                                                @warnings_gotten;
+                            @warnings_gotten = @returned_warnings;
+                        }
+
+                        $ret = $ret->[0];
+                    }
+
                     if ($disallowed) {
                         is($ret, undef, "    And returns undef")
                           or diag "Call was: " . uvchr_display_call($eval_text);
diff --git a/proto.h b/proto.h
index 485211540b..9f6d0df7ff 100644
--- a/proto.h
+++ b/proto.h
@@ -3677,9 +3677,13 @@ PERL_CALLCONV void	Perl_utilize(pTHX_ int aver, I32 floor, OP* version, OP* idop
 	assert(idop)
 /* PERL_CALLCONV U8*	uvchr_to_utf8(pTHX_ U8 *d, UV uv); */
 /* PERL_CALLCONV U8*	uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags); */
+/* PERL_CALLCONV U8*	uvchr_to_utf8_flags_msgs(pTHX_ U8 *d, UV uv, UV flags, HV ** msgs); */
 PERL_CALLCONV U8*	Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags);
 #define PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS	\
 	assert(d)
+PERL_CALLCONV U8*	Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV uv, const UV flags, HV** msgs);
+#define PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS_MSGS	\
+	assert(d)
 PERL_CALLCONV U8*	Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv);
 #define PERL_ARGS_ASSERT_UVUNI_TO_UTF8	\
 	assert(d)
@@ -5992,6 +5996,11 @@ PERL_STATIC_INLINE int	S_is_utf8_overlong_given_start_byte_ok(const U8 * const s
 	assert(s)
 #endif
 
+STATIC HV *	S_new_msg_hv(pTHX_ const char * const message, U32 categories, U32 flag)
+			__attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_NEW_MSG_HV	\
+	assert(message)
+
 STATIC U8*	S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val, const bool wants_value, const U8* const typestr)
 			__attribute__warn_unused_result__;
 #define PERL_ARGS_ASSERT_SWASH_SCAN_LIST_LINE	\
diff --git a/regcomp.c b/regcomp.c
index 6f89a8ec30..fdc3c31c2e 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -345,7 +345,7 @@ struct RExC_state_t {
 /* Change from /d into /u rules, and restart the parse if we've already seen
  * something whose size would increase as a result, by setting *flagp and
  * returning 'restart_retval'.  RExC_uni_semantics is a flag that indicates
- * we've change to /u during the parse.  */
+ * we've changed to /u during the parse.  */
 #define REQUIRE_UNI_RULES(flagp, restart_retval)                            \
     STMT_START {                                                            \
             if (DEPENDS_SEMANTICS) {                                        \
diff --git a/utf8.c b/utf8.c
index 21664d5cf8..9cd4fa0927 100644
--- a/utf8.c
+++ b/utf8.c
@@ -101,6 +101,29 @@ Perl__force_out_malformed_utf8_message(pTHX_
     }
 }
 
+STATIC HV *
+S_new_msg_hv(pTHX_ const char * const message, /* The message text */
+                   U32 categories,  /* Packed warning categories */
+                   U32 flag)        /* Flag associated with this message */
+{
+    /* Creates, populates, and returns an HV* that describes an error message
+     * for the translators between UTF8 and code point */
+
+    SV* msg_sv = newSVpv(message, 0);
+    SV* category_sv = newSVuv(categories);
+    SV* flag_bit_sv = newSVuv(flag);
+
+    HV* msg_hv = newHV();
+
+    PERL_ARGS_ASSERT_NEW_MSG_HV;
+
+    hv_stores(msg_hv, "text", msg_sv);
+    hv_stores(msg_hv, "warn_categories",  category_sv);
+    hv_stores(msg_hv, "flag_bit", flag_bit_sv);
+
+    return msg_hv;
+}
+
 /*
 =for apidoc uvoffuni_to_utf8_flags
 
@@ -117,6 +140,14 @@ For details, see the description for L</uvchr_to_utf8_flags>.
 =cut
 */
 
+U8 *
+Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
+{
+    PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
+
+    return uvoffuni_to_utf8_flags_msgs(d, uv, flags, NULL);
+}
+
 /* All these formats take a single UV code point argument */
 const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf;
 const char nonchar_cp_format[]   = "Unicode non-character U+%04" UVXf
@@ -127,22 +158,38 @@ const char perl_extended_cp_format[] = "Code point 0x%" UVXf " is not"        \
                                        " Unicode, requires a Perl extension," \
                                        " and so is not portable";
 
-#define HANDLE_UNICODE_SURROGATE(uv, flags)                         \
+#define HANDLE_UNICODE_SURROGATE(uv, flags, msgs)                   \
     STMT_START {                                                    \
         if (flags & UNICODE_WARN_SURROGATE) {                       \
-            Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE),        \
-                                   surrogate_cp_format, uv);        \
+            U32 category = packWARN(WARN_SURROGATE);                \
+            const char * format = surrogate_cp_format;              \
+            if (msgs) {                                             \
+                *msgs = new_msg_hv(Perl_form(aTHX_ format, uv),     \
+                                   category,                        \
+                                   UNICODE_GOT_SURROGATE);          \
+            }                                                       \
+            else {                                                  \
+                Perl_ck_warner_d(aTHX_ category, format, uv);       \
+            }                                                       \
         }                                                           \
         if (flags & UNICODE_DISALLOW_SURROGATE) {                   \
             return NULL;                                            \
         }                                                           \
     } STMT_END;
 
-#define HANDLE_UNICODE_NONCHAR(uv, flags)                           \
+#define HANDLE_UNICODE_NONCHAR(uv, flags, msgs)                     \
     STMT_START {                                                    \
         if (flags & UNICODE_WARN_NONCHAR) {                         \
-            Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR),          \
-		                   nonchar_cp_format, uv);          \
+            U32 category = packWARN(WARN_NONCHAR);                  \
+            const char * format = nonchar_cp_format;                \
+            if (msgs) {                                             \
+                *msgs = new_msg_hv(Perl_form(aTHX_ format, uv),     \
+                                   category,                        \
+                                   UNICODE_GOT_NONCHAR);            \
+            }                                                       \
+            else {                                                  \
+                Perl_ck_warner_d(aTHX_ category, format, uv);       \
+            }                                                       \
         }                                                           \
         if (flags & UNICODE_DISALLOW_NONCHAR) {                     \
             return NULL;                                            \
@@ -155,10 +202,62 @@ const char perl_extended_cp_format[] = "Code point 0x%" UVXf " is not"        \
 #define MARK    UTF_CONTINUATION_MARK
 #define MASK    UTF_CONTINUATION_MASK
 
+/*
+=for apidoc uvchr_to_utf8_flags_msgs
+
+THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
+
+Most code should use C<L</uvchr_to_utf8_flags>()> rather than call this directly.
+
+This function is for code that wants any warning and/or error messages to be
+returned to the caller rather than be displayed.  All messages that would have
+been displayed if all lexcial warnings are enabled will be returned.
+
+It is just like C<L</uvchr_to_utf8_flags>> but it takes an extra parameter
+placed after all the others, C<msgs>.  If this parameter is 0, this function
+behaves identically to C<L</uvchr_to_utf8_flags>>.  Otherwise, C<msgs> should
+be a pointer to an C<HV *> variable, in which this function creates a new HV to
+contain any appropriate messages.  The hash has three key-value pairs, as
+follows:
+
+=over 4
+
+=item C<text>
+
+The text of the message as a C<SVpv>.
+
+=item C<warn_categories>
+
+The warning category (or categories) packed into a C<SVuv>.
+
+=item C<flag>
+
+A single flag bit associated with this message, in a C<SVuv>.
+The bit corresponds to some bit in the C<*errors> return value,
+such as C<UNICODE_GOT_SURROGATE>.
+
+=back
+
+It's important to note that specifying this parameter as non-null will cause
+any warnings this function would otherwise generate to be suppressed, and
+instead be placed in C<*msgs>.  The caller can check the lexical warnings state
+(or not) when choosing what to do with the returned messages.
+
+The caller, of course, is responsible for freeing any returned HV.
+
+=cut
+*/
+
+/* Undocumented; we don't want people using this.  Instead they should use
+ * uvchr_to_utf8_flags_msgs() */
 U8 *
-Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
+Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV uv, const UV flags, HV** msgs)
 {
-    PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
+    PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS_MSGS;
+
+    if (msgs) {
+        *msgs = NULL;
+    }
 
     if (OFFUNI_IS_INVARIANT(uv)) {
 	*d++ = LATIN1_TO_NATIVE(uv);
@@ -190,10 +289,10 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
             if (UNLIKELY(   UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
                          || UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
             {
-                HANDLE_UNICODE_NONCHAR(uv, flags);
+                HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
             }
             else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
-                HANDLE_UNICODE_SURROGATE(uv, flags);
+                HANDLE_UNICODE_SURROGATE(uv, flags, msgs);
             }
         }
 #endif
@@ -211,17 +310,31 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
         if (UNLIKELY(uv > MAX_EXTERNALLY_LEGAL_CP)) {
             Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_EXTERNALLY_LEGAL_CP);
         }
-        if (      (flags & UNICODE_WARN_SUPER)
-            || (  (flags & UNICODE_WARN_PERL_EXTENDED)
+        if (       (flags & UNICODE_WARN_SUPER)
+            || (   (flags & UNICODE_WARN_PERL_EXTENDED)
                 && UNICODE_IS_PERL_EXTENDED(uv)))
         {
-            Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
+            const char * format = super_cp_format;
+            U32 category = packWARN(WARN_NON_UNICODE);
+            U32 flag = UNICODE_GOT_SUPER;
+
+            /* Choose the more dire applicable warning */
+            if (UNICODE_IS_PERL_EXTENDED(uv)) {
+                format = perl_extended_cp_format;
+                if (flags & (UNICODE_WARN_PERL_EXTENDED
+                            |UNICODE_DISALLOW_PERL_EXTENDED))
+                {
+                    flag = UNICODE_GOT_PERL_EXTENDED;
+                }
+            }
 
-              /* Choose the more dire applicable warning */
-              (UNICODE_IS_PERL_EXTENDED(uv))
-              ? perl_extended_cp_format
-              : super_cp_format,
-             uv);
+            if (msgs) {
+                *msgs = new_msg_hv(Perl_form(aTHX_ format, uv),
+                                   category, flag);
+            }
+            else {
+                Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE), format, uv);
+            }
         }
         if (       (flags & UNICODE_DISALLOW_SUPER)
             || (   (flags & UNICODE_DISALLOW_PERL_EXTENDED)
@@ -231,7 +344,7 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
         }
     }
     else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
-        HANDLE_UNICODE_NONCHAR(uv, flags);
+        HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
     }
 
     /* Test for and handle 4-byte result.   In the test immediately below, the
@@ -250,10 +363,10 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
                    characters.  The end-plane non-characters for EBCDIC were
                    handled just above */
         if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
-            HANDLE_UNICODE_NONCHAR(uv, flags);
+            HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
         }
         else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
-            HANDLE_UNICODE_SURROGATE(uv, flags);
+            HANDLE_UNICODE_SURROGATE(uv, flags, msgs);
         }
 #endif
 
@@ -375,11 +488,9 @@ there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1))
 A somewhat misleadingly named synonym for C<UNICODE_WARN_PERL_EXTENDED> is
 retained for backward compatibility: C<UNICODE_WARN_ABOVE_31_BIT>.  Similarly,
 C<UNICODE_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
-C<UNICODE_DISALLOW_PERL_EXTENDED>.  The names are misleading because these
-flags can apply to code points that actually do fit in 31 bits.  This happens
-on EBCDIC platforms, and sometimes when the L<overlong
-malformation|/C<UTF8_GOT_LONG>> is also present.  The new names accurately
-describe the situation in all cases.
+C<UNICODE_DISALLOW_PERL_EXTENDED>.  The names are misleading because on EBCDIC
+platforms,these flags can apply to code points that actually do fit in 31 bits.
+The new names accurately describe the situation in all cases.
 
 =cut
 */
@@ -2142,22 +2253,15 @@ Perl_utf8n_to_uvchr_msgs(pTHX_ const U8 *s,
              * this iteration of the loop */
             if (message) {
                 if (msgs) {
-                    SV* msg_sv = newSVpv(message, 0);
-                    SV* category_sv = newSVuv(pack_warn);
-                    SV* flag_bit_sv = newSVuv(this_flag_bit);
-                    HV* msg_hv = newHV();
-
                     assert(this_flag_bit);
 
                     if (*msgs == NULL) {
                         *msgs = newAV();
                     }
 
-                    hv_stores(msg_hv, "text", msg_sv);
-                    hv_stores(msg_hv, "warn_categories",  category_sv);
-                    hv_stores(msg_hv, "flag_bit", flag_bit_sv);
-
-                    av_push(*msgs, newRV_noinc((SV*)msg_hv));
+                    av_push(*msgs, newRV_noinc((SV*) new_msg_hv(message,
+                                                                pack_warn,
+                                                                this_flag_bit)));
                 }
                 else if (PL_op)
                     Perl_warner(aTHX_ pack_warn, "%s in %s", message,
@@ -5921,7 +6025,7 @@ Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
 {
     PERL_ARGS_ASSERT_UVUNI_TO_UTF8;
 
-    return Perl_uvoffuni_to_utf8_flags(aTHX_ d, uv, 0);
+    return uvoffuni_to_utf8_flags(d, uv, 0);
 }
 
 /*
diff --git a/utf8.h b/utf8.h
index cfcdf8413e..828d1d169a 100644
--- a/utf8.h
+++ b/utf8.h
@@ -66,9 +66,13 @@ the string is invariant.
 #define is_ascii_string(s, len)     is_utf8_invariant_string(s, len)
 #define is_invariant_string(s, len) is_utf8_invariant_string(s, len)
 
+#define uvoffuni_to_utf8_flags(d,uv,flags)                                     \
+                               uvoffuni_to_utf8_flags_msgs(d, uv, flags, 0)
 #define uvchr_to_utf8(a,b)          uvchr_to_utf8_flags(a,b,0)
 #define uvchr_to_utf8_flags(d,uv,flags)                                        \
-                            uvoffuni_to_utf8_flags(d,NATIVE_TO_UNI(uv),flags)
+                                    uvchr_to_utf8_flags_msgs(d,uv,flags, 0)
+#define uvchr_to_utf8_flags_msgs(d,uv,flags,msgs)                              \
+                uvoffuni_to_utf8_flags_msgs(d,NATIVE_TO_UNI(uv),flags, msgs)
 #define utf8_to_uvchr_buf(s, e, lenp)                                          \
                      utf8n_to_uvchr(s, (U8*)(e) - (U8*)(s), lenp,              \
                                     ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY)
@@ -932,6 +936,12 @@ point's representation.
 #define UNICODE_DISALLOW_SUPER         0x0040
 #define UNICODE_DISALLOW_PERL_EXTENDED 0x0080
 #define UNICODE_DISALLOW_ABOVE_31_BIT  UNICODE_DISALLOW_PERL_EXTENDED
+
+#define UNICODE_GOT_SURROGATE       UNICODE_DISALLOW_SURROGATE
+#define UNICODE_GOT_NONCHAR         UNICODE_DISALLOW_NONCHAR
+#define UNICODE_GOT_SUPER           UNICODE_DISALLOW_SUPER
+#define UNICODE_GOT_PERL_EXTENDED   UNICODE_DISALLOW_PERL_EXTENDED
+
 #define UNICODE_WARN_ILLEGAL_C9_INTERCHANGE                                   \
                                   (UNICODE_WARN_SURROGATE|UNICODE_WARN_SUPER)
 #define UNICODE_WARN_ILLEGAL_INTERCHANGE                                      \

-- 
Perl5 Master Repository



nntp.perl.org: Perl Programming lists via nntp and http.
Comments to Ask Bjørn Hansen at ask@perl.org | Group listing | About