develooper Front page | perl.perl5.porters | Postings from February 2000

Re: [PATCH] Improved hibit text literals

From:
Gisle Aas
Date:
February 10, 2000 16:37
Subject:
Re: [PATCH] Improved hibit text literals
Message ID:
m3d7q43783.fsf@eik.g.aas.no
[Sorry.  Gnus appeared to do something stupid with the previous
message I sent.  Here it is again with less MIME (I hope).]


This patch relative to 5.5.650 makes perl do the right thing for
literals containing hibit charactets.  The follwing behaviour will
change if you apply this patch:

    - a \x{} escape will not force the UTF8 flag on, unless the value
      is acutally higher than \xFF.

    - the "\xff will produce malformed UTF-8 character; use \x{ff}"
      warning is gone, since we now always do the right thing :-)

    - under 'use utf8', hibit chars that are illegal utf8 are encoded
      using utf8; basically automatically turns latin1 into utf8.
      This ensure that there will never be illegal UTF8 sequences in
      a literal string that has the UTF8 flag set.

    - Octal escapes like \400 and \777 will actually do the right thing now.
      Previously you only got the low 8-bits.

But, it still looks like the \N{} support will not work as it is
now. It never sets the UTF8 flag on the string by itself.

Regards,
Gisle




Index: toke.c
===================================================================
RCS file: /local/perl/build/CVSROOT/perl5.6tobe/toke.c,v
retrieving revision 1.1.1.2
diff -u -p -u -p -r1.1.1.2 toke.c
--- toke.c	2000/02/09 22:25:12	1.1.1.2
+++ toke.c	2000/02/10 23:27:25
@@ -1159,6 +1159,8 @@ S_scan_const(pTHX_ char *start)
     bool dorange = FALSE;			/* are we in a translit range? */
     bool has_utf = FALSE;			/* embedded \x{} */
     I32 len;					/* ? */
+    UV uv;
+
     I32 utf = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op)
 	? (PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF))
 	: UTF;
@@ -1280,18 +1282,20 @@ S_scan_const(pTHX_ char *start)
 	/* (now in tr/// code again) */
 
 	if (*s & 0x80 && thisutf) {
-	    dTHR;			/* only for ckWARN */
-	    if (ckWARN(WARN_UTF8)) {
-		(void)utf8_to_uv((U8*)s, &len);	/* could cvt latin-1 to utf8 here... */
-		if (len) {
-		    has_utf = TRUE;
-		    while (len--)
-			*d++ = *s++;
-		    continue;
-		}
-	    }
-	    else
-		has_utf = TRUE;		/* assume valid utf8 */
+	   (void)utf8_to_uv((U8*)s, &len);
+           if (len == 1) {
+	       /* illegal UTF8, make it valid */
+	       /* need to grow with 1 char to be safe */
+               char *old_pvx = SvPVX(sv);
+               d = SvGROW(sv, SvCUR(sv)+2) + (d - old_pvx);
+               d = (char*)uv_to_utf8((U8*)d, (U8)*s++);
+           }
+           else {
+	       while (len--)
+		   *d++ = *s++;
+	   }
+	   has_utf = TRUE;
+           continue;
 	}
 
 	/* backslashes */
@@ -1347,51 +1351,75 @@ S_scan_const(pTHX_ char *start)
 	    /* \132 indicates an octal constant */
 	    case '0': case '1': case '2': case '3':
 	    case '4': case '5': case '6': case '7':
-		*d++ = (char)scan_oct(s, 3, &len);
+	        uv = (UV)scan_oct(s, 3, &len);
 		s += len;
-		continue;
+		goto NUM_ESCAPE_INSERT;
 
 	    /* \x24 indicates a hex constant */
 	    case 'x':
 		++s;
 		if (*s == '{') {
 		    char* e = strchr(s, '}');
-		    UV uv;
-
 		    if (!e) {
-			yyerror("Missing right brace on \\x{}");
+		        yyerror("Missing right brace on \\x{}");
 			e = s;
-		    }
-		    /* note: utf always shorter than hex */
-		    uv = (UV)scan_hex(s + 1, e - s - 1, &len);
-		    if (uv > 127) {
-			d = (char*)uv_to_utf8((U8*)d, uv);
-			has_utf = TRUE;
 		    }
-		    else
-			*d++ = (char)uv;
-		    s = e + 1;
+                    uv = (UV)scan_hex(s + 1, e - s - 1, &len);
+                    s = e + 1;
 		}
 		else {
-		    /* XXX collapse this branch into the one above */
-		    UV uv = (UV)scan_hex(s, 2, &len);
-		    if (utf && PL_lex_inwhat == OP_TRANS &&
-			utf != (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF))
-		    {
-			d = (char*)uv_to_utf8((U8*)d, uv);	/* doing a CU or UC */
+		    uv = (UV)scan_hex(s, 2, &len);
+		    s += len;
+		}
+
+	      NUM_ESCAPE_INSERT:
+		/* Insert oct or hex escaped character.
+		 * There will always enough room in sv since such escapes will
+		 * be longer than any utf8 sequence they can end up as
+		 */
+		if (uv > 127) {
+		    if (!thisutf && !has_utf && uv > 256) {
+		        /* might need to recode whatever we have accumulated so far
+			 * if it contains any hibit chars
+			 */
+		        int hicount = 0;
+			char *c;
+			for (c = SvPVX(sv); c < d; c++) {
+			    if (*c & 0x80)
+			        hicount++;
+			}
+			if (hicount) {
+			    char *old_pvx = SvPVX(sv);
+			    char *src, *dst;
+			    d = SvGROW(sv, SvCUR(sv) + hicount + 1) + (d - old_pvx);
+
+			    src = d - 1;
+			    d += hicount;
+			    dst = d - 1;
+
+			    while (src < dst) {
+			        if (*src & 0x80) {
+				    dst--;
+				    uv_to_utf8((U8*)dst, (U8)*src--);
+				    dst--;
+			        }
+			        else {
+				    *dst-- = *src--;
+			        }
+			    }
+                        }
+                    }
+
+                    if (thisutf || uv > 256) {
+		        d = (char*)uv_to_utf8((U8*)d, uv);
 			has_utf = TRUE;
-		    }
+                    }
 		    else {
-			if (uv >= 127 && UTF) {
-			    dTHR;
-			    if (ckWARN(WARN_UTF8))
-				Perl_warner(aTHX_ WARN_UTF8,
-				    "\\x%.*s will produce malformed UTF-8 character; use \\x{%.*s} for that",
-				    (int)len,s,(int)len,s);
-			}
-			*d++ = (char)uv;
+		        *d++ = (char)uv;
 		    }
-		    s += len;
+		}
+		else {
+		    *d++ = (char)uv;
 		}
 		continue;
 
Index: t/pragma/warn/doop
===================================================================
RCS file: /local/perl/build/CVSROOT/perl5.6tobe/t/pragma/warn/doop,v
retrieving revision 1.1.1.1
diff -u -p -u -p -r1.1.1.1 doop
--- t/pragma/warn/doop	2000/02/05 20:06:21	1.1.1.1
+++ t/pragma/warn/doop	2000/02/10 23:53:17
@@ -1,29 +1,6 @@
-  doop.c	AOK
-
-  \x%s will produce malformed UTF-8 character; use \x{%s} for that
-
-
-__END__
 # doop.c
 use utf8 ;
 $_ = "\x80  \xff" ;
 chop ;
 EXPECT
 ########
-# doop.c
-BEGIN {
-    if (ord("\t") == 5) {
-        print "SKIPPED\n# Character codes differ on ebcdic machines.";
-        exit 0;
-    }
-}
-use warnings 'utf8'  ;
-use utf8 ;
-$_ = "\x80  \xff" ;
-chop ;
-no warnings 'utf8'  ;
-$_ = "\x80  \xff" ;
-chop ;
-EXPECT
-\x80 will produce malformed UTF-8 character; use \x{80} for that at - line 10.
-\xff will produce malformed UTF-8 character; use \x{ff} for that at - line 10.
Index: t/pragma/warn/pp
===================================================================
RCS file: /local/perl/build/CVSROOT/perl5.6tobe/t/pragma/warn/pp,v
retrieving revision 1.1.1.1
diff -u -p -u -p -r1.1.1.1 pp
--- t/pragma/warn/pp	2000/02/05 20:06:21	1.1.1.1
+++ t/pragma/warn/pp	2000/02/10 23:54:10
@@ -28,11 +28,6 @@
   Constant subroutine %s undefined			<<<TODO
   Constant subroutine (anonymous) undefined		<<<TODO
 
-  Mandatory Warnings
-  ------------------
-  Malformed UTF-8 character (not tested: difficult to produce with
-                             perl now)
-
 __END__
 # pp.c
 use warnings 'substr' ;
@@ -111,20 +106,3 @@ $_ = "\x80  \xff" ;
 reverse ;
 EXPECT
 ########
-# pp.c
-BEGIN {
-    if (ord("\t") == 5) {
-        print "SKIPPED\n# Character codes differ on ebcdic machines.";
-        exit 0;
-    }
-}
-use warnings 'utf8'  ;
-use utf8 ;
-$_ = "\x80  \xff" ;
-reverse ;
-no warnings 'utf8'  ;
-$_ = "\x80  \xff" ;
-reverse ;
-EXPECT
-\x80 will produce malformed UTF-8 character; use \x{80} for that at - line 10.
-\xff will produce malformed UTF-8 character; use \x{ff} for that at - line 10.
Index: t/pragma/warn/sv
===================================================================
RCS file: /local/perl/build/CVSROOT/perl5.6tobe/t/pragma/warn/sv,v
retrieving revision 1.1.1.1
diff -u -p -u -p -r1.1.1.1 sv
--- t/pragma/warn/sv	2000/02/05 20:06:22	1.1.1.1
+++ t/pragma/warn/sv	2000/02/10 23:54:48
@@ -269,25 +269,6 @@ EXPECT
 Undefined value assigned to typeglob at - line 3.
 ########
 # sv.c
-BEGIN {
-    if (ord("\t") == 5) {
-        print "SKIPPED\n# ebcdic \\x characters differ.";
-        exit 0;
-    }
-}
-use utf8 ;
-$^W =0 ;
-{
-  use warnings 'utf8' ;
-  my $a = rindex "a\xff bc ", "bc" ;
-  no warnings 'utf8' ;
-  $a = rindex "a\xff bc ", "bc" ;
-}
-my $a = rindex "a\xff bc ", "bc" ;
-EXPECT
-\xff will produce malformed UTF-8 character; use \x{ff} for that at - line 12.
-########
-# sv.c
 use warnings 'misc';
 use Config;
 BEGIN {
Index: t/pragma/warn/toke
===================================================================
RCS file: /local/perl/build/CVSROOT/perl5.6tobe/t/pragma/warn/toke,v
retrieving revision 1.1.1.2
diff -u -p -u -p -r1.1.1.2 toke
--- t/pragma/warn/toke	2000/02/09 22:25:33	1.1.1.2
+++ t/pragma/warn/toke	2000/02/10 23:55:18
@@ -89,10 +89,6 @@ toke.c	AOK
 	sub time {} 
 	my $a = time()
 
-    \x%.*s will produce malformed UTF-8 character; use \x{%.*s} for that
-        use utf8 ; 
-	$_ = "\xffe"
-
     Unrecognized escape \\%c passed through
         $a = "\m" ;
 
@@ -445,21 +441,6 @@ eval <<'EOE';
 EOE
 EXPECT
 
-########
-# toke.c
-BEGIN {
-    if (ord("\t") == 5) {
-        print "SKIPPED\n# Ebcdic platforms have different \\x constructs.";
-        exit 0;
-    }
-}
-use warnings 'utf8' ;
-use utf8 ;
-$_ = " \xffe " ;
-no warnings 'utf8' ;
-$_ = " \xffe " ;
-EXPECT
-\xff will produce malformed UTF-8 character; use \x{ff} for that at - line 10.
 ########
 # toke.c
 my $a = rand + 4 ;
Index: t/pragma/warn/utf8
===================================================================
RCS file: /local/perl/build/CVSROOT/perl5.6tobe/t/pragma/warn/utf8,v
retrieving revision 1.1.1.1
diff -u -p -u -p -r1.1.1.1 utf8
--- t/pragma/warn/utf8	2000/02/05 20:06:22	1.1.1.1
+++ t/pragma/warn/utf8	2000/02/10 23:45:59
@@ -14,48 +14,16 @@
      <<<<<< Add a test when somethig actually calls utf16_to_utf8
 
 __END__
-# utf8.c [utf8_to_uv]
+# utf8.c [utf8_to_uv] -W
 use utf8 ;
-my $a = ord "\x80" ;
-EXPECT
-########
-# utf8.c [utf8_to_uv]
-BEGIN {
-    if (ord("\t") == 5) {
-        print "SKIPPED\n# Ebcdic platforms have different \\x constructs.";
-        exit 0;
-    }
-}
-use utf8 ;
-my $a = ord "\x80" ;
+my $a = "snøstorm" ;
 {
-    use warnings 'utf8' ;
-    my $a = ord "\x80" ;
     no warnings 'utf8' ;
-    my $a = ord "\x80" ;
-}
-EXPECT
-\x80 will produce malformed UTF-8 character; use \x{80} for that at - line 12.
-########
-# utf8.c [utf8_to_uv]
-use utf8 ;
-my $a = ord "\xf080" ;
-EXPECT
-########
-# utf8.c [utf8_to_uv]
-BEGIN {
-    if (ord("\t") == 5) {
-        print "SKIPPED\n# Ebcdic platforms have different \\x constructs.";
-        exit 0;
-    }
-}
-use utf8 ;
-my $a = ord "\xf080" ;
-{
+    my $a = "snøstorm";
     use warnings 'utf8' ;
-    my $a = ord "\xf080" ;
-    no warnings 'utf8' ;
-    my $a = ord "\xf080" ;
+    my $a = "snøstorm";
 }
 EXPECT
-\xf0 will produce malformed UTF-8 character; use \x{f0} for that at - line 12.
+Malformed UTF-8 character at - line 3.
+Malformed UTF-8 character at - line 8.
+########



nntp.perl.org: Perl Programming lists via nntp and http.
Comments to Ask Bjørn Hansen at ask@perl.org | Group listing | About