Front page | perl.perl5.porters |
Postings from February 2000
Re: [PATCH] Improved hibit text literals
From:
Gisle Aas
Date:
February 10, 2000 16:37
Subject:
Re: [PATCH] Improved hibit text literals
Message ID:
m3d7q43783.fsf@eik.g.aas.no
[Sorry. Gnus appeared to do something stupid with the previous
message I sent. Here it is again with less MIME (I hope).]
This patch relative to 5.5.650 makes perl do the right thing for
literals containing hibit charactets. The follwing behaviour will
change if you apply this patch:
- a \x{} escape will not force the UTF8 flag on, unless the value
is acutally higher than \xFF.
- the "\xff will produce malformed UTF-8 character; use \x{ff}"
warning is gone, since we now always do the right thing :-)
- under 'use utf8', hibit chars that are illegal utf8 are encoded
using utf8; basically automatically turns latin1 into utf8.
This ensure that there will never be illegal UTF8 sequences in
a literal string that has the UTF8 flag set.
- Octal escapes like \400 and \777 will actually do the right thing now.
Previously you only got the low 8-bits.
But, it still looks like the \N{} support will not work as it is
now. It never sets the UTF8 flag on the string by itself.
Regards,
Gisle
Index: toke.c
===================================================================
RCS file: /local/perl/build/CVSROOT/perl5.6tobe/toke.c,v
retrieving revision 1.1.1.2
diff -u -p -u -p -r1.1.1.2 toke.c
--- toke.c 2000/02/09 22:25:12 1.1.1.2
+++ toke.c 2000/02/10 23:27:25
@@ -1159,6 +1159,8 @@ S_scan_const(pTHX_ char *start)
bool dorange = FALSE; /* are we in a translit range? */
bool has_utf = FALSE; /* embedded \x{} */
I32 len; /* ? */
+ UV uv;
+
I32 utf = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op)
? (PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF))
: UTF;
@@ -1280,18 +1282,20 @@ S_scan_const(pTHX_ char *start)
/* (now in tr/// code again) */
if (*s & 0x80 && thisutf) {
- dTHR; /* only for ckWARN */
- if (ckWARN(WARN_UTF8)) {
- (void)utf8_to_uv((U8*)s, &len); /* could cvt latin-1 to utf8 here... */
- if (len) {
- has_utf = TRUE;
- while (len--)
- *d++ = *s++;
- continue;
- }
- }
- else
- has_utf = TRUE; /* assume valid utf8 */
+ (void)utf8_to_uv((U8*)s, &len);
+ if (len == 1) {
+ /* illegal UTF8, make it valid */
+ /* need to grow with 1 char to be safe */
+ char *old_pvx = SvPVX(sv);
+ d = SvGROW(sv, SvCUR(sv)+2) + (d - old_pvx);
+ d = (char*)uv_to_utf8((U8*)d, (U8)*s++);
+ }
+ else {
+ while (len--)
+ *d++ = *s++;
+ }
+ has_utf = TRUE;
+ continue;
}
/* backslashes */
@@ -1347,51 +1351,75 @@ S_scan_const(pTHX_ char *start)
/* \132 indicates an octal constant */
case '0': case '1': case '2': case '3':
case '4': case '5': case '6': case '7':
- *d++ = (char)scan_oct(s, 3, &len);
+ uv = (UV)scan_oct(s, 3, &len);
s += len;
- continue;
+ goto NUM_ESCAPE_INSERT;
/* \x24 indicates a hex constant */
case 'x':
++s;
if (*s == '{') {
char* e = strchr(s, '}');
- UV uv;
-
if (!e) {
- yyerror("Missing right brace on \\x{}");
+ yyerror("Missing right brace on \\x{}");
e = s;
- }
- /* note: utf always shorter than hex */
- uv = (UV)scan_hex(s + 1, e - s - 1, &len);
- if (uv > 127) {
- d = (char*)uv_to_utf8((U8*)d, uv);
- has_utf = TRUE;
}
- else
- *d++ = (char)uv;
- s = e + 1;
+ uv = (UV)scan_hex(s + 1, e - s - 1, &len);
+ s = e + 1;
}
else {
- /* XXX collapse this branch into the one above */
- UV uv = (UV)scan_hex(s, 2, &len);
- if (utf && PL_lex_inwhat == OP_TRANS &&
- utf != (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF))
- {
- d = (char*)uv_to_utf8((U8*)d, uv); /* doing a CU or UC */
+ uv = (UV)scan_hex(s, 2, &len);
+ s += len;
+ }
+
+ NUM_ESCAPE_INSERT:
+ /* Insert oct or hex escaped character.
+ * There will always enough room in sv since such escapes will
+ * be longer than any utf8 sequence they can end up as
+ */
+ if (uv > 127) {
+ if (!thisutf && !has_utf && uv > 256) {
+ /* might need to recode whatever we have accumulated so far
+ * if it contains any hibit chars
+ */
+ int hicount = 0;
+ char *c;
+ for (c = SvPVX(sv); c < d; c++) {
+ if (*c & 0x80)
+ hicount++;
+ }
+ if (hicount) {
+ char *old_pvx = SvPVX(sv);
+ char *src, *dst;
+ d = SvGROW(sv, SvCUR(sv) + hicount + 1) + (d - old_pvx);
+
+ src = d - 1;
+ d += hicount;
+ dst = d - 1;
+
+ while (src < dst) {
+ if (*src & 0x80) {
+ dst--;
+ uv_to_utf8((U8*)dst, (U8)*src--);
+ dst--;
+ }
+ else {
+ *dst-- = *src--;
+ }
+ }
+ }
+ }
+
+ if (thisutf || uv > 256) {
+ d = (char*)uv_to_utf8((U8*)d, uv);
has_utf = TRUE;
- }
+ }
else {
- if (uv >= 127 && UTF) {
- dTHR;
- if (ckWARN(WARN_UTF8))
- Perl_warner(aTHX_ WARN_UTF8,
- "\\x%.*s will produce malformed UTF-8 character; use \\x{%.*s} for that",
- (int)len,s,(int)len,s);
- }
- *d++ = (char)uv;
+ *d++ = (char)uv;
}
- s += len;
+ }
+ else {
+ *d++ = (char)uv;
}
continue;
Index: t/pragma/warn/doop
===================================================================
RCS file: /local/perl/build/CVSROOT/perl5.6tobe/t/pragma/warn/doop,v
retrieving revision 1.1.1.1
diff -u -p -u -p -r1.1.1.1 doop
--- t/pragma/warn/doop 2000/02/05 20:06:21 1.1.1.1
+++ t/pragma/warn/doop 2000/02/10 23:53:17
@@ -1,29 +1,6 @@
- doop.c AOK
-
- \x%s will produce malformed UTF-8 character; use \x{%s} for that
-
-
-__END__
# doop.c
use utf8 ;
$_ = "\x80 \xff" ;
chop ;
EXPECT
########
-# doop.c
-BEGIN {
- if (ord("\t") == 5) {
- print "SKIPPED\n# Character codes differ on ebcdic machines.";
- exit 0;
- }
-}
-use warnings 'utf8' ;
-use utf8 ;
-$_ = "\x80 \xff" ;
-chop ;
-no warnings 'utf8' ;
-$_ = "\x80 \xff" ;
-chop ;
-EXPECT
-\x80 will produce malformed UTF-8 character; use \x{80} for that at - line 10.
-\xff will produce malformed UTF-8 character; use \x{ff} for that at - line 10.
Index: t/pragma/warn/pp
===================================================================
RCS file: /local/perl/build/CVSROOT/perl5.6tobe/t/pragma/warn/pp,v
retrieving revision 1.1.1.1
diff -u -p -u -p -r1.1.1.1 pp
--- t/pragma/warn/pp 2000/02/05 20:06:21 1.1.1.1
+++ t/pragma/warn/pp 2000/02/10 23:54:10
@@ -28,11 +28,6 @@
Constant subroutine %s undefined <<<TODO
Constant subroutine (anonymous) undefined <<<TODO
- Mandatory Warnings
- ------------------
- Malformed UTF-8 character (not tested: difficult to produce with
- perl now)
-
__END__
# pp.c
use warnings 'substr' ;
@@ -111,20 +106,3 @@ $_ = "\x80 \xff" ;
reverse ;
EXPECT
########
-# pp.c
-BEGIN {
- if (ord("\t") == 5) {
- print "SKIPPED\n# Character codes differ on ebcdic machines.";
- exit 0;
- }
-}
-use warnings 'utf8' ;
-use utf8 ;
-$_ = "\x80 \xff" ;
-reverse ;
-no warnings 'utf8' ;
-$_ = "\x80 \xff" ;
-reverse ;
-EXPECT
-\x80 will produce malformed UTF-8 character; use \x{80} for that at - line 10.
-\xff will produce malformed UTF-8 character; use \x{ff} for that at - line 10.
Index: t/pragma/warn/sv
===================================================================
RCS file: /local/perl/build/CVSROOT/perl5.6tobe/t/pragma/warn/sv,v
retrieving revision 1.1.1.1
diff -u -p -u -p -r1.1.1.1 sv
--- t/pragma/warn/sv 2000/02/05 20:06:22 1.1.1.1
+++ t/pragma/warn/sv 2000/02/10 23:54:48
@@ -269,25 +269,6 @@ EXPECT
Undefined value assigned to typeglob at - line 3.
########
# sv.c
-BEGIN {
- if (ord("\t") == 5) {
- print "SKIPPED\n# ebcdic \\x characters differ.";
- exit 0;
- }
-}
-use utf8 ;
-$^W =0 ;
-{
- use warnings 'utf8' ;
- my $a = rindex "a\xff bc ", "bc" ;
- no warnings 'utf8' ;
- $a = rindex "a\xff bc ", "bc" ;
-}
-my $a = rindex "a\xff bc ", "bc" ;
-EXPECT
-\xff will produce malformed UTF-8 character; use \x{ff} for that at - line 12.
-########
-# sv.c
use warnings 'misc';
use Config;
BEGIN {
Index: t/pragma/warn/toke
===================================================================
RCS file: /local/perl/build/CVSROOT/perl5.6tobe/t/pragma/warn/toke,v
retrieving revision 1.1.1.2
diff -u -p -u -p -r1.1.1.2 toke
--- t/pragma/warn/toke 2000/02/09 22:25:33 1.1.1.2
+++ t/pragma/warn/toke 2000/02/10 23:55:18
@@ -89,10 +89,6 @@ toke.c AOK
sub time {}
my $a = time()
- \x%.*s will produce malformed UTF-8 character; use \x{%.*s} for that
- use utf8 ;
- $_ = "\xffe"
-
Unrecognized escape \\%c passed through
$a = "\m" ;
@@ -445,21 +441,6 @@ eval <<'EOE';
EOE
EXPECT
-########
-# toke.c
-BEGIN {
- if (ord("\t") == 5) {
- print "SKIPPED\n# Ebcdic platforms have different \\x constructs.";
- exit 0;
- }
-}
-use warnings 'utf8' ;
-use utf8 ;
-$_ = " \xffe " ;
-no warnings 'utf8' ;
-$_ = " \xffe " ;
-EXPECT
-\xff will produce malformed UTF-8 character; use \x{ff} for that at - line 10.
########
# toke.c
my $a = rand + 4 ;
Index: t/pragma/warn/utf8
===================================================================
RCS file: /local/perl/build/CVSROOT/perl5.6tobe/t/pragma/warn/utf8,v
retrieving revision 1.1.1.1
diff -u -p -u -p -r1.1.1.1 utf8
--- t/pragma/warn/utf8 2000/02/05 20:06:22 1.1.1.1
+++ t/pragma/warn/utf8 2000/02/10 23:45:59
@@ -14,48 +14,16 @@
<<<<<< Add a test when somethig actually calls utf16_to_utf8
__END__
-# utf8.c [utf8_to_uv]
+# utf8.c [utf8_to_uv] -W
use utf8 ;
-my $a = ord "\x80" ;
-EXPECT
-########
-# utf8.c [utf8_to_uv]
-BEGIN {
- if (ord("\t") == 5) {
- print "SKIPPED\n# Ebcdic platforms have different \\x constructs.";
- exit 0;
- }
-}
-use utf8 ;
-my $a = ord "\x80" ;
+my $a = "snøstorm" ;
{
- use warnings 'utf8' ;
- my $a = ord "\x80" ;
no warnings 'utf8' ;
- my $a = ord "\x80" ;
-}
-EXPECT
-\x80 will produce malformed UTF-8 character; use \x{80} for that at - line 12.
-########
-# utf8.c [utf8_to_uv]
-use utf8 ;
-my $a = ord "\xf080" ;
-EXPECT
-########
-# utf8.c [utf8_to_uv]
-BEGIN {
- if (ord("\t") == 5) {
- print "SKIPPED\n# Ebcdic platforms have different \\x constructs.";
- exit 0;
- }
-}
-use utf8 ;
-my $a = ord "\xf080" ;
-{
+ my $a = "snøstorm";
use warnings 'utf8' ;
- my $a = ord "\xf080" ;
- no warnings 'utf8' ;
- my $a = ord "\xf080" ;
+ my $a = "snøstorm";
}
EXPECT
-\xf0 will produce malformed UTF-8 character; use \x{f0} for that at - line 12.
+Malformed UTF-8 character at - line 3.
+Malformed UTF-8 character at - line 8.
+########