develooper Front page | perl.perl5.porters | Postings from September 2000

[PATCH] Fix vec() / utf8 (was Re: bitvec ops still broken with utf8 -- or not?)

Thread Next
From:
Mike Guy
Date:
September 1, 2000 09:43
Subject:
[PATCH] Fix vec() / utf8 (was Re: bitvec ops still broken with utf8 -- or not?)
Message ID:
E13Utuf-0004Bw-00@draco.cus.cam.ac.uk
Hugo <hv@crypt.compulink.co.uk> wrote
> In <E13Ppdr-00032K-00@libra.cus.cam.ac.uk>, Mike Guy writes:
> :Noone objected to that suggestion, or indeed supported it, apart from
> :a response from Ilya which I didn't (and still don't) understand.
>
> I think he was suggesting that only the characters in the bit-range
> specified by your vec() command should be required to be <= 255.

Ah, now I understand.    But that isn't really a practical course of
action.     When actually doing the vec() bit twiddling, you have
to have (the relevant part of) the string in bytes encoding.     And
the idea of a string which is part UTF8 and part bytes ...

And in any case, if you really want to do that sort of thing, you
can do

       vec(substr($x,0,1),3,1) = 1;   # set flag 3 in byte 0

I also don't think Spider's suggestion of operating on the UFT8 encoding
is a starter  -  it's just a mechanism for creating invalid UTF8
encodings.

So the attached patch (for bleedperl) follows my original proposal
to fault any characters > 255.


NOTE:   this patch changes the spec for the Perl_utf8_to_bytes
subroutine.    Firstly, it updates the length  -  otherwise there's
no way of finding this information.    Also it does a prepass to
check validity, so you don't get left with a mixed encoding string
on failure.

This is a newly introduced routine, not currently used by anyone.    
And without the above two changes, it's essentially unusable.


        WARNING     Patch needs   make regen_headers


Mike Guy


--- ./utf8.c.orig	Fri Sep  1 16:03:57 2000
+++ ./utf8.c	Fri Sep  1 17:22:03 2000
@@ -204,7 +204,8 @@
     return uv;
 }
 
-/* utf8_distance(a,b) is intended to be a - b in pointer arithmetic */
+/* utf8_distance(a,b) returns the number of UTF8 characters between
+   the pointers a and b							*/
 
 I32
 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
@@ -247,40 +248,46 @@
 }
 
 /*
-=for apidoc Am|U8 *|utf8_to_bytes|U8 *s|STRLEN len
+=for apidoc Am|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 
-Converts a string C<s> of length C<len> from UTF8 into ASCII encoding.
-Unlike C<bytes_to_utf8>, this over-writes the original string.
-Returns zero on failure after converting as much as possible.
+Converts a string C<s> of length C<len> from UTF8 into byte encoding.
+Unlike C<bytes_to_utf8>, this over-writes the original string, and
+updates len to contain the new length.
+Returns zero on failure leaving the string and len unchanged
 
 =cut
 */
 
 U8 *
-Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN len)
+Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
 {
     dTHR;
     U8 *send;
     U8 *d;
     U8 *save;
 
-    send = s + len;
+    send = s + *len;
     d = save = s;
+
+    /* ensure valid UTF8 and chars < 256 before updating string */
+    while (s < send) {
+	U8 c = *s++;
+        if (c >= 0x80 &&
+	    ( (s >= send) || ((*s++ & 0xc0) != 0x80) || ((c & 0xfe) != 0xc2)))
+	    return 0;    
+    }
+    s = save;
     while (s < send) {
         if (*s < 0x80)
             *d++ = *s++;
         else {
             I32 ulen;
-            UV uv = utf8_to_uv(s, &ulen);
-            if (uv > 255) {
-                *d = '\0';
-                return 0;
-            }
+            *d++ = (U8)utf8_to_uv(s, &ulen);
             s += ulen;
-            *d++ = (U8)uv;
         }
     }
     *d = '\0';
+    *len = d - save;
     return save;
 }
 
--- ./t/op/vec.t.orig	Fri Sep  1 16:03:57 2000
+++ ./t/op/vec.t	Fri Sep  1 15:41:50 2000
@@ -1,6 +1,6 @@
 #!./perl
 
-print "1..23\n";
+print "1..30\n";
 
 print vec($foo,0,1) == 0 ? "ok 1\n" : "not ok 1\n";
 print length($foo) == 0 ? "ok 2\n" : "not ok 2\n";
@@ -48,3 +48,32 @@
 print "ok 22\n";
 print "not " if vec('abcd', 7, 8);
 print "ok 23\n";
+
+# UTF8
+# N.B. currently curiously coded to circumvent bugs elswhere in UTF8 handling
+
+$foo = "\x{100}" . "\xff\xfe";
+$x = substr $foo, 1;
+print "not " if vec($x, 0, 8) != 255;
+print "ok 24\n";
+eval { vec($foo, 1, 8) };
+print "not " unless $@ =~ /^Character > 255 in vec\(\) /;
+print "ok 25\n";
+eval { vec($foo, 1, 8) = 13 };
+print "not " unless $@ =~ /^Character > 255 in vec\(\) /;
+print "ok 26\n";
+print "not " if $foo ne "\x{100}" . "\xff\xfe";
+print "ok 27\n";
+$x = substr $foo, 1;
+vec($x, 2, 4) = 7;
+print "not " if $x ne "\xff\xf7";
+print "ok 28\n";
+
+# mixed magic
+
+$foo = "\x61\x62\x63\x64\x65\x66";
+print "not " if vec(substr($foo, 2, 2), 0, 16) != 25444;
+print "ok 29\n";
+vec(substr($foo, 1,3), 5, 4) = 3;
+print "not " if $foo ne "\x61\x62\x63\x34\x65\x66";
+print "ok 30\n";
--- ./pod/perlfunc.pod.orig	Fri Sep  1 16:03:57 2000
+++ ./pod/perlfunc.pod	Fri Sep  1 15:41:50 2000
@@ -5510,6 +5510,9 @@
 extend the string with sufficiently many zero bytes.   It is an error
 to try to write off the beginning of the string (i.e. negative OFFSET).
 
+The string must not contain any character with value > 255 (which
+can only happen if you're using UTF8 encoding).
+
 Strings created with C<vec> can also be manipulated with the logical
 operators C<|>, C<&>, C<^>, and C<~>.  These operators will assume a bit
 vector operation is desired when both operands are strings.
--- ./pod/perldiag.pod.orig	Fri Sep  1 16:03:57 2000
+++ ./pod/perldiag.pod	Fri Sep  1 15:41:51 2000
@@ -1043,6 +1043,11 @@
 with an assignment operator, which implies modifying the value itself.
 Perhaps you need to copy the value to a temporary, and repeat that.
 
+=item Character > 255 in vec()
+
+(F) You applied the vec() function to a UTF8 string which contained
+a character > 255.   vec() currently only operates on characters < 256.
+
 =item chmod() mode argument is missing initial 0
 
 (W chmod) A novice will sometimes say
--- ./doop.c.orig	Fri Sep  1 16:03:57 2000
+++ ./doop.c	Fri Sep  1 17:22:34 2000
@@ -537,7 +537,8 @@
 	SvTAINTED_on(sv);
 }
 
-/* XXX SvUTF8 support missing! */
+/* currently converts input to bytes if needed and croaks if a character
+   > 255 is encountered							*/
 UV
 Perl_do_vecget(pTHX_ SV *sv, I32 offset, I32 size)
 {
@@ -549,6 +550,16 @@
 	return retnum;
     if (size < 1 || (size & (size-1))) /* size < 1 or not a power of two */ 
 	Perl_croak(aTHX_ "Illegal number of bits in vec");
+
+    if (SvUTF8(sv)) {
+	if (Perl_utf8_to_bytes(aTHX_ (U8*) s, &srclen)) {
+	    SvUTF8_off(sv);
+	    SvCUR_set(sv, srclen);
+	}
+	else
+	    Perl_croak(aTHX_ "Character > 255 in vec()");
+    }
+
     offset *= size;	/* turn into bit offset */
     len = (offset + size + 7) / 8;	/* required number of bytes */
     if (len > srclen) {
@@ -670,7 +681,8 @@
     return retnum;
 }
 
-/* XXX SvUTF8 support missing! */
+/* currently converts input to bytes if needed and croaks if a character
+   > 255 is encountered							*/
 void
 Perl_do_vecset(pTHX_ SV *sv)
 {
@@ -686,6 +698,15 @@
     if (!targ)
 	return;
     s = (unsigned char*)SvPV_force(targ, targlen);
+    if (SvUTF8(targ)) {
+	if (Perl_utf8_to_bytes(aTHX_ (U8*) s, &targlen)) {
+	/*  SvUTF8_off(targ);   SvPOK_only below ensures this  */
+	    SvCUR_set(targ, targlen);
+	}
+	else
+	    Perl_croak(aTHX_ "Character > 255 in vec()");
+    }
+
     (void)SvPOK_only(targ);
     lval = SvUV(sv);
     offset = LvTARGOFF(sv);
--- ./embed.pl.orig	Fri Sep  1 16:03:57 2000
+++ ./embed.pl	Fri Sep  1 15:41:51 2000
@@ -2071,7 +2071,7 @@
 Ap	|U8*	|utf16_to_utf8_reversed|U8* p|U8 *d|I32 bytelen|I32 *newlen
 Ap	|I32	|utf8_distance	|U8 *a|U8 *b
 Ap	|U8*	|utf8_hop	|U8 *s|I32 off
-ApM	|U8*	|utf8_to_bytes	|U8 *s|STRLEN len
+ApM	|U8*	|utf8_to_bytes	|U8 *s|STRLEN *len
 ApM	|U8*	|bytes_to_utf8	|U8 *s|STRLEN *len
 Ap	|UV	|utf8_to_uv	|U8 *s|I32* retlen
 Ap	|U8*	|uv_to_utf8	|U8 *d|UV uv

End of patch

Thread Next


nntp.perl.org: Perl Programming lists via nntp and http.
Comments to Ask Bjørn Hansen at ask@perl.org | Group listing | About