develooper Front page | perl.perl5.porters | Postings from November 2000

[ID 20001113.003] utf8_to_uv on malformed utf returns wrong values

Thread Next
From:
sthoenna
Date:
November 13, 2000 14:52
Subject:
[ID 20001113.003] utf8_to_uv on malformed utf returns wrong values
Message ID:
200011132249.eADMnek09679@garcia.efn.org
This is a bug report for perl from sthoenna@efn.org,
generated with the help of perlbug 1.33 running under perl v5.7.0.


-----------------------------------------------------------------
[Please enter your report here]

utf8_to_uv on malformed utf seems to be setting the length to -1 only
if *not* in CHECK_ONLY mode.  This seems to contradict the function
doc.  (Also, most places that use utf8 don't check for -1, but just
add the returned length to their pointer, leading to endless loops
like the following:

[D:\perl-current].\perl -we "require v128.65535"
Malformed UTF-8 character (character 0xffff) at -e line 1.
Malformed UTF-8 character (unexpected continuation byte 0x80) at -e line 1.
Malformed UTF-8 character (character 0xffff) at -e line 1.
Malformed UTF-8 character (unexpected continuation byte 0x80) at -e line 1.
Malformed UTF-8 character (character 0xffff) at -e line 1.
Malformed UTF-8 character (unexpected continuation byte 0x80) at -e line 1.
Malformed UTF-8 character (character 0xffff) at -e line 1.
Malformed UTF-8 character (unexpected continuation byte 0x80) at -e line 1.
Malformed UTF-8 character (character 0xffff) at -e line 1.
Malformed UTF-8 character (unexpected continuation byte 0x80) at -e line 1.
Malformed UTF-8 character (character 0xffff) at -e line 1.
Malformed UTF-8 character (unexpected continuation byte 0x80) at -e line 1.
Malformed UTF-8 character (character 0xffff) at -e line 1.
Malformed UTF-8 character (unexpected continuation byte 0x80) at -e line 1.
Malformed UTF-8 character (character 0xffff) at -e line 1.
...ad nauseum.

Here the pointer (in str_to_version) is alternating between the first byte
of the 128, the first byte of the 65535, and the second byte of the 128.

Additionally, by the time we jump to the malformed label in utf8_to_uv,
the len may have been set to -1 (by the while (len--) loop).

A patch follows.  It makes CHECK_ONLY return a length of -1 on bad utf
and non-CHECK_ONLY return a length based on the first char compatible
with UTF8SKIP.  We may want to change this to return a shorter length
if, for example, we didn't get a continuation byte where we expected
one.

Another issue is that the doc for the function seems to imply that no
warning will be issued if a problem is found in CHECK_ONLY mode. This
doesn't seem to coincide with the code.

--- utf8.c.orig	Tue Nov  7 12:50:10 2000
+++ utf8.c	Sun Nov 12 22:22:46 2000
@@ -312,12 +312,12 @@
 
     if (flags & UTF8_CHECK_ONLY) {
 	if (retlen)
-	    *retlen = len;
+	    *retlen = -1;
 	return 0;
     }
 
     if (retlen)
-	*retlen = -1;
+	*retlen = expectlen ? expectlen : len;
 
     return UNICODE_REPLACEMENT_CHARACTER;
 }
End of Patch.
--- toke.c.orig	Wed Nov  8 16:08:32 2000
+++ toke.c	Mon Nov 13 12:23:48 2000
@@ -1332,7 +1332,7 @@
 	    UV uv;
 
 	    uv = utf8_to_uv((U8*)s, send - s, &len, UTF8_CHECK_ONLY);
-	    if (len == 1) {
+	    if (len == (STRLEN)-1) {
 		/* Illegal UTF8 (a high-bit byte), make it valid. */
 		char *old_pvx = SvPVX(sv);
 		/* need space for one extra char (NOTE: SvCUR() not set here) */
End of Patch.

As a bonus, here is a patch to restrict the number of allowable problems
for the bit-arithmetic ops:

--- utf8.h.orig	Wed Oct 25 12:20:46 2000
+++ utf8.h	Sun Nov 12 22:38:36 2000
@@ -41,6 +41,8 @@
 #define UTF8_ALLOW_BOM			0x0020
 #define UTF8_ALLOW_FFFF			0x0040
 #define UTF8_ALLOW_LONG			0x0080
+#define UTF8_ALLOW_ANYUV		(UTF8_ALLOW_FE_FF|UTF8_ALLOW_FFFF \
+					|UTF8_ALLOW_BOM|UTF8_ALLOW_SURROGATE)
 #define UTF8_ALLOW_ANY			0x00ff
 #define UTF8_CHECK_ONLY			0x0100
 
--- doop.c.orig	Tue Oct 31 07:20:22 2000
+++ doop.c	Sun Nov 12 22:50:44 2000
@@ -968,10 +968,10 @@
 	switch (optype) {
 	case OP_BIT_AND:
 	    while (lulen && rulen) {
-		luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANY);
+		luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV);
 		lc += ulen;
 		lulen -= ulen;
-		ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANY);
+		ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV);
 		rc += ulen;
 		rulen -= ulen;
 		duc = luc & ruc;
@@ -983,10 +983,10 @@
 	    break;
 	case OP_BIT_XOR:
 	    while (lulen && rulen) {
-		luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANY);
+		luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV);
 		lc += ulen;
 		lulen -= ulen;
-		ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANY);
+		ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV);
 		rc += ulen;
 		rulen -= ulen;
 		duc = luc ^ ruc;
@@ -995,10 +995,10 @@
 	    goto mop_up_utf;
 	case OP_BIT_OR:
 	    while (lulen && rulen) {
-		luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANY);
+		luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV);
 		lc += ulen;
 		lulen -= ulen;
-		ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANY);
+		ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV);
 		rc += ulen;
 		rulen -= ulen;
 		duc = luc | ruc;
--- pp.c.orig	Mon Nov 13 12:03:46 2000
+++ pp.c	Mon Nov 13 13:51:36 2000
@@ -1486,7 +1486,7 @@
 
 	  send = tmps + len;
 	  while (tmps < send) {
-	    UV c = utf8_to_uv(tmps, 0, &l, UTF8_ALLOW_ANY);
+	    UV c = utf8_to_uv(tmps, send-tmps, &l, UTF8_ALLOW_ANYUV);
 	    tmps += UTF8SKIP(tmps);
 	    targlen += UNISKIP(~c);
 	    nchar++;
@@ -1500,7 +1500,7 @@
 	  if (nwide) {
 	      Newz(0, result, targlen + 1, U8);
 	      while (tmps < send) {
-		  UV c = utf8_to_uv(tmps, 0, &l, UTF8_ALLOW_ANY);
+		  UV c = utf8_to_uv(tmps, send-tmps, &l, UTF8_ALLOW_ANYUV);
 		  tmps += UTF8SKIP(tmps);
 		  result = uv_to_utf8(result, ~c);
 	      }
End of Patch.

[Please do not change anything below this line]
-----------------------------------------------------------------
---
Flags:
    category=core
    severity=low
---
Site configuration information for perl v5.7.0:

Configured by sthoenna at Mon Nov 13 14:07:20 PST 2000.

Summary of my perl5 (revision 5.0 version 7 subversion 0) configuration:
  Platform:
    osname=os2, osvers=2.30, archname=os2-64int-ld
    uname='os2 efn.org 2 2.30 i386  '
    config_args='-de -Dprefix=d:/perl -Dusedevel -Duse64bitint -Duselongdouble -Aoptimize=-DDEBUGGING'
    hint=recommended, useposix=true, d_sigaction=define
    usethreads=undef use5005threads=undef useithreads=undef usemultiplicity=undef
    useperlio=undef d_sfio=undef uselargefiles=define usesocks=undef
    use64bitint=define use64bitall=undef uselongdouble=define
  Compiler:
    cc='gcc', ccflags ='-Zomf -Zmt -DDOSISH -DOS2=2 -DEMBED -I. -D_EMX_CRT_REV_=63',
    optimize='-O2 -fomit-frame-pointer -malign-loops=2 -malign-jumps=2 -malign-functions=2 -s -DDEBUGGING',
    cppflags='-Zomf -Zmt -DDOSISH -DOS2=2 -DEMBED -I. -D_EMX_CRT_REV_=63'
    ccversion='', gccversion='2.8.1', gccosandvers=''
    intsize=4, longsize=4, ptrsize=4, doublesize=8, byteorder=12345678
    d_longlong=define, longlongsize=8, d_longdbl=define, longdblsize=12
    ivtype='long long', ivsize=8, nvtype='long double', nvsize=12, Off_t='off_t', lseeksize=4
    alignbytes=4, usemymalloc=y, prototype=define
  Linker and Libraries:
    ld='gcc', ldflags ='-Zexe -Zomf -Zmt -Zcrtdll -Zstack 32000'
    libpth=d:/emx/lib d:/emx/lib/mt
    libs=-lsocket -lm -lbsd
    perllibs=-lsocket -lm -lbsd
    libc=d:/emx/lib/mt/c_import.lib, so=dll, useshrplib=true, libperl=libperl.lib
  Dynamic Linking:
    dlsrc=dl_dlopen.xs, dlext=dll, d_dlsymun=undef, ccdlflags=' '
    cccdlflags='-Zdll', lddlflags='-Zdll -Zomf -Zmt -Zcrtdll -s'

Locally applied patches:
    DEVEL7673

---
@INC for perl v5.7.0:
    lib
    d:/perl/lib/5.7.0/os2-64int-ld
    d:/perl/lib/5.7.0
    d:/perl/lib/site_perl/5.7.0/os2-64int-ld
    d:/perl/lib/site_perl/5.7.0
    d:/perl/lib/site_perl
    .

---
Environment for perl v5.7.0:
    HOME=d:\home\sthoenna
    LANG=en_us
    LANGUAGE (unset)
    LD_LIBRARY_PATH (unset)
    LOGDIR=sthoenna
    PATH=d:\bin;C:\OS2;d:\perl\bin;C:\OS2\SYSTEM;C:\OS2\INSTALL;C:\;C:\OS2\MDOS;C:\OS2\APPS;C:\MMOS2;d:\os2apps\util;d:\DOSAPPS\UTIL;c:\sio;D:\WINDOWS;d:\pdksh;d:\emx\bin;d:\emacs\19.33\bin;d:\ispell
    PERL_BADLANG (unset)
    PERL_SH_DIR=d:\BIN
    SHELL (unset)


Thread Next


nntp.perl.org: Perl Programming lists via nntp and http.
Comments to Ask Bjørn Hansen at ask@perl.org | Group listing | About