Front page | perl.perl5.porters |
Postings from November 2000
[ID 20001113.003] utf8_to_uv on malformed utf returns wrong values
Thread Next
From:
sthoenna
Date:
November 13, 2000 14:52
Subject:
[ID 20001113.003] utf8_to_uv on malformed utf returns wrong values
Message ID:
200011132249.eADMnek09679@garcia.efn.org
This is a bug report for perl from sthoenna@efn.org,
generated with the help of perlbug 1.33 running under perl v5.7.0.
-----------------------------------------------------------------
[Please enter your report here]
utf8_to_uv on malformed utf seems to be setting the length to -1 only
if *not* in CHECK_ONLY mode. This seems to contradict the function
doc. (Also, most places that use utf8 don't check for -1, but just
add the returned length to their pointer, leading to endless loops
like the following:
[D:\perl-current].\perl -we "require v128.65535"
Malformed UTF-8 character (character 0xffff) at -e line 1.
Malformed UTF-8 character (unexpected continuation byte 0x80) at -e line 1.
Malformed UTF-8 character (character 0xffff) at -e line 1.
Malformed UTF-8 character (unexpected continuation byte 0x80) at -e line 1.
Malformed UTF-8 character (character 0xffff) at -e line 1.
Malformed UTF-8 character (unexpected continuation byte 0x80) at -e line 1.
Malformed UTF-8 character (character 0xffff) at -e line 1.
Malformed UTF-8 character (unexpected continuation byte 0x80) at -e line 1.
Malformed UTF-8 character (character 0xffff) at -e line 1.
Malformed UTF-8 character (unexpected continuation byte 0x80) at -e line 1.
Malformed UTF-8 character (character 0xffff) at -e line 1.
Malformed UTF-8 character (unexpected continuation byte 0x80) at -e line 1.
Malformed UTF-8 character (character 0xffff) at -e line 1.
Malformed UTF-8 character (unexpected continuation byte 0x80) at -e line 1.
Malformed UTF-8 character (character 0xffff) at -e line 1.
...ad nauseum.
Here the pointer (in str_to_version) is alternating between the first byte
of the 128, the first byte of the 65535, and the second byte of the 128.
Additionally, by the time we jump to the malformed label in utf8_to_uv,
the len may have been set to -1 (by the while (len--) loop).
A patch follows. It makes CHECK_ONLY return a length of -1 on bad utf
and non-CHECK_ONLY return a length based on the first char compatible
with UTF8SKIP. We may want to change this to return a shorter length
if, for example, we didn't get a continuation byte where we expected
one.
Another issue is that the doc for the function seems to imply that no
warning will be issued if a problem is found in CHECK_ONLY mode. This
doesn't seem to coincide with the code.
--- utf8.c.orig Tue Nov 7 12:50:10 2000
+++ utf8.c Sun Nov 12 22:22:46 2000
@@ -312,12 +312,12 @@
if (flags & UTF8_CHECK_ONLY) {
if (retlen)
- *retlen = len;
+ *retlen = -1;
return 0;
}
if (retlen)
- *retlen = -1;
+ *retlen = expectlen ? expectlen : len;
return UNICODE_REPLACEMENT_CHARACTER;
}
End of Patch.
--- toke.c.orig Wed Nov 8 16:08:32 2000
+++ toke.c Mon Nov 13 12:23:48 2000
@@ -1332,7 +1332,7 @@
UV uv;
uv = utf8_to_uv((U8*)s, send - s, &len, UTF8_CHECK_ONLY);
- if (len == 1) {
+ if (len == (STRLEN)-1) {
/* Illegal UTF8 (a high-bit byte), make it valid. */
char *old_pvx = SvPVX(sv);
/* need space for one extra char (NOTE: SvCUR() not set here) */
End of Patch.
As a bonus, here is a patch to restrict the number of allowable problems
for the bit-arithmetic ops:
--- utf8.h.orig Wed Oct 25 12:20:46 2000
+++ utf8.h Sun Nov 12 22:38:36 2000
@@ -41,6 +41,8 @@
#define UTF8_ALLOW_BOM 0x0020
#define UTF8_ALLOW_FFFF 0x0040
#define UTF8_ALLOW_LONG 0x0080
+#define UTF8_ALLOW_ANYUV (UTF8_ALLOW_FE_FF|UTF8_ALLOW_FFFF \
+ |UTF8_ALLOW_BOM|UTF8_ALLOW_SURROGATE)
#define UTF8_ALLOW_ANY 0x00ff
#define UTF8_CHECK_ONLY 0x0100
--- doop.c.orig Tue Oct 31 07:20:22 2000
+++ doop.c Sun Nov 12 22:50:44 2000
@@ -968,10 +968,10 @@
switch (optype) {
case OP_BIT_AND:
while (lulen && rulen) {
- luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANY);
+ luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV);
lc += ulen;
lulen -= ulen;
- ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANY);
+ ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV);
rc += ulen;
rulen -= ulen;
duc = luc & ruc;
@@ -983,10 +983,10 @@
break;
case OP_BIT_XOR:
while (lulen && rulen) {
- luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANY);
+ luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV);
lc += ulen;
lulen -= ulen;
- ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANY);
+ ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV);
rc += ulen;
rulen -= ulen;
duc = luc ^ ruc;
@@ -995,10 +995,10 @@
goto mop_up_utf;
case OP_BIT_OR:
while (lulen && rulen) {
- luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANY);
+ luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV);
lc += ulen;
lulen -= ulen;
- ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANY);
+ ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV);
rc += ulen;
rulen -= ulen;
duc = luc | ruc;
--- pp.c.orig Mon Nov 13 12:03:46 2000
+++ pp.c Mon Nov 13 13:51:36 2000
@@ -1486,7 +1486,7 @@
send = tmps + len;
while (tmps < send) {
- UV c = utf8_to_uv(tmps, 0, &l, UTF8_ALLOW_ANY);
+ UV c = utf8_to_uv(tmps, send-tmps, &l, UTF8_ALLOW_ANYUV);
tmps += UTF8SKIP(tmps);
targlen += UNISKIP(~c);
nchar++;
@@ -1500,7 +1500,7 @@
if (nwide) {
Newz(0, result, targlen + 1, U8);
while (tmps < send) {
- UV c = utf8_to_uv(tmps, 0, &l, UTF8_ALLOW_ANY);
+ UV c = utf8_to_uv(tmps, send-tmps, &l, UTF8_ALLOW_ANYUV);
tmps += UTF8SKIP(tmps);
result = uv_to_utf8(result, ~c);
}
End of Patch.
[Please do not change anything below this line]
-----------------------------------------------------------------
---
Flags:
category=core
severity=low
---
Site configuration information for perl v5.7.0:
Configured by sthoenna at Mon Nov 13 14:07:20 PST 2000.
Summary of my perl5 (revision 5.0 version 7 subversion 0) configuration:
Platform:
osname=os2, osvers=2.30, archname=os2-64int-ld
uname='os2 efn.org 2 2.30 i386 '
config_args='-de -Dprefix=d:/perl -Dusedevel -Duse64bitint -Duselongdouble -Aoptimize=-DDEBUGGING'
hint=recommended, useposix=true, d_sigaction=define
usethreads=undef use5005threads=undef useithreads=undef usemultiplicity=undef
useperlio=undef d_sfio=undef uselargefiles=define usesocks=undef
use64bitint=define use64bitall=undef uselongdouble=define
Compiler:
cc='gcc', ccflags ='-Zomf -Zmt -DDOSISH -DOS2=2 -DEMBED -I. -D_EMX_CRT_REV_=63',
optimize='-O2 -fomit-frame-pointer -malign-loops=2 -malign-jumps=2 -malign-functions=2 -s -DDEBUGGING',
cppflags='-Zomf -Zmt -DDOSISH -DOS2=2 -DEMBED -I. -D_EMX_CRT_REV_=63'
ccversion='', gccversion='2.8.1', gccosandvers=''
intsize=4, longsize=4, ptrsize=4, doublesize=8, byteorder=12345678
d_longlong=define, longlongsize=8, d_longdbl=define, longdblsize=12
ivtype='long long', ivsize=8, nvtype='long double', nvsize=12, Off_t='off_t', lseeksize=4
alignbytes=4, usemymalloc=y, prototype=define
Linker and Libraries:
ld='gcc', ldflags ='-Zexe -Zomf -Zmt -Zcrtdll -Zstack 32000'
libpth=d:/emx/lib d:/emx/lib/mt
libs=-lsocket -lm -lbsd
perllibs=-lsocket -lm -lbsd
libc=d:/emx/lib/mt/c_import.lib, so=dll, useshrplib=true, libperl=libperl.lib
Dynamic Linking:
dlsrc=dl_dlopen.xs, dlext=dll, d_dlsymun=undef, ccdlflags=' '
cccdlflags='-Zdll', lddlflags='-Zdll -Zomf -Zmt -Zcrtdll -s'
Locally applied patches:
DEVEL7673
---
@INC for perl v5.7.0:
lib
d:/perl/lib/5.7.0/os2-64int-ld
d:/perl/lib/5.7.0
d:/perl/lib/site_perl/5.7.0/os2-64int-ld
d:/perl/lib/site_perl/5.7.0
d:/perl/lib/site_perl
.
---
Environment for perl v5.7.0:
HOME=d:\home\sthoenna
LANG=en_us
LANGUAGE (unset)
LD_LIBRARY_PATH (unset)
LOGDIR=sthoenna
PATH=d:\bin;C:\OS2;d:\perl\bin;C:\OS2\SYSTEM;C:\OS2\INSTALL;C:\;C:\OS2\MDOS;C:\OS2\APPS;C:\MMOS2;d:\os2apps\util;d:\DOSAPPS\UTIL;c:\sio;D:\WINDOWS;d:\pdksh;d:\emx\bin;d:\emacs\19.33\bin;d:\ispell
PERL_BADLANG (unset)
PERL_SH_DIR=d:\BIN
SHELL (unset)
Thread Next
-
[ID 20001113.003] utf8_to_uv on malformed utf returns wrong values
by sthoenna