Front page | perl.perl5.porters |
Postings from March 2008
[perl #51710] utf8::valid rejects characters in \x14_FFFF - \x1F_FFFF
From:
Chris Hall
Date:
March 14, 2008 03:11
Subject:
[perl #51710] utf8::valid rejects characters in \x14_FFFF - \x1F_FFFF
Message ID:
rt-3.6.HEAD-25460-1205443112-394.51710-75-0@perl.org
# New Ticket Created by Chris Hall
# Please include the string: [perl #51710]
# in the subject line of all future correspondence about this issue.
# <URL: http://rt.perl.org/rt3/Ticket/Display.html?id=51710 >
This is a bug report for perl from chris.hall@highwayman.com,
generated with the help of perlbug 1.35 running under perl v5.8.8.
-----------------------------------------------------------------
It appears that utf8::valid() disagrees with Encode::encode('utf8', ...)
for characters \x{14_0000) - \x{1F_0000}.
I suggest utf8::valid() is broken.
The following:
use strict ;
use Encode qw(FB_QUIET LEAVE_SRC) ;
printf "Perl v%vd & Encode %s\n", $^V, $Encode::VERSION ;
# Test characters: 0x0000_FFFF, 0x0001_FFFF, 0x0002_0000, 0x0002_FFFF,
# 0x0003_0000, ...., 0x7FFF_FFFF.
my $c = 0xFFFF ;
while ($c <= 0x7FFF_FFFF) {
my $s = chr($c) ;
my $v = utf8::valid($s) ? 1 : 0 ;
my $o = Encode::encode('utf8', $s, FB_QUIET() | LEAVE_SRC()) ;
my $r = $o ? 1 : 0 ;
if ($v != $r) {
printf "0x%04X_%04X: utf8::valid=%d but Encode::encode=%d ",
($c >> 16), $c & 0xFFFF, $v, $r ;
Encode::_utf8_off($s) ;
print map { sprintf '\x%02X', ord($_) } split(//, $s) ;
print "\n" ;
} ;
if ($c & 0xFFFF) { $c += 1 ; } else { $c += 0xFFFF ; } ;
} ;
Produces:
Perl v5.8.8 & Encode 2.23
0x0014_0000: utf8::valid=0 but Encode::encode=1 \xF5\x80\x80\x80
0x0014_FFFF: utf8::valid=0 but Encode::encode=1 \xF5\x8F\xBF\xBF
0x0015_0000: utf8::valid=0 but Encode::encode=1 \xF5\x90\x80\x80
0x0015_FFFF: utf8::valid=0 but Encode::encode=1 \xF5\x9F\xBF\xBF
0x0016_0000: utf8::valid=0 but Encode::encode=1 \xF5\xA0\x80\x80
0x0016_FFFF: utf8::valid=0 but Encode::encode=1 \xF5\xAF\xBF\xBF
0x0017_0000: utf8::valid=0 but Encode::encode=1 \xF5\xB0\x80\x80
0x0017_FFFF: utf8::valid=0 but Encode::encode=1 \xF5\xBF\xBF\xBF
0x0018_0000: utf8::valid=0 but Encode::encode=1 \xF6\x80\x80\x80
0x0018_FFFF: utf8::valid=0 but Encode::encode=1 \xF6\x8F\xBF\xBF
0x0019_0000: utf8::valid=0 but Encode::encode=1 \xF6\x90\x80\x80
0x0019_FFFF: utf8::valid=0 but Encode::encode=1 \xF6\x9F\xBF\xBF
0x001A_0000: utf8::valid=0 but Encode::encode=1 \xF6\xA0\x80\x80
0x001A_FFFF: utf8::valid=0 but Encode::encode=1 \xF6\xAF\xBF\xBF
0x001B_0000: utf8::valid=0 but Encode::encode=1 \xF6\xB0\x80\x80
0x001B_FFFF: utf8::valid=0 but Encode::encode=1 \xF6\xBF\xBF\xBF
0x001C_0000: utf8::valid=0 but Encode::encode=1 \xF7\x80\x80\x80
0x001C_FFFF: utf8::valid=0 but Encode::encode=1 \xF7\x8F\xBF\xBF
0x001D_0000: utf8::valid=0 but Encode::encode=1 \xF7\x90\x80\x80
0x001D_FFFF: utf8::valid=0 but Encode::encode=1 \xF7\x9F\xBF\xBF
0x001E_0000: utf8::valid=0 but Encode::encode=1 \xF7\xA0\x80\x80
0x001E_FFFF: utf8::valid=0 but Encode::encode=1 \xF7\xAF\xBF\xBF
0x001F_0000: utf8::valid=0 but Encode::encode=1 \xF7\xB0\x80\x80
0x001F_FFFF: utf8::valid=0 but Encode::encode=1 \xF7\xBF\xBF\xBF
And the same for: Perl v5.10.0 & Encode 2.23
Chris
[Please do not change anything below this line]
-----------------------------------------------------------------
---
Flags:
category=core
severity=medium
---
This perlbug was built using Perl v5.8.8 in the Red Hat build system.
It is being executed now by Perl v5.8.8 - Mon Nov 26 14:25:50 EST 2007.
Site configuration information for perl v5.8.8:
Configured by Red Hat, Inc. at Mon Nov 26 14:25:50 EST 2007.
Summary of my perl5 (revision 5 version 8 subversion 8) configuration:
Platform:
osname=linux, osvers=2.6.20-1.3001.fc6xen, archname=x86_64-linux-thread-multi
uname='linux xenbuilder4.fedora.phx.redhat.com 2.6.20-1.3001.fc6xen #1 smp thu aug 9 16:18:42 edt 2007 x86_64 x86_64 x86_64 gnulinux '
config_args='-des -Doptimize=-O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -
mtune=generic -Dversion=5.8.8 -Dmyhostname=localhost -Dperladmin=root@localhost -Dcc=gcc -Dcf_by=Red Hat, Inc. -Dinstallprefix=/usr -
Dprefix=/usr -Dlibpth=/usr/local/lib64 /lib64 /usr/lib64 -Dprivlib=/usr/lib/perl5/5.8.8 -Dsitelib=/usr/lib/perl5/site_perl/5.8.8 -Dvendorlib=/us
r/lib/perl5/vendor_perl/5.8.8 -Darchlib=/usr/lib64/perl5/5.8.8/x86_64-linux-thread-multi -Dsitearch=/usr/lib64/perl5/site_perl/5.8.8/x86_64-linu
x-thread-multi -Dvendorarch=/usr/lib64/perl5/vendor_perl/5.8.8/x86_64-linux-thread-multi -Darchname=x86_64-linux -Dvendorprefix=/usr -
Dsiteprefix=/usr -Duseshrplib -Dusethreads -Duseithreads -Duselargefiles -Dd_dosuid -Dd_semctl_semun -Di_db -Ui_ndbm -Di_gdbm -Di_shadow -
Di_syslog -Dman3ext=3pm -Duseperlio -Dinstallusrbinperl=n -Ubincompat5005 -Uversiononly -Dpager=/usr/bin/less -isr -Dd_gethostent_r_proto -
Ud_endhostent_r_proto -Ud_sethostent_r_proto -Ud_endprotoent_r_proto -Ud_setprotoent_r_proto -Ud_endservent_r_proto -Ud_setservent_r_proto -
Dinc_version_list=5.8.7 5.8.6 5.8.5 -Dscriptdir=/usr/bin'
hint=recommended, useposix=true, d_sigaction=define
usethreads=define use5005threads=undef useithreads=define usemultiplicity=define
useperlio=define d_sfio=undef uselargefiles=define usesocks=undef
use64bitint=define use64bitall=define uselongdouble=undef
usemymalloc=n, bincompat5005=undef
Compiler:
cc='gcc', ccflags ='-D_REENTRANT -D_GNU_SOURCE -fno-strict-aliasing -pipe -Wdeclaration-after-statement -I/usr/local/include -
D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -I/usr/include/gdbm',
optimize='-O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic',
cppflags='-D_REENTRANT -D_GNU_SOURCE -fno-strict-aliasing -pipe -Wdeclaration-after-statement -I/usr/local/include -I/usr/include/gdbm'
ccversion='', gccversion='4.1.2 20070925 (Red Hat 4.1.2-33)', gccosandvers=''
intsize=4, longsize=8, ptrsize=8, doublesize=8, byteorder=12345678
d_longlong=define, longlongsize=8, d_longdbl=define, longdblsize=16
ivtype='long', ivsize=8, nvtype='double', nvsize=8, Off_t='off_t', lseeksize=8
alignbytes=8, prototype=define
Linker and Libraries:
ld='gcc', ldflags =''
libpth=/usr/local/lib64 /lib64 /usr/lib64
libs=-lresolv -lnsl -lgdbm -ldb -ldl -lm -lcrypt -lutil -lpthread -lc
perllibs=-lresolv -lnsl -ldl -lm -lcrypt -lutil -lpthread -lc
libc=, so=so, useshrplib=true, libperl=libperl.so
gnulibc_version='2.7'
Dynamic Linking:
dlsrc=dl_dlopen.xs, dlext=so, d_dlsymun=undef, ccdlflags='-Wl,-E -Wl,-rpath,/usr/lib64/perl5/5.8.8/x86_64-linux-thread-multi/CORE'
cccdlflags='-fPIC', lddlflags='-shared -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -
m64 -mtune=generic'
Locally applied patches:
---
@INC for perl v5.8.8:
/usr/lib64/perl5/site_perl/5.8.8/x86_64-linux-thread-multi
/usr/lib64/perl5/site_perl/5.8.7/x86_64-linux-thread-multi
/usr/lib64/perl5/site_perl/5.8.6/x86_64-linux-thread-multi
/usr/lib64/perl5/site_perl/5.8.5/x86_64-linux-thread-multi
/usr/lib/perl5/site_perl/5.8.8
/usr/lib/perl5/site_perl/5.8.7
/usr/lib/perl5/site_perl/5.8.6
/usr/lib/perl5/site_perl/5.8.5
/usr/lib/perl5/site_perl
/usr/lib64/perl5/vendor_perl/5.8.8/x86_64-linux-thread-multi
/usr/lib64/perl5/vendor_perl/5.8.7/x86_64-linux-thread-multi
/usr/lib64/perl5/vendor_perl/5.8.6/x86_64-linux-thread-multi
/usr/lib64/perl5/vendor_perl/5.8.5/x86_64-linux-thread-multi
/usr/lib/perl5/vendor_perl/5.8.8
/usr/lib/perl5/vendor_perl/5.8.7
/usr/lib/perl5/vendor_perl/5.8.6
/usr/lib/perl5/vendor_perl/5.8.5
/usr/lib/perl5/vendor_perl
/usr/lib64/perl5/5.8.8/x86_64-linux-thread-multi
/usr/lib/perl5/5.8.8
.
---
Environment for perl v5.8.8:
HOME=/home/GMCH
LANG=en_GB.UTF-8
LANGUAGE (unset)
LD_LIBRARY_PATH (unset)
LOGDIR (unset)
PATH=/usr/kerberos/sbin:/usr/kerberos/bin:/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin:/root/bin
PERL_BADLANG (unset)
SHELL=/bin/bash
--
Chris Hall highwayman.com
-
[perl #51710] utf8::valid rejects characters in \x14_FFFF - \x1F_FFFF
by Chris Hall