Front page | perl.perl5.porters |
Postings from October 2011
[perl #101382] seeking on bytes causes broken perl strings
Thread Next
From:
tchrist1
Date:
October 14, 2011 15:57
Subject:
[perl #101382] seeking on bytes causes broken perl strings
Message ID:
rt-3.6.HEAD-31297-1318633009-1163.101382-75-0@perl.org
# New Ticket Created by tchrist1
# Please include the string: [perl #101382]
# in the subject line of all future correspondence about this issue.
# <URL: https://rt.perl.org:443/rt3/Ticket/Display.html?id=101382 >
Perl's seek and sysseek take off_t arguments and retvals, which are in
bytes. But you can call them on a stream with an encoding. Now you're
doomed, recause all read-like functions in perl (getc, readline, read,
sysread) go through the encoding layer. That means you can see to partway
through a multibyte UTF-8 or UTF-16 character (for example) and when you
next read something, and you just produced a broken UTF-8 string in Perl.
I'm pretty sure that that is supposed to be Against The Rules.
% perl -CS -E 'say "E\x{301}" x 50 for 1..100' > sample.utf8
% cat sysseek-enc-test
#!/usr/bin/env perl
# sysseek-enc-test
use v5.14;
use strict;
use warnings;
use open qw(:std :utf8);
use Fcntl qw(:seek);
use Devel::Peek;
my $encoding = "utf8"; # same results w/ "encoding(UTF-8)"
my $mode = "< :$encoding";
@ARGV == 3 || die "usage: $0 utf8filename offset count";
my($filename, $offset, $count) = @ARGV;
$offset =~ /^\d+$/aa || die "offset should be whole number";
$count =~ /^\d+$/aa || die "count should be whole number";
open(my $fh, $mode, $filename) || die "$0: can't open $mode $filename: $!\n";
my $newpos = sysseek($fh, $offset, SEEK_SET) // die "$0: sysseek failed: $!\n";
my $sysret = sysread($fh, my $buf, $count);
$sysret == $count || die "$0: only sysread $sysret not $count chars: $!";
print "sysread worked, trying print and dump...\n";
printf "%d chars from offset %d are length %d:",
$count, $offset, length($buf);
printf "<%s>, U+%v04X\n", $buf, $buf;
Dump($buf);
% perl sysseek-enc sample.utf8 2 4
sysread worked, trying print and dump...
Malformed UTF-8 character (unexpected continuation byte 0x81, with no preceding start byte) in printf at sysseek-enc line 22.
4 chars from offset 2 are length 4:<#ÉE>, U+0000.0045.0301.0045
SV = PVMG(0x3c036640) at 0x3c0657b4
REFCNT = 1
FLAGS = (PADMY,SMG,POK,pPOK,UTF8)
IV = 0
NV = 0
PV = 0x3c026548 "\201E\314\201E"\0Malformed UTF-8 character (unexpected continuation byte 0x81, with no preceding start byte) in subroutine entry at sysseek-enc line 23.
[UTF8 "\x{0}E\x{301}E"]
CUR = 5
LEN = 12
MAGIC = 0x3c049be0
MG_VIRTUAL = &PL_vtbl_utf8
MG_TYPE = PERL_MAGIC_utf8(w)
MG_LEN = 4
Notice that the data produced claim to have an initial code point of
U+0000. But that isn't so:
% head -1 sample.utf8 | uniquote -x | chop -72
E\x{301}E\x{301}E\x{301}E\x{301}E\x{301}E\x{301}E\x{301}E\x{301}E\x{301
% head -1 sample.utf8 | uniquote -b | chop -72
E\xCC\x81E\xCC\x81E\xCC\x81E\xCC\x81E\xCC\x81E\xCC\x81E\xCC\x81E\xCC\x8
% head -1 sample.utf8 | uniquote -o | chop -72
E\314\201E\314\201E\314\201E\314\201E\314\201E\314\201E\314\201E\314\20
The problem doesn't change if you :%s/sys//g the previous program:
% cat seek-enc-test
#!/usr/bin/env perl
# seek-enc-test
use v5.14;
use strict;
use warnings;
use open qw(:std :utf8);
use Fcntl qw(:seek);
use Devel::Peek;
my $encoding = "utf8"; # same results w/ "encoding(UTF-8)"
my $mode = "< :$encoding";
@ARGV == 3 || die "usage: $0 utf8filename offset count";
my($filename, $offset, $count) = @ARGV;
$offset =~ /^\d+$/aa || die "offset should be whole number";
$count =~ /^\d+$/aa || die "count should be whole number";
open(my $fh, $mode, $filename) || die "$0: can't open $mode $filename: $!\n";
my $newpos = seek($fh, $offset, SEEK_SET) // die "$0: seek failed: $!\n";
my $ret = read($fh, my $buf, $count);
$ret == $count || die "$0: only read $ret not $count chars: $!";
print "read worked, trying print and dump...\n";
printf "%d chars from offset %d are length %d:",
$count, $offset, length($buf);
printf "<%s>, U+%v04X\n", $buf, $buf;
Dump($buf);
% perl seek-enc sample.utf8 2 4
read worked, trying print and dump...
Malformed UTF-8 character (unexpected continuation byte 0x81, with no preceding start byte) in printf at seek-enc line 22.
4 chars from offset 2 are length 4:<#ÉE>, U+0000.0045.0301.0045
SV = PVMG(0x3c036640) at 0x3c065fb4
REFCNT = 1
FLAGS = (PADMY,SMG,POK,pPOK,UTF8)
IV = 0
NV = 0
PV = 0x3c046880 "\201E\314\201E"\0Malformed UTF-8 character (unexpected continuation byte 0x81, with no preceding start byte) in subroutine entry at seek-enc line 23.
[UTF8 "\x{0}E\x{301}E"]
CUR = 5
LEN = 12
MAGIC = 0x3c0493e0
MG_VIRTUAL = &PL_vtbl_utf8
MG_TYPE = PERL_MAGIC_utf8(w)
MG_LEN = 4
Ok, now what?
--tom
Summary of my perl5 (revision 5 version 14 subversion 0) configuration:
Platform:
osname=openbsd, osvers=4.4, archname=OpenBSD.i386-openbsd
uname='openbsd chthon 4.4 generic#0 i386 '
config_args='-des'
hint=recommended, useposix=true, d_sigaction=define
useithreads=undef, usemultiplicity=undef
useperlio=define, d_sfio=undef, uselargefiles=define, usesocks=undef
use64bitint=undef, use64bitall=undef, uselongdouble=undef
usemymalloc=y, bincompat5005=undef
Compiler:
cc='cc', ccflags ='-fno-strict-aliasing -pipe -fstack-protector -I/usr/local/include',
optimize='-O2',
cppflags='-fno-strict-aliasing -pipe -fstack-protector -I/usr/local/include'
ccversion='', gccversion='3.3.5 (propolice)', gccosandvers='openbsd4.4'
intsize=4, longsize=4, ptrsize=4, doublesize=8, byteorder=1234
d_longlong=define, longlongsize=8, d_longdbl=define, longdblsize=12
ivtype='long', ivsize=4, nvtype='double', nvsize=8, Off_t='off_t', lseeksize=8
alignbytes=4, prototype=define
Linker and Libraries:
ld='cc', ldflags ='-Wl,-E -fstack-protector -L/usr/local/lib'
libpth=/usr/local/lib /usr/lib
libs=-lgdbm -lm -lutil -lc
perllibs=-lm -lutil -lc
libc=/usr/lib/libc.so.48.0, so=so, useshrplib=false, libperl=libperl.a
gnulibc_version=''
Dynamic Linking:
dlsrc=dl_dlopen.xs, dlext=so, d_dlsymun=undef, ccdlflags=' '
cccdlflags='-DPIC -fPIC ', lddlflags='-shared -fPIC -L/usr/local/lib -fstack-protector'
Characteristics of this binary (from libperl):
Compile-time options: MYMALLOC PERL_DONT_CREATE_GVSV PERL_MALLOC_WRAP
PERL_PRESERVE_IVUV USE_LARGE_FILES USE_PERLIO
USE_PERL_ATOF
Built under openbsd
Compiled at Jun 11 2011 11:48:28
%ENV:
PERL_UNICODE="SA"
@INC:
/usr/local/lib/perl5/site_perl/5.14.0/OpenBSD.i386-openbsd
/usr/local/lib/perl5/site_perl/5.14.0
/usr/local/lib/perl5/5.14.0/OpenBSD.i386-openbsd
/usr/local/lib/perl5/5.14.0
/usr/local/lib/perl5/site_perl/5.12.3
/usr/local/lib/perl5/site_perl/5.11.3
/usr/local/lib/perl5/site_perl/5.10.1
/usr/local/lib/perl5/site_perl/5.10.0
/usr/local/lib/perl5/site_perl/5.8.7
/usr/local/lib/perl5/site_perl/5.8.0
/usr/local/lib/perl5/site_perl/5.6.0
/usr/local/lib/perl5/site_perl/5.005
/usr/local/lib/perl5/site_perl
.
Thread Next
-
[perl #101382] seeking on bytes causes broken perl strings
by tchrist1