develooper Front page | perl.perl5.porters | Postings from October 2011

[perl #101382] seeking on bytes causes broken perl strings

Thread Next
From:
tchrist1
Date:
October 14, 2011 15:57
Subject:
[perl #101382] seeking on bytes causes broken perl strings
Message ID:
rt-3.6.HEAD-31297-1318633009-1163.101382-75-0@perl.org
# New Ticket Created by  tchrist1 
# Please include the string:  [perl #101382]
# in the subject line of all future correspondence about this issue. 
# <URL: https://rt.perl.org:443/rt3/Ticket/Display.html?id=101382 >


Perl's seek and sysseek take off_t arguments and retvals, which are in
bytes.  But you can call them on a stream with an encoding.  Now you're
doomed, recause all read-like functions in perl (getc, readline, read,
sysread) go through the encoding layer.  That means you can see to partway
through a multibyte UTF-8 or UTF-16 character (for example) and when you
next read something, and you just produced a broken UTF-8 string in Perl.
I'm pretty sure that that is supposed to be Against The Rules.

  % perl -CS -E 'say "E\x{301}" x 50 for 1..100' > sample.utf8                                        

  % cat sysseek-enc-test
    #!/usr/bin/env perl
    # sysseek-enc-test
    use v5.14;
    use strict;
    use warnings;
    use open qw(:std :utf8);
    use Fcntl qw(:seek);
    use Devel::Peek;
    my $encoding = "utf8";   # same results w/ "encoding(UTF-8)"
    my $mode     = "< :$encoding";
    @ARGV == 3 					|| die "usage: $0 utf8filename offset count";
    my($filename, $offset, $count) = @ARGV;
    $offset =~ /^\d+$/aa				|| die "offset should be whole number";
    $count =~ /^\d+$/aa				|| die "count should be whole number";
    open(my $fh, $mode, $filename) 			|| die "$0: can't open $mode $filename: $!\n";
    my $newpos = sysseek($fh, $offset, SEEK_SET) 	// die "$0: sysseek failed: $!\n";
    my $sysret = sysread($fh, my $buf, $count);
    $sysret == $count 				|| die "$0: only sysread $sysret not $count chars: $!";
    print  "sysread worked, trying print and dump...\n";
    printf "%d chars from offset %d are length %d:",
		 $count, $offset, length($buf);
    printf "<%s>, U+%v04X\n", $buf, $buf;
    Dump($buf);

  % perl sysseek-enc sample.utf8 2 4
    sysread worked, trying print and dump...
    Malformed UTF-8 character (unexpected continuation byte 0x81, with no preceding start byte) in printf at sysseek-enc line 22.
    4 chars from offset 2 are length 4:<#ÉE>, U+0000.0045.0301.0045
    SV = PVMG(0x3c036640) at 0x3c0657b4
      REFCNT = 1
      FLAGS = (PADMY,SMG,POK,pPOK,UTF8)
      IV = 0
      NV = 0
      PV = 0x3c026548 "\201E\314\201E"\0Malformed UTF-8 character (unexpected continuation byte 0x81, with no preceding start byte) in subroutine entry at sysseek-enc line 23.
     [UTF8 "\x{0}E\x{301}E"]
      CUR = 5
      LEN = 12
      MAGIC = 0x3c049be0
	MG_VIRTUAL = &PL_vtbl_utf8
	MG_TYPE = PERL_MAGIC_utf8(w)
	MG_LEN = 4

Notice that the data produced claim to have an initial code point of
U+0000.  But that isn't so:

  % head -1 sample.utf8 | uniquote -x | chop -72
    E\x{301}E\x{301}E\x{301}E\x{301}E\x{301}E\x{301}E\x{301}E\x{301}E\x{301
  % head -1 sample.utf8 | uniquote -b | chop -72
    E\xCC\x81E\xCC\x81E\xCC\x81E\xCC\x81E\xCC\x81E\xCC\x81E\xCC\x81E\xCC\x8
  % head -1 sample.utf8 | uniquote -o | chop -72
    E\314\201E\314\201E\314\201E\314\201E\314\201E\314\201E\314\201E\314\20

The problem doesn't change if you :%s/sys//g the previous program:

  % cat seek-enc-test
    #!/usr/bin/env perl
    # seek-enc-test
    use v5.14;
    use strict;
    use warnings;
    use open qw(:std :utf8);
    use Fcntl qw(:seek);
    use Devel::Peek;
    my $encoding = "utf8";   # same results w/ "encoding(UTF-8)"
    my $mode     = "< :$encoding";
    @ARGV == 3 					|| die "usage: $0 utf8filename offset count";
    my($filename, $offset, $count) = @ARGV;
    $offset =~ /^\d+$/aa				|| die "offset should be whole number";
    $count =~ /^\d+$/aa				|| die "count should be whole number";
    open(my $fh, $mode, $filename) 			|| die "$0: can't open $mode $filename: $!\n";
    my $newpos = seek($fh, $offset, SEEK_SET) 	// die "$0: seek failed: $!\n";
    my $ret = read($fh, my $buf, $count);
    $ret == $count 					|| die "$0: only read $ret not $count chars: $!";
    print  "read worked, trying print and dump...\n";
    printf "%d chars from offset %d are length %d:",
		 $count, $offset, length($buf);
    printf "<%s>, U+%v04X\n", $buf, $buf;
    Dump($buf);
    
  % perl seek-enc sample.utf8 2 4
    read worked, trying print and dump...
    Malformed UTF-8 character (unexpected continuation byte 0x81, with no preceding start byte) in printf at seek-enc line 22.
    4 chars from offset 2 are length 4:<#ÉE>, U+0000.0045.0301.0045
    SV = PVMG(0x3c036640) at 0x3c065fb4
      REFCNT = 1
      FLAGS = (PADMY,SMG,POK,pPOK,UTF8)
      IV = 0
      NV = 0
      PV = 0x3c046880 "\201E\314\201E"\0Malformed UTF-8 character (unexpected continuation byte 0x81, with no preceding start byte) in subroutine entry at seek-enc line 23.
     [UTF8 "\x{0}E\x{301}E"]
      CUR = 5
      LEN = 12
      MAGIC = 0x3c0493e0
	MG_VIRTUAL = &PL_vtbl_utf8
	MG_TYPE = PERL_MAGIC_utf8(w)
	MG_LEN = 4

Ok, now what?

--tom

Summary of my perl5 (revision 5 version 14 subversion 0) configuration:
   
  Platform:
    osname=openbsd, osvers=4.4, archname=OpenBSD.i386-openbsd
    uname='openbsd chthon 4.4 generic#0 i386 '
    config_args='-des'
    hint=recommended, useposix=true, d_sigaction=define
    useithreads=undef, usemultiplicity=undef
    useperlio=define, d_sfio=undef, uselargefiles=define, usesocks=undef
    use64bitint=undef, use64bitall=undef, uselongdouble=undef
    usemymalloc=y, bincompat5005=undef
  Compiler:
    cc='cc', ccflags ='-fno-strict-aliasing -pipe -fstack-protector -I/usr/local/include',
    optimize='-O2',
    cppflags='-fno-strict-aliasing -pipe -fstack-protector -I/usr/local/include'
    ccversion='', gccversion='3.3.5 (propolice)', gccosandvers='openbsd4.4'
    intsize=4, longsize=4, ptrsize=4, doublesize=8, byteorder=1234
    d_longlong=define, longlongsize=8, d_longdbl=define, longdblsize=12
    ivtype='long', ivsize=4, nvtype='double', nvsize=8, Off_t='off_t', lseeksize=8
    alignbytes=4, prototype=define
  Linker and Libraries:
    ld='cc', ldflags ='-Wl,-E  -fstack-protector -L/usr/local/lib'
    libpth=/usr/local/lib /usr/lib
    libs=-lgdbm -lm -lutil -lc
    perllibs=-lm -lutil -lc
    libc=/usr/lib/libc.so.48.0, so=so, useshrplib=false, libperl=libperl.a
    gnulibc_version=''
  Dynamic Linking:
    dlsrc=dl_dlopen.xs, dlext=so, d_dlsymun=undef, ccdlflags=' '
    cccdlflags='-DPIC -fPIC ', lddlflags='-shared -fPIC  -L/usr/local/lib -fstack-protector'


Characteristics of this binary (from libperl): 
  Compile-time options: MYMALLOC PERL_DONT_CREATE_GVSV PERL_MALLOC_WRAP
                        PERL_PRESERVE_IVUV USE_LARGE_FILES USE_PERLIO
                        USE_PERL_ATOF
  Built under openbsd
  Compiled at Jun 11 2011 11:48:28
  %ENV:
    PERL_UNICODE="SA"
  @INC:
    /usr/local/lib/perl5/site_perl/5.14.0/OpenBSD.i386-openbsd
    /usr/local/lib/perl5/site_perl/5.14.0
    /usr/local/lib/perl5/5.14.0/OpenBSD.i386-openbsd
    /usr/local/lib/perl5/5.14.0
    /usr/local/lib/perl5/site_perl/5.12.3
    /usr/local/lib/perl5/site_perl/5.11.3
    /usr/local/lib/perl5/site_perl/5.10.1
    /usr/local/lib/perl5/site_perl/5.10.0
    /usr/local/lib/perl5/site_perl/5.8.7
    /usr/local/lib/perl5/site_perl/5.8.0
    /usr/local/lib/perl5/site_perl/5.6.0
    /usr/local/lib/perl5/site_perl/5.005
    /usr/local/lib/perl5/site_perl
    .


Thread Next


nntp.perl.org: Perl Programming lists via nntp and http.
Comments to Ask Bjørn Hansen at ask@perl.org | Group listing | About