Front page | perl.perl5.porters |
Postings from August 2011
[perl #96592] 4 pod casemapping errors: s/(\w+)/\u\L$1/g is ALWAYS WRONG
From:
tchrist1
Date:
August 8, 2011 16:17
Subject:
[perl #96592] 4 pod casemapping errors: s/(\w+)/\u\L$1/g is ALWAYS WRONG
Message ID:
rt-3.6.HEAD-31297-1312845447-1036.96592-75-0@perl.org
# New Ticket Created by tchrist1
# Please include the string: [perl #96592]
# in the subject line of all future correspondence about this issue.
# <URL: https://rt.perl.org:443/rt3/Ticket/Display.html?id=96592 >
These are all in error:
perldata.pod: s/(\w+)/\u\L$1/g; # "titlecase" words
perlfaq4.pod: $string =~ s/([\w']+)/\u\L$1/g;
perlop.pod: substr($str, -30) =~ s/\b(\p{Alpha}+)\b/\u\L$1/g;
perlretut.pod:string. The regexps C<\L\u$word> or C<\u\L$word> convert the first
They don't work because you cannot guarantee a correct titlecase
mapping if you first send it through lowercase. There are no
roundtrip guarantees with Unicode casemapping.
Here are two places where you get an error doing it the way
the pods erroneously suggest, but there are others:
orig => İ is 0130
lc => i̇ is 0069.0307
tc => İ is 0130
tc lc => İ is 0049.0307 (wrong answer)
orig => ẞ is 1E9E
lc => ß is 00DF
tc => ẞ is 1E9E
tc lc => Ss is 0053.0073 (wrong answer)
The correct approach requires something more like
s/\b(\w)(\w*)\b/\u$1\L$2/g; # "titlecase" "words"
Because casemapA(string) is never guaranteed to be the
same as casemapA(casemapB(string)).
--tom
#!/usr/bin/env perl
use utf8;
use v5.14;
use strict;
use warnings;
use open qw(:std :encoding(UTF-8));
use charnames qw(:full);
my @chars = (
"\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}",
"\N{GREEK CAPITAL THETA SYMBOL}",
"\N{LATIN CAPITAL LETTER SHARP S}",
"\N{OHM SIGN}",
"\N{KELVIN SIGN}",
"\N{ANGSTROM SIGN}",
"\N{LATIN SMALL LETTER SHARP S}",
"\N{GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI}",
"\N{LATIN SMALL LIGATURE FF}",
"\N{LATIN SMALL LIGATURE FFI}",
"\N{LATIN SMALL LIGATURE LONG S T}",
"\N{LATIN SMALL LIGATURE ST}",
);
sub report($$;$) {
my ($what, $str, $ok) = @_;
my $mask = "%-5s => %-3s is %v04X\n";
if (@_ == 3) {
$mask =~ s/\n/\t%s\n/;
}
printf $mask, $what, ($str) x 2, $ok;
}
for my $char (@chars) {
my $lc = lc $char;
my $tc_good = ucfirst $char;
my $tc_bad_lc = ucfirst lc $char;
my $tc_bad_uc = ucfirst uc $char;
report "orig " => $char;
report " lc" => $lc;
report "tc " => $tc_good, "real";
report "tc lc" => $tc_bad_lc, ($tc_good eq $tc_bad_lc) ? "RIGHT" : "WRONG";
report "tc uc" => $tc_bad_uc, ($tc_good eq $tc_bad_uc) ? "RIGHT" : "WRONG";
print "\n";
}
__END__
Summary of my perl5 (revision 5 version 14 subversion 0) configuration:
Platform:
osname=openbsd, osvers=4.4, archname=OpenBSD.i386-openbsd
uname='openbsd chthon 4.4 generic#0 i386 '
config_args='-des'
hint=recommended, useposix=true, d_sigaction=define
useithreads=undef, usemultiplicity=undef
useperlio=define, d_sfio=undef, uselargefiles=define, usesocks=undef
use64bitint=undef, use64bitall=undef, uselongdouble=undef
usemymalloc=y, bincompat5005=undef
Compiler:
cc='cc', ccflags ='-fno-strict-aliasing -pipe -fstack-protector -I/usr/local/include',
optimize='-O2',
cppflags='-fno-strict-aliasing -pipe -fstack-protector -I/usr/local/include'
ccversion='', gccversion='3.3.5 (propolice)', gccosandvers='openbsd4.4'
intsize=4, longsize=4, ptrsize=4, doublesize=8, byteorder=1234
d_longlong=define, longlongsize=8, d_longdbl=define, longdblsize=12
ivtype='long', ivsize=4, nvtype='double', nvsize=8, Off_t='off_t', lseeksize=8
alignbytes=4, prototype=define
Linker and Libraries:
ld='cc', ldflags ='-Wl,-E -fstack-protector -L/usr/local/lib'
libpth=/usr/local/lib /usr/lib
libs=-lgdbm -lm -lutil -lc
perllibs=-lm -lutil -lc
libc=/usr/lib/libc.so.48.0, so=so, useshrplib=false, libperl=libperl.a
gnulibc_version=''
Dynamic Linking:
dlsrc=dl_dlopen.xs, dlext=so, d_dlsymun=undef, ccdlflags=' '
cccdlflags='-DPIC -fPIC ', lddlflags='-shared -fPIC -L/usr/local/lib -fstack-protector'
Characteristics of this binary (from libperl):
Compile-time options: MYMALLOC PERL_DONT_CREATE_GVSV PERL_MALLOC_WRAP
PERL_PRESERVE_IVUV USE_LARGE_FILES USE_PERLIO
USE_PERL_ATOF
Built under openbsd
Compiled at Jun 11 2011 11:48:28
%ENV:
PERL_UNICODE="SA"
@INC:
/usr/local/lib/perl5/site_perl/5.14.0/OpenBSD.i386-openbsd
/usr/local/lib/perl5/site_perl/5.14.0
/usr/local/lib/perl5/5.14.0/OpenBSD.i386-openbsd
/usr/local/lib/perl5/5.14.0
/usr/local/lib/perl5/site_perl/5.12.3
/usr/local/lib/perl5/site_perl/5.11.3
/usr/local/lib/perl5/site_perl/5.10.1
/usr/local/lib/perl5/site_perl/5.10.0
/usr/local/lib/perl5/site_perl/5.8.7
/usr/local/lib/perl5/site_perl/5.8.0
/usr/local/lib/perl5/site_perl/5.6.0
/usr/local/lib/perl5/site_perl/5.005
/usr/local/lib/perl5/site_perl
.
-
[perl #96592] 4 pod casemapping errors: s/(\w+)/\u\L$1/g is ALWAYS WRONG
by tchrist1