develooper Front page | perl.perl5.porters | Postings from September 2000

how not to convert codepages for input to perl

From:
Prymmer/Kahn
Date:
September 24, 2000 11:02
Subject:
how not to convert codepages for input to perl
Message ID:
Pine.BSF.4.21.0009241100480.18956-100000@shell8.ba.best.com


In what turned out to be an unsuccessful attempt at interposing a
EBCDIC <-> ASCII conversion tables into the suspiciously named
routines utf8.c:Perl_utf8_to_bytes() and utf8.c:Perl_bytes_to_utf8(),
I was curious about what impact there might be on performance 
(note that with this patch perl still won't build on OS/390 owing 
to the SWASHNEW method call failing during the first test of miniperl 
during the build).  Hence I inserted an identity map into those utf8.c 
routines for ASCIIish machines and ran ./perl TEST 12 times on linux 
with and without the modification.  The results looked something 
like this:

Initial test run with PL_a2a mod:

All tests successful.
u=0.64  s=0.34  cu=59.05  cs=16.25  scripts=256  tests=12660

Average of all twelve with PL_a2a mod:

0.340833333333332  wall clock hours
 u: mean = 0.83250  +/- 0.114743
 s: mean = 0.26750  +/- 0.073252
cu: mean = 60.00750  +/- 3.757722
cs: mean = 16.85417  +/- 2.287005


Initial test run without PL_a2a mod:

All tests successful.
u=0.7  s=0.35  cu=58.67  cs=16.32  scripts=256  tests=12660

Average of all twelve without PL_a2a mod:

0.341944444444444 wall clock hours
 u: mean = 0.82250  +/- 0.111447
 s: mean = 0.24917  +/- 0.038485
cu: mean = 60.30583  +/- 3.982606
cs: mean = 17.09083  +/- 2.662485

So there was no statistically significant  difference in performance 
with the identity map on an ASCII machine.  Apparently those routines 
are not called all that frequently.  Indeed, the appearance of calls to
those two seems somewhat asymmetric:

% grep utf8_to_bytes *.c
perlapi.c:#undef  Perl_utf8_to_bytes
perlapi.c:Perl_utf8_to_bytes(pTHXo_ U8 *s, STRLEN *len)
perlapi.c:    return ((CPerlObj*)pPerl)->Perl_utf8_to_bytes(s, len);
sv.c:        if (!utf8_to_bytes((U8*)c, &len)) {
utf8.c:=for apidoc Am|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
utf8.c:Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)

% grep bytes_to_utf8 *.c
doop.c:	NeWsTr = bytes_to_utf8(start, &LeN);	\
doop.c:        s = bytes_to_utf8(s, &len);
perlapi.c:#undef  Perl_bytes_to_utf8
perlapi.c:Perl_bytes_to_utf8(pTHXo_ U8 *s, STRLEN *len)
perlapi.c:    return ((CPerlObj*)pPerl)->Perl_bytes_to_utf8(s, len);
sv.c:	SvPVX(sv) = (char*)bytes_to_utf8((U8*)s, &len);
sv.c:	    pv2 = (char*)bytes_to_utf8((U8*)pv2, &cur2);
sv.c:	    pv1 = (char*)bytes_to_utf8((U8*)pv1, &cur1);
sv.c:	    pv2 = (char*)bytes_to_utf8((U8*)pv2, &cur2);
sv.c:	    pv1 = (char*)bytes_to_utf8((U8*)pv1, &cur1);
utf8.c:Unlike C<bytes_to_utf8>, this over-writes the original string, and
utf8.c:=for apidoc Am|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
utf8.c:Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN *len)


At any rate here was the conversion table diff that I had tried in this
test (this is *not* a patch):


diff -ru perl.7094/perl.h perl/perl.h
--- perl.7094/perl.h	Thu Sep 14 21:53:44 2000
+++ perl/perl.h	Sun Sep 24 10:26:18 2000
@@ -2340,6 +2340,74 @@
 
 #ifdef DOINIT
 #ifdef EBCDIC
+EXT unsigned char PL_a2e[] = { /* ASCII (ISO8859-1) to EBCDIC (IBM-1047) */
+    0,      1,      2,      3,      55,     45,     46,     47,
+    22,     5,      21,     11,     12,     13,     14,     15,
+    16,     17,     18,     19,     60,     61,     50,     38,
+    24,     25,     63,     39,     28,     29,     30,     31,
+    64,     90,     127,    123,    91,     108,    80,     125,
+    77,     93,     92,     78,     107,    96,     75,     97,
+    240,    241,    242,    243,    244,    245,    246,    247,
+    248,    249,    122,    94,     76,     126,    110,    111,
+    124,    193,    194,    195,    196,    197,    198,    199,
+    200,    201,    209,    210,    211,    212,    213,    214,
+    215,    216,    217,    226,    227,    228,    229,    230,
+    231,    232,    233,    173,    224,    189,    95,     109,
+    121,    129,    130,    131,    132,    133,    134,    135,
+    136,    137,    145,    146,    147,    148,    149,    150,
+    151,    152,    153,    162,    163,    164,    165,    166,
+    167,    168,    169,    192,    79,     208,    161,    7,
+    32,     33,     34,     35,     36,     37,     6,      23,
+    40,     41,     42,     43,     44,     9,      10,     27,
+    48,     49,     26,     51,     52,     53,     54,     8,
+    56,     57,     58,     59,     4,      20,     62,     255,
+    65,     170,    74,     177,    159,    178,    106,    181,
+    187,    180,    154,    138,    176,    202,    175,    188,
+    144,    143,    234,    250,    190,    160,    182,    179,
+    157,    218,    155,    139,    183,    184,    185,    171,
+    100,    101,    98,     102,    99,     103,    158,    104,
+    116,    113,    114,    115,    120,    117,    118,    119,
+    172,    105,    237,    238,    235,    239,    236,    191,
+    128,    253,    254,    251,    252,    186,    174,    89,
+    68,     69,     66,     70,     67,     71,     156,    72,
+    84,     81,     82,     83,     88,     85,     86,     87,
+    140,    73,     205,    206,    203,    207,    204,    225,
+    112,    221,    222,    219,    220,    141,    142,    223
+};
+EXT unsigned char PL_e2a[] = { /* EBCDIC (IBM-1047) to ASCII (ISO8859-1) */
+    0,      1,      2,      3,      156,    9,      134,    127,
+    151,    141,    142,    11,     12,     13,     14,     15,
+    16,     17,     18,     19,     157,    10,     8,      135,
+    24,     25,     146,    143,    28,     29,     30,     31,
+    128,    129,    130,    131,    132,    133,    23,     27,
+    136,    137,    138,    139,    140,    5,      6,      7,
+    144,    145,    22,     147,    148,    149,    150,    4,
+    152,    153,    154,    155,    20,     21,     158,    26,
+    32,     160,    226,    228,    224,    225,    227,    229,
+    231,    241,    162,    46,     60,     40,     43,     124,
+    38,     233,    234,    235,    232,    237,    238,    239,
+    236,    223,    33,     36,     42,     41,     59,     94,
+    45,     47,     194,    196,    192,    193,    195,    197,
+    199,    209,    166,    44,     37,     95,     62,     63,
+    248,    201,    202,    203,    200,    205,    206,    207,
+    204,    96,     58,     35,     64,     39,     61,     34,
+    216,    97,     98,     99,     100,    101,    102,    103,
+    104,    105,    171,    187,    240,    253,    254,    177,
+    176,    106,    107,    108,    109,    110,    111,    112,
+    113,    114,    170,    186,    230,    184,    198,    164,
+    181,    126,    115,    116,    117,    118,    119,    120,
+    121,    122,    161,    191,    208,    91,     222,    174,
+    172,    163,    165,    183,    169,    167,    182,    188,
+    189,    190,    221,    168,    175,    93,     180,    215,
+    123,    65,     66,     67,     68,     69,     70,     71,
+    72,     73,     173,    244,    246,    242,    243,    245,
+    125,    74,     75,     76,     77,     78,     79,     80,
+    81,     82,     185,    251,    252,    249,    250,    255,
+    92,     247,    83,     84,     85,     86,     87,     88,
+    89,     90,     178,    212,    214,    210,    211,    213,
+    48,     49,    50,      51,     52,     53,     54,     55,
+    56,     57,    179,     219,    220,    217,    218,    159
+};
 EXT unsigned char PL_fold[] = { /* fast EBCDIC case folding table */
     0,      1,      2,      3,      4,      5,      6,      7,
     8,      9,      10,     11,     12,     13,     14,     15,
@@ -2375,6 +2443,40 @@
     248,    249,    250,    251,    252,    253,    254,    255
 };
 #else   /* ascii rather than ebcdic */
+EXT unsigned char PL_a2a[] = { /* ASCII to ASCII (ISO8859-1) identity map */
+        0,      1,      2,      3,      4,      5,      6,      7,
+        8,      9,      10,     11,     12,     13,     14,     15,
+        16,     17,     18,     19,     20,     21,     22,     23,
+        24,     25,     26,     27,     28,     29,     30,     31,
+        32,     33,     34,     35,     36,     37,     38,     39,
+        40,     41,     42,     43,     44,     45,     46,     47,
+        48,     49,     50,     51,     52,     53,     54,     55,
+        56,     57,     58,     59,     60,     61,     62,     63,
+        64,     65,     66,     67,     68,     69,     70,     71,
+        72,     73,     74,     75,     76,     77,     78,     79,
+        80,     81,     82,     83,     84,     85,     86,     87,
+        88,     89,     90,     91,     92,     93,     94,     95,
+        96,     97,     98,     99,     100,    101,    102,    103,
+        104,    105,    106,    107,    108,    109,    110,    111,
+        112,    113,    114,    115,    116,    117,    118,    119,
+        120,    121,    122,    123,    124,    125,    126,    127,
+        128,    129,    130,    131,    132,    133,    134,    135,
+        136,    137,    138,    139,    140,    141,    142,    143,
+        144,    145,    146,    147,    148,    149,    150,    151,
+        152,    153,    154,    155,    156,    157,    158,    159,
+        160,    161,    162,    163,    164,    165,    166,    167,
+        168,    169,    170,    171,    172,    173,    174,    175,
+        176,    177,    178,    179,    180,    181,    182,    183,
+        184,    185,    186,    187,    188,    189,    190,    191,
+        192,    193,    194,    195,    196,    197,    198,    199,
+        200,    201,    202,    203,    204,    205,    206,    207,
+        208,    209,    210,    211,    212,    213,    214,    215,
+        216,    217,    218,    219,    220,    221,    222,    223,
+        224,    225,    226,    227,    228,    229,    230,    231,
+        232,    233,    234,    235,    236,    237,    238,    239,
+        240,    241,    242,    243,    244,    245,    246,    247,
+        248,    249,    250,    251,    252,    253,    254,    255,
+};
 EXTCONST  unsigned char PL_fold[] = {
 	0,	1,	2,	3,	4,	5,	6,	7,
 	8,	9,	10,	11,	12,	13,	14,	15,
@@ -2412,6 +2514,7 @@
 #endif  /* !EBCDIC */
 #else
 EXTCONST unsigned char PL_fold[];
+EXT unsigned char PL_a2a[];
 #endif
 
 #ifdef DOINIT
diff -ru perl.7094/utf8.c perl/utf8.c
--- perl.7094/utf8.c	Thu Sep 14 21:55:22 2000
+++ perl/utf8.c	Sun Sep 24 10:26:11 2000
@@ -298,7 +298,13 @@
 
     /* ensure valid UTF8 and chars < 256 before updating string */
     while (s < send) {
-	U8 c = *s++;
+	U8 c;
+#ifdef EBCDIC
+        *s = PL_e2a[*s];
+#else
+        *s = PL_a2a[*s];
+#endif
+	c = *s++;
         if (c >= 0x80 &&
 	    ( (s >= send) || ((*s++ & 0xc0) != 0x80) || ((c & 0xfe) != 0xc2))) {
 	    *len = -1;
@@ -343,6 +349,11 @@
     dst = d;
 
     while (s < send) {
+#ifdef EBCDIC
+        *s = PL_a2e[*s];
+#else
+        *s = PL_a2a[*s];
+#endif
         if (*s < 0x80)
             *d++ = *s++;
         else {
End of diff (not a patch!)

Does anyone have any suggestions on where I might insert conversion tables
such as PL_a2e and PL_e2a a bit more profitably?  Thanks.

Peter Prymmer








nntp.perl.org: Perl Programming lists via nntp and http.
Comments to Ask Bjørn Hansen at ask@perl.org | Group listing | About