develooper Front page | perl.perl5.porters | Postings from June 2003

[Encode] pre-1.97 patches

Thread Next
From:
Dan Kogai
Date:
June 27, 2003 09:21
Subject:
[Encode] pre-1.97 patches
Message ID:
56D5BFEE-A8BB-11D7-9092-000393AE4244@dan.co.jp
Porters,

   The following is the pre-1.97 patch for Encode.  It fixes one bug and 
adds one feature.

$Revision: 1.96 $ $Date: 2003/06/18 09:29:02 $
! lib/Encode/Guess.pm
   $Encode::Guess::NoUTFAutoGuess is added so you can turn off
   automatic  utf(8|16|32) guessing -- originally by Autrijus
   Message-Id: <20030626162731.GA2077@not.autrijus.org>
! Encode.pm
   Addressed the following;
   Subject: [perl #22835] FB_QUIET doesn't work with Encode::encode
   Message-Id: <rt-22835-59975.6.8650775354304@rt.perl.org>


Dan the Encode Maintainer
===================================================================
RCS file: Changes,v
retrieving revision 1.96
diff -u -r1.96 Changes
--- Changes     2003/06/18 09:29:02     1.96
+++ Changes     2003/06/27 16:17:27
@@ -1,8 +1,18 @@
  # Revision history for Perl extension Encode.
  #
-# $Id: Changes,v 1.96 2003/06/18 09:29:02 dankogai Exp dankogai $
+# $Id: Changes,v 1.96 2003/06/18 09:29:02 dankogai Exp $
  #
  $Revision: 1.96 $ $Date: 2003/06/18 09:29:02 $
+! lib/Encode/Guess.pm
+  $Encode::Guess::NoUTFAutoGuess is added so you can turn off
+  automatic  utf(8|16|32) guessing -- originally by Autrijus
+  Message-Id: <20030626162731.GA2077@not.autrijus.org>
+! Encode.pm
+  Addressed the following;
+  Subject: [perl #22835] FB_QUIET doesn't work with Encode::encode
+  Message-Id: <rt-22835-59975.6.8650775354304@rt.perl.org>
+
+1.96 2003/06/18 09:29:02
  ! lib/Encode/JP/JP.pm t/guess.t
    m/(...)/ in void context then $1 is considered a Bad Thing
    Message-Id: <B5AB34D0-A019-11D7-AF03-000393AE4244@dan.co.jp>
===================================================================
RCS file: Encode.pm,v
retrieving revision 1.96
diff -u -r1.96 Encode.pm
--- Encode.pm   2003/06/18 09:29:02     1.96
+++ Encode.pm   2003/06/27 15:40:48
@@ -147,7 +147,7 @@
         Carp::croak("Unknown encoding '$name'");
      }
      my $octets = $enc->encode($string,$check);
-    return undef if ($check && length($string));
+    $_[1] = $string if $check;
      return $octets;
  }

===================================================================
RCS file: lib/Encode/Guess.pm,v
retrieving revision 1.8
diff -u -r1.8 lib/Encode/Guess.pm
--- lib/Encode/Guess.pm 2003/04/24 17:43:16     1.8
+++ lib/Encode/Guess.pm 2003/06/27 16:13:12
@@ -18,6 +18,7 @@
  sub perlio_ok { 0 }

  our @EXPORT = qw(guess_encoding);
+our $NoUTFAutoGuess = 0;

  sub import { # Exporter not used so we do it on our own
      my $callpkg = caller;
@@ -70,75 +71,80 @@
      return unless defined $octet and length $octet;

      # cheat 0: utf8 flag;
-    Encode::is_utf8($octet) and return find_encoding('utf8');
+    if ( Encode::is_utf8($octet) ) {
+       return find_encoding('utf8') unless $NoUTFAutoGuess;
+       Encode::_utf8_off($octet);
+    }
      # cheat 1: BOM
      use Encode::Unicode;
-    my $BOM = unpack('n', $octet);
-    return find_encoding('UTF-16')
-       if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe));
-    $BOM = unpack('N', $octet);
-    return find_encoding('UTF-32')
-       if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe0000));
+    unless ($NoUTFAutoGuess) {
+       my $BOM = unpack('n', $octet);
+       return find_encoding('UTF-16')
+           if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe));
+       $BOM = unpack('N', $octet);
+       return find_encoding('UTF-32')
+           if (defined $BOM and ($BOM == 0xFeFF or $BOM == 
0xFFFe0000));
+       if ($octet =~ /\x00/o){ # if \x00 found, we assume 
UTF-(16|32)(BE|LE)
+           my $utf;
+           my ($be, $le) = (0, 0);
+           if ($octet =~ /\x00\x00/o){ # UTF-32(BE|LE) assumed
+               $utf = "UTF-32";
+               for my $char (unpack('N*', $octet)){
+                   $char & 0x0000ffff and $be++;
+                   $char & 0xffff0000 and $le++;
+               }
+           }else{ # UTF-16(BE|LE) assumed
+               $utf = "UTF-16";
+               for my $char (unpack('n*', $octet)){
+                   $char & 0x00ff and $be++;
+                   $char & 0xff00 and $le++;
+               }
+           }
+           $DEBUG and warn "$utf, be == $be, le == $le";
+           $be == $le
+               and return
+                   "Encodings ambiguous between $utf BE and LE ($be, 
$le)";
+           $utf .= ($be > $le) ? 'BE' : 'LE';
+           return find_encoding($utf);
+       }
+    }
      my %try =  %{$obj->{Suspects}};
      for my $c (@_){
         my $e = find_encoding($c) or die "Unknown encoding: $c";
         $try{$e->name} = $e;
         $DEBUG and warn "Added: ", $e->name;
      }
-    if ($octet =~ /\x00/o){ # if \x00 found, we assume 
UTF-(16|32)(BE|LE)
-       my $utf;
-       my ($be, $le) = (0, 0);
-       if ($octet =~ /\x00\x00/o){ # UTF-32(BE|LE) assumed
-           $utf = "UTF-32";
-           for my $char (unpack('N*', $octet)){
-               $char & 0x0000ffff and $be++;
-               $char & 0xffff0000 and $le++;
-           }
-       }else{ # UTF-16(BE|LE) assumed
-           $utf = "UTF-16";
-           for my $char (unpack('n*', $octet)){
-               $char & 0x00ff and $be++;
-               $char & 0xff00 and $le++;
+    my $nline = 1;
+    for my $line (split /\r\n?|\n/, $octet){
+       # cheat 2 -- \e in the string
+       if ($line =~ /\e/o){
+           my @keys = keys %try;
+           delete @try{qw/utf8 ascii/};
+           for my $k (@keys){
+               ref($try{$k}) eq 'Encode::XS' and delete $try{$k};
             }
         }
-       $DEBUG and warn "$utf, be == $be, le == $le";
-       $be == $le
-           and return "Encodings ambiguous between $utf BE and LE 
($be, $le)";
-       $utf .= ($be > $le) ? 'BE' : 'LE';
-       return find_encoding($utf);
-    }else{
-       my $nline = 1;
-       for my $line (split /\r\n?|\n/, $octet){
-           # cheat 2 -- \e in the string
-           if ($line =~ /\e/o){
-               my @keys = keys %try;
-               delete @try{qw/utf8 ascii/};
-               for my $k (@keys){
-                   ref($try{$k}) eq 'Encode::XS' and delete $try{$k};
-               }
-           }
-           my %ok = %try;
-           # warn join(",", keys %try);
-           for my $k (keys %try){
-               my $scratch = $line;
-               $try{$k}->decode($scratch, FB_QUIET);
-               if ($scratch eq ''){
-                   $DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, 
$k);
-               }else{
-                   use bytes ();
-                   $DEBUG and
-                       warn sprintf("%4d:%-24s not ok; %d bytes 
left\n",
-                                    $nline, $k, 
bytes::length($scratch));
-                   delete $ok{$k};
-               }
+       my %ok = %try;
+       # warn join(",", keys %try);
+       for my $k (keys %try){
+           my $scratch = $line;
+           $try{$k}->decode($scratch, FB_QUIET);
+           if ($scratch eq ''){
+               $DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k);
+           }else{
+               use bytes ();
+               $DEBUG and
+                   warn sprintf("%4d:%-24s not ok; %d bytes left\n",
+                                $nline, $k, bytes::length($scratch));
+               delete $ok{$k};
             }
-           %ok or return "No appropriate encodings found!";
-           if (scalar(keys(%ok)) == 1){
-               my ($retval) = values(%ok);
-               return $retval;
-           }
-           %try = %ok; $nline++;
         }
+       %ok or return "No appropriate encodings found!";
+       if (scalar(keys(%ok)) == 1){
+           my ($retval) = values(%ok);
+           return $retval;
+       }
+       %try = %ok; $nline++;
      }
      $try{ascii} or
         return  "Encodings too ambiguous: ", join(" or ", keys %try);
@@ -188,6 +194,10 @@

   # tries all major Japanese Encodings as well
    use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;
+
+If the C<$Encode::Guess::NoUTFAutoGuess> variable is set to a true
+value, no heuristics will be applied to UTF8/16/32, and the result
+will be limited to the suspects and C<ascii>.

  =over 4


Thread Next


nntp.perl.org: Perl Programming lists via nntp and http.
Comments to Ask Bjørn Hansen at ask@perl.org | Group listing | About