develooper Front page | perl.perl5.porters | Postings from May 2003

[Encode] UTF-7 Support

Thread Next
From:
Dan Kogai
Date:
May 17, 2003 08:46
Subject:
[Encode] UTF-7 Support
Message ID:
99C4504E-887E-11D7-840A-000393AE4244@dan.co.jp
Pumpkings,

The following patch adds UTF-7 support to Encode.  I decided to release 
the patch before 1.95 so
Marks and others can enjoy it.  Beware of MANIFEST since it adds 
lib/Encode/Unicode/UTF7.pm.

Enjoy!

Dan the Encode Maintainer

diff -ruNBb ext/Encode.old/Changes ext/Encode/Changes
--- ext/Encode.old/Changes      Sun May 11 05:15:30 2003
+++ ext/Encode/Changes  Sun May 18 00:12:36 2003
@@ -3,6 +3,13 @@
  # $Id: Changes,v 1.94 2003/05/10 18:13:59 dankogai Exp $
  #
  $Revision: 1.94 $ $Date: 2003/05/10 18:13:59 $
++ lib/Encode/Unicode/UTF7.pm
+! lib/Encode/Config.pm lib/Encode/Alias.pm Unicode/Unicode.pm 
t/Unicode.t
+  lib/Encode/Supported.pod
+  UTF-7 support is now added.  With this Encode now has all transcoding
+  methods in Unicode::String.
+
+1.94 2003/05/10 18:13:59
  ! lib/Encode/MIME/Header.pm
    A more sophisticated solution for double-encoding by dankogai
  ! lib/Encode/MIME/Header.pm AUTHORS
diff -ruNBb ext/Encode.old/MANIFEST ext/Encode/MANIFEST
--- ext/Encode.old/MANIFEST     Sun May 11 05:15:30 2003
+++ ext/Encode/MANIFEST Sun May 18 00:02:50 2003
@@ -51,6 +51,7 @@
  lib/Encode/MIME/Header.pm      Encode extension
  lib/Encode/PerlIO.pod  Documents for Encode & PerlIO
  lib/Encode/Supported.pod       Documents for supported encodings
+lib/Encode/Unicode/UTF7.pm Encode Extension
  t/Aliases.t    test script
  t/CJKT.t       test script
  t/Encode.t     test script
diff -ruNBb ext/Encode.old/Unicode/Unicode.pm 
ext/Encode/Unicode/Unicode.pm
--- ext/Encode.old/Unicode/Unicode.pm   Thu Feb  6 14:13:36 2003
+++ ext/Encode/Unicode/Unicode.pm       Sun May 18 00:14:47 2003
@@ -287,9 +287,13 @@
  =item L<http://www.unicode.org/glossary/> says:

  I<Character Encoding Scheme> A character encoding form plus byte
-serialization. There are seven character encoding schemes in Unicode:
+serialization. There are Seven character encoding schemes in Unicode:
  UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32 (UCS-4), UTF-32BE (UCS-4BE) 
and
-UTF-32LE (UCS-4LE).
+UTF-32LE (UCS-4LE), and UTF-7.
+
+Since UTF-7 is a 7-bit (re)encoded version of UTF-16BE, It is not part 
of
+Unicode's Character Encoding Scheme.  It is separately implemented in
+Encode::Unicode::UTF7.  For details see L<Encode::Unicode::UTF7>.

  =item Quick Reference

@@ -434,7 +438,7 @@

  =head1 SEE ALSO

-L<Encode>, L<http://www.unicode.org/glossary/>,
+L<Encode>, L<Encode::Unicode::UTF7>, 
L<http://www.unicode.org/glossary/>,
  L<http://www.unicode.org/unicode/faq/utf_bom.html>,

  RFC 2781 L<http://rfc.net/rfc2781.html>,
diff -ruNBb ext/Encode.old/lib/Encode/Alias.pm 
ext/Encode/lib/Encode/Alias.pm
--- ext/Encode.old/lib/Encode/Alias.pm  Thu Feb  6 14:13:36 2003
+++ ext/Encode/lib/Encode/Alias.pm      Sat May 17 23:07:38 2003
@@ -1,5 +1,6 @@
  package Encode::Alias;
  use strict;
+no warnings 'redefine';
  use Encode;
  our $VERSION = do { my @r = (q$Revision: 1.35 $ =~ /\d+/g); sprintf 
"%d."."%02d" x $#r, @r };
  our $DEBUG = 0;
@@ -128,6 +129,7 @@
      define_alias( qr/^(.*)$/ => '"\L$1"' );

      # UTF/UCS stuff
+    define_alias( qr/^UTF-?7$/i           => '"UTF-7"');
      define_alias( qr/^UCS-?2-?LE$/i       => '"UCS-2LE"' );
      define_alias( qr/^UCS-?2-?(BE)?$/i    => '"UCS-2BE"',
                    qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")',
diff -ruNBb ext/Encode.old/lib/Encode/Config.pm 
ext/Encode/lib/Encode/Config.pm
--- ext/Encode.old/lib/Encode/Config.pm Thu Apr 25 06:27:02 2002
+++ ext/Encode/lib/Encode/Config.pm     Sat May 17 00:49:02 2003
@@ -98,6 +98,7 @@
       'UTF-32'                 => 'Encode::Unicode',
       'UTF-32BE'               => 'Encode::Unicode',
       'UTF-32LE'               => 'Encode::Unicode',
+     'UTF-7'                  => 'Encode::Unicode::UTF7',
      );

  unless (ord("A") == 193){
diff -ruNBb ext/Encode.old/lib/Encode/Supported.pod 
ext/Encode/lib/Encode/Supported.pod
--- ext/Encode.old/lib/Encode/Supported.pod     Fri Oct 25 09:28:45 2002
+++ ext/Encode/lib/Encode/Supported.pod Sun May 18 00:11:19 2003
@@ -87,10 +87,14 @@
    UTF-32                                                      [UC]
    UTF-32BE     UCS-4                                         [UC]
    UTF-32LE                                                    [UC]
+  UTF-7                                                  [RFC2152]
    ----------------------------------------------------------------

  To find how (UCS-2|UTF-(16|32))(LE|BE)? differ from one another,
  see L<Encode::Unicode>.
+
+UTF-7 is a special encoding which "re-encodes" UTF-16BE into a 7-bit
+encoding.  It is implemeneted seperately by Encode::Unicode::UTF7.

  =head2 Encode::Byte -- Extended ASCII

diff -ruNBb ext/Encode.old/lib/Encode/Unicode/UTF7.pm 
ext/Encode/lib/Encode/Unicode/UTF7.pm
--- ext/Encode.old/lib/Encode/Unicode/UTF7.pm   Thu Jan  1 09:00:00 1970
+++ ext/Encode/lib/Encode/Unicode/UTF7.pm       Sun May 18 00:37:36 2003
@@ -0,0 +1,117 @@
+#
+# $Id: UTF7.pm,v 0.1 2003/05/16 18:06:24 dankogai Exp dankogai $
+#
+package Encode::Unicode::UTF7;
+use strict;
+no warnings 'redefine';
+use base qw(Encode::Encoding);
+__PACKAGE__->Define('UTF-7');
+our $VERSION = do { my @r = (q$Revision: 0.1 $ =~ /\d+/g); sprintf 
"%d."."%02d" x $#r, @r };
+use MIME::Base64;
+use Encode;
+
+#
+# Algorithms taken from Unicode::String by Gisle Aas
+#
+
+our $OPTIONAL_DIRECT_CHARS = 1;
+my $specials =   quotemeta "\'(),-.:?";
+$OPTIONAL_DIRECT_CHARS and
+    $specials .= quotemeta "!\"#$%&*;<=>@[]^_`{|}";
+# \s will not work because it matches U+3000 DEOGRAPHIC SPACE
+# We use \x00-\x20 instead (controls + space)
+my $re_asis =     qr/(?:[\x00-\x20A-Za-z0-9$specials])/;
+my $re_encoded = qr/(?:[^\x00-\x20A-Za-z0-9$specials])/;
+my $e_utf16 = find_encoding("UTF-16BE");
+
+sub needs_lines { 1 };
+
+sub encode($$;$){
+    my ($obj, $str, $chk) = @_;
+    my $len = length($str);
+    pos($str) = 0;
+    my $bytes = '';
+    while (pos($str) < $len){
+       if    ($str =~ /\G($re_asis+)/ogc){
+           $bytes .= $1;
+       }elsif($str =~ /\G($re_encoded+)/ogsc){
+           if ($1 eq "+"){
+               $bytes .= "+-";
+           }else{
+               my $base64 = encode_base64($e_utf16->encode($1), '');
+               $base64 =~ s/=+$//;
+               $bytes .= "+$base64-";
+           }
+       }else{
+           die "This should not happen! (pos=" . pos($str) . ")";
+       }
+    }
+    $_[1] = '' if $chk;
+    return $bytes;
+}
+
+sub decode{
+    my ($obj, $bytes, $chk) = @_;
+    my $len = length($bytes);
+    my $str = "";
+    while (pos($bytes) < $len) {
+       if    ($bytes =~ /\G([^+]+)/ogc) {
+           $str .= $1;
+       }elsif($bytes =~ /\G\+-/ogc) {
+           $str .= "+";
+       }elsif($bytes =~ /\G\+([A-Za-z0-9+\/]+)-?/ogsc) {
+           my $base64 = $1;
+           my $pad = length($base64) % 4;
+           $base64 .= "=" x (4 - $pad) if $pad;
+           $str .= $e_utf16->decode(decode_base64($base64));
+       }elsif($bytes =~ /\G\+/ogc) {
+           $^W and warn "Bad UTF7 data escape";
+           $str .= "+";
+       }else{
+           die "This should not happen " . pos($bytes);
+       }
+    }
+    $_[1] = '' if $chk;
+    return $str;
+}
+1;
+__END__
+
+=head1 NAME
+
+Encode::Unicode::UTF7 -- UTF-7 encoding
+
+=head1 SYNOPSIS
+
+    use Encode qw/encode decode/;
+    $utf7 = encode("UTF-7", $utf8);
+    $utf8 = decode("UTF-7", $ucs2);
+
+=head1 ABSTRACT
+
+This module implements UTF-7 encoding documented in RFC 2152.  UTF-7,
+as its name suggests, is a 7-bit re-encoded version of UTF-16BE.  It
+is designed to be MTA-safe and expected to be a standard way to
+exchange Unicoded mails via mails.  But with the advent of UTF-8 and
+8-bit compliant MTAs, UTF-7 is hardly ever used.
+
+UTF-7 was not supported by Encode until version 1.95 because of that.
+But Unicode::String, a module by Gisle Aas which adds Unicode supports
+to non-utf8-savvy perl did support UTF-7, the UTF-7 support was added
+so Encode can supersede Unicode::String 100%.
+
+=head1 In Practice
+
+When you want to encode Unicode for mails and web pages, however, do
+not use UTF-7 unless you are sure your recipients and readers can
+handle it.  Very few MUAs and WWW Browsers support these days (only
+Mozilla seems to support one).  For general cases, use UTF-8 for
+message body and MIME-Header for header instead.
+
+=head1 SEE ALSO
+
+L<Encode>, L<Encode::Unicode>, L<Unicode::String>
+
+RFC 2781 L<http://www.ietf.org/rfc/rfc2152.txt>
+
+=cut
diff -ruNBb ext/Encode.old/t/Unicode.t ext/Encode/t/Unicode.t
--- ext/Encode.old/t/Unicode.t  Wed May  8 05:22:50 2002
+++ ext/Encode/t/Unicode.t      Sat May 17 23:54:29 2003
@@ -18,7 +18,7 @@

  use strict;
  #use Test::More 'no_plan';
-use Test::More tests => 30;
+use Test::More tests => 37;
  use Encode qw(encode decode);

  #
@@ -103,6 +103,24 @@
      }
  };

+#
+# CJKT vs. UTF-7
+#
+
+use File::Spec;
+use File::Basename;

+my $dir =  dirname(__FILE__);
+opendir my $dh, $dir or die "$dir:$!";
+my @file = sort grep {/\.utf$/o} readdir $dh;
+closedir $dh;
+for my $file (@file){
+    my $path = File::Spec->catfile($dir, $file);
+    open my $fh, '<:utf8', $path or die "$path:$!";
+    my $content = join('' => <$fh>);
+    close $fh;
+    is(decode("UTF-7", encode("UTF-7", $content)), $content,
+       "UTF-7 RT:$file");
+}
  1;
  __END__


Thread Next


nntp.perl.org: Perl Programming lists via nntp and http.
Comments to Ask Bjørn Hansen at ask@perl.org | Group listing | About