develooper Front page | perl.perl5.porters | Postings from July 2001

Re: [PATCH @11446] UnicodeCD::charinfo

Thread Previous | Thread Next
From:
Jarkko Hietaniemi
Date:
July 23, 2001 11:45
Subject:
Re: [PATCH @11446] UnicodeCD::charinfo
Message ID:
20010723134330.A14651@chaos.wustl.edu
On Tue, Jul 24, 2001 at 01:51:32AM +0900, SADAHIRO Tomoyuki wrote:
> 
> Hello, this is a patch for /UnicodeCD\.(?:pm|t)/.
> 
> Since Unicode.txt is not sorted in dictionary order,
>  e.g
>   FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
>   10300;OLD ITALIC LETTER A;Lo;0;L;;;;;N;;;;;
> 
> then, a sorted file is necessary, isn't it?
> 
>   !lib/UnicodeCD.pm
>   !lib/UnicodeCD.t
>   +Unicode.sort
> 
> (but Unicode.sort is not attached,
>  considering its hugeness in size 
>  and easiness to prepare from Unicode.txt)

Darn.  Got me there, I am the one always warning people about the fact
that Unicode is not 16 bit anymore :-)

I think we should solve this somehow differently, different, I don't
want to introduce a new huge-ish file (that is just a differently sorted
version of an existing file) to just to do the binary search.

> ##### BEGIN PATCH #####
> diff -urN orig/lib/UnicodeCD.pm lib/UnicodeCD.pm
> --- orig/lib/UnicodeCD.pm	Sun Jul 22 08:02:50 2001
> +++ lib/UnicodeCD.pm	Tue Jul 24 00:11:02 2001
> @@ -119,14 +119,129 @@
>      return;
>  }
>  
> +sub han_charname {
> +    my $arg  = shift;
> +    my $code = _getcode($arg);
> +    croak __PACKAGE__, "::charinfo: unknown code '$arg'"
> +	unless defined $code;
> +    croak __PACKAGE__, "::han_charname: outside CJK Unified Ideographs '$arg'"
> +        unless 0x3400  <= $code && $code <= 0x4DB5  
> +            || 0x4E00  <= $code && $code <= 0x9FA5  
> +            || 0x20000 <= $code && $code <= 0x2A6D6;
> +    sprintf "CJK UNIFIED IDEOGRAPH-%04X", $code;
> +}
> +
> +my @JamoL = ( # Leading Consonant (HANGUL CHOSEONG)
> +    "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
> +    "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H",
> +  );
> +
> +my @JamoV = ( # Medium Vowel (HANGUL JUNGSEONG)
> +    "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
> +    "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI",
> +    "YU", "EU", "YI", "I",
> +  );
> +
> +my @JamoT = ( # Trailing Consonant (HANGUL JONGSEONG)
> +    "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM",
> +    "LB", "LS", "LT", "LP", "LH", "M", "B", "BS",
> +    "S", "SS", "NG", "J", "C", "K", "T", "P", "H",
> +  );
> +
> +my %HangulConst = (
> +   SBase  => 0xAC00,
> +   LBase  => 0x1100,
> +   VBase  => 0x1161,
> +   TBase  => 0x11A7,
> +   LCount => 19,     # scalar @JamoL
> +   VCount => 21,     # scalar @JamoV
> +   TCount => 28,     # scalar @JamoT
> +   NCount => 588,    # VCount * TCount
> +   SCount => 11172,  # LCount * NCount
> +   Final  => 0xD7A3, # SBase -1 + SCount
> +  );
> +
> +sub hangul_charname {
> +    my $arg  = shift;
> +    my $code = _getcode($arg);
> +    croak __PACKAGE__, "::charinfo: unknown code '$arg'"
> +	unless defined $code;
> +    croak __PACKAGE__, "::hangul_charname: outside Hangul Syllables '$arg'"
> +        unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
> +    my $SIndex = $code - $HangulConst{SBase};
> +    my $LIndex = int( $SIndex / $HangulConst{NCount});
> +    my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
> +    my $TIndex =      $SIndex % $HangulConst{TCount};
> +    return join('',
> +        "HANGUL SYLLABLE ",
> +        $JamoL[$LIndex],
> +        $JamoV[$VIndex],
> +        $JamoT[$TIndex],
> +      );
> +}
> +
> +sub hangul_decomp {
> +    my $arg  = shift;
> +    my $code = _getcode($arg);
> +    croak __PACKAGE__, "::charinfo: unknown code '$arg'"
> +	unless defined $code;
> +    croak __PACKAGE__, "::hangul_decomp: outside Hangul Syllables '$arg'"
> +        unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
> +    my $SIndex = $code - $HangulConst{SBase};
> +    my $LIndex = int( $SIndex / $HangulConst{NCount});
> +    my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
> +    my $TIndex =      $SIndex % $HangulConst{TCount};
> +
> +    return join(" ",
> +        sprintf("%04X", $HangulConst{LBase} + $LIndex),
> +        sprintf("%04X", $HangulConst{VBase} + $VIndex),
> +      $TIndex ?
> +        sprintf("%04X", $HangulConst{TBase} + $TIndex) : (),
> +    );
> +}
> +
> +my @CharinfoRanges = (
> +# block name
> +# [ first, last, coderef to name, coderef to decompose ],
> +# CJK Ideographs Extension A
> +  [ 0x3400,   0x4DB5,   \&han_charname,   undef  ],
> +# CJK Ideographs
> +  [ 0x4E00,   0x9FA5,   \&han_charname,   undef  ],
> +# Hangul Syllables
> +  [ 0xAC00,   0xD7A3,   \&hangul_charname, \&hangul_decomp  ],
> +# Non-Private Use High Surrogates
> +  [ 0xD800,   0xDB7F,   undef,   undef  ],
> +# Private Use High Surrogates
> +  [ 0xDB80,   0xDBFF,   undef,   undef  ],
> +# Low Surrogates
> +  [ 0xDC00,   0xDFFF,   undef,   undef  ],
> +# The Private Use Area
> +  [ 0xE000,   0xF8FF,   undef,   undef  ],
> +# CJK Ideographs Extension B
> +  [ 0x20000,  0x2A6D6,  \&han_charname,   undef  ],
> +# Plane 15 Private Use Area
> +  [ 0xF0000,  0xFFFFD,  undef,   undef  ],
> +# Plane 16 Private Use Area
> +  [ 0x100000, 0x10FFFD, undef,   undef  ],
> +);
> +
>  sub charinfo {
>      my $arg  = shift;
>      my $code = _getcode($arg);
>      croak __PACKAGE__, "::charinfo: unknown code '$arg'"
>  	unless defined $code;
>      my $hexk = sprintf("%04X", $code);
> -
> -    openunicode(\$UNICODEFH, "Unicode.txt");
> +    my($rcode,$rname,$rdec);
> +    foreach my $range (@CharinfoRanges){
> +      if($range->[0] <= $code && $code <= $range->[1]){
> +        $rcode = $hexk;
> +        $rname = $range->[2] ? $range->[2]->($code) : '';
> +        $rdec  = $range->[3] ? $range->[3]->($code) : '';
> +        $hexk  = sprintf("%04X",$range->[0]); # replace by the first
> +        last;
> +      }
> +    }
> +    openunicode(\$UNICODEFH, "Unicode.sort"); # sorted
>      if (defined $UNICODEFH) {
>  	use Search::Dict;
>  	if (look($UNICODEFH, "$hexk;") >= 0) {
> @@ -143,6 +258,11 @@
>  	    if ($prop{code} eq $hexk) {
>  		$prop{block}  = charblock($code);
>  		$prop{script} = charscript($code);
> +		if(defined $rname){
> +                    $prop{code} = $rcode;
> +                    $prop{name} = $rname;
> +                    $prop{decomposition} = $rdec;
> +                }
>  		return \%prop;
>  	    }
>  	}
> diff -urN orig/lib/UnicodeCD.t lib/UnicodeCD.t
> --- orig/lib/UnicodeCD.t	Fri Jul 13 00:22:26 2001
> +++ lib/UnicodeCD.t	Tue Jul 24 01:37:04 2001
> @@ -3,7 +3,7 @@
>  use Test;
>  use strict;
>  
> -BEGIN { plan tests => 111 };
> +BEGIN { plan tests => 111 + 17 * 3};
>  
>  use UnicodeCD 'charinfo';
>  
> @@ -92,6 +92,70 @@
>  ok($charinfo->{title},          '');
>  ok($charinfo->{block},          'Hebrew');
>  ok($charinfo->{script},         'Hebrew');
> +
> +# an open syllable in Hangul
> +
> +$charinfo = charinfo(0xAC00);
> +
> +ok($charinfo->{code},           'AC00');
> +ok($charinfo->{name},           'HANGUL SYLLABLE GA');
> +ok($charinfo->{category},       'Lo');
> +ok($charinfo->{combining},      '0');
> +ok($charinfo->{bidi},           'L');
> +ok($charinfo->{decomposition},  '1100 1161');
> +ok($charinfo->{decimal},        '');
> +ok($charinfo->{digit},          '');
> +ok($charinfo->{numeric},        '');
> +ok($charinfo->{mirrored},       'N');
> +ok($charinfo->{unicode10},      '');
> +ok($charinfo->{comment},        '');
> +ok($charinfo->{upper},          '');
> +ok($charinfo->{lower},          '');
> +ok($charinfo->{title},          '');
> +ok($charinfo->{block},          'Hangul Syllables');
> +ok($charinfo->{script},         'Hangul');
> +
> +# a close syllable in Hangul
> +
> +$charinfo = charinfo(0xAE00);
> +
> +ok($charinfo->{code},           'AE00');
> +ok($charinfo->{name},           'HANGUL SYLLABLE GEUL');
> +ok($charinfo->{category},       'Lo');
> +ok($charinfo->{combining},      '0');
> +ok($charinfo->{bidi},           'L');
> +ok($charinfo->{decomposition},  '1100 1173 11AF');
> +ok($charinfo->{decimal},        '');
> +ok($charinfo->{digit},          '');
> +ok($charinfo->{numeric},        '');
> +ok($charinfo->{mirrored},       'N');
> +ok($charinfo->{unicode10},      '');
> +ok($charinfo->{comment},        '');
> +ok($charinfo->{upper},          '');
> +ok($charinfo->{lower},          '');
> +ok($charinfo->{title},          '');
> +ok($charinfo->{block},          'Hangul Syllables');
> +ok($charinfo->{script},         'Hangul');
> +
> +$charinfo = charinfo(0x1D400);
> +
> +ok($charinfo->{code},           '1D400');
> +ok($charinfo->{name},           'MATHEMATICAL BOLD CAPITAL A');
> +ok($charinfo->{category},       'Lu');
> +ok($charinfo->{combining},      '0');
> +ok($charinfo->{bidi},           'L');
> +ok($charinfo->{decomposition},  '<font> 0041');
> +ok($charinfo->{decimal},        '');
> +ok($charinfo->{digit},          '');
> +ok($charinfo->{numeric},        '');
> +ok($charinfo->{mirrored},       'N');
> +ok($charinfo->{unicode10},      '');
> +ok($charinfo->{comment},        '');
> +ok($charinfo->{upper},          '');
> +ok($charinfo->{lower},          '');
> +ok($charinfo->{title},          '');
> +ok($charinfo->{block},          'Mathematical Alphanumeric Symbols');
> +ok($charinfo->{script},         undef);
>  
>  use UnicodeCD qw(charblock charscript);
>  
> ##### END OF PATCH #####
> 
> -----
> regards,
> SADAHIRO Tomoyuki
> E-mail: bqw10602@nifty.com

-- 
$jhi++; # http://www.iki.fi/jhi/
        # There is this special biologist word we use for 'stable'.
        # It is 'dead'. -- Jack Cohen

Thread Previous | Thread Next


nntp.perl.org: Perl Programming lists via nntp and http.
Comments to Ask Bjørn Hansen at ask@perl.org | Group listing | About