Front page | perl.perl5.porters |
Postings from July 2001
Re: [PATCH @11446] UnicodeCD::charinfo
Thread Previous
|
Thread Next
From:
Jarkko Hietaniemi
Date:
July 23, 2001 11:45
Subject:
Re: [PATCH @11446] UnicodeCD::charinfo
Message ID:
20010723134330.A14651@chaos.wustl.edu
On Tue, Jul 24, 2001 at 01:51:32AM +0900, SADAHIRO Tomoyuki wrote:
>
> Hello, this is a patch for /UnicodeCD\.(?:pm|t)/.
>
> Since Unicode.txt is not sorted in dictionary order,
> e.g
> FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
> 10300;OLD ITALIC LETTER A;Lo;0;L;;;;;N;;;;;
>
> then, a sorted file is necessary, isn't it?
>
> !lib/UnicodeCD.pm
> !lib/UnicodeCD.t
> +Unicode.sort
>
> (but Unicode.sort is not attached,
> considering its hugeness in size
> and easiness to prepare from Unicode.txt)
Darn. Got me there, I am the one always warning people about the fact
that Unicode is not 16 bit anymore :-)
I think we should solve this somehow differently, different, I don't
want to introduce a new huge-ish file (that is just a differently sorted
version of an existing file) to just to do the binary search.
> ##### BEGIN PATCH #####
> diff -urN orig/lib/UnicodeCD.pm lib/UnicodeCD.pm
> --- orig/lib/UnicodeCD.pm Sun Jul 22 08:02:50 2001
> +++ lib/UnicodeCD.pm Tue Jul 24 00:11:02 2001
> @@ -119,14 +119,129 @@
> return;
> }
>
> +sub han_charname {
> + my $arg = shift;
> + my $code = _getcode($arg);
> + croak __PACKAGE__, "::charinfo: unknown code '$arg'"
> + unless defined $code;
> + croak __PACKAGE__, "::han_charname: outside CJK Unified Ideographs '$arg'"
> + unless 0x3400 <= $code && $code <= 0x4DB5
> + || 0x4E00 <= $code && $code <= 0x9FA5
> + || 0x20000 <= $code && $code <= 0x2A6D6;
> + sprintf "CJK UNIFIED IDEOGRAPH-%04X", $code;
> +}
> +
> +my @JamoL = ( # Leading Consonant (HANGUL CHOSEONG)
> + "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
> + "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H",
> + );
> +
> +my @JamoV = ( # Medium Vowel (HANGUL JUNGSEONG)
> + "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
> + "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI",
> + "YU", "EU", "YI", "I",
> + );
> +
> +my @JamoT = ( # Trailing Consonant (HANGUL JONGSEONG)
> + "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM",
> + "LB", "LS", "LT", "LP", "LH", "M", "B", "BS",
> + "S", "SS", "NG", "J", "C", "K", "T", "P", "H",
> + );
> +
> +my %HangulConst = (
> + SBase => 0xAC00,
> + LBase => 0x1100,
> + VBase => 0x1161,
> + TBase => 0x11A7,
> + LCount => 19, # scalar @JamoL
> + VCount => 21, # scalar @JamoV
> + TCount => 28, # scalar @JamoT
> + NCount => 588, # VCount * TCount
> + SCount => 11172, # LCount * NCount
> + Final => 0xD7A3, # SBase -1 + SCount
> + );
> +
> +sub hangul_charname {
> + my $arg = shift;
> + my $code = _getcode($arg);
> + croak __PACKAGE__, "::charinfo: unknown code '$arg'"
> + unless defined $code;
> + croak __PACKAGE__, "::hangul_charname: outside Hangul Syllables '$arg'"
> + unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
> + my $SIndex = $code - $HangulConst{SBase};
> + my $LIndex = int( $SIndex / $HangulConst{NCount});
> + my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
> + my $TIndex = $SIndex % $HangulConst{TCount};
> + return join('',
> + "HANGUL SYLLABLE ",
> + $JamoL[$LIndex],
> + $JamoV[$VIndex],
> + $JamoT[$TIndex],
> + );
> +}
> +
> +sub hangul_decomp {
> + my $arg = shift;
> + my $code = _getcode($arg);
> + croak __PACKAGE__, "::charinfo: unknown code '$arg'"
> + unless defined $code;
> + croak __PACKAGE__, "::hangul_decomp: outside Hangul Syllables '$arg'"
> + unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
> + my $SIndex = $code - $HangulConst{SBase};
> + my $LIndex = int( $SIndex / $HangulConst{NCount});
> + my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
> + my $TIndex = $SIndex % $HangulConst{TCount};
> +
> + return join(" ",
> + sprintf("%04X", $HangulConst{LBase} + $LIndex),
> + sprintf("%04X", $HangulConst{VBase} + $VIndex),
> + $TIndex ?
> + sprintf("%04X", $HangulConst{TBase} + $TIndex) : (),
> + );
> +}
> +
> +my @CharinfoRanges = (
> +# block name
> +# [ first, last, coderef to name, coderef to decompose ],
> +# CJK Ideographs Extension A
> + [ 0x3400, 0x4DB5, \&han_charname, undef ],
> +# CJK Ideographs
> + [ 0x4E00, 0x9FA5, \&han_charname, undef ],
> +# Hangul Syllables
> + [ 0xAC00, 0xD7A3, \&hangul_charname, \&hangul_decomp ],
> +# Non-Private Use High Surrogates
> + [ 0xD800, 0xDB7F, undef, undef ],
> +# Private Use High Surrogates
> + [ 0xDB80, 0xDBFF, undef, undef ],
> +# Low Surrogates
> + [ 0xDC00, 0xDFFF, undef, undef ],
> +# The Private Use Area
> + [ 0xE000, 0xF8FF, undef, undef ],
> +# CJK Ideographs Extension B
> + [ 0x20000, 0x2A6D6, \&han_charname, undef ],
> +# Plane 15 Private Use Area
> + [ 0xF0000, 0xFFFFD, undef, undef ],
> +# Plane 16 Private Use Area
> + [ 0x100000, 0x10FFFD, undef, undef ],
> +);
> +
> sub charinfo {
> my $arg = shift;
> my $code = _getcode($arg);
> croak __PACKAGE__, "::charinfo: unknown code '$arg'"
> unless defined $code;
> my $hexk = sprintf("%04X", $code);
> -
> - openunicode(\$UNICODEFH, "Unicode.txt");
> + my($rcode,$rname,$rdec);
> + foreach my $range (@CharinfoRanges){
> + if($range->[0] <= $code && $code <= $range->[1]){
> + $rcode = $hexk;
> + $rname = $range->[2] ? $range->[2]->($code) : '';
> + $rdec = $range->[3] ? $range->[3]->($code) : '';
> + $hexk = sprintf("%04X",$range->[0]); # replace by the first
> + last;
> + }
> + }
> + openunicode(\$UNICODEFH, "Unicode.sort"); # sorted
> if (defined $UNICODEFH) {
> use Search::Dict;
> if (look($UNICODEFH, "$hexk;") >= 0) {
> @@ -143,6 +258,11 @@
> if ($prop{code} eq $hexk) {
> $prop{block} = charblock($code);
> $prop{script} = charscript($code);
> + if(defined $rname){
> + $prop{code} = $rcode;
> + $prop{name} = $rname;
> + $prop{decomposition} = $rdec;
> + }
> return \%prop;
> }
> }
> diff -urN orig/lib/UnicodeCD.t lib/UnicodeCD.t
> --- orig/lib/UnicodeCD.t Fri Jul 13 00:22:26 2001
> +++ lib/UnicodeCD.t Tue Jul 24 01:37:04 2001
> @@ -3,7 +3,7 @@
> use Test;
> use strict;
>
> -BEGIN { plan tests => 111 };
> +BEGIN { plan tests => 111 + 17 * 3};
>
> use UnicodeCD 'charinfo';
>
> @@ -92,6 +92,70 @@
> ok($charinfo->{title}, '');
> ok($charinfo->{block}, 'Hebrew');
> ok($charinfo->{script}, 'Hebrew');
> +
> +# an open syllable in Hangul
> +
> +$charinfo = charinfo(0xAC00);
> +
> +ok($charinfo->{code}, 'AC00');
> +ok($charinfo->{name}, 'HANGUL SYLLABLE GA');
> +ok($charinfo->{category}, 'Lo');
> +ok($charinfo->{combining}, '0');
> +ok($charinfo->{bidi}, 'L');
> +ok($charinfo->{decomposition}, '1100 1161');
> +ok($charinfo->{decimal}, '');
> +ok($charinfo->{digit}, '');
> +ok($charinfo->{numeric}, '');
> +ok($charinfo->{mirrored}, 'N');
> +ok($charinfo->{unicode10}, '');
> +ok($charinfo->{comment}, '');
> +ok($charinfo->{upper}, '');
> +ok($charinfo->{lower}, '');
> +ok($charinfo->{title}, '');
> +ok($charinfo->{block}, 'Hangul Syllables');
> +ok($charinfo->{script}, 'Hangul');
> +
> +# a close syllable in Hangul
> +
> +$charinfo = charinfo(0xAE00);
> +
> +ok($charinfo->{code}, 'AE00');
> +ok($charinfo->{name}, 'HANGUL SYLLABLE GEUL');
> +ok($charinfo->{category}, 'Lo');
> +ok($charinfo->{combining}, '0');
> +ok($charinfo->{bidi}, 'L');
> +ok($charinfo->{decomposition}, '1100 1173 11AF');
> +ok($charinfo->{decimal}, '');
> +ok($charinfo->{digit}, '');
> +ok($charinfo->{numeric}, '');
> +ok($charinfo->{mirrored}, 'N');
> +ok($charinfo->{unicode10}, '');
> +ok($charinfo->{comment}, '');
> +ok($charinfo->{upper}, '');
> +ok($charinfo->{lower}, '');
> +ok($charinfo->{title}, '');
> +ok($charinfo->{block}, 'Hangul Syllables');
> +ok($charinfo->{script}, 'Hangul');
> +
> +$charinfo = charinfo(0x1D400);
> +
> +ok($charinfo->{code}, '1D400');
> +ok($charinfo->{name}, 'MATHEMATICAL BOLD CAPITAL A');
> +ok($charinfo->{category}, 'Lu');
> +ok($charinfo->{combining}, '0');
> +ok($charinfo->{bidi}, 'L');
> +ok($charinfo->{decomposition}, '<font> 0041');
> +ok($charinfo->{decimal}, '');
> +ok($charinfo->{digit}, '');
> +ok($charinfo->{numeric}, '');
> +ok($charinfo->{mirrored}, 'N');
> +ok($charinfo->{unicode10}, '');
> +ok($charinfo->{comment}, '');
> +ok($charinfo->{upper}, '');
> +ok($charinfo->{lower}, '');
> +ok($charinfo->{title}, '');
> +ok($charinfo->{block}, 'Mathematical Alphanumeric Symbols');
> +ok($charinfo->{script}, undef);
>
> use UnicodeCD qw(charblock charscript);
>
> ##### END OF PATCH #####
>
> -----
> regards,
> SADAHIRO Tomoyuki
> E-mail: bqw10602@nifty.com
--
$jhi++; # http://www.iki.fi/jhi/
# There is this special biologist word we use for 'stable'.
# It is 'dead'. -- Jack Cohen
Thread Previous
|
Thread Next