develooper Front page | perl.perl5.porters | Postings from November 2000

[PATCH 5.7.0] Charnames :terse

Thread Next
Ilya Zakharevich
November 22, 2000 18:57
[PATCH 5.7.0] Charnames :terse
Message ID:
This is the update to which I promised a long time ago.
As usual, a lot of additional work may be needed: code the rules for
transliteration of the upper latin-1, of greek, IPA, hebrew, and the
other scripts with widespread transliteration schemes.


Unfortunately, my hope to find a transliteration-to-cyrillic scheme
which is more-or-less "standard" and "works good" turned out to be too
optimistic.  The schemes are either designed to be either not-human
readable, or work only in cyrillic-to-transliteration direction.

[Standards exists, but are unreachable.  The URLs I found are at the
 beginning of Terse/]

The scheme I concocted differs in a handful of position both from the
Lynx scheme, and from the MathReviews transliteration scheme.  I think
this scheme is the lest fragile mixture of these two, which allows
both input of all the characters, and makes the transliteration
phonetically acceptable (except for 'yi' for the "b|" letter ?).

Yours \N{Il'ya Zakharevich}

P.S.  This patch introduces a new file, some extra bookkeeping may be
      needed (I updated MANIFEST).

P.P.S.  It is hard to write the test code without a screen feedback on
	the translation.  Running in tkcon is possible, but ugly...

--- ./lib/	Mon Oct  9 15:20:18 2000
+++ ./lib/	Wed Nov 22 21:31:37 2000
@@ -1,15 +1,72 @@
 package charnames;
+# use strict;			# will just eat memory...
 use bytes ();		# for $bytes::hint_bits
 use warnings();
 $charnames::hint_bits = 0x20000;
 my $txt;
+my %terse;
+my %terse_rex;
+my %terse_short;
+my %terse_long;
 # This is not optimized in any way yet
 sub charnames {
-  $name = shift;
+  my $name = shift;
   $txt = do "unicode/" unless $txt;
   my @off;
+  if ($^H{charnames_terse}) {
+    unless (exists $terse{$^H{charnames_terse}}) {
+      my ($script, $leader, $trailer) = $^H{charnames_terse};
+      ($leader, $script) = ($1, $2) if $script =~ /^(\W*)(.*)\z/;
+      ($script, $trailer) = ($1, $2) if $script =~ /^(.*?)(\W*)\z/;
+      my $separator;
+      if ($leader eq '' and $trailer ne '') {
+	$separator = $trailer;
+	$trailer = '';
+      }
+      $leader = quotemeta $leader;
+      $trailer = quotemeta $trailer;
+      my @short_keys;
+      my @long_keys;
+      if (exists $terse_long{$script}) {
+	@short_keys = keys %{$terse_short{$script}};
+	@long_keys  = keys %{$terse_long{$script}};
+      } else {
+	my $h = require "Terse/$";
+	@short_keys = grep  /^.\z/, keys %$h;
+	@long_keys  = grep !/^.\z/, keys %$h;
+	@{$terse_long{$script}}{@long_keys}   = @$h{@long_keys};
+	@{$terse_short{$script}}{@short_keys} = @$h{@short_keys};
+      }
+      @{$terse{$^H{charnames_terse}}}{@short_keys}
+	= @{$terse_short{$script}}{@short_keys};
+      @{$terse{$^H{charnames_terse}}}{map "$leader$_$trailer", @long_keys}
+	= @{$terse_long{$script}}{@long_keys};
+      $terse{$^H{charnames_terse}}{$separator} = '' if length $separator;
+      my $char_class = '';
+      $char_class = '[' . quotemeta(join '', @short_keys, 
+				    (length($separator) == 1
+				     ? $separator : ())) . ']'
+	if @short_keys;
+      my $rex = join '|', map quotemeta $_,
+	sort {length $b <=> length $a or $a cmp $b}
+	  @long_keys, (length($separator) > 1 ? $separator : ());
+      $rex = "$leader(?:$rex)$trailer" 
+	if length($rex) + length($leader) + length($trailer);
+      $rex = "$rex|$char_class"	if $char_class and length $rex;
+      $rex = $char_class unless length $rex;
+      $rex = qr/\z./ unless length $rex; # Never match!
+      #print "`REx': `$rex'.\n`leader': `$leader',\n`trailer': `$trailer',\n";
+      $terse_rex{$^H{charnames_terse}} = qr/$rex/;
+    }
+    my $rex = $terse_rex{$^H{charnames_terse}};
+    $name =~ s/($rex)/$terse{$^H{charnames_terse}}{$1}/g;
+    return $name;
+  }
   if ($^H{charnames_full} and $txt =~ /\t\t$name$/m) {
     @off = ($-[0], $+[0]);
@@ -30,7 +87,7 @@ sub charnames {
   die "Unknown charname '$name'" unless @off;
   my $ord = hex substr $txt, $off[0] - 4, 4;
   if ($^H & $bytes::hint_bits) {	# "use bytes" in effect?
     use bytes;
@@ -47,6 +104,15 @@ sub import {
   die "`use charnames' needs explicit imports list" unless @_;
   $^H |= $charnames::hint_bits;
   $^H{charnames} = \&charnames ;
+  if (@_ and $_[0] eq ':terse') {
+    die "Cannot enable :terse in presence of other charnames flags"
+      if defined $^H{charnames_full} or defined $^H{charnames_short}
+	or defined $^H{charnames_scripts};
+    die ":terse used with other charnes flags" unless @_ <= 2;
+    die ":terse used without terse style" unless @_ == 2;
+    $^H{charnames_terse} = $_[1];
+    return;
+  }
   my %h;
   @h{@_} = (1) x @_;
   $^H{charnames_full} = delete $h{':full'};
@@ -61,6 +127,15 @@ sub import {
+sub unimport {
+  die "`no charnames' with arguments not supported" if @_ > 1;
+  delete $^H{charnames_full};
+  delete $^H{charnames_short};
+  delete $^H{charnames_scripts};
+  delete $^H{charnames};
+  delete $^H{charnames_terse};
+  $^H &= ~$charnames::hint_bits;
@@ -75,38 +150,76 @@ charnames - define character names for C
   print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
   use charnames ':short';
-  print "\N{greek:Sigma} is an upper-case sigma.\n";
+  print "\N{greek:Sigma} is the upper-case sigma.\n";
   use charnames qw(cyrillic greek);
-  print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
+  print "\N{sigma} is the Greek sigma, and \N{be} is the Cyrillic b.\n";
-Pragma C<use charnames> supports arguments C<:full>, C<:short> and
-script names.  If C<:full> is present, for expansion of
-C<\N{CHARNAME}}> string C<CHARNAME> is first looked in the list of
-standard Unicode names of chars.  If C<:short> is present, and
-C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
-as a letter in script C<SCRIPT>.  If pragma C<use charnames> is used
-with script name arguments, then for C<\N{CHARNAME}}> the name
-C<CHARNAME> is looked up as a letter in the given scripts (in the
-specified order).
+Pragma C<use charnames> supports tags C<:full>, C<:short>, C<:terse>
+(followed by a transliteration scheme); other arguments are taken as
+script names.  When this pragma is active, C<\N{CHARNAME}}> in string literals
+is replaced governed by the following rules.
+In the presense of the tag C<:full>, the string C<CHARNAME> is first
+looked in the list of standard Unicode names of chars.  In the presense of
+the tag C<:short>, C<CHARNAME> of the form C<SCRIPT:CNAME> is
+looked up in the standard Unicode tables as a letter in script
+C<SCRIPT>.  In presence of script name arguments, the string
+C<CHARNAME> is looked up as a letter in one of the given scripts (in
+the specified order).
-For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
-this pragma looks for the names
+When looking for C<CharName> in a script C<SCRIPTNAME> the following
+names are checked
-in the table of standard Unicode names.  If C<CHARNAME> is lowercase,
+in the table of standard Unicode names.  If C<CharName> is lowercase,
 then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant is
+In the presence of the arguments C<:terse transliteration> no other
+arguments are allowed.  Transliterable substrings are found in
+C<CHARNAME> (using a greedy algorithm), and substituted by their values.
+The C<transliteration> argument can have one of the following forms:
+(with I<LEADER>, I<SEPARATOR> and I<TRAILER> consisting of non-word
+characters).  I<WORD> is used to choose the table of transliteration
+tokens: it should be returned by the module F<Terse::WORD>.
+I<LEADER>, I<SEPARATOR>, and I<TRAILER> determine the rules for the
+substitution of multi-character tokens.  For the forms I<LEADER WORD>
+and C<LEADER WORD TRAILER> a minimal approach is taken:
+multi-character tokens are recognized only if they are preceeded by
+I<LEADER> (or surrounded by I<LEADER> and I<TRAILER> correspondingly).
+For the form I<WORD> the most greedy approach is taken: the tokens are
+recognized everywhere.  The form I<WORD SEPARATOR> defines an
+additional token for I<SEPARATOR> with 0-length expansion.
+For example, if loading a module Terse::ae_to_xe6 returns C<{ ae => chr
+0xe6}>, then
+  use charnames qw(:terse ae_to_xe6);
+  ok() if "\N{braekaer}" eq "br\x{e6}k\x{e6}r";
+  use charnames qw(:terse <ae_to_xe6>);
+  ok() if "\N{braekaer}" eq "braekaer";
+  ok() if "\N{br<ae>k<ae>r}" eq "br\x{e6}k\x{e6}r";
+  ok() if "\N{br<ae>kaer}" eq "br\x{e6}kaer";
+  use charnames qw(:terse ae_to_xe6!);
+  ok() if "\N{braekaer}" eq "br\x{e6}k\x{e6}r";
+  ok() if "\N{bra!eka!er}" eq "braekaer";
 The mechanism of translation of C<\N{...}> escapes is general and not
-hardwired into F<>.  A module can install custom
+hardwired into F<>.  Any module can install custom
 translations (inside the scope which C<use>s the module) with the
 following magic incantation:
@@ -118,8 +231,8 @@ following magic incantation:
 Here translator() is a subroutine which takes C<CHARNAME> as an
-argument, and returns text to insert into the string instead of the
-C<\N{CHARNAME}> escape.  Since the text to insert should be different
+argument, and returns the text to insert into the string instead of the
+C<\N{CHARNAME}> escape.  WHAT FOLLOWS IS OBSOLETE!  THE MODULE ITSELF SHOULD BE CHANGED TOO!  Since the text to insert should be different
 in C<bytes> mode and out of it, the function should check the current
 state of C<bytes>-flag as in:
--- ./lib/Terse/	Wed Nov 22 21:32:46 2000
+++ ./lib/Terse/	Wed Nov 22 20:52:10 2000
@@ -0,0 +1,171 @@
+# Should be covered by
+# ANSI Z39.24-1976. System for the Romanization of Slavic Cyrillic Characters.
+# I did not check the conformance yet.
+# Also:
+# ISO 9:1986.
+# Transliteration of Slavic Cyrillic characters into Latin characters.
+# Also:
+# ISO/DIS 9:1992,  ISO/TC46/SC2
+# Some practical hints:
+# Some info:
+# With qw(:terse <cyrillic>): by \N{Il'<ya> Za<kh>arevi<ch>}
+# With qw(:terse cyrillic): by \N{Il'ya Zakharevich}
+# 0x410 .. 0x42F Uppercase
+# 0x430 .. 0x44F lowercase
+@russian = qw(a b v g d e zh z i y k l m n o p
+	      r s t u f kh ts ch sh shch " yi ' `e yu ya);
+# This scheme is not universally accepted.  We merge "reasonably" ;-)
+# the schemes at hand.
+my %h;
+# uc variants first, to map ' into lc soft-sign
+$russian[$_] eq '?' or $h{"\U$russian[$_]"} = uc chr (0x410 + $_) for 0..0x1f;
+$russian[$_] eq '?' 
+  or "\u$russian[$_]" eq $russian[$_]
+  		    or $h{"\u$russian[$_]"} = uc chr (0x410 + $_) for 0..0x1f;
+$russian[$_] eq '?' or $h{$russian[$_]}     =    chr (0x430 + $_) for 0..0x1f;
+# 0x401 .. 0x40F Uppercase (with one hole)
+# 0x451 .. 0x45F lowercase (with one hole)
+@cyrilli = qw(? yo d% g% ie ds ii yy j% lj nj tsh kj ? v% dz);
+$cyrilli[$_] eq '?' or $h{"\U$cyrilli[$_]"} = uc chr (0x450 + $_) for 0..0xf;
+$cyrilli[$_] eq '?' 
+  or "\u$russian[$_]" eq $cyrilli[$_]
+  		    or $h{"\u$cyrilli[$_]"} = uc chr (0x450 + $_) for 0..0xf;
+$cyrilli[$_] eq '?' or $h{$cyrilli[$_]}     =    chr (0x400 + $_) for 0..0xf;
+# This is 100% Lynx-compatible.
+@h{qw(Y3 y3 O3 o3 F3 f3 V3 v3 C3 c3 G3 g3 AE ae)} = map chr(0x400 + hex $_),
+  qw( 62 63 6a 6b 72 73 74 75 80 81 90 91 d4 d5 );
+# We modify the following positions: our: kh ts shch yi yo tsh yy
+my $table = <<EOT;		# Stolen from lynx's src/chrtrans/def7_uni.tbl
--- ./MANIFEST~	Tue Nov 14 13:56:07 2000
+++ ./MANIFEST	Wed Nov 22 21:49:48 2000
@@ -719,6 +719,7 @@ lib/Term/	Perl module suppor
 lib/Term/		Perl module supporting termcap usage
 lib/Term/	A command completion subroutine
 lib/Term/	Stub readline library
+lib/Terse/	ASCII-2-Cyrillic transliteration rules.
 lib/		A simple framework for writing test scripts
 lib/Test/	A test harness
 lib/Text/	An abbreviation table builder

Thread Next Perl Programming lists via nntp and http.
Comments to Ask Bjørn Hansen at | Group listing | About