develooper Front page | perl.cvs.parrot | Postings from January 2009

[svn:parrot] r35708 - branches/strings/pseudocode

From:
simon
Date:
January 17, 2009 14:53
Subject:
[svn:parrot] r35708 - branches/strings/pseudocode
Message ID:
20090117225249.9E452CB9AE@x12.develooper.com
Author: simon
Date: Sat Jan 17 14:52:48 2009
New Revision: 35708

Modified:
   branches/strings/pseudocode/Encodings.pm
   branches/strings/pseudocode/ParrotString.pm

Log:
Half of grapheme_copy, plus variable-width grapheme iteration.


Modified: branches/strings/pseudocode/Encodings.pm
==============================================================================
--- branches/strings/pseudocode/Encodings.pm	(original)
+++ branches/strings/pseudocode/Encodings.pm	Sat Jan 17 14:52:48 2009
@@ -59,6 +59,38 @@
             $callback(char_at_byteoffset($str, $index), $parameter);
         }
     }
+    method string_grapheme_iterate ($str, $callback, $parameter) {
+        if ($str.charset !~~ ParrotCharset::Unicode) {
+            # Although why you'd store non-Unicode in UTF8 is beyond me
+            my $to_unicode = sub ($c, $subcallback) {
+                $subcallback.[0].( [ $str.charset.to_unicode($c) ], $subcallback.[1]);
+            };
+            self.string_char_iterate($str, $to_unicode, [ $callback, $parameter ]);
+        }
+        # Collect characters into graphemes in a roughly O(n) way...
+        my $index = 0;
+        while ($index < $str.bufused-1) {
+            my $c = char_at_byteoffset($str, $index);
+
+            # If we're the last character, do the callback and give up
+            if ($index >= $str.bufused) { $callback([$c], $parameter); return; }
+
+            # At this point we know there is at least one following character.
+            # How many of them are combining?
+            my @grapheme = ( $c );
+            my $next_char;
+            my $nc_index = $index;
+            my $end_of_grapheme_sequence = $index;
+            while ($nc_index <= $str.bufused and
+                   $next_char = char_at_byteoffset($str, $nc_index)
+                   and ParrotCharset::Unicode::is_combining($next_char)) {
+                   $end_of_grapheme_sequence = $nc_index;
+                   push @grapheme, @([ $next_char ]); # Work around horrible push/copy bug
+           }
+           $callback([@grapheme], $parameter);
+           $index = $end_of_grapheme_sequence;
+        }
+    }
 
     # We're not going to cache this because if it's worth caching it's
     # worth converting to a Parrot native string rather than keeping

Modified: branches/strings/pseudocode/ParrotString.pm
==============================================================================
--- branches/strings/pseudocode/ParrotString.pm	(original)
+++ branches/strings/pseudocode/ParrotString.pm	Sat Jan 17 14:52:48 2009
@@ -78,6 +78,12 @@
 
 sub Parrot_string_copy($src, $dst) { ... }
 sub Parrot_string_grapheme_copy ($src, $dst) { 
+     if ($src.encoding ~~ $dst.encoding and $src.charset ~~ $dst.charset) {
+        return Parrot_string_append($src, $dst);
+     }
+     my $append_to = sub ($g, $dst) { $dst.encoding.append_grapheme($src, $g) };
+     $src.encoding.string_grapheme_iterate($src, $append_to, $dst);
+     return $src;
 }
 sub Parrot_string_repeat($src, $reps) { ... }
 sub Parrot_string_substr($src, $offset, $len) { ... }



nntp.perl.org: Perl Programming lists via nntp and http.
Comments to Ask Bjørn Hansen at ask@perl.org | Group listing | About