develooper Front page | perl.cvs.parrot | Postings from January 2009

[svn:parrot] r35582 - in branches/strings/pseudocode: . t

From:
simon
Date:
January 15, 2009 03:25
Subject:
[svn:parrot] r35582 - in branches/strings/pseudocode: . t
Message ID:
20090115112538.D24FACB9AE@x12.develooper.com
Author: simon
Date: Thu Jan 15 03:25:38 2009
New Revision: 35582

Modified:
   branches/strings/pseudocode/ParrotString.pm
   branches/strings/pseudocode/t/create.t

Log:
Another function or two done, plus the start of UTF8 support.


Modified: branches/strings/pseudocode/ParrotString.pm
==============================================================================
--- branches/strings/pseudocode/ParrotString.pm	(original)
+++ branches/strings/pseudocode/ParrotString.pm	Thu Jan 15 03:25:38 2009
@@ -17,16 +17,55 @@
 class ParrotCharset::SJIS    {  };
 class ParrotCharset::EUCJP   {  };
 
-class ParrotEncoding::UTF8   {  };
+class ParrotEncoding::UTF8   {  
+    sub _skip($c) {
+        if $c <= 191 { return 1 }
+        if 191 < $c < 224 { return 2 }
+        return 3
+    }
+    sub char_at_byteoffset ($str, $offset is rw) { # Private helper
+        my $c = $str.buffer.[$offset++];
+        if 191 < $c < 224 {
+            # XXX Guard
+            $c = (($c +& 31) +< 6) +| ( $str.buffer.[$offset++] +& 63 );
+        } elsif $c >= 224 {
+            # XXX Guard
+            $c = (($c +& 15) +< 12) 
+                +| (( $str.buffer.[$offset++] +& 63 ) +< 6);
+            $c  +|= $str.buffer.[$offset++] +& 63;
+        }
+        return $c;
+    }
+    method string_char_iterate ($str, $callback, $parameter) {
+        my $index = 0;
+        while ($index < $str.bufused-1) {
+            $callback(char_at_byteoffset($str, $index), $parameter);
+        }
+    }
+
+    # We're not going to cache this because if it's worth caching it's
+    # worth converting to a Parrot native string rather than keeping
+    # UTF8. We'll keep it dumb and working and people can optimise later
+    method char_at_index($str, $index) { 
+        my $i = $index + 0; # work around Rakudo assignment weirdness
+        my $offset = 0;
+        while $i-- > 0 { $offset += _skip($str.buffer[$offset])  }
+        return char_at_byteoffset($str, $offset);
+    }
+};
 class ParrotEncoding::UTF16  {  };
 class ParrotEncoding::UTF32  {  };
 class ParrotEncoding::EBCDIC {  };
 class ParrotEncoding::Byte   {
-    method string_grapheme_iterate ($str, $callback, $parameter) {
-        for (0..$str.bufused-1) {
-            $callback($str.buffer.[$_], $parameter);
-        }
+    method string_char_iterate ($str, $callback, $parameter) {
+        for (0..$str.bufused-1) { $callback($str.buffer.[$_], $parameter); }
     }
+
+    method string_grapheme_iterate($str, $callback, $parameter) {
+        for (0..$str.bufused-1) { $callback($str.buffer.[$_], $parameter); }
+    }
+
+    method char_at_index($str, $index) { return $str.buffer.[$index] }
 };
 
 
@@ -58,7 +97,7 @@
     my $news = ParrotString.new();
     $news.charset  = $charset;
     $news.encoding = $encoding;
-    $news.buffer   = split("", $s);
+    $news.buffer   = map { ord $_ }, split("", $s);
     $news.bufused = $news.strlen = $len || length($s);
     return $news;
 }
@@ -75,6 +114,13 @@
     }
 }
 
+sub Parrot_string_length($str) {
+    # This code written funny to be a bit more C-like
+    my $data = 0; my $callback = sub ($char, $data is rw) { $data++ };
+    $str.encoding.string_char_iterate($str, $callback, $data);
+    return $data;
+}
+
 sub Parrot_string_grapheme_length($str) {
     # This code written funny to be a bit more C-like
     my $data = 0; my $callback = sub ($char, $data is rw) { $data++ };
@@ -84,7 +130,7 @@
 
 sub Parrot_string_byte_length($str) { return $str.strlen }
 
-sub Parrot_string_index($str, $index) { ... }
+sub Parrot_string_index($str, $index) { return $str.encoding.char_at_index($str, $index) }
 sub Parrot_string_grapheme_index($str, $index) { ... }
 sub Parrot_string_find_substr($str, $substr) { ... }
 
@@ -97,4 +143,7 @@
 sub Parrot_string_grapheme_replace($src, $offset, $len, $replacement) { ... }
 sub Parrot_string_chopn($src, $count) { ... }
 sub Parrot_string_chopn_inplace($src, $count) { ... }
-sub Parrot_string_grapheme_chopn($src, $count) { ... }
+
+sub Parrot_string_grapheme_chopn($src, $count) { 
+    return Parrot_string_replace($src, Parrot_string_grapheme_length($src) - $count, $count, undef);
+}

Modified: branches/strings/pseudocode/t/create.t
==============================================================================
--- branches/strings/pseudocode/t/create.t	(original)
+++ branches/strings/pseudocode/t/create.t	Thu Jan 15 03:25:38 2009
@@ -1,8 +1,15 @@
 use Test;
 use ParrotString;
-plan 3;
+plan 8;
 
-my $str = Parrot_string_new_init("abcdef", 4, ParrotCharset::ASCII, ParrotEncoding::Byte);
+my $str = Parrot_string_new_init("flurble", 4, ParrotCharset::ASCII, ParrotEncoding::Byte);
 ok($str.charset ~~ ParrotCharset::ASCII, "Charset set properly");
 is(Parrot_string_grapheme_length($str), 4, "String length correct");
 is(Parrot_string_byte_length($str), 4, "String length correct");
+is(Parrot_string_index($str, 1), ord("l"), "String indexing");
+
+$str = Parrot_string_new_init("\xce\xb3\xce\xb5\xce\xb9\xce\xac \xcf\x83\xce\xbf\xcf\x85 \xce\xba\xcf\x8c\xcf\x83\xce\xbc\xce\xbf\xcf\x82", 28, ParrotCharset::Unicode, ParrotEncoding::UTF8);
+ok($str.charset ~~ ParrotCharset::Unicode, "We're unicode");
+is(Parrot_string_byte_length($str), 28, "String byte length correct");
+is(Parrot_string_length($str), 15, "UTF8 char length correct");
+is(Parrot_string_index($str, 3), 0x3ac, "UTF8 string indexing");



nntp.perl.org: Perl Programming lists via nntp and http.
Comments to Ask Bjørn Hansen at ask@perl.org | Group listing | About