develooper Front page | perl.perl5.porters | Postings from January 2005

Re: [perl #33734] unpack fails on utf-8 strings

Thread Previous | Thread Next
From:
perl5-porters
Date:
January 22, 2005 11:23
Subject:
Re: [perl #33734] unpack fails on utf-8 strings
Message ID:
cstqri$a6j$1@post.home.lunix
Some points I forgot to make in the previous mail:

 - If the first character in the unpackstring is U it now behaves like
   there is an U0 before (this is consistent with the documentation
   and behaviour of pack, and needed for reversibility)
 - Only literal C0 and U0 cause a mode switch, not ones implied by
   something like: unpack("C/U", "\x00")
   (I consider that a bugfix that should also be applied to maint)
 - C0 and U0 modes are scoped to (), so in format "s(nU0v)2S", the U0 mode
   only applies to the v, NOT to the S or the n (in the second round)
   (I also consider the old behaviour here a bug. It made multiround
   groups and C/() style groups too unpredictable)

A few bugfixes and improvements relative to the previous patch:

--- pp_pack.c.1	Sat Jan 22 14:43:22 2005
+++ pp_pack.c	Sat Jan 22 16:06:04 2005
@@ -668,21 +668,32 @@
 STATIC bool 
 need_utf8(const char *pat, const char *patend)
 {
-    if (pat >= patend) return FALSE;
-    if (*pat == 'U') return TRUE;
+    bool first = TRUE;
     while (pat < patend) {
 	if (pat[0] == '#') {
 	    pat++;
 	    pat = memchr(pat, '\n', patend-pat);
 	    if (!pat) return FALSE;
 	} else if (pat[0] == 'U') {
-	    if (pat[1] == '0') return TRUE;
-	}
+	    if (first || pat[1] == '0') return TRUE;
+	} else first = FALSE;
 	pat++;
     }
     return FALSE;
 }
 
+STATIC char
+first_symbol(const char *pat, const char *patend) {
+    while (pat < patend) {
+	if (pat[0] != '#') return pat[0];
+	pat++;
+	pat = memchr(pat, '\n', patend-pat);
+	if (!pat) return 0;
+	pat++;
+    }
+    return 0;
+}
+
 /*
 =for apidoc unpack_str
 
@@ -707,7 +718,7 @@
 	flags |= FLAG_UNPACK_DO_UTF8;
     }
 
-    if (pat < patend && *pat == 'U') {
+    if (first_symbol(pat, patend) == 'U') {
 	/*
 	  if (!(flags & FLAG_UNPACK_DO_UTF8)) 
 	      Perl_croak(aTHX_ "U0 mode on a byte string");
@@ -747,7 +758,7 @@
 	flags |= FLAG_UNPACK_DO_UTF8;
     }
 
-    if (pat < patend && *pat == 'U') {
+    if (first_symbol(pat, patend) == 'U') {
 	/*
 	  if (!(flags & FLAG_UNPACK_DO_UTF8)) 
 	      Perl_croak(aTHX_ "U0 mode on a byte string");
@@ -774,7 +785,7 @@
        warnings), but these two mean we make no progress in the string and
        might enter an infinite loop */
     if (retlen == (STRLEN) -1 || retlen == 0)
-	Perl_croak(aTHX_ "malformed UTF-8 string in unpack");
+	Perl_croak(aTHX_ "Malformed UTF-8 string in unpack");
     if (val >= 0x100) Perl_croak(aTHX_ "'%c' applied to character value %"UVf,
 				 (int) datumtype, val);
     *s += retlen;
@@ -782,37 +793,58 @@
 }
 
 STATIC bool
-next_uni_uu(pTHX_ char **s, const char *end, I32 *out)
+next_uni_bytes(pTHX_ char **s, const char *end, char *buf, int buf_len)
 {
     UV val;
     STRLEN retlen;
     char *from = *s;
-    val = UNI_TO_NATIVE(utf8n_to_uvuni(*s, end-*s, &retlen, UTF8_CHECK_ONLY));
-    if (val >= 0x100 || !ISUUCHAR(val) || 
-	retlen == (STRLEN) -1 || retlen == 0) {
-	*out = 0;
-	return FALSE;
+    int bad = 0;
+    U32 flags = ckWARN(WARN_UTF8) ? 
+	UTF8_CHECK_ONLY : (UTF8_CHECK_ONLY | UTF8_ALLOW_ANY);
+    for (;buf_len > 0; buf_len--) {
+	if (from >= end) return FALSE;
+	val = UNI_TO_NATIVE(utf8n_to_uvuni(from, end-from, &retlen, flags));
+	if (retlen == (STRLEN) -1 || retlen == 0) {
+	    from += UTF8SKIP(from);
+	    bad |= 1;
+	} else from += retlen;
+	if (val >= 0x100) {
+	    bad |= 2;
+	    val &= 0xff;
+	}
+	*(unsigned char *)buf++ = val;
+    }
+    /* We have enough characters for the buffer. Did we have problems ? */
+    if (bad) {
+	if (bad & 1) {
+	    /* Rewalk the string fragment while warning */
+	    char *ptr;
+	    flags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
+	    for (ptr = *s; ptr < from; ptr += UTF8SKIP(ptr))
+		utf8n_to_uvuni(ptr, end-ptr, &retlen, flags);
+	    if (from > end) from = end;
+	}
+	if ((bad & 2) && ckWARN(WARN_UNPACK))
+	    Perl_warner(aTHX_ packWARN(WARN_UNPACK),
+			"Character(s) wrapped in unpack");
     }
-    *out = PL_uudmap[val] & 077;
     *s = from;
     return TRUE;
 }
 
 STATIC bool
-next_uni_bytes(pTHX_ char **s, const char *end, char *buf, int buf_len)
+next_uni_uu(pTHX_ char **s, const char *end, I32 *out)
 {
     UV val;
     STRLEN retlen;
     char *from = *s;
-    while (buf_len > 0) {
-	if (from >= end) return 1;
-	val = UNI_TO_NATIVE(utf8n_to_uvuni(from, end-from, &retlen,
-					   UTF8_CHECK_ONLY));
-	if (val >= 0x100 || retlen == (STRLEN) -1 || retlen == 0) return FALSE;
-	from += retlen;
-	*(unsigned char *)buf++ = val;
-	buf_len--;
+    val = UNI_TO_NATIVE(utf8n_to_uvuni(*s, end-*s, &retlen, UTF8_CHECK_ONLY));
+    if (val >= 0x100 || !ISUUCHAR(val) || 
+	retlen == (STRLEN) -1 || retlen == 0) {
+	*out = 0;
+	return FALSE;
     }
+    *out = PL_uudmap[val] & 077;
     *s = from;
     return TRUE;
 }
@@ -884,6 +916,8 @@
 	      symptr->level++;
 	      PUTBACK;
 	      while (len--) {
+		  if (utf8) symptr->flags |=  FLAG_UNPACK_PARSE_UTF8;
+		  else      symptr->flags &= ~FLAG_UNPACK_PARSE_UTF8;
 		  symptr->patptr = savsym.grpbeg;
 		  unpack_rec(symptr, s, strbeg, strend, &s);
 		  if (s == strend && savsym.howlen == e_star)
@@ -952,7 +986,7 @@
 		I32 l = 0;
 		for (hop = strbeg; hop < s; hop += UTF8SKIP(hop)) l++;
 		if (s != hop)
-		    Perl_croak(aTHX_ "malformed UTF-8 string in unpack");
+		    Perl_croak(aTHX_ "Malformed UTF-8 string in unpack");
 		ai32 = l % len;
 	    } else ai32 = (s - strbeg) % len;
 	    if (ai32 == 0) break;
@@ -985,12 +1019,12 @@
 		for (l=len, hop=s; l>0; l--, hop += UTF8SKIP(hop)) {
 		    if (hop >= strend) {
 			if (hop > strend)
-			    Perl_croak(aTHX_ "malformed UTF-8 string in unpack");
+			    Perl_croak(aTHX_ "Malformed UTF-8 string in unpack");
 			break;
 		    }
 		}
 		if (hop > strend)
-		    Perl_croak(aTHX_ "malformed UTF-8 string in unpack");
+		    Perl_croak(aTHX_ "Malformed UTF-8 string in unpack");
 		len = hop - s;
 	    } else if (len > strend - s)
 		len = strend - s;
@@ -1168,16 +1202,9 @@
 	    break;
 	  case 'C':
 	    if (len == 0) {
-		if (literal) {
+		if (literal)
 		    /* Switch to "natural" mode */
-		    if (symptr->flags & FLAG_UNPACK_DO_UTF8) {
-			symptr->flags |= FLAG_UNPACK_PARSE_UTF8;
-			utf8 = 1;
-		    } else {
-			symptr->flags &= ~FLAG_UNPACK_PARSE_UTF8;
-			utf8 = 0;
-		    }
-		}
+		    utf8 = (symptr->flags & FLAG_UNPACK_DO_UTF8) ? 1 : 0;
 		break;
 	    }
 	  uchar_checksum:
@@ -1215,14 +1242,11 @@
 	    if (len == 0) {
 		if (literal) {
 		    /* Switch to "bytes in utf-8" mode */
-		    if (symptr->flags & FLAG_UNPACK_DO_UTF8) {
-			utf8 = 0;
-			symptr->flags &= ~FLAG_UNPACK_PARSE_UTF8;
-		    } else {
+		    if (symptr->flags & FLAG_UNPACK_DO_UTF8) utf8 = 0;
+		    else
 			/* Should be impossible due to the need_utf8() test */
 			Perl_croak(aTHX_ "U0 mode on a byte string");
 		    }
-		}
 		break;
 	    }
 	    if (len > strend - s) len = strend - s;
@@ -1235,7 +1259,7 @@
 		STRLEN retlen;
 		UV auv = utf8n_to_uvuni((U8*)s, strend - s, &retlen, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANYUV);
 		if (retlen == (STRLEN) -1 || retlen == 0)
-		    Perl_croak(aTHX_ "malformed UTF-8 string in unpack");
+		    Perl_croak(aTHX_ "Malformed UTF-8 string in unpack");
 		s += retlen;
 		if (!checksum)
 		    PUSHs(sv_2mortal(newSVuv((UV) auv)));

Thread Previous | Thread Next


nntp.perl.org: Perl Programming lists via nntp and http.
Comments to Ask Bjørn Hansen at ask@perl.org | Group listing | About