develooper Front page | perl.perl5.porters | Postings from February 2007

Re: unicode regex performance

From:
demerphq
Date:
February 10, 2007 08:06
Subject:
Re: unicode regex performance
Message ID:
9b18b3110702100806v4e1e4ffcr729c78b8429652b0@mail.gmail.com
On 2/10/07, Gerard Goossen <gerard@tty.nl> wrote:
> On Thu, Feb 08, 2007 at 09:06:31AM +0100, Dr.Ruud wrote:
> > Gerard Goossen schreef:
> >
> > > But for example when doing a regex for a
> > > fixed string like m/aap/ in UTF-8 you just have to do a memory search
> > > for the bytes representing 'aap' in UTF-32 you can do the same, but
> > > you have much more memory to search through.
> >
> > A string encoded such that it uses the word size of the platform, will
> > actually search faster. It is an extra step to get to the bytes, which
> > on most hardware takes considerable time.
>
> Nonsence. benchmark** of the most basic search:
> NONE*:  1.79s
> UTF-8:  1.86s
> UTF-16: 2.10s
> UTF-32: 2.90s
>
> Which is worse then lineair in memory usage (probably due to cache
> misses).
> And this is without optimalizations, like if you are searching for a
> longer patterns, you can do match multiple codepoints using one integer
> comparison.

Er, I do not understand what you think this demonstrates.

If i understand it correctly, you are checking to see how fast your
computer is doing reads of a certain size. BUT you arent mutliplying
the timing by the amount of data processed to normalize the results

So, if it takes 1.86s to read 1 byte, then it takes 7.44 to read 4 bytes.
If it takes 2.10s to read 2 bytes then it takes 4.20s to read 4 bytes.
Yet it takes only 2.9 seconds to read 4 bytes.

How do you reckon that this benchmark shows byte reads are faster than
U32 reads?

Furthermore this type of benchmark is totally simplistic. We ARENT
talking about reading one byte versus reading one word. We are talking
about reading up to 6 bytes, dynamically, versus reading one word
statically. So lets multiple the byte lookup time by the worst case,
1.86 * 6 == 11.6, so thats 8.7 seconds more than reading the same unit
of data out of utf32. Thats 4 times slower, without considering the
effect of the logic required to determine how much data need be read.

Currently in Perl in order to read a utf8 to get U32 codepoint you
have to use the following logic:

UV
Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
{
    return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
			  ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
}

UV
Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
U32 flags)
{
    const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
    return UNI_TO_NATIVE(uv);
}

UV
Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
{
    dVAR;
    const U8 * const s0 = s;
    UV uv = *s, ouv = 0;
    STRLEN len = 1;
    const bool dowarn = ckWARN_d(WARN_UTF8);
    const UV startbyte = *s;
    STRLEN expectlen = 0;
    U32 warning = 0;

/* This list is a superset of the UTF8_ALLOW_XXX. */

#define UTF8_WARN_EMPTY				 1
#define UTF8_WARN_CONTINUATION			 2
#define UTF8_WARN_NON_CONTINUATION	 	 3
#define UTF8_WARN_FE_FF				 4
#define UTF8_WARN_SHORT				 5
#define UTF8_WARN_OVERFLOW			 6
#define UTF8_WARN_SURROGATE			 7
#define UTF8_WARN_LONG				 8
#define UTF8_WARN_FFFF				 9 /* Also FFFE. */

    if (curlen == 0 &&
	!(flags & UTF8_ALLOW_EMPTY)) {
	warning = UTF8_WARN_EMPTY;
	goto malformed;
    }

    if (UTF8_IS_INVARIANT(uv)) {
	if (retlen)
	    *retlen = 1;
	return (UV) (NATIVE_TO_UTF(*s));
    }

    if (UTF8_IS_CONTINUATION(uv) &&
	!(flags & UTF8_ALLOW_CONTINUATION)) {
	warning = UTF8_WARN_CONTINUATION;
	goto malformed;
    }

    if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	warning = UTF8_WARN_NON_CONTINUATION;
	goto malformed;
    }

#ifdef EBCDIC
    uv = NATIVE_TO_UTF(uv);
#else
    if ((uv == 0xfe || uv == 0xff) &&
	!(flags & UTF8_ALLOW_FE_FF)) {
	warning = UTF8_WARN_FE_FF;
	goto malformed;
    }
#endif

    if      (!(uv & 0x20))	{ len =  2; uv &= 0x1f; }
    else if (!(uv & 0x10))	{ len =  3; uv &= 0x0f; }
    else if (!(uv & 0x08))	{ len =  4; uv &= 0x07; }
    else if (!(uv & 0x04))	{ len =  5; uv &= 0x03; }
#ifdef EBCDIC
    else if (!(uv & 0x02))	{ len =  6; uv &= 0x01; }
    else			{ len =  7; uv &= 0x01; }
#else
    else if (!(uv & 0x02))	{ len =  6; uv &= 0x01; }
    else if (!(uv & 0x01))	{ len =  7; uv = 0; }
    else			{ len = 13; uv = 0; } /* whoa! */
#endif

    if (retlen)
	*retlen = len;

    expectlen = len;

    if ((curlen < expectlen) &&
	!(flags & UTF8_ALLOW_SHORT)) {
	warning = UTF8_WARN_SHORT;
	goto malformed;
    }

    len--;
    s++;
    ouv = uv;

    while (len--) {
	if (!UTF8_IS_CONTINUATION(*s) &&
	    !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	    s--;
	    warning = UTF8_WARN_NON_CONTINUATION;
	    goto malformed;
	}
	else
	    uv = UTF8_ACCUMULATE(uv, *s);
	if (!(uv > ouv)) {
	    /* These cannot be allowed. */
	    if (uv == ouv) {
		if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
		    warning = UTF8_WARN_LONG;
		    goto malformed;
		}
	    }
	    else { /* uv < ouv */
		/* This cannot be allowed. */
		warning = UTF8_WARN_OVERFLOW;
		goto malformed;
	    }
	}
	s++;
	ouv = uv;
    }

    if (UNICODE_IS_SURROGATE(uv) &&
	!(flags & UTF8_ALLOW_SURROGATE)) {
	warning = UTF8_WARN_SURROGATE;
	goto malformed;
    } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
	       !(flags & UTF8_ALLOW_LONG)) {
	warning = UTF8_WARN_LONG;
	goto malformed;
    } else if (UNICODE_IS_ILLEGAL(uv) &&
	       !(flags & UTF8_ALLOW_FFFF)) {
	warning = UTF8_WARN_FFFF;
	goto malformed;
    }

    return uv;

malformed:

    if (flags & UTF8_CHECK_ONLY) {
	if (retlen)
	    *retlen = ((STRLEN) -1);
	return 0;
    }

    if (dowarn) {
	SV* const sv = sv_2mortal(newSVpvs("Malformed UTF-8 character "));

	switch (warning) {
	case 0: /* Intentionally empty. */ break;
	case UTF8_WARN_EMPTY:
	    sv_catpvs(sv, "(empty string)");
	    break;
	case UTF8_WARN_CONTINUATION:
	    Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte
0x%02"UVxf", with no preceding start byte)", uv);
	    break;
	case UTF8_WARN_NON_CONTINUATION:
	    if (s == s0)
	        Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte
0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
                           (UV)s[1], startbyte);
	    else {
		const int len = (int)(s-s0);
	        Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte
0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d
bytes)",
                           (UV)s[1], len, len > 1 ? "s" : "",
startbyte, (int)expectlen);
	    }

	    break;
	case UTF8_WARN_FE_FF:
	    Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
	    break;
	case UTF8_WARN_SHORT:
	    Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte
0x%02"UVxf")",
                           (int)curlen, curlen == 1 ? "" : "s",
(int)expectlen, startbyte);
	    expectlen = curlen;		/* distance for caller to skip */
	    break;
	case UTF8_WARN_OVERFLOW:
	    Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x,
after start byte 0x%02"UVxf")",
                           ouv, *s, startbyte);
	    break;
	case UTF8_WARN_SURROGATE:
	    Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
	    break;
	case UTF8_WARN_LONG:
	    Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte
0x%02"UVxf")",
			   (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
	    break;
	case UTF8_WARN_FFFF:
	    Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
	    break;
	default:
	    sv_catpvs(sv, "(unknown reason)");
	    break;
	}
	
	if (warning) {
	    const char * const s = SvPVX_const(sv);

	    if (PL_op)
		Perl_warner(aTHX_ packWARN(WARN_UTF8),
			    "%s in %s", s,  OP_DESC(PL_op));
	    else
		Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
	}
    }

    if (retlen)
	*retlen = expectlen ? expectlen : len;

    return 0;
}

Im sorry, but i see no way that utf8 could be as fast as utf32.

Yves

-- 
perl -Mre=debug -e "/just|another|perl|hacker/"



nntp.perl.org: Perl Programming lists via nntp and http.
Comments to Ask Bjørn Hansen at ask@perl.org | Group listing | About