develooper Front page | perl.perl5.porters | Postings from February 2021

Unicode::Normalize NFKD

Thread Next
From:
H.Merijn Brand
Date:
February 18, 2021 10:15
Subject:
Unicode::Normalize NFKD
Message ID:
20210218111500.7253d435@pc09.procura.nl
I am *not* a Unicode expert, so maybe this summarizes to "WHY" instead
of "THIS MIGHT BE A BUG"

--8<---
use 5.18.2;
use warnings;

use Data::Peek;
use Unicode::Normalize qw( normalize );
use Encode             qw( encode decode );
use charnames          qw(:full);

sub dp {
    my ($tag, $dta) = @_;
    my $dp = DPeek ($dta);
    printf "%-6s: %-52s", $tag, $dp =~ s{^(\S+)\K}{" " x (26 - length $1)}er;
    utf8::is_utf8 ($dta) and
        print join " + " => map { charnames::viacode (ord) } split // => $dta;
    say "";
    } # dp

$| = 1;
foreach my $bytes (
        "\xe1\xb8\xaf",
        "\xc3\xaf\xcc\x81",
        "\xc3\xad\xcc\x88",
        "\x69\xcc\x81\xcc\x88",
        "\x69\xcc\x88\xcc\x81",
        ) {
    my $u = decode ("utf-8", $bytes);
    dp ("Bytes", $bytes);
    dp ("UTF-8", $u);
    dp ("NF$_", normalize ($_, $u)) for qw( D C KD KC );
    say "";
    }
-->8---

->

--8<---
Bytes : PV("\341\270\257"\0)
UTF-8 : PV("\341\270\257"\0)       [UTF8 "\x{1e2f}"]        LATIN SMALL LETTER I WITH DIAERESIS AND ACUTE
NFD   : PV("i\314\210\314\201"\0)  [UTF8 "i\x{308}\x{301}"] LATIN SMALL LETTER I + COMBINING DIAERESIS + COMBINING ACUTE ACCENT
NFC   : PV("\341\270\257"\0)       [UTF8 "\x{1e2f}"]        LATIN SMALL LETTER I WITH DIAERESIS AND ACUTE
NFKD  : PV("i\314\210\314\201"\0)  [UTF8 "i\x{308}\x{301}"] LATIN SMALL LETTER I + COMBINING DIAERESIS + COMBINING ACUTE ACCENT
NFKC  : PV("\341\270\257"\0)       [UTF8 "\x{1e2f}"]        LATIN SMALL LETTER I WITH DIAERESIS AND ACUTE

Bytes : PV("\303\257\314\201"\0)
UTF-8 : PV("\303\257\314\201"\0)   [UTF8 "\x{ef}\x{301}"]   LATIN SMALL LETTER I WITH DIAERESIS + COMBINING ACUTE ACCENT
NFD   : PV("i\314\210\314\201"\0)  [UTF8 "i\x{308}\x{301}"] LATIN SMALL LETTER I + COMBINING DIAERESIS + COMBINING ACUTE ACCENT
NFC   : PV("\341\270\257"\0)       [UTF8 "\x{1e2f}"]        LATIN SMALL LETTER I WITH DIAERESIS AND ACUTE
NFKD  : PV("i\314\210\314\201"\0)  [UTF8 "i\x{308}\x{301}"] LATIN SMALL LETTER I + COMBINING DIAERESIS + COMBINING ACUTE ACCENT
NFKC  : PV("\341\270\257"\0)       [UTF8 "\x{1e2f}"]        LATIN SMALL LETTER I WITH DIAERESIS AND ACUTE

Bytes : PV("\303\255\314\210"\0)
UTF-8 : PV("\303\255\314\210"\0)   [UTF8 "\x{ed}\x{308}"]   LATIN SMALL LETTER I WITH ACUTE + COMBINING DIAERESIS
NFD   : PV("i\314\201\314\210"\0)  [UTF8 "i\x{301}\x{308}"] LATIN SMALL LETTER I + COMBINING ACUTE ACCENT + COMBINING DIAERESIS
NFC   : PV("\303\255\314\210"\0)   [UTF8 "\x{ed}\x{308}"]   LATIN SMALL LETTER I WITH ACUTE + COMBINING DIAERESIS
NFKD  : PV("i\314\201\314\210"\0)  [UTF8 "i\x{301}\x{308}"] LATIN SMALL LETTER I + COMBINING ACUTE ACCENT + COMBINING DIAERESIS
NFKC  : PV("\303\255\314\210"\0)   [UTF8 "\x{ed}\x{308}"]   LATIN SMALL LETTER I WITH ACUTE + COMBINING DIAERESIS

Bytes : PV("i\314\201\314\210"\0)
UTF-8 : PV("i\314\201\314\210"\0)  [UTF8 "i\x{301}\x{308}"] LATIN SMALL LETTER I + COMBINING ACUTE ACCENT + COMBINING DIAERESIS
NFD   : PV("i\314\201\314\210"\0)  [UTF8 "i\x{301}\x{308}"] LATIN SMALL LETTER I + COMBINING ACUTE ACCENT + COMBINING DIAERESIS
NFC   : PV("\303\255\314\210"\0)   [UTF8 "\x{ed}\x{308}"]   LATIN SMALL LETTER I WITH ACUTE + COMBINING DIAERESIS
NFKD  : PV("i\314\201\314\210"\0)  [UTF8 "i\x{301}\x{308}"] LATIN SMALL LETTER I + COMBINING ACUTE ACCENT + COMBINING DIAERESIS
NFKC  : PV("\303\255\314\210"\0)   [UTF8 "\x{ed}\x{308}"]   LATIN SMALL LETTER I WITH ACUTE + COMBINING DIAERESIS

Bytes : PV("i\314\210\314\201"\0)
UTF-8 : PV("i\314\210\314\201"\0)  [UTF8 "i\x{308}\x{301}"] LATIN SMALL LETTER I + COMBINING DIAERESIS + COMBINING ACUTE ACCENT
NFD   : PV("i\314\210\314\201"\0)  [UTF8 "i\x{308}\x{301}"] LATIN SMALL LETTER I + COMBINING DIAERESIS + COMBINING ACUTE ACCENT
NFC   : PV("\341\270\257"\0)       [UTF8 "\x{1e2f}"]        LATIN SMALL LETTER I WITH DIAERESIS AND ACUTE
NFKD  : PV("i\314\210\314\201"\0)  [UTF8 "i\x{308}\x{301}"] LATIN SMALL LETTER I + COMBINING DIAERESIS + COMBINING ACUTE ACCENT
NFKC  : PV("\341\270\257"\0)       [UTF8 "\x{1e2f}"]        LATIN SMALL LETTER I WITH DIAERESIS AND ACUTE
-->8---

I do expect NFKD to return "\x{1e2f}" for all 5.

-- 
H.Merijn Brand  https://tux.nl   Perl Monger   http://amsterdam.pm.org/
using perl5.00307 .. 5.33        porting perl5 on HP-UX, AIX, and Linux
https://tux.nl/email.html http://qa.perl.org https://www.test-smoke.org
                           

Thread Next


nntp.perl.org: Perl Programming lists via nntp and http.
Comments to Ask Bjørn Hansen at ask@perl.org | Group listing | About