Unicode character decomposition Unix library? - Unix

This is a discussion on Unicode character decomposition Unix library? - Unix ; Is there a library/whatever available on Unices to do Unicode character decomposition normalisation? -- SM Ryan http://www.rawbw.com/~wyrmwif/ I'm not even supposed to be here today....

+ Reply to Thread
Results 1 to 3 of 3

Thread: Unicode character decomposition Unix library?

  1. Unicode character decomposition Unix library?

    Is there a library/whatever available on Unices to do
    Unicode character decomposition normalisation?

    --
    SM Ryan http://www.rawbw.com/~wyrmwif/
    I'm not even supposed to be here today.

  2. Re: Unicode character decomposition Unix library?

    SM Ryan wrote:
    > Is there a library/whatever available on Unices to do
    > Unicode character decomposition normalisation?


    http://icu-project.org/

    iconv, a library for characterset conversion likely already installed on
    your system, may be capable of this. I'm unsure.

  3. Re: Unicode character decomposition Unix library?

    William Ahern wrote:
    # SM Ryan wrote:
    # > Is there a library/whatever available on Unices to do
    # > Unicode character decomposition normalisation?
    #
    # http://icu-project.org/
    #
    # iconv, a library for characterset conversion likely already installed on
    # your system, may be capable of this. I'm unsure.

    I found about the unicode data file and wrote a Tcl script that generates
    a C function NFD that comverts a UTF-8 string to an NFD UTF-8 string.
    In case anyone else is interested (this normalises for MacOSX paths).

    ./nfd.tcl <.../UnicodeData.txt >nfd.c

    nfd.tcl:
    =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
    #!/usr/bin/tclsh

    proc utf codes {
    set utf {}
    foreach code $codes {
    set s [encoding convertto utf-8 [format %c $code]]
    binary scan $s c* b
    foreach c $b {
    lappend utf [expr {($c+0x100) % 0x100}]
    }
    }
    return $utf
    }

    array set choice {0 0 {0 0} {/*end of string*/ POP; continue;}}
    while {[gets stdin line]>=0} {
    set line [split $line ";"]
    if {[llength $line]<6} continue
    if {[llength [lindex $line 5]]==0} continue
    set code 0x[lindex $line 0]
    set code [expr {$code}]
    set decomposition {}
    set okay true
    foreach c [lindex $line 5] {
    set okay [string is xdigit -strict $c]
    if {!$okay} break
    set c 0x$c
    lappend decomposition [expr {$c}]
    }
    if {!$okay} continue
    set code [utf $code]
    set decomposition [utf $decomposition]
    set stem 0
    foreach byte $code {
    if {![info exists choice([concat $stem $byte])]} {
    lappend choice($stem) $byte
    }
    lappend stem $byte
    }
    set decompositionstring ""
    foreach byte $decomposition {
    append decompositionstring \\x[format %02X $byte]
    }
    set choice($stem) "/*[lindex $line 1]*/ PUSH(\"$decompositionstring\"); continue;"
    }

    proc choose {space x} {
    global choice
    set n [llength $x]
    set space1 "$space "
    if {[string index $choice($x) 0] eq "/"} {
    puts ${space}$choice($x)
    } elseif {[llength $choice($x)]==1} {
    set sep "${space}if ("
    set i 0
    while {[string index $choice($x) 0] ne "/"} {
    set c [lindex $choice($x) 0]
    puts -nonewline "${sep}STRING\[$i\]==0x[format %02X $c]"
    lappend x $c
    set sep { && }
    incr i
    }
    puts ") \{"
    puts "${space1}STRING += $i;"
    choose $space1 $x
    puts "${space}\}"
    } else {
    puts "${space}switch (*STRING++) \{"
    foreach c $choice($x) {
    puts "$space case 0x[format %02X $c]:"
    choose $space1 [concat $x $c]
    }
    puts "$space default: STRING -= $n; EMIT(*STRING++); continue;"
    puts "${space}\}"
    }
    }
    puts "
    char *NFD(char *utf) \{
    typedef struct Chain *Chain; struct Chain {char ch; Chain prev;};
    typedef struct Stack *Stack; struct Stack {unsigned char *st; Stack under;};
    Chain chain = 0; int changed = 0; int n = 1; char *result; Stack stack = 0;
    #ifndef MALLOC
    #define MALLOC malloc
    #endif
    #ifndef FREE
    #define FREE free
    #endif
    #define PUSH(s) {Stack t = MALLOC(sizeof(struct Stack)); \\
    t->st = (unsigned char*)(s); t->under = stack; stack = t; changed = 1;}
    #define POP {Stack t = stack->under; FREE(stack); stack = t;}
    #define EMIT(c) {Chain t = MALLOC(sizeof(struct Chain)); \\
    t->ch = (c); t->prev = chain; chain = t; n++;}
    #define STRING (stack->st)
    if (utf) PUSH(utf);
    while (stack) \{"
    choose " " 0
    puts " \}
    if (!changed) {
    while (chain) {Chain t = chain->prev; FREE(chain); chain = t;}
    return utf;
    }
    result = MALLOC(n); result\[--n\] = 0;
    while (chain) {
    Chain t = chain->prev; result\[--n\] = chain->ch;
    FREE(chain); chain = t;
    }
    return result;
    #undef MALLOC
    #undef FREE
    #undef PUSH
    #undef POP
    #undef EMIT
    #undef STRING
    \}
    "
    =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

    --
    SM Ryan http://www.rawbw.com/~wyrmwif/
    Who's leading this mob?

+ Reply to Thread