121b7d2a9Sdan 221b7d2a9Sdan#-------------------------------------------------------------------------- 321b7d2a9Sdan# Parameter $zName must be a path to the file UnicodeData.txt. This command 421b7d2a9Sdan# reads the file and returns a list of mappings required to remove all 521b7d2a9Sdan# diacritical marks from a unicode string. Each mapping is itself a list 621b7d2a9Sdan# consisting of two elements - the unicode codepoint and the single ASCII 721b7d2a9Sdan# character that it should be replaced with, or an empty string if the 821b7d2a9Sdan# codepoint should simply be removed from the input. Examples: 921b7d2a9Sdan# 10*e89feee5Sdan# { 224 a 0 } (replace codepoint 224 to "a") 11*e89feee5Sdan# { 769 "" 0 } (remove codepoint 769 from input) 1221b7d2a9Sdan# 1321b7d2a9Sdan# Mappings are only returned for non-upper case codepoints. It is assumed 1421b7d2a9Sdan# that the input has already been folded to lower case. 1521b7d2a9Sdan# 16*e89feee5Sdan# The third value in the list is always either 0 or 1. 0 if the 17*e89feee5Sdan# UnicodeData.txt file maps the codepoint to a single ASCII character and 18*e89feee5Sdan# a diacritic, or 1 if the mapping is indirect. For example, consider the 19*e89feee5Sdan# two entries: 20*e89feee5Sdan# 21*e89feee5Sdan# 1ECD;LATIN SMALL LETTER O WITH DOT BELOW;Ll;0;L;006F 0323;;;;N;;;1ECC;;1ECC 22*e89feee5Sdan# 1ED9;LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW;Ll;0;L;1ECD 0302;;;;N;;;1ED8;;1ED8 23*e89feee5Sdan# 24*e89feee5Sdan# The first codepoint is a direct mapping (as 006F is ASCII and 0323 is a 25*e89feee5Sdan# diacritic). The second is an indirect mapping, as it maps to the 26*e89feee5Sdan# first codepoint plus 0302 (a diacritic). 27*e89feee5Sdan# 2821b7d2a9Sdanproc rd_load_unicodedata_text {zName} { 2921b7d2a9Sdan global tl_lookup_table 3021b7d2a9Sdan 3121b7d2a9Sdan set fd [open $zName] 3221b7d2a9Sdan set lField { 3321b7d2a9Sdan code 3421b7d2a9Sdan character_name 3521b7d2a9Sdan general_category 3621b7d2a9Sdan canonical_combining_classes 3721b7d2a9Sdan bidirectional_category 3821b7d2a9Sdan character_decomposition_mapping 3921b7d2a9Sdan decimal_digit_value 4021b7d2a9Sdan digit_value 4121b7d2a9Sdan numeric_value 4221b7d2a9Sdan mirrored 4321b7d2a9Sdan unicode_1_name 4421b7d2a9Sdan iso10646_comment_field 4521b7d2a9Sdan uppercase_mapping 4621b7d2a9Sdan lowercase_mapping 4721b7d2a9Sdan titlecase_mapping 4821b7d2a9Sdan } 4921b7d2a9Sdan set lRet [list] 5021b7d2a9Sdan 5121b7d2a9Sdan while { ![eof $fd] } { 5221b7d2a9Sdan set line [gets $fd] 5321b7d2a9Sdan if {$line == ""} continue 5421b7d2a9Sdan 5521b7d2a9Sdan set fields [split $line ";"] 5621b7d2a9Sdan if {[llength $fields] != [llength $lField]} { error "parse error: $line" } 5721b7d2a9Sdan foreach $lField $fields {} 5821b7d2a9Sdan if { [llength $character_decomposition_mapping]!=2 5921b7d2a9Sdan || [string is xdigit [lindex $character_decomposition_mapping 0]]==0 6021b7d2a9Sdan } { 6121b7d2a9Sdan continue 6221b7d2a9Sdan } 6321b7d2a9Sdan 6421b7d2a9Sdan set iCode [expr "0x$code"] 6521b7d2a9Sdan set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"] 6621b7d2a9Sdan set iDia [expr "0x[lindex $character_decomposition_mapping 1]"] 6721b7d2a9Sdan 68*e89feee5Sdan # Filter out upper-case characters, as they will be mapped to their 69*e89feee5Sdan # lower-case equivalents before this data is used. 7021b7d2a9Sdan if {[info exists tl_lookup_table($iCode)]} continue 7121b7d2a9Sdan 72*e89feee5Sdan # Check if this is an indirect mapping. If so, set bIndirect to true 73*e89feee5Sdan # and change $iAscii to the indirectly mappped ASCII character. 74*e89feee5Sdan set bIndirect 0 75*e89feee5Sdan if {[info exists dia($iDia)] && [info exists mapping($iAscii)]} { 76*e89feee5Sdan set iAscii $mapping($iAscii) 77*e89feee5Sdan set bIndirect 1 78*e89feee5Sdan } 79*e89feee5Sdan 8021b7d2a9Sdan if { ($iAscii >= 97 && $iAscii <= 122) 8121b7d2a9Sdan || ($iAscii >= 65 && $iAscii <= 90) 8221b7d2a9Sdan } { 83*e89feee5Sdan lappend lRet [list $iCode [string tolower [format %c $iAscii]] $bIndirect] 84*e89feee5Sdan set mapping($iCode) $iAscii 8521b7d2a9Sdan set dia($iDia) 1 8621b7d2a9Sdan } 8721b7d2a9Sdan } 8821b7d2a9Sdan 8921b7d2a9Sdan foreach d [array names dia] { 90*e89feee5Sdan lappend lRet [list $d "" 0] 9121b7d2a9Sdan } 9221b7d2a9Sdan set lRet [lsort -integer -index 0 $lRet] 9321b7d2a9Sdan 9421b7d2a9Sdan close $fd 9521b7d2a9Sdan set lRet 9621b7d2a9Sdan} 9721b7d2a9Sdan 9821b7d2a9Sdan#------------------------------------------------------------------------- 9921b7d2a9Sdan# Parameter $zName must be a path to the file UnicodeData.txt. This command 10021b7d2a9Sdan# reads the file and returns a list of codepoints (integers). The list 10121b7d2a9Sdan# contains all codepoints in the UnicodeData.txt assigned to any "General 10221b7d2a9Sdan# Category" that is not a "Letter" or "Number". 10321b7d2a9Sdan# 10421b7d2a9Sdanproc an_load_unicodedata_text {zName} { 10521b7d2a9Sdan set fd [open $zName] 10621b7d2a9Sdan set lField { 10721b7d2a9Sdan code 10821b7d2a9Sdan character_name 10921b7d2a9Sdan general_category 11021b7d2a9Sdan canonical_combining_classes 11121b7d2a9Sdan bidirectional_category 11221b7d2a9Sdan character_decomposition_mapping 11321b7d2a9Sdan decimal_digit_value 11421b7d2a9Sdan digit_value 11521b7d2a9Sdan numeric_value 11621b7d2a9Sdan mirrored 11721b7d2a9Sdan unicode_1_name 11821b7d2a9Sdan iso10646_comment_field 11921b7d2a9Sdan uppercase_mapping 12021b7d2a9Sdan lowercase_mapping 12121b7d2a9Sdan titlecase_mapping 12221b7d2a9Sdan } 12321b7d2a9Sdan set lRet [list] 12421b7d2a9Sdan 12521b7d2a9Sdan while { ![eof $fd] } { 12621b7d2a9Sdan set line [gets $fd] 12721b7d2a9Sdan if {$line == ""} continue 12821b7d2a9Sdan 12921b7d2a9Sdan set fields [split $line ";"] 13021b7d2a9Sdan if {[llength $fields] != [llength $lField]} { error "parse error: $line" } 13121b7d2a9Sdan foreach $lField $fields {} 13221b7d2a9Sdan 13321b7d2a9Sdan set iCode [expr "0x$code"] 13421b7d2a9Sdan set bAlnum [expr { 13521b7d2a9Sdan [lsearch {L N} [string range $general_category 0 0]] >= 0 13621b7d2a9Sdan || $general_category=="Co" 13721b7d2a9Sdan }] 13821b7d2a9Sdan 13921b7d2a9Sdan if { !$bAlnum } { lappend lRet $iCode } 14021b7d2a9Sdan } 14121b7d2a9Sdan 14221b7d2a9Sdan close $fd 14321b7d2a9Sdan set lRet 14421b7d2a9Sdan} 14521b7d2a9Sdan 14621b7d2a9Sdanproc tl_load_casefolding_txt {zName} { 14721b7d2a9Sdan global tl_lookup_table 14821b7d2a9Sdan 14921b7d2a9Sdan set fd [open $zName] 15021b7d2a9Sdan while { ![eof $fd] } { 15121b7d2a9Sdan set line [gets $fd] 15221b7d2a9Sdan if {[string range $line 0 0] == "#"} continue 15321b7d2a9Sdan if {$line == ""} continue 15421b7d2a9Sdan 15521b7d2a9Sdan foreach x {a b c d} {unset -nocomplain $x} 15621b7d2a9Sdan foreach {a b c d} [split $line ";"] {} 15721b7d2a9Sdan 15821b7d2a9Sdan set a2 [list] 15921b7d2a9Sdan set c2 [list] 16021b7d2a9Sdan foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] } 16121b7d2a9Sdan foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] } 16221b7d2a9Sdan set b [string trim $b] 16321b7d2a9Sdan set d [string trim $d] 16421b7d2a9Sdan 16521b7d2a9Sdan if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 } 16621b7d2a9Sdan } 16721b7d2a9Sdan} 16821b7d2a9Sdan 169b80bb6ceSdanproc cc_load_unicodedata_text {zName} { 170b80bb6ceSdan set fd [open $zName] 171b80bb6ceSdan set lField { 172b80bb6ceSdan code 173b80bb6ceSdan character_name 174b80bb6ceSdan general_category 175b80bb6ceSdan canonical_combining_classes 176b80bb6ceSdan bidirectional_category 177b80bb6ceSdan character_decomposition_mapping 178b80bb6ceSdan decimal_digit_value 179b80bb6ceSdan digit_value 180b80bb6ceSdan numeric_value 181b80bb6ceSdan mirrored 182b80bb6ceSdan unicode_1_name 183b80bb6ceSdan iso10646_comment_field 184b80bb6ceSdan uppercase_mapping 185b80bb6ceSdan lowercase_mapping 186b80bb6ceSdan titlecase_mapping 187b80bb6ceSdan } 188b80bb6ceSdan set lRet [list] 189b80bb6ceSdan 190b80bb6ceSdan while { ![eof $fd] } { 191b80bb6ceSdan set line [gets $fd] 192b80bb6ceSdan if {$line == ""} continue 193b80bb6ceSdan 194b80bb6ceSdan set fields [split $line ";"] 195b80bb6ceSdan if {[llength $fields] != [llength $lField]} { error "parse error: $line" } 196b80bb6ceSdan foreach $lField $fields {} 197b80bb6ceSdan 198b80bb6ceSdan lappend lRet [list $code $general_category] 199b80bb6ceSdan } 200b80bb6ceSdan 201b80bb6ceSdan close $fd 202b80bb6ceSdan set lRet 203b80bb6ceSdan} 204b80bb6ceSdan 20521b7d2a9Sdan 206