1 2#-------------------------------------------------------------------------- 3# Parameter $zName must be a path to the file UnicodeData.txt. This command 4# reads the file and returns a list of mappings required to remove all 5# diacritical marks from a unicode string. Each mapping is itself a list 6# consisting of two elements - the unicode codepoint and the single ASCII 7# character that it should be replaced with, or an empty string if the 8# codepoint should simply be removed from the input. Examples: 9# 10# { 224 a 0 } (replace codepoint 224 to "a") 11# { 769 "" 0 } (remove codepoint 769 from input) 12# 13# Mappings are only returned for non-upper case codepoints. It is assumed 14# that the input has already been folded to lower case. 15# 16# The third value in the list is always either 0 or 1. 0 if the 17# UnicodeData.txt file maps the codepoint to a single ASCII character and 18# a diacritic, or 1 if the mapping is indirect. For example, consider the 19# two entries: 20# 21# 1ECD;LATIN SMALL LETTER O WITH DOT BELOW;Ll;0;L;006F 0323;;;;N;;;1ECC;;1ECC 22# 1ED9;LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW;Ll;0;L;1ECD 0302;;;;N;;;1ED8;;1ED8 23# 24# The first codepoint is a direct mapping (as 006F is ASCII and 0323 is a 25# diacritic). The second is an indirect mapping, as it maps to the 26# first codepoint plus 0302 (a diacritic). 27# 28proc rd_load_unicodedata_text {zName} { 29 global tl_lookup_table 30 31 set fd [open $zName] 32 set lField { 33 code 34 character_name 35 general_category 36 canonical_combining_classes 37 bidirectional_category 38 character_decomposition_mapping 39 decimal_digit_value 40 digit_value 41 numeric_value 42 mirrored 43 unicode_1_name 44 iso10646_comment_field 45 uppercase_mapping 46 lowercase_mapping 47 titlecase_mapping 48 } 49 set lRet [list] 50 51 while { ![eof $fd] } { 52 set line [gets $fd] 53 if {$line == ""} continue 54 55 set fields [split $line ";"] 56 if {[llength $fields] != [llength $lField]} { error "parse error: $line" } 57 foreach $lField $fields {} 58 if { [llength $character_decomposition_mapping]!=2 59 || [string is xdigit [lindex $character_decomposition_mapping 0]]==0 60 } { 61 continue 62 } 63 64 set iCode [expr "0x$code"] 65 set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"] 66 set iDia [expr "0x[lindex $character_decomposition_mapping 1]"] 67 68 # Filter out upper-case characters, as they will be mapped to their 69 # lower-case equivalents before this data is used. 70 if {[info exists tl_lookup_table($iCode)]} continue 71 72 # Check if this is an indirect mapping. If so, set bIndirect to true 73 # and change $iAscii to the indirectly mappped ASCII character. 74 set bIndirect 0 75 if {[info exists dia($iDia)] && [info exists mapping($iAscii)]} { 76 set iAscii $mapping($iAscii) 77 set bIndirect 1 78 } 79 80 if { ($iAscii >= 97 && $iAscii <= 122) 81 || ($iAscii >= 65 && $iAscii <= 90) 82 } { 83 lappend lRet [list $iCode [string tolower [format %c $iAscii]] $bIndirect] 84 set mapping($iCode) $iAscii 85 set dia($iDia) 1 86 } 87 } 88 89 foreach d [array names dia] { 90 lappend lRet [list $d "" 0] 91 } 92 set lRet [lsort -integer -index 0 $lRet] 93 94 close $fd 95 set lRet 96} 97 98#------------------------------------------------------------------------- 99# Parameter $zName must be a path to the file UnicodeData.txt. This command 100# reads the file and returns a list of codepoints (integers). The list 101# contains all codepoints in the UnicodeData.txt assigned to any "General 102# Category" that is not a "Letter" or "Number". 103# 104proc an_load_unicodedata_text {zName} { 105 set fd [open $zName] 106 set lField { 107 code 108 character_name 109 general_category 110 canonical_combining_classes 111 bidirectional_category 112 character_decomposition_mapping 113 decimal_digit_value 114 digit_value 115 numeric_value 116 mirrored 117 unicode_1_name 118 iso10646_comment_field 119 uppercase_mapping 120 lowercase_mapping 121 titlecase_mapping 122 } 123 set lRet [list] 124 125 while { ![eof $fd] } { 126 set line [gets $fd] 127 if {$line == ""} continue 128 129 set fields [split $line ";"] 130 if {[llength $fields] != [llength $lField]} { error "parse error: $line" } 131 foreach $lField $fields {} 132 133 set iCode [expr "0x$code"] 134 set bAlnum [expr { 135 [lsearch {L N} [string range $general_category 0 0]] >= 0 136 || $general_category=="Co" 137 }] 138 139 if { !$bAlnum } { lappend lRet $iCode } 140 } 141 142 close $fd 143 set lRet 144} 145 146proc tl_load_casefolding_txt {zName} { 147 global tl_lookup_table 148 149 set fd [open $zName] 150 while { ![eof $fd] } { 151 set line [gets $fd] 152 if {[string range $line 0 0] == "#"} continue 153 if {$line == ""} continue 154 155 foreach x {a b c d} {unset -nocomplain $x} 156 foreach {a b c d} [split $line ";"] {} 157 158 set a2 [list] 159 set c2 [list] 160 foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] } 161 foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] } 162 set b [string trim $b] 163 set d [string trim $d] 164 165 if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 } 166 } 167} 168 169proc cc_load_unicodedata_text {zName} { 170 set fd [open $zName] 171 set lField { 172 code 173 character_name 174 general_category 175 canonical_combining_classes 176 bidirectional_category 177 character_decomposition_mapping 178 decimal_digit_value 179 digit_value 180 numeric_value 181 mirrored 182 unicode_1_name 183 iso10646_comment_field 184 uppercase_mapping 185 lowercase_mapping 186 titlecase_mapping 187 } 188 set lRet [list] 189 190 while { ![eof $fd] } { 191 set line [gets $fd] 192 if {$line == ""} continue 193 194 set fields [split $line ";"] 195 if {[llength $fields] != [llength $lField]} { error "parse error: $line" } 196 foreach $lField $fields {} 197 198 lappend lRet [list $code $general_category] 199 } 200 201 close $fd 202 set lRet 203} 204 205 206