1 2#-------------------------------------------------------------------------- 3# Parameter $zName must be a path to the file UnicodeData.txt. This command 4# reads the file and returns a list of mappings required to remove all 5# diacritical marks from a unicode string. Each mapping is itself a list 6# consisting of two elements - the unicode codepoint and the single ASCII 7# character that it should be replaced with, or an empty string if the 8# codepoint should simply be removed from the input. Examples: 9# 10# { 224 a } (replace codepoint 224 to "a") 11# { 769 "" } (remove codepoint 769 from input) 12# 13# Mappings are only returned for non-upper case codepoints. It is assumed 14# that the input has already been folded to lower case. 15# 16proc rd_load_unicodedata_text {zName} { 17 global tl_lookup_table 18 19 set fd [open $zName] 20 set lField { 21 code 22 character_name 23 general_category 24 canonical_combining_classes 25 bidirectional_category 26 character_decomposition_mapping 27 decimal_digit_value 28 digit_value 29 numeric_value 30 mirrored 31 unicode_1_name 32 iso10646_comment_field 33 uppercase_mapping 34 lowercase_mapping 35 titlecase_mapping 36 } 37 set lRet [list] 38 39 while { ![eof $fd] } { 40 set line [gets $fd] 41 if {$line == ""} continue 42 43 set fields [split $line ";"] 44 if {[llength $fields] != [llength $lField]} { error "parse error: $line" } 45 foreach $lField $fields {} 46 if { [llength $character_decomposition_mapping]!=2 47 || [string is xdigit [lindex $character_decomposition_mapping 0]]==0 48 } { 49 continue 50 } 51 52 set iCode [expr "0x$code"] 53 set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"] 54 set iDia [expr "0x[lindex $character_decomposition_mapping 1]"] 55 56 if {[info exists tl_lookup_table($iCode)]} continue 57 58 if { ($iAscii >= 97 && $iAscii <= 122) 59 || ($iAscii >= 65 && $iAscii <= 90) 60 } { 61 lappend lRet [list $iCode [string tolower [format %c $iAscii]]] 62 set dia($iDia) 1 63 } 64 } 65 66 foreach d [array names dia] { 67 lappend lRet [list $d ""] 68 } 69 set lRet [lsort -integer -index 0 $lRet] 70 71 close $fd 72 set lRet 73} 74 75#------------------------------------------------------------------------- 76# Parameter $zName must be a path to the file UnicodeData.txt. This command 77# reads the file and returns a list of codepoints (integers). The list 78# contains all codepoints in the UnicodeData.txt assigned to any "General 79# Category" that is not a "Letter" or "Number". 80# 81proc an_load_unicodedata_text {zName} { 82 set fd [open $zName] 83 set lField { 84 code 85 character_name 86 general_category 87 canonical_combining_classes 88 bidirectional_category 89 character_decomposition_mapping 90 decimal_digit_value 91 digit_value 92 numeric_value 93 mirrored 94 unicode_1_name 95 iso10646_comment_field 96 uppercase_mapping 97 lowercase_mapping 98 titlecase_mapping 99 } 100 set lRet [list] 101 102 while { ![eof $fd] } { 103 set line [gets $fd] 104 if {$line == ""} continue 105 106 set fields [split $line ";"] 107 if {[llength $fields] != [llength $lField]} { error "parse error: $line" } 108 foreach $lField $fields {} 109 110 set iCode [expr "0x$code"] 111 set bAlnum [expr { 112 [lsearch {L N} [string range $general_category 0 0]] >= 0 113 || $general_category=="Co" 114 }] 115 116 if { !$bAlnum } { lappend lRet $iCode } 117 } 118 119 close $fd 120 set lRet 121} 122 123proc tl_load_casefolding_txt {zName} { 124 global tl_lookup_table 125 126 set fd [open $zName] 127 while { ![eof $fd] } { 128 set line [gets $fd] 129 if {[string range $line 0 0] == "#"} continue 130 if {$line == ""} continue 131 132 foreach x {a b c d} {unset -nocomplain $x} 133 foreach {a b c d} [split $line ";"] {} 134 135 set a2 [list] 136 set c2 [list] 137 foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] } 138 foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] } 139 set b [string trim $b] 140 set d [string trim $d] 141 142 if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 } 143 } 144} 145 146 147