1
2#--------------------------------------------------------------------------
3# Parameter $zName must be a path to the file UnicodeData.txt. This command
4# reads the file and returns a list of mappings required to remove all
5# diacritical marks from a unicode string. Each mapping is itself a list
6# consisting of two elements - the unicode codepoint and the single ASCII
7# character that it should be replaced with, or an empty string if the
8# codepoint should simply be removed from the input. Examples:
9#
10#   { 224 a  }     (replace codepoint 224 to "a")
11#   { 769 "" }     (remove codepoint 769 from input)
12#
13# Mappings are only returned for non-upper case codepoints. It is assumed
14# that the input has already been folded to lower case.
15#
16proc rd_load_unicodedata_text {zName} {
17  global tl_lookup_table
18
19  set fd [open $zName]
20  set lField {
21    code
22    character_name
23    general_category
24    canonical_combining_classes
25    bidirectional_category
26    character_decomposition_mapping
27    decimal_digit_value
28    digit_value
29    numeric_value
30    mirrored
31    unicode_1_name
32    iso10646_comment_field
33    uppercase_mapping
34    lowercase_mapping
35    titlecase_mapping
36  }
37  set lRet [list]
38
39  while { ![eof $fd] } {
40    set line [gets $fd]
41    if {$line == ""} continue
42
43    set fields [split $line ";"]
44    if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
45    foreach $lField $fields {}
46    if { [llength $character_decomposition_mapping]!=2
47      || [string is xdigit [lindex $character_decomposition_mapping 0]]==0
48    } {
49      continue
50    }
51
52    set iCode  [expr "0x$code"]
53    set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
54    set iDia   [expr "0x[lindex $character_decomposition_mapping 1]"]
55
56    if {[info exists tl_lookup_table($iCode)]} continue
57
58    if { ($iAscii >= 97 && $iAscii <= 122)
59      || ($iAscii >= 65 && $iAscii <= 90)
60    } {
61      lappend lRet [list $iCode [string tolower [format %c $iAscii]]]
62      set dia($iDia) 1
63    }
64  }
65
66  foreach d [array names dia] {
67    lappend lRet [list $d ""]
68  }
69  set lRet [lsort -integer -index 0 $lRet]
70
71  close $fd
72  set lRet
73}
74
75#-------------------------------------------------------------------------
76# Parameter $zName must be a path to the file UnicodeData.txt. This command
77# reads the file and returns a list of codepoints (integers). The list
78# contains all codepoints in the UnicodeData.txt assigned to any "General
79# Category" that is not a "Letter" or "Number".
80#
81proc an_load_unicodedata_text {zName} {
82  set fd [open $zName]
83  set lField {
84    code
85    character_name
86    general_category
87    canonical_combining_classes
88    bidirectional_category
89    character_decomposition_mapping
90    decimal_digit_value
91    digit_value
92    numeric_value
93    mirrored
94    unicode_1_name
95    iso10646_comment_field
96    uppercase_mapping
97    lowercase_mapping
98    titlecase_mapping
99  }
100  set lRet [list]
101
102  while { ![eof $fd] } {
103    set line [gets $fd]
104    if {$line == ""} continue
105
106    set fields [split $line ";"]
107    if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
108    foreach $lField $fields {}
109
110    set iCode [expr "0x$code"]
111    set bAlnum [expr {
112         [lsearch {L N} [string range $general_category 0 0]] >= 0
113      || $general_category=="Co"
114    }]
115
116    if { !$bAlnum } { lappend lRet $iCode }
117  }
118
119  close $fd
120  set lRet
121}
122
123proc tl_load_casefolding_txt {zName} {
124  global tl_lookup_table
125
126  set fd [open $zName]
127  while { ![eof $fd] } {
128    set line [gets $fd]
129    if {[string range $line 0 0] == "#"} continue
130    if {$line == ""} continue
131
132    foreach x {a b c d} {unset -nocomplain $x}
133    foreach {a b c d} [split $line ";"] {}
134
135    set a2 [list]
136    set c2 [list]
137    foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] }
138    foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] }
139    set b [string trim $b]
140    set d [string trim $d]
141
142    if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 }
143  }
144}
145
146
147