1
2#--------------------------------------------------------------------------
3# Parameter $zName must be a path to the file UnicodeData.txt. This command
4# reads the file and returns a list of mappings required to remove all
5# diacritical marks from a unicode string. Each mapping is itself a list
6# consisting of two elements - the unicode codepoint and the single ASCII
7# character that it should be replaced with, or an empty string if the
8# codepoint should simply be removed from the input. Examples:
9#
10#   { 224 a  0 }     (replace codepoint 224 to "a")
11#   { 769 "" 0 }     (remove codepoint 769 from input)
12#
13# Mappings are only returned for non-upper case codepoints. It is assumed
14# that the input has already been folded to lower case.
15#
16# The third value in the list is always either 0 or 1. 0 if the
17# UnicodeData.txt file maps the codepoint to a single ASCII character and
18# a diacritic, or 1 if the mapping is indirect. For example, consider the
19# two entries:
20#
21# 1ECD;LATIN SMALL LETTER O WITH DOT BELOW;Ll;0;L;006F 0323;;;;N;;;1ECC;;1ECC
22# 1ED9;LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW;Ll;0;L;1ECD 0302;;;;N;;;1ED8;;1ED8
23#
24# The first codepoint is a direct mapping (as 006F is ASCII and 0323 is a
25# diacritic). The second is an indirect mapping, as it maps to the
26# first codepoint plus 0302 (a diacritic).
27#
28proc rd_load_unicodedata_text {zName} {
29  global tl_lookup_table
30
31  set fd [open $zName]
32  set lField {
33    code
34    character_name
35    general_category
36    canonical_combining_classes
37    bidirectional_category
38    character_decomposition_mapping
39    decimal_digit_value
40    digit_value
41    numeric_value
42    mirrored
43    unicode_1_name
44    iso10646_comment_field
45    uppercase_mapping
46    lowercase_mapping
47    titlecase_mapping
48  }
49  set lRet [list]
50
51  while { ![eof $fd] } {
52    set line [gets $fd]
53    if {$line == ""} continue
54
55    set fields [split $line ";"]
56    if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
57    foreach $lField $fields {}
58    if { [llength $character_decomposition_mapping]!=2
59      || [string is xdigit [lindex $character_decomposition_mapping 0]]==0
60    } {
61      continue
62    }
63
64    set iCode  [expr "0x$code"]
65    set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
66    set iDia   [expr "0x[lindex $character_decomposition_mapping 1]"]
67
68    # Filter out upper-case characters, as they will be mapped to their
69    # lower-case equivalents before this data is used.
70    if {[info exists tl_lookup_table($iCode)]} continue
71
72    # Check if this is an indirect mapping. If so, set bIndirect to true
73    # and change $iAscii to the indirectly mappped ASCII character.
74    set bIndirect 0
75    if {[info exists dia($iDia)] && [info exists mapping($iAscii)]} {
76      set iAscii $mapping($iAscii)
77      set bIndirect 1
78    }
79
80    if { ($iAscii >= 97 && $iAscii <= 122)
81      || ($iAscii >= 65 && $iAscii <= 90)
82    } {
83      lappend lRet [list $iCode [string tolower [format %c $iAscii]] $bIndirect]
84      set mapping($iCode) $iAscii
85      set dia($iDia) 1
86    }
87  }
88
89  foreach d [array names dia] {
90    lappend lRet [list $d "" 0]
91  }
92  set lRet [lsort -integer -index 0 $lRet]
93
94  close $fd
95  set lRet
96}
97
98#-------------------------------------------------------------------------
99# Parameter $zName must be a path to the file UnicodeData.txt. This command
100# reads the file and returns a list of codepoints (integers). The list
101# contains all codepoints in the UnicodeData.txt assigned to any "General
102# Category" that is not a "Letter" or "Number".
103#
104proc an_load_unicodedata_text {zName} {
105  set fd [open $zName]
106  set lField {
107    code
108    character_name
109    general_category
110    canonical_combining_classes
111    bidirectional_category
112    character_decomposition_mapping
113    decimal_digit_value
114    digit_value
115    numeric_value
116    mirrored
117    unicode_1_name
118    iso10646_comment_field
119    uppercase_mapping
120    lowercase_mapping
121    titlecase_mapping
122  }
123  set lRet [list]
124
125  while { ![eof $fd] } {
126    set line [gets $fd]
127    if {$line == ""} continue
128
129    set fields [split $line ";"]
130    if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
131    foreach $lField $fields {}
132
133    set iCode [expr "0x$code"]
134    set bAlnum [expr {
135         [lsearch {L N} [string range $general_category 0 0]] >= 0
136      || $general_category=="Co"
137    }]
138
139    if { !$bAlnum } { lappend lRet $iCode }
140  }
141
142  close $fd
143  set lRet
144}
145
146proc tl_load_casefolding_txt {zName} {
147  global tl_lookup_table
148
149  set fd [open $zName]
150  while { ![eof $fd] } {
151    set line [gets $fd]
152    if {[string range $line 0 0] == "#"} continue
153    if {$line == ""} continue
154
155    foreach x {a b c d} {unset -nocomplain $x}
156    foreach {a b c d} [split $line ";"] {}
157
158    set a2 [list]
159    set c2 [list]
160    foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] }
161    foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] }
162    set b [string trim $b]
163    set d [string trim $d]
164
165    if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 }
166  }
167}
168
169proc cc_load_unicodedata_text {zName} {
170  set fd [open $zName]
171  set lField {
172    code
173    character_name
174    general_category
175    canonical_combining_classes
176    bidirectional_category
177    character_decomposition_mapping
178    decimal_digit_value
179    digit_value
180    numeric_value
181    mirrored
182    unicode_1_name
183    iso10646_comment_field
184    uppercase_mapping
185    lowercase_mapping
186    titlecase_mapping
187  }
188  set lRet [list]
189
190  while { ![eof $fd] } {
191    set line [gets $fd]
192    if {$line == ""} continue
193
194    set fields [split $line ";"]
195    if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
196    foreach $lField $fields {}
197
198    lappend lRet [list $code $general_category]
199  }
200
201  close $fd
202  set lRet
203}
204
205
206