121b7d2a9Sdan
221b7d2a9Sdan#--------------------------------------------------------------------------
321b7d2a9Sdan# Parameter $zName must be a path to the file UnicodeData.txt. This command
421b7d2a9Sdan# reads the file and returns a list of mappings required to remove all
521b7d2a9Sdan# diacritical marks from a unicode string. Each mapping is itself a list
621b7d2a9Sdan# consisting of two elements - the unicode codepoint and the single ASCII
721b7d2a9Sdan# character that it should be replaced with, or an empty string if the
821b7d2a9Sdan# codepoint should simply be removed from the input. Examples:
921b7d2a9Sdan#
10*e89feee5Sdan#   { 224 a  0 }     (replace codepoint 224 to "a")
11*e89feee5Sdan#   { 769 "" 0 }     (remove codepoint 769 from input)
1221b7d2a9Sdan#
1321b7d2a9Sdan# Mappings are only returned for non-upper case codepoints. It is assumed
1421b7d2a9Sdan# that the input has already been folded to lower case.
1521b7d2a9Sdan#
16*e89feee5Sdan# The third value in the list is always either 0 or 1. 0 if the
17*e89feee5Sdan# UnicodeData.txt file maps the codepoint to a single ASCII character and
18*e89feee5Sdan# a diacritic, or 1 if the mapping is indirect. For example, consider the
19*e89feee5Sdan# two entries:
20*e89feee5Sdan#
21*e89feee5Sdan# 1ECD;LATIN SMALL LETTER O WITH DOT BELOW;Ll;0;L;006F 0323;;;;N;;;1ECC;;1ECC
22*e89feee5Sdan# 1ED9;LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW;Ll;0;L;1ECD 0302;;;;N;;;1ED8;;1ED8
23*e89feee5Sdan#
24*e89feee5Sdan# The first codepoint is a direct mapping (as 006F is ASCII and 0323 is a
25*e89feee5Sdan# diacritic). The second is an indirect mapping, as it maps to the
26*e89feee5Sdan# first codepoint plus 0302 (a diacritic).
27*e89feee5Sdan#
2821b7d2a9Sdanproc rd_load_unicodedata_text {zName} {
2921b7d2a9Sdan  global tl_lookup_table
3021b7d2a9Sdan
3121b7d2a9Sdan  set fd [open $zName]
3221b7d2a9Sdan  set lField {
3321b7d2a9Sdan    code
3421b7d2a9Sdan    character_name
3521b7d2a9Sdan    general_category
3621b7d2a9Sdan    canonical_combining_classes
3721b7d2a9Sdan    bidirectional_category
3821b7d2a9Sdan    character_decomposition_mapping
3921b7d2a9Sdan    decimal_digit_value
4021b7d2a9Sdan    digit_value
4121b7d2a9Sdan    numeric_value
4221b7d2a9Sdan    mirrored
4321b7d2a9Sdan    unicode_1_name
4421b7d2a9Sdan    iso10646_comment_field
4521b7d2a9Sdan    uppercase_mapping
4621b7d2a9Sdan    lowercase_mapping
4721b7d2a9Sdan    titlecase_mapping
4821b7d2a9Sdan  }
4921b7d2a9Sdan  set lRet [list]
5021b7d2a9Sdan
5121b7d2a9Sdan  while { ![eof $fd] } {
5221b7d2a9Sdan    set line [gets $fd]
5321b7d2a9Sdan    if {$line == ""} continue
5421b7d2a9Sdan
5521b7d2a9Sdan    set fields [split $line ";"]
5621b7d2a9Sdan    if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
5721b7d2a9Sdan    foreach $lField $fields {}
5821b7d2a9Sdan    if { [llength $character_decomposition_mapping]!=2
5921b7d2a9Sdan      || [string is xdigit [lindex $character_decomposition_mapping 0]]==0
6021b7d2a9Sdan    } {
6121b7d2a9Sdan      continue
6221b7d2a9Sdan    }
6321b7d2a9Sdan
6421b7d2a9Sdan    set iCode  [expr "0x$code"]
6521b7d2a9Sdan    set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
6621b7d2a9Sdan    set iDia   [expr "0x[lindex $character_decomposition_mapping 1]"]
6721b7d2a9Sdan
68*e89feee5Sdan    # Filter out upper-case characters, as they will be mapped to their
69*e89feee5Sdan    # lower-case equivalents before this data is used.
7021b7d2a9Sdan    if {[info exists tl_lookup_table($iCode)]} continue
7121b7d2a9Sdan
72*e89feee5Sdan    # Check if this is an indirect mapping. If so, set bIndirect to true
73*e89feee5Sdan    # and change $iAscii to the indirectly mappped ASCII character.
74*e89feee5Sdan    set bIndirect 0
75*e89feee5Sdan    if {[info exists dia($iDia)] && [info exists mapping($iAscii)]} {
76*e89feee5Sdan      set iAscii $mapping($iAscii)
77*e89feee5Sdan      set bIndirect 1
78*e89feee5Sdan    }
79*e89feee5Sdan
8021b7d2a9Sdan    if { ($iAscii >= 97 && $iAscii <= 122)
8121b7d2a9Sdan      || ($iAscii >= 65 && $iAscii <= 90)
8221b7d2a9Sdan    } {
83*e89feee5Sdan      lappend lRet [list $iCode [string tolower [format %c $iAscii]] $bIndirect]
84*e89feee5Sdan      set mapping($iCode) $iAscii
8521b7d2a9Sdan      set dia($iDia) 1
8621b7d2a9Sdan    }
8721b7d2a9Sdan  }
8821b7d2a9Sdan
8921b7d2a9Sdan  foreach d [array names dia] {
90*e89feee5Sdan    lappend lRet [list $d "" 0]
9121b7d2a9Sdan  }
9221b7d2a9Sdan  set lRet [lsort -integer -index 0 $lRet]
9321b7d2a9Sdan
9421b7d2a9Sdan  close $fd
9521b7d2a9Sdan  set lRet
9621b7d2a9Sdan}
9721b7d2a9Sdan
9821b7d2a9Sdan#-------------------------------------------------------------------------
9921b7d2a9Sdan# Parameter $zName must be a path to the file UnicodeData.txt. This command
10021b7d2a9Sdan# reads the file and returns a list of codepoints (integers). The list
10121b7d2a9Sdan# contains all codepoints in the UnicodeData.txt assigned to any "General
10221b7d2a9Sdan# Category" that is not a "Letter" or "Number".
10321b7d2a9Sdan#
10421b7d2a9Sdanproc an_load_unicodedata_text {zName} {
10521b7d2a9Sdan  set fd [open $zName]
10621b7d2a9Sdan  set lField {
10721b7d2a9Sdan    code
10821b7d2a9Sdan    character_name
10921b7d2a9Sdan    general_category
11021b7d2a9Sdan    canonical_combining_classes
11121b7d2a9Sdan    bidirectional_category
11221b7d2a9Sdan    character_decomposition_mapping
11321b7d2a9Sdan    decimal_digit_value
11421b7d2a9Sdan    digit_value
11521b7d2a9Sdan    numeric_value
11621b7d2a9Sdan    mirrored
11721b7d2a9Sdan    unicode_1_name
11821b7d2a9Sdan    iso10646_comment_field
11921b7d2a9Sdan    uppercase_mapping
12021b7d2a9Sdan    lowercase_mapping
12121b7d2a9Sdan    titlecase_mapping
12221b7d2a9Sdan  }
12321b7d2a9Sdan  set lRet [list]
12421b7d2a9Sdan
12521b7d2a9Sdan  while { ![eof $fd] } {
12621b7d2a9Sdan    set line [gets $fd]
12721b7d2a9Sdan    if {$line == ""} continue
12821b7d2a9Sdan
12921b7d2a9Sdan    set fields [split $line ";"]
13021b7d2a9Sdan    if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
13121b7d2a9Sdan    foreach $lField $fields {}
13221b7d2a9Sdan
13321b7d2a9Sdan    set iCode [expr "0x$code"]
13421b7d2a9Sdan    set bAlnum [expr {
13521b7d2a9Sdan         [lsearch {L N} [string range $general_category 0 0]] >= 0
13621b7d2a9Sdan      || $general_category=="Co"
13721b7d2a9Sdan    }]
13821b7d2a9Sdan
13921b7d2a9Sdan    if { !$bAlnum } { lappend lRet $iCode }
14021b7d2a9Sdan  }
14121b7d2a9Sdan
14221b7d2a9Sdan  close $fd
14321b7d2a9Sdan  set lRet
14421b7d2a9Sdan}
14521b7d2a9Sdan
14621b7d2a9Sdanproc tl_load_casefolding_txt {zName} {
14721b7d2a9Sdan  global tl_lookup_table
14821b7d2a9Sdan
14921b7d2a9Sdan  set fd [open $zName]
15021b7d2a9Sdan  while { ![eof $fd] } {
15121b7d2a9Sdan    set line [gets $fd]
15221b7d2a9Sdan    if {[string range $line 0 0] == "#"} continue
15321b7d2a9Sdan    if {$line == ""} continue
15421b7d2a9Sdan
15521b7d2a9Sdan    foreach x {a b c d} {unset -nocomplain $x}
15621b7d2a9Sdan    foreach {a b c d} [split $line ";"] {}
15721b7d2a9Sdan
15821b7d2a9Sdan    set a2 [list]
15921b7d2a9Sdan    set c2 [list]
16021b7d2a9Sdan    foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] }
16121b7d2a9Sdan    foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] }
16221b7d2a9Sdan    set b [string trim $b]
16321b7d2a9Sdan    set d [string trim $d]
16421b7d2a9Sdan
16521b7d2a9Sdan    if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 }
16621b7d2a9Sdan  }
16721b7d2a9Sdan}
16821b7d2a9Sdan
169b80bb6ceSdanproc cc_load_unicodedata_text {zName} {
170b80bb6ceSdan  set fd [open $zName]
171b80bb6ceSdan  set lField {
172b80bb6ceSdan    code
173b80bb6ceSdan    character_name
174b80bb6ceSdan    general_category
175b80bb6ceSdan    canonical_combining_classes
176b80bb6ceSdan    bidirectional_category
177b80bb6ceSdan    character_decomposition_mapping
178b80bb6ceSdan    decimal_digit_value
179b80bb6ceSdan    digit_value
180b80bb6ceSdan    numeric_value
181b80bb6ceSdan    mirrored
182b80bb6ceSdan    unicode_1_name
183b80bb6ceSdan    iso10646_comment_field
184b80bb6ceSdan    uppercase_mapping
185b80bb6ceSdan    lowercase_mapping
186b80bb6ceSdan    titlecase_mapping
187b80bb6ceSdan  }
188b80bb6ceSdan  set lRet [list]
189b80bb6ceSdan
190b80bb6ceSdan  while { ![eof $fd] } {
191b80bb6ceSdan    set line [gets $fd]
192b80bb6ceSdan    if {$line == ""} continue
193b80bb6ceSdan
194b80bb6ceSdan    set fields [split $line ";"]
195b80bb6ceSdan    if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
196b80bb6ceSdan    foreach $lField $fields {}
197b80bb6ceSdan
198b80bb6ceSdan    lappend lRet [list $code $general_category]
199b80bb6ceSdan  }
200b80bb6ceSdan
201b80bb6ceSdan  close $fd
202b80bb6ceSdan  set lRet
203b80bb6ceSdan}
204b80bb6ceSdan
20521b7d2a9Sdan
206