1# 2012 May 25 2# 3# The author disclaims copyright to this source code. In place of 4# a legal notice, here is a blessing: 5# 6# May you do good and not evil. 7# May you find forgiveness for yourself and forgive others. 8# May you share freely, never taking more than you give. 9# 10#************************************************************************* 11# 12# The tests in this file focus on testing the "unicode" FTS tokenizer. 13# 14 15set testdir [file dirname $argv0] 16source $testdir/tester.tcl 17ifcapable !fts3_unicode { finish_test ; return } 18set ::testprefix fts4unicode 19 20proc do_unicode_token_test {tn input res} { 21 set input [string map {' ''} $input] 22 uplevel [list do_execsql_test $tn " 23 SELECT fts3_tokenizer_test('unicode61', 'remove_diacritics=0', '$input'); 24 " [list [list {*}$res]]] 25} 26 27proc do_unicode_token_test2 {tn input res} { 28 set input [string map {' ''} $input] 29 uplevel [list do_execsql_test $tn " 30 SELECT fts3_tokenizer_test('unicode61', '$input'); 31 " [list [list {*}$res]]] 32} 33 34proc do_unicode_token_test3 {tn args} { 35 set res [lindex $args end] 36 set sql "SELECT fts3_tokenizer_test('unicode61'" 37 foreach a [lrange $args 0 end-1] { 38 append sql ", '" 39 append sql [string map {' ''} $a] 40 append sql "'" 41 } 42 append sql ")" 43 uplevel [list do_execsql_test $tn $sql [list [list {*}$res]]] 44} 45 46do_unicode_token_test 1.0 {a B c D} {0 a a 1 b B 2 c c 3 d D} 47do_unicode_token_test 1.1 {� � �} {0 � � 1 � � 2 � �} 48do_unicode_token_test 1.2 {x�x x�x x�x} {0 x�x x�x 1 x�x x�x 2 x�x x�x} 49 50# 0x00DF is a small "sharp s". 0x1E9E is a capital sharp s. 51do_unicode_token_test 1.3 "\uDF" "0 \uDF \uDF" 52do_unicode_token_test 1.4 "\u1E9E" "0 � \u1E9E" 53do_unicode_token_test 1.5 "\u1E9E" "0 \uDF \u1E9E" 54 55do_unicode_token_test 1.6 "The quick brown fox" { 56 0 the The 1 quick quick 2 brown brown 3 fox fox 57} 58do_unicode_token_test 1.7 "The\u00bfquick\u224ebrown\u2263fox" { 59 0 the The 1 quick quick 2 brown brown 3 fox fox 60} 61 62do_unicode_token_test2 1.8 {a B c D} {0 a a 1 b B 2 c c 3 d D} 63do_unicode_token_test2 1.9 {� � �} {0 a � 1 o � 2 u �} 64do_unicode_token_test2 1.10 {x�x x�x x�x} {0 xax x�x 1 xox x�x 2 xux x�x} 65 66# Check that diacritics are removed if remove_diacritics=1 is specified. 67# And that they do not break tokens. 68do_unicode_token_test2 1.10 "xx\u0301xx" "0 xxxx xx\u301xx" 69 70#------------------------------------------------------------------------- 71# 72set docs [list { 73 Enhance the INSERT syntax to allow multiple rows to be inserted via the 74 VALUES clause. 75} { 76 Enhance the CREATE VIRTUAL TABLE command to support the IF NOT EXISTS clause. 77} { 78 Added the sqlite3_stricmp() interface as a counterpart to sqlite3_strnicmp(). 79} { 80 Added the sqlite3_db_readonly() interface. 81} { 82 Added the SQLITE_FCNTL_PRAGMA file control, giving VFS implementations the 83 ability to add new PRAGMA statements or to override built-in PRAGMAs. 84} { 85 Queries of the form: "SELECT max(x), y FROM table" returns the value of y on 86 the same row that contains the maximum x value. 87} { 88 Added support for the FTS4 languageid option. 89} { 90 Documented support for the FTS4 content option. This feature has actually 91 been in the code since version 3.7.9 but is only now considered to be 92 officially supported. 93} { 94 Pending statements no longer block ROLLBACK. Instead, the pending statement 95 will return SQLITE_ABORT upon next access after the ROLLBACK. 96} { 97 Improvements to the handling of CSV inputs in the command-line shell 98} { 99 Fix a bug introduced in version 3.7.10 that might cause a LEFT JOIN to be 100 incorrectly converted into an INNER JOIN if the WHERE clause indexable terms 101 connected by OR. 102}] 103 104set map(a) [list "\u00C4" "\u00E4"] ; # LATIN LETTER A WITH DIAERESIS 105set map(e) [list "\u00CB" "\u00EB"] ; # LATIN LETTER E WITH DIAERESIS 106set map(i) [list "\u00CF" "\u00EF"] ; # LATIN LETTER I WITH DIAERESIS 107set map(o) [list "\u00D6" "\u00F6"] ; # LATIN LETTER O WITH DIAERESIS 108set map(u) [list "\u00DC" "\u00FC"] ; # LATIN LETTER U WITH DIAERESIS 109set map(y) [list "\u0178" "\u00FF"] ; # LATIN LETTER Y WITH DIAERESIS 110set map(h) [list "\u1E26" "\u1E27"] ; # LATIN LETTER H WITH DIAERESIS 111set map(w) [list "\u1E84" "\u1E85"] ; # LATIN LETTER W WITH DIAERESIS 112set map(x) [list "\u1E8C" "\u1E8D"] ; # LATIN LETTER X WITH DIAERESIS 113foreach k [array names map] { 114 lappend mappings [string toupper $k] [lindex $map($k) 0] 115 lappend mappings $k [lindex $map($k) 1] 116} 117proc mapdoc {doc} { 118 set doc [regsub -all {[[:space:]]+} $doc " "] 119 string map $::mappings [string trim $doc] 120} 121 122do_test 2.0 { 123 execsql { CREATE VIRTUAL TABLE t2 USING fts4(tokenize=unicode61, x); } 124 foreach doc $docs { 125 set d [mapdoc $doc] 126 execsql { INSERT INTO t2 VALUES($d) } 127 } 128} {} 129 130do_test 2.1 { 131 set q [mapdoc "row"] 132 execsql { SELECT * FROM t2 WHERE t2 MATCH $q } 133} [list [mapdoc { 134 Queries of the form: "SELECT max(x), y FROM table" returns the value of y on 135 the same row that contains the maximum x value. 136}]] 137 138foreach {tn query snippet} { 139 2 "row" { 140 ...returns the value of y on the same [row] that contains 141 the maximum x value. 142 } 143 3 "ROW" { 144 ...returns the value of y on the same [row] that contains 145 the maximum x value. 146 } 147 4 "rollback" { 148 ...[ROLLBACK]. Instead, the pending statement 149 will return SQLITE_ABORT upon next access after the [ROLLBACK]. 150 } 151 5 "rOllback" { 152 ...[ROLLBACK]. Instead, the pending statement 153 will return SQLITE_ABORT upon next access after the [ROLLBACK]. 154 } 155 6 "lang*" { 156 Added support for the FTS4 [languageid] option. 157 } 158} { 159 do_test 2.$tn { 160 set q [mapdoc $query] 161 execsql { SELECT snippet(t2, '[', ']', '...') FROM t2 WHERE t2 MATCH $q } 162 } [list [mapdoc $snippet]] 163} 164 165#------------------------------------------------------------------------- 166# Make sure the unicode61 tokenizer does not crash if it is passed a 167# NULL pointer. 168reset_db 169do_execsql_test 3.1 { 170 CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61, x, y); 171 INSERT INTO t1 VALUES(NULL, 'a b c'); 172} 173 174do_execsql_test 3.2 { 175 SELECT snippet(t1, '[', ']') FROM t1 WHERE t1 MATCH 'b' 176} {{a [b] c}} 177 178do_execsql_test 3.3 { 179 BEGIN; 180 DELETE FROM t1; 181 INSERT INTO t1 VALUES('b b b b b b b b b b b', 'b b b b b b b b b b b b b'); 182 INSERT INTO t1 SELECT * FROM t1; 183 INSERT INTO t1 SELECT * FROM t1; 184 INSERT INTO t1 SELECT * FROM t1; 185 INSERT INTO t1 SELECT * FROM t1; 186 INSERT INTO t1 SELECT * FROM t1; 187 INSERT INTO t1 SELECT * FROM t1; 188 INSERT INTO t1 SELECT * FROM t1; 189 INSERT INTO t1 SELECT * FROM t1; 190 INSERT INTO t1 SELECT * FROM t1; 191 INSERT INTO t1 SELECT * FROM t1; 192 INSERT INTO t1 SELECT * FROM t1; 193 INSERT INTO t1 SELECT * FROM t1; 194 INSERT INTO t1 SELECT * FROM t1; 195 INSERT INTO t1 SELECT * FROM t1; 196 INSERT INTO t1 SELECT * FROM t1; 197 INSERT INTO t1 SELECT * FROM t1; 198 INSERT INTO t1 VALUES('a b c', NULL); 199 INSERT INTO t1 VALUES('a x c', NULL); 200 COMMIT; 201} 202 203do_execsql_test 3.4 { 204 SELECT * FROM t1 WHERE t1 MATCH 'a b'; 205} {{a b c} {}} 206 207#------------------------------------------------------------------------- 208# 209reset_db 210 211do_test 4.1 { 212 set a "abc\uFFFEdef" 213 set b "abc\uD800def" 214 set c "\uFFFEdef" 215 set d "\uD800def" 216 execsql { 217 CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61, x); 218 INSERT INTO t1 VALUES($a); 219 INSERT INTO t1 VALUES($b); 220 INSERT INTO t1 VALUES($c); 221 INSERT INTO t1 VALUES($d); 222 } 223} {} 224 225do_test 4.2 { 226 set a [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0x62}] 227 set b [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0x62}] 228 set c [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}] 229 set d [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}] 230 execsql { 231 INSERT INTO t1 VALUES($a); 232 INSERT INTO t1 VALUES($b); 233 INSERT INTO t1 VALUES($c); 234 INSERT INTO t1 VALUES($d); 235 } 236} {} 237 238do_test 4.3 { 239 set a [binary format c* {0xF7 0xBF 0xBF 0xBF}] 240 set b [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF}] 241 set c [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF}] 242 set d [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF}] 243 execsql { 244 INSERT INTO t1 VALUES($a); 245 INSERT INTO t1 VALUES($b); 246 INSERT INTO t1 VALUES($c); 247 INSERT INTO t1 VALUES($d); 248 } 249} {} 250 251#------------------------------------------------------------------------- 252 253do_unicode_token_test3 5.1 {tokenchars=} { 254 sqlite3_reset sqlite3_column_int 255} { 256 0 sqlite3 sqlite3 257 1 reset reset 258 2 sqlite3 sqlite3 259 3 column column 260 4 int int 261} 262 263do_unicode_token_test3 5.2 {tokenchars=_} { 264 sqlite3_reset sqlite3_column_int 265} { 266 0 sqlite3_reset sqlite3_reset 267 1 sqlite3_column_int sqlite3_column_int 268} 269 270do_unicode_token_test3 5.3 {separators=xyz} { 271 Laotianxhorseyrunszfast 272} { 273 0 laotian Laotian 274 1 horse horse 275 2 runs runs 276 3 fast fast 277} 278 279do_unicode_token_test3 5.4 {tokenchars=xyz} { 280 Laotianxhorseyrunszfast 281} { 282 0 laotianxhorseyrunszfast Laotianxhorseyrunszfast 283} 284 285do_unicode_token_test3 5.5 {tokenchars=_} {separators=zyx} { 286 sqlite3_resetxsqlite3_column_intyhonda_phantom 287} { 288 0 sqlite3_reset sqlite3_reset 289 1 sqlite3_column_int sqlite3_column_int 290 2 honda_phantom honda_phantom 291} 292 293do_unicode_token_test3 5.6 "separators=\u05D1" "abc\u05D1def" { 294 0 abc abc 1 def def 295} 296 297do_unicode_token_test3 5.7 \ 298 "tokenchars=\u2444\u2445" \ 299 "separators=\u05D0\u05D1\u05D2" \ 300 "\u2444fre\u2445sh\u05D0water\u05D2fish.\u2445timer" \ 301 [list \ 302 0 \u2444fre\u2445sh \u2444fre\u2445sh \ 303 1 water water \ 304 2 fish fish \ 305 3 \u2445timer \u2445timer \ 306 ] 307 308# Check that it is not possible to add a standalone diacritic codepoint 309# to either separators or tokenchars. 310do_unicode_token_test3 5.8 "separators=\u0301" \ 311 "hello\u0301world \u0301helloworld" \ 312 "0 helloworld hello\u0301world 1 helloworld helloworld" 313 314do_unicode_token_test3 5.9 "tokenchars=\u0301" \ 315 "hello\u0301world \u0301helloworld" \ 316 "0 helloworld hello\u0301world 1 helloworld helloworld" 317 318do_unicode_token_test3 5.10 "separators=\u0301" \ 319 "remove_diacritics=0" \ 320 "hello\u0301world \u0301helloworld" \ 321 "0 hello\u0301world hello\u0301world 1 helloworld helloworld" 322 323do_unicode_token_test3 5.11 "tokenchars=\u0301" \ 324 "remove_diacritics=0" \ 325 "hello\u0301world \u0301helloworld" \ 326 "0 hello\u0301world hello\u0301world 1 helloworld helloworld" 327 328 329#------------------------------------------------------------------------- 330 331proc do_tokenize {tokenizer txt} { 332 set res [list] 333 foreach {a b c} [db one {SELECT fts3_tokenizer_test($tokenizer, $txt)}] { 334 lappend res $b 335 } 336 set res 337} 338 339# Argument $lCodepoint must be a list of codepoints (integers) that 340# correspond to whitespace characters. This command creates a string 341# $W from the codepoints, then tokenizes "${W}hello{$W}world${W}" 342# using tokenizer $tokenizer. The test passes if the tokenizer successfully 343# extracts the two 5 character tokens. 344# 345proc do_isspace_test {tn tokenizer lCp} { 346 set whitespace [format [string repeat %c [llength $lCp]] {*}$lCp] 347 set txt "${whitespace}hello${whitespace}world${whitespace}" 348 uplevel [list do_test $tn [list do_tokenize $tokenizer $txt] {hello world}] 349} 350 351set tokenizers [list unicode61] 352ifcapable icu { lappend tokenizers icu } 353 354# Some tests to check that the tokenizers can both identify white-space 355# codepoints. All codepoints tested below are of type "Zs" in the 356# UnicodeData.txt file. 357foreach T $tokenizers { 358 do_isspace_test 6.$T.1 $T 32 359 do_isspace_test 6.$T.2 $T 160 360 do_isspace_test 6.$T.3 $T 5760 361 do_isspace_test 6.$T.4 $T 6158 362 do_isspace_test 6.$T.5 $T 8192 363 do_isspace_test 6.$T.6 $T 8193 364 do_isspace_test 6.$T.7 $T 8194 365 do_isspace_test 6.$T.8 $T 8195 366 do_isspace_test 6.$T.9 $T 8196 367 do_isspace_test 6.$T.10 $T 8197 368 do_isspace_test 6.$T.11 $T 8198 369 do_isspace_test 6.$T.12 $T 8199 370 do_isspace_test 6.$T.13 $T 8200 371 do_isspace_test 6.$T.14 $T 8201 372 do_isspace_test 6.$T.15 $T 8202 373 do_isspace_test 6.$T.16 $T 8239 374 do_isspace_test 6.$T.17 $T 8287 375 do_isspace_test 6.$T.18 $T 12288 376 377 do_isspace_test 6.$T.19 $T {32 160 5760 6158} 378 do_isspace_test 6.$T.19 $T {8192 8193 8194 8195} 379 do_isspace_test 6.$T.19 $T {8196 8197 8198 8199} 380 do_isspace_test 6.$T.19 $T {8200 8201 8202 8239} 381 do_isspace_test 6.$T.19 $T {8287 12288} 382} 383 384 385finish_test 386 387 388