13d403c71Sdan# 2012 May 25 23d403c71Sdan# 33d403c71Sdan# The author disclaims copyright to this source code. In place of 43d403c71Sdan# a legal notice, here is a blessing: 53d403c71Sdan# 63d403c71Sdan# May you do good and not evil. 73d403c71Sdan# May you find forgiveness for yourself and forgive others. 83d403c71Sdan# May you share freely, never taking more than you give. 93d403c71Sdan# 103d403c71Sdan#************************************************************************* 113d403c71Sdan# 123d403c71Sdan# The tests in this file focus on testing the "unicode" FTS tokenizer. 133d403c71Sdan# 143d403c71Sdan 153d403c71Sdanset testdir [file dirname $argv0] 163d403c71Sdansource $testdir/tester.tcl 177946c530Sdanifcapable !fts3_unicode { finish_test ; return } 183d403c71Sdanset ::testprefix fts4unicode 193d403c71Sdan 203d403c71Sdanproc do_unicode_token_test {tn input res} { 213d403c71Sdan set input [string map {' ''} $input] 223d403c71Sdan uplevel [list do_execsql_test $tn " 23754d3adfSdan SELECT fts3_tokenizer_test('unicode61', 'remove_diacritics=0', '$input'); 24754d3adfSdan " [list [list {*}$res]]] 25754d3adfSdan} 26754d3adfSdan 27754d3adfSdanproc do_unicode_token_test2 {tn input res} { 28754d3adfSdan set input [string map {' ''} $input] 29754d3adfSdan uplevel [list do_execsql_test $tn " 30ab322bd2Sdan SELECT fts3_tokenizer_test('unicode61', '$input'); 313d403c71Sdan " [list [list {*}$res]]] 323d403c71Sdan} 333d403c71Sdan 3425cdf46aSdanproc do_unicode_token_test3 {tn args} { 3525cdf46aSdan set res [lindex $args end] 3625cdf46aSdan set sql "SELECT fts3_tokenizer_test('unicode61'" 3725cdf46aSdan foreach a [lrange $args 0 end-1] { 3825cdf46aSdan append sql ", '" 3925cdf46aSdan append sql [string map {' ''} $a] 4025cdf46aSdan append sql "'" 4125cdf46aSdan } 4225cdf46aSdan append sql ")" 4325cdf46aSdan uplevel [list do_execsql_test $tn $sql [list [list {*}$res]]] 4425cdf46aSdan} 4525cdf46aSdan 463d403c71Sdando_unicode_token_test 1.0 {a B c D} {0 a a 1 b B 2 c c 3 d D} 47549bc3dbSmistachkin 48549bc3dbSmistachkindo_unicode_token_test 1.1 "\uC4 \uD6 \uDC" \ 49549bc3dbSmistachkin "0 \uE4 \uC4 1 \uF6 \uD6 2 \uFC \uDC" 50549bc3dbSmistachkin 51549bc3dbSmistachkindo_unicode_token_test 1.2 "x\uC4x x\uD6x x\uDCx" \ 52549bc3dbSmistachkin "0 x\uE4x x\uC4x 1 x\uF6x x\uD6x 2 x\uFCx x\uDCx" 533d403c71Sdan 543d403c71Sdan# 0x00DF is a small "sharp s". 0x1E9E is a capital sharp s. 553d403c71Sdando_unicode_token_test 1.3 "\uDF" "0 \uDF \uDF" 56549bc3dbSmistachkindo_unicode_token_test 1.4 "\u1E9E" "0 \uDF \u1E9E" 573d403c71Sdan 58549bc3dbSmistachkindo_unicode_token_test 1.5 "The quick brown fox" { 593d403c71Sdan 0 the The 1 quick quick 2 brown brown 3 fox fox 603d403c71Sdan} 61549bc3dbSmistachkindo_unicode_token_test 1.6 "The\u00bfquick\u224ebrown\u2263fox" { 623d403c71Sdan 0 the The 1 quick quick 2 brown brown 3 fox fox 633d403c71Sdan} 643d403c71Sdan 65549bc3dbSmistachkindo_unicode_token_test2 1.7 {a B c D} {0 a a 1 b B 2 c c 3 d D} 66549bc3dbSmistachkindo_unicode_token_test2 1.8 "\uC4 \uD6 \uDC" "0 a \uC4 1 o \uD6 2 u \uDC" 67549bc3dbSmistachkin 68549bc3dbSmistachkindo_unicode_token_test2 1.9 "x\uC4x x\uD6x x\uDCx" \ 69549bc3dbSmistachkin "0 xax x\uC4x 1 xox x\uD6x 2 xux x\uDCx" 70754d3adfSdan 71754d3adfSdan# Check that diacritics are removed if remove_diacritics=1 is specified. 72754d3adfSdan# And that they do not break tokens. 73549bc3dbSmistachkindo_unicode_token_test2 1.10 "xx\u0301xx" "0 xxxx xx\u301xx" 747c37e2f6Sdrh 757c37e2f6Sdrh# Title-case mappings work 76549bc3dbSmistachkindo_unicode_token_test 1.11 "\u01c5" "0 \u01c6 \u01c5" 77754d3adfSdan 78ab322bd2Sdan#------------------------------------------------------------------------- 79ab322bd2Sdan# 80ab322bd2Sdanset docs [list { 81ab322bd2Sdan Enhance the INSERT syntax to allow multiple rows to be inserted via the 82ab322bd2Sdan VALUES clause. 83ab322bd2Sdan} { 84ab322bd2Sdan Enhance the CREATE VIRTUAL TABLE command to support the IF NOT EXISTS clause. 85ab322bd2Sdan} { 86ab322bd2Sdan Added the sqlite3_stricmp() interface as a counterpart to sqlite3_strnicmp(). 87ab322bd2Sdan} { 88ab322bd2Sdan Added the sqlite3_db_readonly() interface. 89ab322bd2Sdan} { 90ab322bd2Sdan Added the SQLITE_FCNTL_PRAGMA file control, giving VFS implementations the 91ab322bd2Sdan ability to add new PRAGMA statements or to override built-in PRAGMAs. 92ab322bd2Sdan} { 93ab322bd2Sdan Queries of the form: "SELECT max(x), y FROM table" returns the value of y on 94ab322bd2Sdan the same row that contains the maximum x value. 95ab322bd2Sdan} { 96ab322bd2Sdan Added support for the FTS4 languageid option. 97ab322bd2Sdan} { 98ab322bd2Sdan Documented support for the FTS4 content option. This feature has actually 99ab322bd2Sdan been in the code since version 3.7.9 but is only now considered to be 100ab322bd2Sdan officially supported. 101ab322bd2Sdan} { 102ab322bd2Sdan Pending statements no longer block ROLLBACK. Instead, the pending statement 103ab322bd2Sdan will return SQLITE_ABORT upon next access after the ROLLBACK. 104ab322bd2Sdan} { 105ab322bd2Sdan Improvements to the handling of CSV inputs in the command-line shell 106ab322bd2Sdan} { 107ab322bd2Sdan Fix a bug introduced in version 3.7.10 that might cause a LEFT JOIN to be 108ab322bd2Sdan incorrectly converted into an INNER JOIN if the WHERE clause indexable terms 109ab322bd2Sdan connected by OR. 110ab322bd2Sdan}] 111ab322bd2Sdan 112ab322bd2Sdanset map(a) [list "\u00C4" "\u00E4"] ; # LATIN LETTER A WITH DIAERESIS 113ab322bd2Sdanset map(e) [list "\u00CB" "\u00EB"] ; # LATIN LETTER E WITH DIAERESIS 114ab322bd2Sdanset map(i) [list "\u00CF" "\u00EF"] ; # LATIN LETTER I WITH DIAERESIS 115ab322bd2Sdanset map(o) [list "\u00D6" "\u00F6"] ; # LATIN LETTER O WITH DIAERESIS 116ab322bd2Sdanset map(u) [list "\u00DC" "\u00FC"] ; # LATIN LETTER U WITH DIAERESIS 117ab322bd2Sdanset map(y) [list "\u0178" "\u00FF"] ; # LATIN LETTER Y WITH DIAERESIS 118ab322bd2Sdanset map(h) [list "\u1E26" "\u1E27"] ; # LATIN LETTER H WITH DIAERESIS 119ab322bd2Sdanset map(w) [list "\u1E84" "\u1E85"] ; # LATIN LETTER W WITH DIAERESIS 120ab322bd2Sdanset map(x) [list "\u1E8C" "\u1E8D"] ; # LATIN LETTER X WITH DIAERESIS 121ab322bd2Sdanforeach k [array names map] { 122ab322bd2Sdan lappend mappings [string toupper $k] [lindex $map($k) 0] 123ab322bd2Sdan lappend mappings $k [lindex $map($k) 1] 124ab322bd2Sdan} 125ab322bd2Sdanproc mapdoc {doc} { 126ab322bd2Sdan set doc [regsub -all {[[:space:]]+} $doc " "] 127ab322bd2Sdan string map $::mappings [string trim $doc] 128ab322bd2Sdan} 129ab322bd2Sdan 130ab322bd2Sdando_test 2.0 { 131ab322bd2Sdan execsql { CREATE VIRTUAL TABLE t2 USING fts4(tokenize=unicode61, x); } 132ab322bd2Sdan foreach doc $docs { 133ab322bd2Sdan set d [mapdoc $doc] 134ab322bd2Sdan execsql { INSERT INTO t2 VALUES($d) } 135ab322bd2Sdan } 136ab322bd2Sdan} {} 137ab322bd2Sdan 138ab322bd2Sdando_test 2.1 { 139ab322bd2Sdan set q [mapdoc "row"] 140ab322bd2Sdan execsql { SELECT * FROM t2 WHERE t2 MATCH $q } 141ab322bd2Sdan} [list [mapdoc { 142ab322bd2Sdan Queries of the form: "SELECT max(x), y FROM table" returns the value of y on 143ab322bd2Sdan the same row that contains the maximum x value. 144ab322bd2Sdan}]] 145ab322bd2Sdan 146ab322bd2Sdanforeach {tn query snippet} { 147ab322bd2Sdan 2 "row" { 148ab322bd2Sdan ...returns the value of y on the same [row] that contains 149ab322bd2Sdan the maximum x value. 150ab322bd2Sdan } 151ab322bd2Sdan 3 "ROW" { 152ab322bd2Sdan ...returns the value of y on the same [row] that contains 153ab322bd2Sdan the maximum x value. 154ab322bd2Sdan } 155ab322bd2Sdan 4 "rollback" { 156ab322bd2Sdan ...[ROLLBACK]. Instead, the pending statement 157ab322bd2Sdan will return SQLITE_ABORT upon next access after the [ROLLBACK]. 158ab322bd2Sdan } 159ab322bd2Sdan 5 "rOllback" { 160ab322bd2Sdan ...[ROLLBACK]. Instead, the pending statement 161ab322bd2Sdan will return SQLITE_ABORT upon next access after the [ROLLBACK]. 162ab322bd2Sdan } 163ab322bd2Sdan 6 "lang*" { 164ab322bd2Sdan Added support for the FTS4 [languageid] option. 165ab322bd2Sdan } 166ab322bd2Sdan} { 167ab322bd2Sdan do_test 2.$tn { 168ab322bd2Sdan set q [mapdoc $query] 169ab322bd2Sdan execsql { SELECT snippet(t2, '[', ']', '...') FROM t2 WHERE t2 MATCH $q } 170ab322bd2Sdan } [list [mapdoc $snippet]] 171ab322bd2Sdan} 172ab322bd2Sdan 1737a796731Sdan#------------------------------------------------------------------------- 1747a796731Sdan# Make sure the unicode61 tokenizer does not crash if it is passed a 1757a796731Sdan# NULL pointer. 1767a796731Sdanreset_db 1777a796731Sdando_execsql_test 3.1 { 1787a796731Sdan CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61, x, y); 1797a796731Sdan INSERT INTO t1 VALUES(NULL, 'a b c'); 1807a796731Sdan} 1817a796731Sdan 1827a796731Sdando_execsql_test 3.2 { 1837a796731Sdan SELECT snippet(t1, '[', ']') FROM t1 WHERE t1 MATCH 'b' 1847a796731Sdan} {{a [b] c}} 1857a796731Sdan 1867a796731Sdando_execsql_test 3.3 { 1877a796731Sdan BEGIN; 1887a796731Sdan DELETE FROM t1; 1897a796731Sdan INSERT INTO t1 VALUES('b b b b b b b b b b b', 'b b b b b b b b b b b b b'); 1907a796731Sdan INSERT INTO t1 SELECT * FROM t1; 1917a796731Sdan INSERT INTO t1 SELECT * FROM t1; 1927a796731Sdan INSERT INTO t1 SELECT * FROM t1; 1937a796731Sdan INSERT INTO t1 SELECT * FROM t1; 1947a796731Sdan INSERT INTO t1 SELECT * FROM t1; 1957a796731Sdan INSERT INTO t1 SELECT * FROM t1; 1967a796731Sdan INSERT INTO t1 SELECT * FROM t1; 1977a796731Sdan INSERT INTO t1 SELECT * FROM t1; 1987a796731Sdan INSERT INTO t1 SELECT * FROM t1; 1997a796731Sdan INSERT INTO t1 SELECT * FROM t1; 2007a796731Sdan INSERT INTO t1 SELECT * FROM t1; 2017a796731Sdan INSERT INTO t1 SELECT * FROM t1; 2027a796731Sdan INSERT INTO t1 SELECT * FROM t1; 2037a796731Sdan INSERT INTO t1 SELECT * FROM t1; 2047a796731Sdan INSERT INTO t1 SELECT * FROM t1; 2057a796731Sdan INSERT INTO t1 SELECT * FROM t1; 2067a796731Sdan INSERT INTO t1 VALUES('a b c', NULL); 2077a796731Sdan INSERT INTO t1 VALUES('a x c', NULL); 2087a796731Sdan COMMIT; 2097a796731Sdan} 2107a796731Sdan 2117a796731Sdando_execsql_test 3.4 { 2127a796731Sdan SELECT * FROM t1 WHERE t1 MATCH 'a b'; 2137a796731Sdan} {{a b c} {}} 2147a796731Sdan 2157a796731Sdan#------------------------------------------------------------------------- 2167a796731Sdan# 2177a796731Sdanreset_db 2187a796731Sdan 2197a796731Sdando_test 4.1 { 2207a796731Sdan set a "abc\uFFFEdef" 2217a796731Sdan set b "abc\uD800def" 2227a796731Sdan set c "\uFFFEdef" 2237a796731Sdan set d "\uD800def" 2247a796731Sdan execsql { 2257a796731Sdan CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61, x); 2267a796731Sdan INSERT INTO t1 VALUES($a); 2277a796731Sdan INSERT INTO t1 VALUES($b); 2287a796731Sdan INSERT INTO t1 VALUES($c); 2297a796731Sdan INSERT INTO t1 VALUES($d); 2307a796731Sdan } 2317a796731Sdan} {} 2327a796731Sdan 2337a796731Sdando_test 4.2 { 2347a796731Sdan set a [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0x62}] 2357a796731Sdan set b [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0x62}] 2367a796731Sdan set c [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}] 2377a796731Sdan set d [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}] 2387a796731Sdan execsql { 2397a796731Sdan INSERT INTO t1 VALUES($a); 2407a796731Sdan INSERT INTO t1 VALUES($b); 2417a796731Sdan INSERT INTO t1 VALUES($c); 2427a796731Sdan INSERT INTO t1 VALUES($d); 2437a796731Sdan } 2447a796731Sdan} {} 2457a796731Sdan 2467a796731Sdando_test 4.3 { 2477a796731Sdan set a [binary format c* {0xF7 0xBF 0xBF 0xBF}] 2487a796731Sdan set b [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF}] 2497a796731Sdan set c [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF}] 2507a796731Sdan set d [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF}] 2517a796731Sdan execsql { 2527a796731Sdan INSERT INTO t1 VALUES($a); 2537a796731Sdan INSERT INTO t1 VALUES($b); 2547a796731Sdan INSERT INTO t1 VALUES($c); 2557a796731Sdan INSERT INTO t1 VALUES($d); 2567a796731Sdan } 2577a796731Sdan} {} 2587a796731Sdan 25925cdf46aSdan#------------------------------------------------------------------------- 26025cdf46aSdan 26125cdf46aSdando_unicode_token_test3 5.1 {tokenchars=} { 26225cdf46aSdan sqlite3_reset sqlite3_column_int 26325cdf46aSdan} { 26425cdf46aSdan 0 sqlite3 sqlite3 26525cdf46aSdan 1 reset reset 26625cdf46aSdan 2 sqlite3 sqlite3 26725cdf46aSdan 3 column column 26825cdf46aSdan 4 int int 26925cdf46aSdan} 27025cdf46aSdan 27125cdf46aSdando_unicode_token_test3 5.2 {tokenchars=_} { 27225cdf46aSdan sqlite3_reset sqlite3_column_int 27325cdf46aSdan} { 27425cdf46aSdan 0 sqlite3_reset sqlite3_reset 27525cdf46aSdan 1 sqlite3_column_int sqlite3_column_int 27625cdf46aSdan} 27725cdf46aSdan 27825cdf46aSdando_unicode_token_test3 5.3 {separators=xyz} { 27925cdf46aSdan Laotianxhorseyrunszfast 28025cdf46aSdan} { 28125cdf46aSdan 0 laotian Laotian 28225cdf46aSdan 1 horse horse 28325cdf46aSdan 2 runs runs 28425cdf46aSdan 3 fast fast 28525cdf46aSdan} 28625cdf46aSdan 28725cdf46aSdando_unicode_token_test3 5.4 {tokenchars=xyz} { 28825cdf46aSdan Laotianxhorseyrunszfast 28925cdf46aSdan} { 29025cdf46aSdan 0 laotianxhorseyrunszfast Laotianxhorseyrunszfast 29125cdf46aSdan} 29225cdf46aSdan 29325cdf46aSdando_unicode_token_test3 5.5 {tokenchars=_} {separators=zyx} { 29425cdf46aSdan sqlite3_resetxsqlite3_column_intyhonda_phantom 29525cdf46aSdan} { 29625cdf46aSdan 0 sqlite3_reset sqlite3_reset 29725cdf46aSdan 1 sqlite3_column_int sqlite3_column_int 29825cdf46aSdan 2 honda_phantom honda_phantom 29925cdf46aSdan} 30025cdf46aSdan 30125cdf46aSdando_unicode_token_test3 5.6 "separators=\u05D1" "abc\u05D1def" { 30225cdf46aSdan 0 abc abc 1 def def 30325cdf46aSdan} 30425cdf46aSdan 30525cdf46aSdando_unicode_token_test3 5.7 \ 30625cdf46aSdan "tokenchars=\u2444\u2445" \ 30725cdf46aSdan "separators=\u05D0\u05D1\u05D2" \ 30825cdf46aSdan "\u2444fre\u2445sh\u05D0water\u05D2fish.\u2445timer" \ 30925cdf46aSdan [list \ 31025cdf46aSdan 0 \u2444fre\u2445sh \u2444fre\u2445sh \ 31125cdf46aSdan 1 water water \ 31225cdf46aSdan 2 fish fish \ 31325cdf46aSdan 3 \u2445timer \u2445timer \ 31425cdf46aSdan ] 31525cdf46aSdan 31625cdf46aSdan# Check that it is not possible to add a standalone diacritic codepoint 31725cdf46aSdan# to either separators or tokenchars. 31825cdf46aSdando_unicode_token_test3 5.8 "separators=\u0301" \ 31925cdf46aSdan "hello\u0301world \u0301helloworld" \ 32025cdf46aSdan "0 helloworld hello\u0301world 1 helloworld helloworld" 32125cdf46aSdan 32225cdf46aSdando_unicode_token_test3 5.9 "tokenchars=\u0301" \ 32325cdf46aSdan "hello\u0301world \u0301helloworld" \ 32425cdf46aSdan "0 helloworld hello\u0301world 1 helloworld helloworld" 32525cdf46aSdan 32625cdf46aSdando_unicode_token_test3 5.10 "separators=\u0301" \ 32725cdf46aSdan "remove_diacritics=0" \ 32825cdf46aSdan "hello\u0301world \u0301helloworld" \ 32925cdf46aSdan "0 hello\u0301world hello\u0301world 1 helloworld helloworld" 33025cdf46aSdan 33125cdf46aSdando_unicode_token_test3 5.11 "tokenchars=\u0301" \ 33225cdf46aSdan "remove_diacritics=0" \ 33325cdf46aSdan "hello\u0301world \u0301helloworld" \ 33425cdf46aSdan "0 hello\u0301world hello\u0301world 1 helloworld helloworld" 3357a796731Sdan 3367a796731Sdan 3373aaa4cd9Sdan#------------------------------------------------------------------------- 3383aaa4cd9Sdan 3393aaa4cd9Sdanproc do_tokenize {tokenizer txt} { 3403aaa4cd9Sdan set res [list] 3413aaa4cd9Sdan foreach {a b c} [db one {SELECT fts3_tokenizer_test($tokenizer, $txt)}] { 3423aaa4cd9Sdan lappend res $b 3433aaa4cd9Sdan } 3443aaa4cd9Sdan set res 3453aaa4cd9Sdan} 3463aaa4cd9Sdan 3473aaa4cd9Sdan# Argument $lCodepoint must be a list of codepoints (integers) that 3483aaa4cd9Sdan# correspond to whitespace characters. This command creates a string 3493aaa4cd9Sdan# $W from the codepoints, then tokenizes "${W}hello{$W}world${W}" 3503aaa4cd9Sdan# using tokenizer $tokenizer. The test passes if the tokenizer successfully 3513aaa4cd9Sdan# extracts the two 5 character tokens. 3523aaa4cd9Sdan# 3533aaa4cd9Sdanproc do_isspace_test {tn tokenizer lCp} { 3543aaa4cd9Sdan set whitespace [format [string repeat %c [llength $lCp]] {*}$lCp] 3553aaa4cd9Sdan set txt "${whitespace}hello${whitespace}world${whitespace}" 3563aaa4cd9Sdan uplevel [list do_test $tn [list do_tokenize $tokenizer $txt] {hello world}] 3573aaa4cd9Sdan} 3583aaa4cd9Sdan 3593aaa4cd9Sdanset tokenizers [list unicode61] 3603aaa4cd9Sdanifcapable icu { lappend tokenizers icu } 3613aaa4cd9Sdan 3623aaa4cd9Sdan# Some tests to check that the tokenizers can both identify white-space 3633aaa4cd9Sdan# codepoints. All codepoints tested below are of type "Zs" in the 3643aaa4cd9Sdan# UnicodeData.txt file. 36507d694c7Sdrh# 36607d694c7Sdrh# Note that codepoint 6158 has changed from Zs to Cf in recent versions 36707d694c7Sdrh# of UnicodeData.txt. So take that into account for the "icu" tests. 36807d694c7Sdrh# 3693aaa4cd9Sdanforeach T $tokenizers { 3703aaa4cd9Sdan do_isspace_test 6.$T.1 $T 32 3713aaa4cd9Sdan do_isspace_test 6.$T.2 $T 160 3723aaa4cd9Sdan do_isspace_test 6.$T.3 $T 5760 37307d694c7Sdrh if {$T!="icu"} { 3743aaa4cd9Sdan do_isspace_test 6.$T.4 $T 6158 37507d694c7Sdrh } 3763aaa4cd9Sdan do_isspace_test 6.$T.5 $T 8192 3773aaa4cd9Sdan do_isspace_test 6.$T.6 $T 8193 3783aaa4cd9Sdan do_isspace_test 6.$T.7 $T 8194 3793aaa4cd9Sdan do_isspace_test 6.$T.8 $T 8195 3803aaa4cd9Sdan do_isspace_test 6.$T.9 $T 8196 3813aaa4cd9Sdan do_isspace_test 6.$T.10 $T 8197 3823aaa4cd9Sdan do_isspace_test 6.$T.11 $T 8198 3833aaa4cd9Sdan do_isspace_test 6.$T.12 $T 8199 3843aaa4cd9Sdan do_isspace_test 6.$T.13 $T 8200 3853aaa4cd9Sdan do_isspace_test 6.$T.14 $T 8201 3863aaa4cd9Sdan do_isspace_test 6.$T.15 $T 8202 387d26d2c7dSdan if {$T!="icu"} { 3883aaa4cd9Sdan do_isspace_test 6.$T.16 $T 8239 389d26d2c7dSdan } 3903aaa4cd9Sdan do_isspace_test 6.$T.17 $T 8287 3913aaa4cd9Sdan do_isspace_test 6.$T.18 $T 12288 3923aaa4cd9Sdan 39307d694c7Sdrh if {$T!="icu"} { 3943aaa4cd9Sdan do_isspace_test 6.$T.19 $T {32 160 5760 6158} 39507d694c7Sdrh } else { 39607d694c7Sdrh do_isspace_test 6.$T.19 $T {32 160 5760 8192} 39707d694c7Sdrh } 398cbc53fecSmistachkin do_isspace_test 6.$T.20 $T {8192 8193 8194 8195} 399cbc53fecSmistachkin do_isspace_test 6.$T.21 $T {8196 8197 8198 8199} 400cbc53fecSmistachkin do_isspace_test 6.$T.22 $T {8200 8201 8202 8239} 401cbc53fecSmistachkin do_isspace_test 6.$T.23 $T {8287 12288} 4023aaa4cd9Sdan} 4033aaa4cd9Sdan 404f2c9229fSdan#------------------------------------------------------------------------- 405f2c9229fSdan# Test that the private use ranges are treated as alphanumeric. 406f2c9229fSdan# 407f2c9229fSdanforeach {tn1 c} { 408f2c9229fSdan 1 \ue000 2 \ue001 3 \uf000 4 \uf8fe 5 \uf8ff 409f2c9229fSdan} { 410f2c9229fSdan foreach {tn2 config res} { 411f2c9229fSdan 1 "" "0 hello*world hello*world" 412f2c9229fSdan 2 "separators=*" "0 hello hello 1 world world" 413f2c9229fSdan } { 414f2c9229fSdan set config [string map [list * $c] $config] 415f2c9229fSdan set input [string map [list * $c] "hello*world"] 416f2c9229fSdan set output [string map [list * $c] $res] 417f2c9229fSdan do_unicode_token_test3 7.$tn1.$tn2 {*}$config $input $output 418f2c9229fSdan } 419f2c9229fSdan} 420f2c9229fSdan 42143398081Sdan#------------------------------------------------------------------------- 42243398081Sdan# Cursory test of remove_diacritics=0. 42343398081Sdan# 42443398081Sdan# 00C4;LATIN CAPITAL LETTER A WITH DIAERESIS 42543398081Sdan# 00D6;LATIN CAPITAL LETTER O WITH DIAERESIS 42643398081Sdan# 00E4;LATIN SMALL LETTER A WITH DIAERESIS 42743398081Sdan# 00F6;LATIN SMALL LETTER O WITH DIAERESIS 42843398081Sdan# 42943398081Sdando_execsql_test 8.1.1 " 43043398081Sdan CREATE VIRTUAL TABLE t3 USING fts4(tokenize=unicode61 'remove_diacritics=1'); 43143398081Sdan INSERT INTO t3 VALUES('o'); 43243398081Sdan INSERT INTO t3 VALUES('a'); 43343398081Sdan INSERT INTO t3 VALUES('O'); 43443398081Sdan INSERT INTO t3 VALUES('A'); 43543398081Sdan INSERT INTO t3 VALUES('\xD6'); 43643398081Sdan INSERT INTO t3 VALUES('\xC4'); 43743398081Sdan INSERT INTO t3 VALUES('\xF6'); 43843398081Sdan INSERT INTO t3 VALUES('\xE4'); 43943398081Sdan" 44043398081Sdando_execsql_test 8.1.2 { 44143398081Sdan SELECT rowid FROM t3 WHERE t3 MATCH 'o'; 44243398081Sdan} {1 3 5 7} 44343398081Sdando_execsql_test 8.1.3 { 44443398081Sdan SELECT rowid FROM t3 WHERE t3 MATCH 'a'; 44543398081Sdan} {2 4 6 8} 44643398081Sdando_execsql_test 8.2.1 { 44743398081Sdan CREATE VIRTUAL TABLE t4 USING fts4(tokenize=unicode61 "remove_diacritics=0"); 44843398081Sdan INSERT INTO t4 SELECT * FROM t3; 44943398081Sdan} 45043398081Sdando_execsql_test 8.2.2 { 45143398081Sdan SELECT rowid FROM t4 WHERE t4 MATCH 'o'; 45243398081Sdan} {1 3} 45343398081Sdando_execsql_test 8.2.3 { 45443398081Sdan SELECT rowid FROM t4 WHERE t4 MATCH 'a'; 45543398081Sdan} {2 4} 4563aaa4cd9Sdan 457f1d2670dSdan#------------------------------------------------------------------------- 458f1d2670dSdan# 459f1d2670dSdanforeach {tn sql} { 460f1d2670dSdan 1 { 461f1d2670dSdan CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 [tokenchars= .]); 462f1d2670dSdan CREATE VIRTUAL TABLE t6 USING fts4( 463f1d2670dSdan tokenize=unicode61 [tokenchars=="] "tokenchars=[]"); 464f1d2670dSdan CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 [separators=x\xC4]); 465f1d2670dSdan } 466f1d2670dSdan 2 { 467f1d2670dSdan CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 "tokenchars= ."); 468f1d2670dSdan CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 "tokenchars=[=""]"); 469f1d2670dSdan CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 "separators=x\xC4"); 470f1d2670dSdan } 471f1d2670dSdan 3 { 472f1d2670dSdan CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 'tokenchars= .'); 473f1d2670dSdan CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 'tokenchars=="[]'); 474f1d2670dSdan CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 'separators=x\xC4'); 475f1d2670dSdan } 476f1d2670dSdan 4 { 477f1d2670dSdan CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 `tokenchars= .`); 478f1d2670dSdan CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 `tokenchars=[="]`); 479f1d2670dSdan CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 `separators=x\xC4`); 480f1d2670dSdan } 481f1d2670dSdan} { 482f1d2670dSdan do_execsql_test 9.$tn.0 { 483f1d2670dSdan DROP TABLE IF EXISTS t5; 484f1d2670dSdan DROP TABLE IF EXISTS t5aux; 485f1d2670dSdan DROP TABLE IF EXISTS t6; 486f1d2670dSdan DROP TABLE IF EXISTS t6aux; 487f1d2670dSdan DROP TABLE IF EXISTS t7; 488f1d2670dSdan DROP TABLE IF EXISTS t7aux; 489f1d2670dSdan } 490f1d2670dSdan do_execsql_test 9.$tn.1 $sql 491f1d2670dSdan 492f1d2670dSdan do_execsql_test 9.$tn.2 { 493f1d2670dSdan CREATE VIRTUAL TABLE t5aux USING fts4aux(t5); 494f1d2670dSdan INSERT INTO t5 VALUES('one two three/four.five.six'); 495f1d2670dSdan SELECT * FROM t5aux; 496f1d2670dSdan } { 497f1d2670dSdan four.five.six * 1 1 four.five.six 0 1 1 498f1d2670dSdan {one two three} * 1 1 {one two three} 0 1 1 499f1d2670dSdan } 500f1d2670dSdan 501f1d2670dSdan do_execsql_test 9.$tn.3 { 502f1d2670dSdan CREATE VIRTUAL TABLE t6aux USING fts4aux(t6); 503f1d2670dSdan INSERT INTO t6 VALUES('alpha=beta"gamma/delta[epsilon]zeta'); 504f1d2670dSdan SELECT * FROM t6aux; 505f1d2670dSdan } { 506f1d2670dSdan {alpha=beta"gamma} * 1 1 {alpha=beta"gamma} 0 1 1 507f1d2670dSdan {delta[epsilon]zeta} * 1 1 {delta[epsilon]zeta} 0 1 1 508f1d2670dSdan } 509f1d2670dSdan 510f1d2670dSdan do_execsql_test 9.$tn.4 { 511f1d2670dSdan CREATE VIRTUAL TABLE t7aux USING fts4aux(t7); 512f1d2670dSdan INSERT INTO t7 VALUES('alephxbeth\xC4gimel'); 513f1d2670dSdan SELECT * FROM t7aux; 514f1d2670dSdan } { 515f1d2670dSdan aleph * 1 1 aleph 0 1 1 516f1d2670dSdan beth * 1 1 beth 0 1 1 517f1d2670dSdan gimel * 1 1 gimel 0 1 1 518f1d2670dSdan } 519f1d2670dSdan} 520f1d2670dSdan 521f1d2670dSdan# Check that multiple options are handled correctly. 522f1d2670dSdan# 523f1d2670dSdando_execsql_test 10.1 { 524f1d2670dSdan DROP TABLE IF EXISTS t1; 525f1d2670dSdan CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61 526f1d2670dSdan "tokenchars=xyz" "tokenchars=.=" "separators=.=" "separators=xy" 527f1d2670dSdan "separators=a" "separators=a" "tokenchars=a" "tokenchars=a" 528f1d2670dSdan ); 529f1d2670dSdan 530f1d2670dSdan INSERT INTO t1 VALUES('oneatwoxthreeyfour'); 531f1d2670dSdan INSERT INTO t1 VALUES('a.single=word'); 532f1d2670dSdan CREATE VIRTUAL TABLE t1aux USING fts4aux(t1); 533f1d2670dSdan SELECT * FROM t1aux; 534f1d2670dSdan} { 535f1d2670dSdan .single=word * 1 1 .single=word 0 1 1 536f1d2670dSdan four * 1 1 four 0 1 1 537f1d2670dSdan one * 1 1 one 0 1 1 538f1d2670dSdan three * 1 1 three 0 1 1 539f1d2670dSdan two * 1 1 two 0 1 1 540f1d2670dSdan} 541f1d2670dSdan 542f1d2670dSdan# Test that case folding happens after tokenization, not before. 543f1d2670dSdan# 544f1d2670dSdando_execsql_test 10.2 { 545f1d2670dSdan DROP TABLE IF EXISTS t2; 546f1d2670dSdan CREATE VIRTUAL TABLE t2 USING fts4(tokenize=unicode61 "separators=aB"); 547f1d2670dSdan INSERT INTO t2 VALUES('oneatwoBthree'); 548f1d2670dSdan INSERT INTO t2 VALUES('onebtwoAthree'); 549f1d2670dSdan CREATE VIRTUAL TABLE t2aux USING fts4aux(t2); 550f1d2670dSdan SELECT * FROM t2aux; 551f1d2670dSdan} { 552f1d2670dSdan one * 1 1 one 0 1 1 553f1d2670dSdan onebtwoathree * 1 1 onebtwoathree 0 1 1 554f1d2670dSdan three * 1 1 three 0 1 1 555f1d2670dSdan two * 1 1 two 0 1 1 556f1d2670dSdan} 557f1d2670dSdan 5586284d021Sdan# Test that the tokenchars and separators options work with the 5596284d021Sdan# fts3tokenize table. 5606284d021Sdan# 5616284d021Sdando_execsql_test 11.1 { 5626284d021Sdan CREATE VIRTUAL TABLE ft1 USING fts3tokenize( 5636284d021Sdan "unicode61", "tokenchars=@.", "separators=1234567890" 5646284d021Sdan ); 5656284d021Sdan SELECT token FROM ft1 WHERE input = '[email protected]'; 5666284d021Sdan} { 5676284d021Sdan berlin@street sydney.road 5686284d021Sdan} 569f1d2670dSdan 570*31afee93Sdan# Test for embedded nul characters in fts4 unicode index. 571*31afee93Sdan# 572*31afee93Sdando_execsql_test 12.0 { 573*31afee93Sdan CREATE VIRTUAL TABLE t12 USING fts4(tokenize=unicode61); 574*31afee93Sdan INSERT INTO t12 VALUES('abc' || char(0) || 'def'); 575*31afee93Sdan SELECT hex(CAST(content AS blob)) FROM t12; 576*31afee93Sdan} {61626300646566} 577*31afee93Sdando_execsql_test 12.1 { 578*31afee93Sdan INSERT INTO t12(t12) VALUES('integrity-check'); 579*31afee93Sdan} {} 580*31afee93Sdando_execsql_test 12.2 { 581*31afee93Sdan CREATE VIRTUAL TABLE t12aux USING fts4aux(t12); 582*31afee93Sdan SELECT * FROM t12aux; 583*31afee93Sdan} {abc * 1 1 abc 0 1 1} 584*31afee93Sdando_execsql_test 12.3 { 585*31afee93Sdan SELECT hex(CAST(content AS blob)) FROM t12 WHERE t12 MATCH 'abc' 586*31afee93Sdan} {61626300646566} 587*31afee93Sdan 5883d403c71Sdanfinish_test 589