xref: /sqlite-3.40.0/test/fts4unicode.test (revision 31afee93)
13d403c71Sdan# 2012 May 25
23d403c71Sdan#
33d403c71Sdan# The author disclaims copyright to this source code.  In place of
43d403c71Sdan# a legal notice, here is a blessing:
53d403c71Sdan#
63d403c71Sdan#    May you do good and not evil.
73d403c71Sdan#    May you find forgiveness for yourself and forgive others.
83d403c71Sdan#    May you share freely, never taking more than you give.
93d403c71Sdan#
103d403c71Sdan#*************************************************************************
113d403c71Sdan#
123d403c71Sdan# The tests in this file focus on testing the "unicode" FTS tokenizer.
133d403c71Sdan#
143d403c71Sdan
153d403c71Sdanset testdir [file dirname $argv0]
163d403c71Sdansource $testdir/tester.tcl
177946c530Sdanifcapable !fts3_unicode { finish_test ; return }
183d403c71Sdanset ::testprefix fts4unicode
193d403c71Sdan
203d403c71Sdanproc do_unicode_token_test {tn input res} {
213d403c71Sdan  set input [string map {' ''} $input]
223d403c71Sdan  uplevel [list do_execsql_test $tn "
23754d3adfSdan    SELECT fts3_tokenizer_test('unicode61', 'remove_diacritics=0', '$input');
24754d3adfSdan  " [list [list {*}$res]]]
25754d3adfSdan}
26754d3adfSdan
27754d3adfSdanproc do_unicode_token_test2 {tn input res} {
28754d3adfSdan  set input [string map {' ''} $input]
29754d3adfSdan  uplevel [list do_execsql_test $tn "
30ab322bd2Sdan    SELECT fts3_tokenizer_test('unicode61', '$input');
313d403c71Sdan  " [list [list {*}$res]]]
323d403c71Sdan}
333d403c71Sdan
3425cdf46aSdanproc do_unicode_token_test3 {tn args} {
3525cdf46aSdan  set res   [lindex $args end]
3625cdf46aSdan  set sql "SELECT fts3_tokenizer_test('unicode61'"
3725cdf46aSdan  foreach a [lrange $args 0 end-1] {
3825cdf46aSdan    append sql ", '"
3925cdf46aSdan    append sql [string map {' ''} $a]
4025cdf46aSdan    append sql "'"
4125cdf46aSdan  }
4225cdf46aSdan  append sql ")"
4325cdf46aSdan  uplevel [list do_execsql_test $tn $sql [list [list {*}$res]]]
4425cdf46aSdan}
4525cdf46aSdan
463d403c71Sdando_unicode_token_test 1.0 {a B c D} {0 a a 1 b B 2 c c 3 d D}
47549bc3dbSmistachkin
48549bc3dbSmistachkindo_unicode_token_test 1.1 "\uC4 \uD6 \uDC" \
49549bc3dbSmistachkin    "0 \uE4 \uC4 1 \uF6 \uD6 2 \uFC \uDC"
50549bc3dbSmistachkin
51549bc3dbSmistachkindo_unicode_token_test 1.2 "x\uC4x x\uD6x x\uDCx" \
52549bc3dbSmistachkin    "0 x\uE4x x\uC4x 1 x\uF6x x\uD6x 2 x\uFCx x\uDCx"
533d403c71Sdan
543d403c71Sdan# 0x00DF is a small "sharp s". 0x1E9E is a capital sharp s.
553d403c71Sdando_unicode_token_test 1.3 "\uDF" "0 \uDF \uDF"
56549bc3dbSmistachkindo_unicode_token_test 1.4 "\u1E9E" "0 \uDF \u1E9E"
573d403c71Sdan
58549bc3dbSmistachkindo_unicode_token_test 1.5 "The quick brown fox" {
593d403c71Sdan  0 the The 1 quick quick 2 brown brown 3 fox fox
603d403c71Sdan}
61549bc3dbSmistachkindo_unicode_token_test 1.6 "The\u00bfquick\u224ebrown\u2263fox" {
623d403c71Sdan  0 the The 1 quick quick 2 brown brown 3 fox fox
633d403c71Sdan}
643d403c71Sdan
65549bc3dbSmistachkindo_unicode_token_test2 1.7  {a B c D} {0 a a 1 b B 2 c c 3 d D}
66549bc3dbSmistachkindo_unicode_token_test2 1.8  "\uC4 \uD6 \uDC" "0 a \uC4 1 o \uD6 2 u \uDC"
67549bc3dbSmistachkin
68549bc3dbSmistachkindo_unicode_token_test2 1.9  "x\uC4x x\uD6x x\uDCx" \
69549bc3dbSmistachkin    "0 xax x\uC4x 1 xox x\uD6x 2 xux x\uDCx"
70754d3adfSdan
71754d3adfSdan# Check that diacritics are removed if remove_diacritics=1 is specified.
72754d3adfSdan# And that they do not break tokens.
73549bc3dbSmistachkindo_unicode_token_test2 1.10 "xx\u0301xx" "0 xxxx xx\u301xx"
747c37e2f6Sdrh
757c37e2f6Sdrh# Title-case mappings work
76549bc3dbSmistachkindo_unicode_token_test 1.11 "\u01c5" "0 \u01c6 \u01c5"
77754d3adfSdan
78ab322bd2Sdan#-------------------------------------------------------------------------
79ab322bd2Sdan#
80ab322bd2Sdanset docs [list {
81ab322bd2Sdan  Enhance the INSERT syntax to allow multiple rows to be inserted via the
82ab322bd2Sdan  VALUES clause.
83ab322bd2Sdan} {
84ab322bd2Sdan  Enhance the CREATE VIRTUAL TABLE command to support the IF NOT EXISTS clause.
85ab322bd2Sdan} {
86ab322bd2Sdan  Added the sqlite3_stricmp() interface as a counterpart to sqlite3_strnicmp().
87ab322bd2Sdan} {
88ab322bd2Sdan  Added the sqlite3_db_readonly() interface.
89ab322bd2Sdan} {
90ab322bd2Sdan  Added the SQLITE_FCNTL_PRAGMA file control, giving VFS implementations the
91ab322bd2Sdan  ability to add new PRAGMA statements or to override built-in PRAGMAs.
92ab322bd2Sdan} {
93ab322bd2Sdan  Queries of the form: "SELECT max(x), y FROM table" returns the value of y on
94ab322bd2Sdan  the same row that contains the maximum x value.
95ab322bd2Sdan} {
96ab322bd2Sdan  Added support for the FTS4 languageid option.
97ab322bd2Sdan} {
98ab322bd2Sdan  Documented support for the FTS4 content option. This feature has actually
99ab322bd2Sdan  been in the code since version 3.7.9 but is only now considered to be
100ab322bd2Sdan  officially supported.
101ab322bd2Sdan} {
102ab322bd2Sdan  Pending statements no longer block ROLLBACK. Instead, the pending statement
103ab322bd2Sdan  will return SQLITE_ABORT upon next access after the ROLLBACK.
104ab322bd2Sdan} {
105ab322bd2Sdan  Improvements to the handling of CSV inputs in the command-line shell
106ab322bd2Sdan} {
107ab322bd2Sdan  Fix a bug introduced in version 3.7.10 that might cause a LEFT JOIN to be
108ab322bd2Sdan  incorrectly converted into an INNER JOIN if the WHERE clause indexable terms
109ab322bd2Sdan  connected by OR.
110ab322bd2Sdan}]
111ab322bd2Sdan
112ab322bd2Sdanset map(a) [list "\u00C4" "\u00E4"]  ; # LATIN LETTER A WITH DIAERESIS
113ab322bd2Sdanset map(e) [list "\u00CB" "\u00EB"]  ; # LATIN LETTER E WITH DIAERESIS
114ab322bd2Sdanset map(i) [list "\u00CF" "\u00EF"]  ; # LATIN LETTER I WITH DIAERESIS
115ab322bd2Sdanset map(o) [list "\u00D6" "\u00F6"]  ; # LATIN LETTER O WITH DIAERESIS
116ab322bd2Sdanset map(u) [list "\u00DC" "\u00FC"]  ; # LATIN LETTER U WITH DIAERESIS
117ab322bd2Sdanset map(y) [list "\u0178" "\u00FF"]  ; # LATIN LETTER Y WITH DIAERESIS
118ab322bd2Sdanset map(h) [list "\u1E26" "\u1E27"]  ; # LATIN LETTER H WITH DIAERESIS
119ab322bd2Sdanset map(w) [list "\u1E84" "\u1E85"]  ; # LATIN LETTER W WITH DIAERESIS
120ab322bd2Sdanset map(x) [list "\u1E8C" "\u1E8D"]  ; # LATIN LETTER X WITH DIAERESIS
121ab322bd2Sdanforeach k [array names map] {
122ab322bd2Sdan  lappend mappings [string toupper $k] [lindex $map($k) 0]
123ab322bd2Sdan  lappend mappings $k [lindex $map($k) 1]
124ab322bd2Sdan}
125ab322bd2Sdanproc mapdoc {doc} {
126ab322bd2Sdan  set doc [regsub -all {[[:space:]]+} $doc " "]
127ab322bd2Sdan  string map $::mappings [string trim $doc]
128ab322bd2Sdan}
129ab322bd2Sdan
130ab322bd2Sdando_test 2.0 {
131ab322bd2Sdan  execsql { CREATE VIRTUAL TABLE t2 USING fts4(tokenize=unicode61, x); }
132ab322bd2Sdan  foreach doc $docs {
133ab322bd2Sdan    set d [mapdoc $doc]
134ab322bd2Sdan    execsql { INSERT INTO t2 VALUES($d) }
135ab322bd2Sdan  }
136ab322bd2Sdan} {}
137ab322bd2Sdan
138ab322bd2Sdando_test 2.1 {
139ab322bd2Sdan  set q [mapdoc "row"]
140ab322bd2Sdan  execsql { SELECT * FROM t2 WHERE t2 MATCH $q }
141ab322bd2Sdan} [list [mapdoc {
142ab322bd2Sdan  Queries of the form: "SELECT max(x), y FROM table" returns the value of y on
143ab322bd2Sdan  the same row that contains the maximum x value.
144ab322bd2Sdan}]]
145ab322bd2Sdan
146ab322bd2Sdanforeach {tn query snippet} {
147ab322bd2Sdan  2 "row" {
148ab322bd2Sdan     ...returns the value of y on the same [row] that contains
149ab322bd2Sdan     the maximum x value.
150ab322bd2Sdan  }
151ab322bd2Sdan  3 "ROW" {
152ab322bd2Sdan     ...returns the value of y on the same [row] that contains
153ab322bd2Sdan     the maximum x value.
154ab322bd2Sdan  }
155ab322bd2Sdan  4 "rollback" {
156ab322bd2Sdan     ...[ROLLBACK]. Instead, the pending statement
157ab322bd2Sdan     will return SQLITE_ABORT upon next access after the [ROLLBACK].
158ab322bd2Sdan  }
159ab322bd2Sdan  5 "rOllback" {
160ab322bd2Sdan     ...[ROLLBACK]. Instead, the pending statement
161ab322bd2Sdan     will return SQLITE_ABORT upon next access after the [ROLLBACK].
162ab322bd2Sdan  }
163ab322bd2Sdan  6 "lang*" {
164ab322bd2Sdan     Added support for the FTS4 [languageid] option.
165ab322bd2Sdan  }
166ab322bd2Sdan} {
167ab322bd2Sdan  do_test 2.$tn {
168ab322bd2Sdan    set q [mapdoc $query]
169ab322bd2Sdan    execsql { SELECT snippet(t2, '[', ']', '...') FROM t2 WHERE t2 MATCH $q }
170ab322bd2Sdan  } [list [mapdoc $snippet]]
171ab322bd2Sdan}
172ab322bd2Sdan
1737a796731Sdan#-------------------------------------------------------------------------
1747a796731Sdan# Make sure the unicode61 tokenizer does not crash if it is passed a
1757a796731Sdan# NULL pointer.
1767a796731Sdanreset_db
1777a796731Sdando_execsql_test 3.1 {
1787a796731Sdan  CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61, x, y);
1797a796731Sdan  INSERT INTO t1 VALUES(NULL, 'a b c');
1807a796731Sdan}
1817a796731Sdan
1827a796731Sdando_execsql_test 3.2 {
1837a796731Sdan  SELECT snippet(t1, '[', ']') FROM t1 WHERE t1 MATCH 'b'
1847a796731Sdan} {{a [b] c}}
1857a796731Sdan
1867a796731Sdando_execsql_test 3.3 {
1877a796731Sdan  BEGIN;
1887a796731Sdan  DELETE FROM t1;
1897a796731Sdan  INSERT INTO t1 VALUES('b b b b b b b b b b b', 'b b b b b b b b b b b b b');
1907a796731Sdan  INSERT INTO t1 SELECT * FROM t1;
1917a796731Sdan  INSERT INTO t1 SELECT * FROM t1;
1927a796731Sdan  INSERT INTO t1 SELECT * FROM t1;
1937a796731Sdan  INSERT INTO t1 SELECT * FROM t1;
1947a796731Sdan  INSERT INTO t1 SELECT * FROM t1;
1957a796731Sdan  INSERT INTO t1 SELECT * FROM t1;
1967a796731Sdan  INSERT INTO t1 SELECT * FROM t1;
1977a796731Sdan  INSERT INTO t1 SELECT * FROM t1;
1987a796731Sdan  INSERT INTO t1 SELECT * FROM t1;
1997a796731Sdan  INSERT INTO t1 SELECT * FROM t1;
2007a796731Sdan  INSERT INTO t1 SELECT * FROM t1;
2017a796731Sdan  INSERT INTO t1 SELECT * FROM t1;
2027a796731Sdan  INSERT INTO t1 SELECT * FROM t1;
2037a796731Sdan  INSERT INTO t1 SELECT * FROM t1;
2047a796731Sdan  INSERT INTO t1 SELECT * FROM t1;
2057a796731Sdan  INSERT INTO t1 SELECT * FROM t1;
2067a796731Sdan  INSERT INTO t1 VALUES('a b c', NULL);
2077a796731Sdan  INSERT INTO t1 VALUES('a x c', NULL);
2087a796731Sdan  COMMIT;
2097a796731Sdan}
2107a796731Sdan
2117a796731Sdando_execsql_test 3.4 {
2127a796731Sdan  SELECT * FROM t1 WHERE t1 MATCH 'a b';
2137a796731Sdan} {{a b c} {}}
2147a796731Sdan
2157a796731Sdan#-------------------------------------------------------------------------
2167a796731Sdan#
2177a796731Sdanreset_db
2187a796731Sdan
2197a796731Sdando_test 4.1 {
2207a796731Sdan  set a "abc\uFFFEdef"
2217a796731Sdan  set b "abc\uD800def"
2227a796731Sdan  set c "\uFFFEdef"
2237a796731Sdan  set d "\uD800def"
2247a796731Sdan  execsql {
2257a796731Sdan    CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61, x);
2267a796731Sdan    INSERT INTO t1 VALUES($a);
2277a796731Sdan    INSERT INTO t1 VALUES($b);
2287a796731Sdan    INSERT INTO t1 VALUES($c);
2297a796731Sdan    INSERT INTO t1 VALUES($d);
2307a796731Sdan  }
2317a796731Sdan} {}
2327a796731Sdan
2337a796731Sdando_test 4.2 {
2347a796731Sdan  set a [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0x62}]
2357a796731Sdan  set b [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0x62}]
2367a796731Sdan  set c [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]
2377a796731Sdan  set d [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]
2387a796731Sdan  execsql {
2397a796731Sdan    INSERT INTO t1 VALUES($a);
2407a796731Sdan    INSERT INTO t1 VALUES($b);
2417a796731Sdan    INSERT INTO t1 VALUES($c);
2427a796731Sdan    INSERT INTO t1 VALUES($d);
2437a796731Sdan  }
2447a796731Sdan} {}
2457a796731Sdan
2467a796731Sdando_test 4.3 {
2477a796731Sdan  set a [binary format c* {0xF7 0xBF 0xBF 0xBF}]
2487a796731Sdan  set b [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF}]
2497a796731Sdan  set c [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF}]
2507a796731Sdan  set d [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF}]
2517a796731Sdan  execsql {
2527a796731Sdan    INSERT INTO t1 VALUES($a);
2537a796731Sdan    INSERT INTO t1 VALUES($b);
2547a796731Sdan    INSERT INTO t1 VALUES($c);
2557a796731Sdan    INSERT INTO t1 VALUES($d);
2567a796731Sdan  }
2577a796731Sdan} {}
2587a796731Sdan
25925cdf46aSdan#-------------------------------------------------------------------------
26025cdf46aSdan
26125cdf46aSdando_unicode_token_test3 5.1 {tokenchars=} {
26225cdf46aSdan  sqlite3_reset sqlite3_column_int
26325cdf46aSdan} {
26425cdf46aSdan  0 sqlite3 sqlite3
26525cdf46aSdan  1 reset reset
26625cdf46aSdan  2 sqlite3 sqlite3
26725cdf46aSdan  3 column column
26825cdf46aSdan  4 int int
26925cdf46aSdan}
27025cdf46aSdan
27125cdf46aSdando_unicode_token_test3 5.2 {tokenchars=_} {
27225cdf46aSdan  sqlite3_reset sqlite3_column_int
27325cdf46aSdan} {
27425cdf46aSdan  0 sqlite3_reset sqlite3_reset
27525cdf46aSdan  1 sqlite3_column_int sqlite3_column_int
27625cdf46aSdan}
27725cdf46aSdan
27825cdf46aSdando_unicode_token_test3 5.3 {separators=xyz} {
27925cdf46aSdan  Laotianxhorseyrunszfast
28025cdf46aSdan} {
28125cdf46aSdan  0 laotian Laotian
28225cdf46aSdan  1 horse horse
28325cdf46aSdan  2 runs runs
28425cdf46aSdan  3 fast fast
28525cdf46aSdan}
28625cdf46aSdan
28725cdf46aSdando_unicode_token_test3 5.4 {tokenchars=xyz} {
28825cdf46aSdan  Laotianxhorseyrunszfast
28925cdf46aSdan} {
29025cdf46aSdan  0 laotianxhorseyrunszfast Laotianxhorseyrunszfast
29125cdf46aSdan}
29225cdf46aSdan
29325cdf46aSdando_unicode_token_test3 5.5 {tokenchars=_} {separators=zyx} {
29425cdf46aSdan  sqlite3_resetxsqlite3_column_intyhonda_phantom
29525cdf46aSdan} {
29625cdf46aSdan  0 sqlite3_reset sqlite3_reset
29725cdf46aSdan  1 sqlite3_column_int sqlite3_column_int
29825cdf46aSdan  2 honda_phantom honda_phantom
29925cdf46aSdan}
30025cdf46aSdan
30125cdf46aSdando_unicode_token_test3 5.6 "separators=\u05D1" "abc\u05D1def" {
30225cdf46aSdan  0 abc abc 1 def def
30325cdf46aSdan}
30425cdf46aSdan
30525cdf46aSdando_unicode_token_test3 5.7                             \
30625cdf46aSdan  "tokenchars=\u2444\u2445"                            \
30725cdf46aSdan  "separators=\u05D0\u05D1\u05D2"                      \
30825cdf46aSdan  "\u2444fre\u2445sh\u05D0water\u05D2fish.\u2445timer" \
30925cdf46aSdan  [list                                                \
31025cdf46aSdan    0 \u2444fre\u2445sh \u2444fre\u2445sh              \
31125cdf46aSdan    1 water water                                      \
31225cdf46aSdan    2 fish fish                                        \
31325cdf46aSdan    3 \u2445timer \u2445timer                          \
31425cdf46aSdan  ]
31525cdf46aSdan
31625cdf46aSdan# Check that it is not possible to add a standalone diacritic codepoint
31725cdf46aSdan# to either separators or tokenchars.
31825cdf46aSdando_unicode_token_test3 5.8 "separators=\u0301" \
31925cdf46aSdan  "hello\u0301world \u0301helloworld"          \
32025cdf46aSdan  "0 helloworld hello\u0301world 1 helloworld helloworld"
32125cdf46aSdan
32225cdf46aSdando_unicode_token_test3 5.9 "tokenchars=\u0301" \
32325cdf46aSdan  "hello\u0301world \u0301helloworld"          \
32425cdf46aSdan  "0 helloworld hello\u0301world 1 helloworld helloworld"
32525cdf46aSdan
32625cdf46aSdando_unicode_token_test3 5.10 "separators=\u0301" \
32725cdf46aSdan  "remove_diacritics=0"                        \
32825cdf46aSdan  "hello\u0301world \u0301helloworld"          \
32925cdf46aSdan  "0 hello\u0301world hello\u0301world 1 helloworld helloworld"
33025cdf46aSdan
33125cdf46aSdando_unicode_token_test3 5.11 "tokenchars=\u0301" \
33225cdf46aSdan  "remove_diacritics=0"                         \
33325cdf46aSdan  "hello\u0301world \u0301helloworld"           \
33425cdf46aSdan  "0 hello\u0301world hello\u0301world 1 helloworld helloworld"
3357a796731Sdan
3367a796731Sdan
3373aaa4cd9Sdan#-------------------------------------------------------------------------
3383aaa4cd9Sdan
3393aaa4cd9Sdanproc do_tokenize {tokenizer txt} {
3403aaa4cd9Sdan  set res [list]
3413aaa4cd9Sdan  foreach {a b c} [db one {SELECT fts3_tokenizer_test($tokenizer, $txt)}] {
3423aaa4cd9Sdan    lappend res $b
3433aaa4cd9Sdan  }
3443aaa4cd9Sdan  set res
3453aaa4cd9Sdan}
3463aaa4cd9Sdan
3473aaa4cd9Sdan# Argument $lCodepoint must be a list of codepoints (integers) that
3483aaa4cd9Sdan# correspond to whitespace characters. This command creates a string
3493aaa4cd9Sdan# $W from the codepoints, then tokenizes "${W}hello{$W}world${W}"
3503aaa4cd9Sdan# using tokenizer $tokenizer. The test passes if the tokenizer successfully
3513aaa4cd9Sdan# extracts the two 5 character tokens.
3523aaa4cd9Sdan#
3533aaa4cd9Sdanproc do_isspace_test {tn tokenizer lCp} {
3543aaa4cd9Sdan  set whitespace [format [string repeat %c [llength $lCp]] {*}$lCp]
3553aaa4cd9Sdan  set txt "${whitespace}hello${whitespace}world${whitespace}"
3563aaa4cd9Sdan  uplevel [list do_test $tn [list do_tokenize $tokenizer $txt] {hello world}]
3573aaa4cd9Sdan}
3583aaa4cd9Sdan
3593aaa4cd9Sdanset tokenizers [list unicode61]
3603aaa4cd9Sdanifcapable icu { lappend tokenizers icu }
3613aaa4cd9Sdan
3623aaa4cd9Sdan# Some tests to check that the tokenizers can both identify white-space
3633aaa4cd9Sdan# codepoints. All codepoints tested below are of type "Zs" in the
3643aaa4cd9Sdan# UnicodeData.txt file.
36507d694c7Sdrh#
36607d694c7Sdrh# Note that codepoint 6158 has changed from Zs to Cf in recent versions
36707d694c7Sdrh# of UnicodeData.txt.  So take that into account for the "icu" tests.
36807d694c7Sdrh#
3693aaa4cd9Sdanforeach T $tokenizers {
3703aaa4cd9Sdan  do_isspace_test 6.$T.1 $T    32
3713aaa4cd9Sdan  do_isspace_test 6.$T.2 $T    160
3723aaa4cd9Sdan  do_isspace_test 6.$T.3 $T    5760
37307d694c7Sdrh  if {$T!="icu"} {
3743aaa4cd9Sdan    do_isspace_test 6.$T.4 $T    6158
37507d694c7Sdrh  }
3763aaa4cd9Sdan  do_isspace_test 6.$T.5 $T    8192
3773aaa4cd9Sdan  do_isspace_test 6.$T.6 $T    8193
3783aaa4cd9Sdan  do_isspace_test 6.$T.7 $T    8194
3793aaa4cd9Sdan  do_isspace_test 6.$T.8 $T    8195
3803aaa4cd9Sdan  do_isspace_test 6.$T.9 $T    8196
3813aaa4cd9Sdan  do_isspace_test 6.$T.10 $T    8197
3823aaa4cd9Sdan  do_isspace_test 6.$T.11 $T    8198
3833aaa4cd9Sdan  do_isspace_test 6.$T.12 $T    8199
3843aaa4cd9Sdan  do_isspace_test 6.$T.13 $T    8200
3853aaa4cd9Sdan  do_isspace_test 6.$T.14 $T    8201
3863aaa4cd9Sdan  do_isspace_test 6.$T.15 $T    8202
387d26d2c7dSdan  if {$T!="icu"} {
3883aaa4cd9Sdan    do_isspace_test 6.$T.16 $T    8239
389d26d2c7dSdan  }
3903aaa4cd9Sdan  do_isspace_test 6.$T.17 $T    8287
3913aaa4cd9Sdan  do_isspace_test 6.$T.18 $T   12288
3923aaa4cd9Sdan
39307d694c7Sdrh  if {$T!="icu"} {
3943aaa4cd9Sdan    do_isspace_test 6.$T.19 $T   {32 160 5760 6158}
39507d694c7Sdrh  } else {
39607d694c7Sdrh    do_isspace_test 6.$T.19 $T   {32 160 5760 8192}
39707d694c7Sdrh  }
398cbc53fecSmistachkin  do_isspace_test 6.$T.20 $T   {8192 8193 8194 8195}
399cbc53fecSmistachkin  do_isspace_test 6.$T.21 $T   {8196 8197 8198 8199}
400cbc53fecSmistachkin  do_isspace_test 6.$T.22 $T   {8200 8201 8202 8239}
401cbc53fecSmistachkin  do_isspace_test 6.$T.23 $T   {8287 12288}
4023aaa4cd9Sdan}
4033aaa4cd9Sdan
404f2c9229fSdan#-------------------------------------------------------------------------
405f2c9229fSdan# Test that the private use ranges are treated as alphanumeric.
406f2c9229fSdan#
407f2c9229fSdanforeach {tn1 c} {
408f2c9229fSdan  1 \ue000 2 \ue001 3 \uf000 4 \uf8fe 5 \uf8ff
409f2c9229fSdan} {
410f2c9229fSdan  foreach {tn2 config res} {
411f2c9229fSdan    1 ""             "0 hello*world hello*world"
412f2c9229fSdan    2 "separators=*" "0 hello hello 1 world world"
413f2c9229fSdan  } {
414f2c9229fSdan    set config [string map [list * $c] $config]
415f2c9229fSdan    set input  [string map [list * $c] "hello*world"]
416f2c9229fSdan    set output [string map [list * $c] $res]
417f2c9229fSdan    do_unicode_token_test3 7.$tn1.$tn2 {*}$config $input $output
418f2c9229fSdan  }
419f2c9229fSdan}
420f2c9229fSdan
42143398081Sdan#-------------------------------------------------------------------------
42243398081Sdan# Cursory test of remove_diacritics=0.
42343398081Sdan#
42443398081Sdan# 00C4;LATIN CAPITAL LETTER A WITH DIAERESIS
42543398081Sdan# 00D6;LATIN CAPITAL LETTER O WITH DIAERESIS
42643398081Sdan# 00E4;LATIN SMALL LETTER A WITH DIAERESIS
42743398081Sdan# 00F6;LATIN SMALL LETTER O WITH DIAERESIS
42843398081Sdan#
42943398081Sdando_execsql_test 8.1.1 "
43043398081Sdan  CREATE VIRTUAL TABLE t3 USING fts4(tokenize=unicode61 'remove_diacritics=1');
43143398081Sdan  INSERT INTO t3 VALUES('o');
43243398081Sdan  INSERT INTO t3 VALUES('a');
43343398081Sdan  INSERT INTO t3 VALUES('O');
43443398081Sdan  INSERT INTO t3 VALUES('A');
43543398081Sdan  INSERT INTO t3 VALUES('\xD6');
43643398081Sdan  INSERT INTO t3 VALUES('\xC4');
43743398081Sdan  INSERT INTO t3 VALUES('\xF6');
43843398081Sdan  INSERT INTO t3 VALUES('\xE4');
43943398081Sdan"
44043398081Sdando_execsql_test 8.1.2 {
44143398081Sdan  SELECT rowid FROM t3 WHERE t3 MATCH 'o';
44243398081Sdan} {1 3 5 7}
44343398081Sdando_execsql_test 8.1.3 {
44443398081Sdan  SELECT rowid FROM t3 WHERE t3 MATCH 'a';
44543398081Sdan} {2 4 6 8}
44643398081Sdando_execsql_test 8.2.1 {
44743398081Sdan  CREATE VIRTUAL TABLE t4 USING fts4(tokenize=unicode61 "remove_diacritics=0");
44843398081Sdan  INSERT INTO t4 SELECT * FROM t3;
44943398081Sdan}
45043398081Sdando_execsql_test 8.2.2 {
45143398081Sdan  SELECT rowid FROM t4 WHERE t4 MATCH 'o';
45243398081Sdan} {1 3}
45343398081Sdando_execsql_test 8.2.3 {
45443398081Sdan  SELECT rowid FROM t4 WHERE t4 MATCH 'a';
45543398081Sdan} {2 4}
4563aaa4cd9Sdan
457f1d2670dSdan#-------------------------------------------------------------------------
458f1d2670dSdan#
459f1d2670dSdanforeach {tn sql} {
460f1d2670dSdan  1 {
461f1d2670dSdan    CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 [tokenchars= .]);
462f1d2670dSdan    CREATE VIRTUAL TABLE t6 USING fts4(
463f1d2670dSdan        tokenize=unicode61 [tokenchars=="] "tokenchars=[]");
464f1d2670dSdan    CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 [separators=x\xC4]);
465f1d2670dSdan  }
466f1d2670dSdan  2 {
467f1d2670dSdan    CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 "tokenchars= .");
468f1d2670dSdan    CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 "tokenchars=[=""]");
469f1d2670dSdan    CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 "separators=x\xC4");
470f1d2670dSdan  }
471f1d2670dSdan  3 {
472f1d2670dSdan    CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 'tokenchars= .');
473f1d2670dSdan    CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 'tokenchars=="[]');
474f1d2670dSdan    CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 'separators=x\xC4');
475f1d2670dSdan  }
476f1d2670dSdan  4 {
477f1d2670dSdan    CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 `tokenchars= .`);
478f1d2670dSdan    CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 `tokenchars=[="]`);
479f1d2670dSdan    CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 `separators=x\xC4`);
480f1d2670dSdan  }
481f1d2670dSdan} {
482f1d2670dSdan  do_execsql_test 9.$tn.0 {
483f1d2670dSdan    DROP TABLE IF EXISTS t5;
484f1d2670dSdan    DROP TABLE IF EXISTS t5aux;
485f1d2670dSdan    DROP TABLE IF EXISTS t6;
486f1d2670dSdan    DROP TABLE IF EXISTS t6aux;
487f1d2670dSdan    DROP TABLE IF EXISTS t7;
488f1d2670dSdan    DROP TABLE IF EXISTS t7aux;
489f1d2670dSdan  }
490f1d2670dSdan  do_execsql_test 9.$tn.1 $sql
491f1d2670dSdan
492f1d2670dSdan  do_execsql_test 9.$tn.2 {
493f1d2670dSdan    CREATE VIRTUAL TABLE t5aux USING fts4aux(t5);
494f1d2670dSdan    INSERT INTO t5 VALUES('one two three/four.five.six');
495f1d2670dSdan    SELECT * FROM t5aux;
496f1d2670dSdan  } {
497f1d2670dSdan    four.five.six   * 1 1 four.five.six   0 1 1
498f1d2670dSdan    {one two three} * 1 1 {one two three} 0 1 1
499f1d2670dSdan  }
500f1d2670dSdan
501f1d2670dSdan  do_execsql_test 9.$tn.3 {
502f1d2670dSdan    CREATE VIRTUAL TABLE t6aux USING fts4aux(t6);
503f1d2670dSdan    INSERT INTO t6 VALUES('alpha=beta"gamma/delta[epsilon]zeta');
504f1d2670dSdan    SELECT * FROM t6aux;
505f1d2670dSdan  } {
506f1d2670dSdan    {alpha=beta"gamma}   * 1 1 {alpha=beta"gamma} 0 1 1
507f1d2670dSdan    {delta[epsilon]zeta} * 1 1 {delta[epsilon]zeta} 0 1 1
508f1d2670dSdan  }
509f1d2670dSdan
510f1d2670dSdan  do_execsql_test 9.$tn.4 {
511f1d2670dSdan    CREATE VIRTUAL TABLE t7aux USING fts4aux(t7);
512f1d2670dSdan    INSERT INTO t7 VALUES('alephxbeth\xC4gimel');
513f1d2670dSdan    SELECT * FROM t7aux;
514f1d2670dSdan  } {
515f1d2670dSdan    aleph * 1 1 aleph 0 1 1
516f1d2670dSdan    beth  * 1 1 beth  0 1 1
517f1d2670dSdan    gimel * 1 1 gimel 0 1 1
518f1d2670dSdan  }
519f1d2670dSdan}
520f1d2670dSdan
521f1d2670dSdan# Check that multiple options are handled correctly.
522f1d2670dSdan#
523f1d2670dSdando_execsql_test 10.1 {
524f1d2670dSdan  DROP TABLE IF EXISTS t1;
525f1d2670dSdan  CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61
526f1d2670dSdan    "tokenchars=xyz" "tokenchars=.=" "separators=.=" "separators=xy"
527f1d2670dSdan    "separators=a" "separators=a" "tokenchars=a" "tokenchars=a"
528f1d2670dSdan  );
529f1d2670dSdan
530f1d2670dSdan  INSERT INTO t1 VALUES('oneatwoxthreeyfour');
531f1d2670dSdan  INSERT INTO t1 VALUES('a.single=word');
532f1d2670dSdan  CREATE VIRTUAL TABLE t1aux USING fts4aux(t1);
533f1d2670dSdan  SELECT * FROM t1aux;
534f1d2670dSdan} {
535f1d2670dSdan  .single=word * 1 1 .single=word 0 1 1
536f1d2670dSdan  four         * 1 1 four         0 1 1
537f1d2670dSdan  one          * 1 1 one          0 1 1
538f1d2670dSdan  three        * 1 1 three        0 1 1
539f1d2670dSdan  two          * 1 1 two          0 1 1
540f1d2670dSdan}
541f1d2670dSdan
542f1d2670dSdan# Test that case folding happens after tokenization, not before.
543f1d2670dSdan#
544f1d2670dSdando_execsql_test 10.2 {
545f1d2670dSdan  DROP TABLE IF EXISTS t2;
546f1d2670dSdan  CREATE VIRTUAL TABLE t2 USING fts4(tokenize=unicode61 "separators=aB");
547f1d2670dSdan  INSERT INTO t2 VALUES('oneatwoBthree');
548f1d2670dSdan  INSERT INTO t2 VALUES('onebtwoAthree');
549f1d2670dSdan  CREATE VIRTUAL TABLE t2aux USING fts4aux(t2);
550f1d2670dSdan  SELECT * FROM t2aux;
551f1d2670dSdan} {
552f1d2670dSdan  one           * 1 1 one           0 1 1
553f1d2670dSdan  onebtwoathree * 1 1 onebtwoathree 0 1 1
554f1d2670dSdan  three         * 1 1 three         0 1 1
555f1d2670dSdan  two           * 1 1 two           0 1 1
556f1d2670dSdan}
557f1d2670dSdan
5586284d021Sdan# Test that the tokenchars and separators options work with the
5596284d021Sdan# fts3tokenize table.
5606284d021Sdan#
5616284d021Sdando_execsql_test 11.1 {
5626284d021Sdan  CREATE VIRTUAL TABLE ft1 USING fts3tokenize(
5636284d021Sdan    "unicode61", "tokenchars=@.", "separators=1234567890"
5646284d021Sdan  );
5656284d021Sdan  SELECT token FROM ft1 WHERE input = '[email protected]';
5666284d021Sdan} {
5676284d021Sdan  berlin@street sydney.road
5686284d021Sdan}
569f1d2670dSdan
570*31afee93Sdan# Test for embedded nul characters in fts4 unicode index.
571*31afee93Sdan#
572*31afee93Sdando_execsql_test 12.0 {
573*31afee93Sdan  CREATE VIRTUAL TABLE t12 USING fts4(tokenize=unicode61);
574*31afee93Sdan  INSERT INTO t12 VALUES('abc' || char(0) || 'def');
575*31afee93Sdan  SELECT hex(CAST(content AS blob)) FROM t12;
576*31afee93Sdan} {61626300646566}
577*31afee93Sdando_execsql_test 12.1 {
578*31afee93Sdan  INSERT INTO t12(t12) VALUES('integrity-check');
579*31afee93Sdan} {}
580*31afee93Sdando_execsql_test 12.2 {
581*31afee93Sdan  CREATE VIRTUAL TABLE t12aux USING fts4aux(t12);
582*31afee93Sdan  SELECT * FROM t12aux;
583*31afee93Sdan} {abc * 1 1 abc 0 1 1}
584*31afee93Sdando_execsql_test 12.3 {
585*31afee93Sdan  SELECT hex(CAST(content AS blob)) FROM t12 WHERE t12 MATCH 'abc'
586*31afee93Sdan} {61626300646566}
587*31afee93Sdan
5883d403c71Sdanfinish_test
589