1# 2014 Dec 20
2#
3# The author disclaims copyright to this source code.  In place of
4# a legal notice, here is a blessing:
5#
6#    May you do good and not evil.
7#    May you find forgiveness for yourself and forgive others.
8#    May you share freely, never taking more than you give.
9#
10#***********************************************************************
11#
12# Tests focusing on the built-in fts5 tokenizers.
13#
14
15source [file join [file dirname [info script]] fts5_common.tcl]
16set testprefix fts5tokenizer
17
18# If SQLITE_ENABLE_FTS5 is defined, omit this file.
19ifcapable !fts5 {
20  finish_test
21  return
22}
23
24
25do_execsql_test 1.0 {
26  CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize=porter);
27  DROP TABLE ft1;
28}
29do_execsql_test 1.1 {
30  CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize='porter');
31  DROP TABLE ft1;
32}
33do_execsql_test 1.2 {
34  CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = porter);
35  DROP TABLE ft1;
36}
37do_execsql_test 1.3 {
38  CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter');
39  DROP TABLE ft1;
40}
41do_execsql_test 1.4 {
42  CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter ascii');
43  DROP TABLE ft1;
44}
45
46do_catchsql_test 1.5 {
47  CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'nosuch');
48} {1 {no such tokenizer: nosuch}}
49
50do_catchsql_test 1.6 {
51  CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter nosuch');
52} {1 {error in tokenizer constructor}}
53
54do_execsql_test 2.0 {
55  CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize=porter);
56  INSERT INTO ft1 VALUES('embedded databases');
57}
58do_execsql_test 2.1 { SELECT rowid FROM ft1 WHERE ft1 MATCH 'embedding' } 1
59do_execsql_test 2.2 { SELECT rowid FROM ft1 WHERE ft1 MATCH 'database' } 1
60do_execsql_test 2.3 {
61  SELECT rowid FROM ft1 WHERE ft1 MATCH 'database embedding'
62} 1
63
64proc tcl_create {args} {
65  set ::targs $args
66  error "failed"
67}
68sqlite3_fts5_create_tokenizer db tcl tcl_create
69
70foreach {tn directive expected} {
71  1 {tokenize='tcl a b c'}             {a b c}
72  2 {tokenize='tcl ''d'' ''e'' ''f'''} {d e f}
73  3 {tokenize="tcl 'g' 'h' 'i'"}       {g h i}
74  4 {tokenize = tcl}                   {}
75} {
76  do_catchsql_test 3.$tn.1 "
77    CREATE VIRTUAL TABLE ft2 USING fts5(x, $directive)
78  " {1 {error in tokenizer constructor}}
79  do_test 3.$tn.2 { set ::targs } $expected
80}
81
82do_catchsql_test 4.1 {
83  CREATE VIRTUAL TABLE ft2 USING fts5(x, tokenize = tcl abc);
84} {1 {parse error in "tokenize = tcl abc"}}
85do_catchsql_test 4.2 {
86  CREATE VIRTUAL TABLE ft2 USING fts5(x y)
87} {1 {unrecognized column option: y}}
88
89#-------------------------------------------------------------------------
90# Test the "separators" and "tokenchars" options a bit.
91#
92foreach {tn tokenizer} {1 ascii 2 unicode61} {
93  reset_db
94  set T "$tokenizer tokenchars ',.:' separators 'xyz'"
95  execsql "CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = \"$T\")"
96  do_execsql_test 5.$tn.1 {
97    INSERT INTO t1 VALUES('abcxdefyghizjkl.mno,pqr:stu/vwx+yz');
98  }
99  foreach {tn2 token res} {
100    1 abc 1     2 def 1     3 ghi 1    4 jkl {}
101    5 mno {}    6 pqr {}    7 stu {}   8 jkl.mno,pqr:stu 1
102    9 vw  1
103  } {
104    do_execsql_test 5.$tn.2.$tn2 "
105      SELECT rowid FROM t1 WHERE t1 MATCH '\"$token\"'
106    " $res
107  }
108}
109
110#-------------------------------------------------------------------------
111# Miscellaneous tests for the ascii tokenizer.
112#
113# 5.1.*: Test that the ascii tokenizer ignores non-ASCII characters in the
114#        'separators' option. But unicode61 does not.
115#
116# 5.2.*: An option without an argument is an error.
117#
118
119do_test 5.1.1 {
120  execsql "
121    CREATE VIRTUAL TABLE a1 USING fts5(x, tokenize=`ascii separators '\u1234'`);
122    INSERT INTO a1 VALUES('abc\u1234def');
123  "
124  execsql { SELECT rowid FROM a1 WHERE a1 MATCH 'def' }
125} {}
126
127do_test 5.1.2 {
128  execsql "
129    CREATE VIRTUAL TABLE a2 USING fts5(
130        x, tokenize=`unicode61 separators '\u1234'`);
131    INSERT INTO a2 VALUES('abc\u1234def');
132  "
133  execsql { SELECT rowid FROM a2 WHERE a2 MATCH 'def' }
134} {1}
135
136do_catchsql_test 5.2 {
137  CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'ascii tokenchars');
138} {1 {error in tokenizer constructor}}
139do_catchsql_test 5.3 {
140  CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'ascii opt arg');
141} {1 {error in tokenizer constructor}}
142
143#-------------------------------------------------------------------------
144# Test that the ASCII and unicode61 tokenizers both handle SQLITE_DONE
145# correctly.
146#
147
148proc test_token_cb {varname token iStart iEnd} {
149  upvar $varname var
150  lappend var $token
151  if {[llength $var]==3} { return "SQLITE_DONE" }
152  return "SQLITE_OK"
153}
154
155proc tokenize {cmd} {
156  set res [list]
157  $cmd xTokenize [$cmd xColumnText 0] [list test_token_cb res]
158  set res
159}
160sqlite3_fts5_create_function db tokenize tokenize
161
162do_execsql_test 6.0 {
163  CREATE VIRTUAL TABLE x1 USING fts5(a, tokenize=ascii);
164  INSERT INTO x1 VALUES('q w e r t y');
165  INSERT INTO x1 VALUES('y t r e w q');
166  SELECT tokenize(x1) FROM x1 WHERE x1 MATCH 'e AND r';
167} {
168  {q w e} {y t r}
169}
170
171do_execsql_test 6.1 {
172  CREATE VIRTUAL TABLE x2 USING fts5(a, tokenize=unicode61);
173  INSERT INTO x2 VALUES('q w e r t y');
174  INSERT INTO x2 VALUES('y t r e w q');
175  SELECT tokenize(x2) FROM x2 WHERE x2 MATCH 'e AND r';
176} {
177  {q w e} {y t r}
178}
179
180
181#-------------------------------------------------------------------------
182# Miscellaneous tests for the unicode tokenizer.
183#
184do_catchsql_test 6.1 {
185  CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'unicode61 tokenchars');
186} {1 {error in tokenizer constructor}}
187do_catchsql_test 6.2 {
188  CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'unicode61 a b');
189} {1 {error in tokenizer constructor}}
190do_catchsql_test 6.3 {
191  CREATE VIRTUAL TABLE a3 USING fts5(
192    x, y, tokenize = 'unicode61 remove_diacritics 3'
193  );
194} {1 {error in tokenizer constructor}}
195do_catchsql_test 6.4 {
196  CREATE VIRTUAL TABLE a3 USING fts5(
197    x, y, tokenize = 'unicode61 remove_diacritics 10'
198  );
199} {1 {error in tokenizer constructor}}
200
201#-------------------------------------------------------------------------
202# Porter tokenizer with very large tokens.
203#
204set a [string repeat a 100]
205set b [string repeat b 500]
206set c [string repeat c 1000]
207do_execsql_test 7.0 {
208  CREATE VIRTUAL TABLE e5 USING fts5(x, tokenize=porter);
209  INSERT INTO e5 VALUES($a || ' ' || $b);
210  INSERT INTO e5 VALUES($b || ' ' || $c);
211  INSERT INTO e5 VALUES($c || ' ' || $a);
212}
213
214do_execsql_test 7.1 {SELECT rowid FROM e5 WHERE e5 MATCH $a} { 1 3 }
215do_execsql_test 7.2 {SELECT rowid FROM e5 WHERE e5 MATCH $b} { 1 2 }
216do_execsql_test 7.3 {SELECT rowid FROM e5 WHERE e5 MATCH $c} { 2 3 }
217
218#-------------------------------------------------------------------------
219# Test the 'separators' option with the unicode61 tokenizer.
220#
221do_execsql_test 8.1 {
222  BEGIN;
223  CREATE VIRTUAL TABLE e6 USING fts5(x,
224    tokenize="unicode61 separators ABCDEFGHIJKLMNOPQRSTUVWXYZ"
225  );
226  INSERT INTO e6 VALUES('theAquickBbrownCfoxDjumpedWoverXtheYlazyZdog');
227  CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row');
228  SELECT term FROM e7;
229  ROLLBACK;
230} {
231  brown dog fox jumped lazy over quick the
232}
233
234do_execsql_test 8.2 [subst {
235  BEGIN;
236  CREATE VIRTUAL TABLE e6 USING fts5(x,
237    tokenize="unicode61 separators '\u0E01\u0E02\u0E03\u0E04\u0E05\u0E06\u0E07'"
238  );
239  INSERT INTO e6 VALUES('the\u0E01quick\u0E01brown\u0E01fox\u0E01'
240                     || 'jumped\u0E01over\u0E01the\u0E01lazy\u0E01dog'
241  );
242  INSERT INTO e6 VALUES('\u0E08\u0E07\u0E09');
243  CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row');
244  SELECT term FROM e7;
245  ROLLBACK;
246}] [subst {
247  brown dog fox jumped lazy over quick the \u0E08 \u0E09
248}]
249
250# Test that the porter tokenizer correctly passes arguments through to
251# its parent tokenizer.
252do_execsql_test 8.3 {
253  BEGIN;
254  CREATE VIRTUAL TABLE e6 USING fts5(x,
255    tokenize="porter unicode61 separators ABCDEFGHIJKLMNOPQRSTUVWXYZ"
256  );
257  INSERT INTO e6 VALUES('theAquickBbrownCfoxDjumpedWoverXtheYlazyZdog');
258  CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row');
259  SELECT term FROM e7;
260  ROLLBACK;
261} {
262  brown dog fox jump lazi over quick the
263}
264
265#-------------------------------------------------------------------------
266# Check that the FTS5_TOKENIZE_PREFIX flag is passed to the tokenizer
267# implementation.
268#
269reset_db
270proc tcl_create {args} { return "tcl_tokenize" }
271sqlite3_fts5_create_tokenizer db tcl tcl_create
272set ::flags [list]
273proc tcl_tokenize {tflags text} {
274  lappend ::flags $tflags
275  foreach {w iStart iEnd} [fts5_tokenize_split $text] {
276    sqlite3_fts5_token $w $iStart $iEnd
277  }
278}
279
280do_execsql_test 9.1.1 {
281  CREATE VIRTUAL TABLE t1 USING fts5(a, tokenize=tcl);
282  INSERT INTO t1 VALUES('abc');
283  INSERT INTO t1 VALUES('xyz');
284} {}
285do_test 9.1.2 { set ::flags } {document document}
286
287set ::flags [list]
288do_execsql_test 9.2.1 { SELECT * FROM t1('abc'); } {abc}
289do_test 9.2.2 { set ::flags } {query}
290
291set ::flags [list]
292do_execsql_test 9.3.1 { SELECT * FROM t1('ab*'); } {abc}
293do_test 9.3.2 { set ::flags } {prefixquery}
294
295set ::flags [list]
296do_execsql_test 9.4.1 { SELECT * FROM t1('"abc xyz" *'); } {}
297do_test 9.4.2 { set ::flags } {prefixquery}
298
299set ::flags [list]
300do_execsql_test 9.5.1 { SELECT * FROM t1('"abc xyz*"'); } {}
301do_test 9.5.2 { set ::flags } {query}
302
303
304finish_test
305