1# 2014 Dec 20
2#
3# The author disclaims copyright to this source code.  In place of
4# a legal notice, here is a blessing:
5#
6#    May you do good and not evil.
7#    May you find forgiveness for yourself and forgive others.
8#    May you share freely, never taking more than you give.
9#
10#***********************************************************************
11#
12# Tests focusing on custom tokenizers that support synonyms.
13#
14
15source [file join [file dirname [info script]] fts5_common.tcl]
16set testprefix fts5synonym
17
18# If SQLITE_ENABLE_FTS5 is defined, omit this file.
19ifcapable !fts5 {
20  finish_test
21  return
22}
23
24proc tcl_create {args} { return "tcl_tokenize" }
25
26foreach_detail_mode $testprefix {
27
28#-------------------------------------------------------------------------
29# Warm body test for the code in fts5_tcl.c.
30#
31fts5_tclnum_register db
32do_execsql_test 1.0 {
33  CREATE VIRTUAL TABLE ft USING fts5(x, tokenize = "tclnum document", detail=%DETAIL%);
34  INSERT INTO ft VALUES('abc def ghi');
35  INSERT INTO ft VALUES('jkl mno pqr');
36  SELECT rowid, x FROM ft WHERE ft MATCH 'def';
37  SELECT x, rowid FROM ft WHERE ft MATCH 'pqr';
38} {1 {abc def ghi} {jkl mno pqr} 2}
39
40#-------------------------------------------------------------------------
41# Test a tokenizer that supports synonyms by adding extra entries to the
42# FTS index.
43#
44reset_db
45fts5_tclnum_register db
46
47do_execsql_test 2.0 {
48  CREATE VIRTUAL TABLE ft USING fts5(
49      x, tokenize = "tclnum document", detail=%DETAIL%
50  );
51  INSERT INTO ft VALUES('one two three');
52  INSERT INTO ft VALUES('four five six');
53  INSERT INTO ft VALUES('eight nine ten');
54} {}
55
56foreach {tn expr res} {
57  1 "3" 1
58  2 "eight OR 8 OR 5" {2 3}
59  3 "10" {}
60  4 "1*" {1}
61  5 "1 + 2" {1}
62} {
63  if {![fts5_expr_ok $expr ft]} continue
64  do_execsql_test 2.1.$tn {
65    SELECT rowid FROM ft WHERE ft MATCH $expr
66  } $res
67}
68
69#-------------------------------------------------------------------------
70# Test some broken tokenizers:
71#
72#   3.1.*: A tokenizer that declares the very first token to be colocated.
73#
74#   3.2.*: A tokenizer that reports two identical tokens at the same position.
75#          This is allowed.
76#
77reset_db
78sqlite3_fts5_create_tokenizer db tcl tcl_create
79proc tcl_tokenize {tflags text} {
80  set bColo 1
81  foreach {w iStart iEnd} [fts5_tokenize_split $text] {
82    if {$bColo} {
83      sqlite3_fts5_token -colo $w $iStart $iEnd
84      set bColo 0
85    } {
86      sqlite3_fts5_token $w $iStart $iEnd
87    }
88  }
89}
90do_execsql_test 3.1.0 {
91  CREATE VIRTUAL TABLE ft USING fts5(x, tokenize = tcl);
92  INSERT INTO ft VALUES('one two three');
93  CREATE VIRTUAL TABLE vv USING fts5vocab(ft, row);
94  SELECT * FROM vv;
95} {
96  one 1 1   three 1 1   two 1 1
97}
98
99do_execsql_test 3.1.1 {
100  INSERT INTO ft(ft) VALUES('integrity-check');
101} {}
102
103proc tcl_tokenize {tflags text} {
104  foreach {w iStart iEnd} [fts5_tokenize_split $text] {
105    sqlite3_fts5_token $w $iStart $iEnd
106  }
107}
108
109do_execsql_test 3.1.2 {
110  SELECT rowid FROM ft WHERE ft MATCH 'one two three'
111} {1}
112
113reset_db
114sqlite3_fts5_create_tokenizer db tcl tcl_create
115proc tcl_tokenize {tflags text} {
116  foreach {w iStart iEnd} [fts5_tokenize_split $text] {
117    sqlite3_fts5_token $w $iStart $iEnd
118    sqlite3_fts5_token -colo $w $iStart $iEnd
119  }
120}
121do_execsql_test 3.2.0 {
122  CREATE VIRTUAL TABLE ft USING fts5(x, tokenize = tcl);
123  INSERT INTO ft VALUES('one one two three');
124  CREATE VIRTUAL TABLE vv USING fts5vocab(ft, row);
125  SELECT * FROM vv;
126} {
127  one 1 4   three 1 2   two 1 2
128}
129do_execsql_test 3.2.1 {
130  SELECT rowid FROM ft WHERE ft MATCH 'one';
131} {1}
132do_execsql_test 3.2.2 {
133  SELECT rowid FROM ft WHERE ft MATCH 'one two three';
134} {1}
135do_execsql_test 3.2.3 {
136  SELECT rowid FROM ft WHERE ft MATCH 'one + one + two + three';
137} {1}
138do_execsql_test 3.2.4 {
139  SELECT rowid FROM ft WHERE ft MATCH 'one two two three';
140} {1}
141do_execsql_test 3.2.5 {
142  SELECT rowid FROM ft WHERE ft MATCH 'one + two + two + three';
143} {}
144
145#-------------------------------------------------------------------------
146# Check that expressions with synonyms can be parsed and executed.
147#
148reset_db
149fts5_tclnum_register db
150
151foreach {tn expr res} {
152  1  {abc}                           {"abc"}
153  2  {one}                           {"one"|"i"|"1"}
154  3  {3}                             {"3"|"iii"|"three"}
155  4  {3*}                            {"3" *}
156} {
157  do_execsql_test 4.1.$tn {
158    SELECT fts5_expr($expr, 'tokenize=tclnum')
159  } [list $res]
160}
161
162do_execsql_test 4.2.1 {
163  CREATE VIRTUAL TABLE xx USING fts5(x, tokenize=tclnum, detail=%DETAIL%);
164  INSERT INTO xx VALUES('one two');
165  INSERT INTO xx VALUES('three four');
166}
167
168do_execsql_test 4.2.2 {
169  SELECT rowid FROM xx WHERE xx MATCH '2'
170} {1}
171
172do_execsql_test 4.2.3 {
173  SELECT rowid FROM xx WHERE xx MATCH '3'
174} {2}
175
176do_test 5.0 {
177  execsql {
178    CREATE VIRTUAL TABLE t1 USING fts5(a, b, tokenize=tclnum, detail=%DETAIL%)
179  }
180  foreach {rowid a b} {
181    1 {four v 4 i three} {1 3 five five 4 one}
182    2 {5 1 3 4 i} {2 2 v two 4}
183    3 {5 i 5 2 four 4 1} {iii ii five two 1}
184    4 {ii four 4 one 5 three five} {one 5 1 iii 4 3}
185    5 {three i v i four 4 1} {ii five five five iii}
186    6 {4 2 ii two 2 iii} {three 1 four 4 iv 1 iv}
187    7 {ii ii two three 2 5} {iii i ii iii iii one one}
188    8 {2 ii i two 3 three 2} {two iv v iii 3 five}
189    9 {i 2 iv 3 five four v} {iii 4 three i three ii 1}
190  } {
191    execsql { INSERT INTO t1(rowid, a, b) VALUES($rowid, $a, $b) }
192  }
193} {}
194
195
196foreach {tn q res} {
197  1 {one} {
198    1 {four v 4 [i] three} {[1] 3 five five 4 [one]}
199    2 {5 [1] 3 4 [i]} {2 2 v two 4}
200    3 {5 [i] 5 2 four 4 [1]} {iii ii five two [1]}
201    4 {ii four 4 [one] 5 three five} {[one] 5 [1] iii 4 3}
202    5 {three [i] v [i] four 4 [1]} {ii five five five iii}
203    6 {4 2 ii two 2 iii} {three [1] four 4 iv [1] iv}
204    7 {ii ii two three 2 5} {iii [i] ii iii iii [one] [one]}
205    8 {2 ii [i] two 3 three 2} {two iv v iii 3 five}
206    9 {[i] 2 iv 3 five four v} {iii 4 three [i] three ii [1]}
207  }
208  2 {five four} {
209    1 {[four] [v] [4] i three} {1 3 [five] [five] [4] one}
210    2 {[5] 1 3 [4] i} {2 2 [v] two [4]}
211    3 {[5] i [5] 2 [four] [4] 1} {iii ii [five] two 1}
212    4 {ii [four] [4] one [5] three [five]} {one [5] 1 iii [4] 3}
213    5 {three i [v] i [four] [4] 1} {ii [five] [five] [five] iii}
214    8 {2 ii i two 3 three 2} {two [iv] [v] iii 3 [five]}
215    9 {i 2 [iv] 3 [five] [four] [v]} {iii [4] three i three ii 1}
216  }
217  3 {one OR two OR iii OR 4 OR v} {
218    1 {[four] [v] [4] [i] [three]} {[1] [3] [five] [five] [4] [one]}
219    2 {[5] [1] [3] [4] [i]} {[2] [2] [v] [two] [4]}
220    3 {[5] [i] [5] [2] [four] [4] [1]} {[iii] [ii] [five] [two] [1]}
221    4 {[ii] [four] [4] [one] [5] [three] [five]} {[one] [5] [1] [iii] [4] [3]}
222    5 {[three] [i] [v] [i] [four] [4] [1]} {[ii] [five] [five] [five] [iii]}
223    6 {[4] [2] [ii] [two] [2] [iii]} {[three] [1] [four] [4] [iv] [1] [iv]}
224    7 {[ii] [ii] [two] [three] [2] [5]} {[iii] [i] [ii] [iii] [iii] [one] [one]}
225    8 {[2] [ii] [i] [two] [3] [three] [2]} {[two] [iv] [v] [iii] [3] [five]}
226    9 {[i] [2] [iv] [3] [five] [four] [v]} {[iii] [4] [three] [i] [three] [ii] [1]}
227  }
228
229  4 {5 + 1} {
230    2 {[5 1] 3 4 i} {2 2 v two 4}
231    3 {[5 i] 5 2 four 4 1} {iii ii five two 1}
232    4 {ii four 4 one 5 three five} {one [5 1] iii 4 3}
233    5 {three i [v i] four 4 1} {ii five five five iii}
234  }
235
236  5 {one + two + three} {
237    7 {ii ii two three 2 5} {iii [i ii iii] iii one one}
238    8 {2 ii [i two 3] three 2} {two iv v iii 3 five}
239  }
240
241  6 {"v v"} {
242    1 {four v 4 i three} {1 3 [five five] 4 one}
243    5 {three i v i four 4 1} {ii [five five five] iii}
244  }
245} {
246  if {![fts5_expr_ok $q t1]} continue
247  do_execsql_test 5.1.$tn {
248    SELECT rowid, highlight(t1, 0, '[', ']'), highlight(t1, 1, '[', ']')
249    FROM t1 WHERE t1 MATCH $q
250  } $res
251}
252
253# Test that the xQueryPhrase() API works with synonyms.
254#
255proc mit {blob} {
256  set scan(littleEndian) i*
257  set scan(bigEndian) I*
258  binary scan $blob $scan($::tcl_platform(byteOrder)) r
259  return $r
260}
261db func mit mit
262sqlite3_fts5_register_matchinfo db
263
264foreach {tn q res} {
265  1 {one} {
266      1 {1 11 7 2 12 6}     2 {2 11 7 0 12 6}
267      3 {2 11 7 1 12 6}     4 {1 11 7 2 12 6}
268      5 {3 11 7 0 12 6}     6 {0 11 7 2 12 6}
269      7 {0 11 7 3 12 6}     8 {1 11 7 0 12 6}
270      9 {1 11 7 2 12 6}
271  }
272} {
273  do_execsql_test 5.2.$tn {
274    SELECT rowid, mit(matchinfo(t1, 'x')) FROM t1 WHERE t1 MATCH $q
275  } $res
276}
277
278#-------------------------------------------------------------------------
279# Test terms with more than 4 synonyms.
280#
281reset_db
282sqlite3_fts5_create_tokenizer db tcl tcl_create
283proc tcl_tokenize {tflags text} {
284  foreach {w iStart iEnd} [fts5_tokenize_split $text] {
285    sqlite3_fts5_token $w $iStart $iEnd
286    if {$tflags=="query" && [string length $w]==1} {
287      for {set i 2} {$i<=10} {incr i} {
288        sqlite3_fts5_token -colo [string repeat $w $i] $iStart $iEnd
289      }
290    }
291  }
292}
293
294do_execsql_test 6.0.1 {
295  CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize=tcl, detail=%DETAIL%);
296  INSERT INTO t1 VALUES('yy xx qq');
297  INSERT INTO t1 VALUES('yy xx xx');
298}
299if {[fts5_expr_ok "NEAR(y q)" t1]} {
300  do_execsql_test 6.0.2 {
301    SELECT * FROM t1 WHERE t1 MATCH 'NEAR(y q)';
302  } {{yy xx qq}}
303}
304
305do_test 6.0.3 {
306  execsql {
307    CREATE VIRTUAL TABLE t2 USING fts5(a, b, tokenize=tcl, detail=%DETAIL%)
308  }
309  foreach {rowid a b} {
310    1 {yyyy vvvvv qq oo yyyyyy vvvv eee} {ffff uu r qq aaaa}
311    2 {ww oooooo bbbbb ssssss mm} {ffffff yy iiii rr s ccc qqqqq}
312    3 {zzzz llll gggggg cccc uu} {hhhhhh aaaa ppppp rr ee jjjj}
313    4 {r f i rrrrrr ww hhh} {aa yyy t x aaaaa ii}
314    5 {fffff mm vvvv ooo ffffff kkkk tttt} {cccccc bb e zzz d n}
315    6 {iii dddd hh qqqq ddd ooo} {ttt d c b aaaaaa qqqq}
316    7 {jjjj rrrr v zzzzz u tt t} {ppppp pp dddd mm hhh uuu}
317    8 {gggg rrrrrr kkkk vvvv gggg jjjjjj b} {dddddd jj r w cccc wwwwww ss}
318    9 {kkkkk qqq oooo e tttttt mmm} {e ss qqqqqq hhhh llllll gg}
319  } {
320    execsql { INSERT INTO t2(rowid, a, b) VALUES($rowid, $a, $b) }
321  }
322} {}
323
324foreach {tn q res} {
325  1 {a} {
326    1 {yyyy vvvvv qq oo yyyyyy vvvv eee} {ffff uu r qq [aaaa]}
327    3 {zzzz llll gggggg cccc uu} {hhhhhh [aaaa] ppppp rr ee jjjj}
328    4 {r f i rrrrrr ww hhh} {[aa] yyy t x [aaaaa] ii}
329    6 {iii dddd hh qqqq ddd ooo} {ttt d c b [aaaaaa] qqqq}
330  }
331
332  2 {a AND q} {
333    1 {yyyy vvvvv [qq] oo yyyyyy vvvv eee} {ffff uu r [qq] [aaaa]}
334    6 {iii dddd hh [qqqq] ddd ooo} {ttt d c b [aaaaaa] [qqqq]}
335  }
336
337  3 {o OR (q AND a)} {
338    1 {yyyy vvvvv [qq] [oo] yyyyyy vvvv eee} {ffff uu r [qq] [aaaa]}
339    2 {ww [oooooo] bbbbb ssssss mm} {ffffff yy iiii rr s ccc qqqqq}
340    5 {fffff mm vvvv [ooo] ffffff kkkk tttt} {cccccc bb e zzz d n}
341    6 {iii dddd hh [qqqq] ddd [ooo]} {ttt d c b [aaaaaa] [qqqq]}
342    9 {kkkkk qqq [oooo] e tttttt mmm} {e ss qqqqqq hhhh llllll gg}
343  }
344
345  4 {NEAR(q y, 20)} {
346    1 {[yyyy] vvvvv [qq] oo [yyyyyy] vvvv eee} {ffff uu r qq aaaa}
347    2 {ww oooooo bbbbb ssssss mm} {ffffff [yy] iiii rr s ccc [qqqqq]}
348  }
349} {
350  if {![fts5_expr_ok $q t2]} continue
351
352  do_execsql_test 6.1.$tn.asc {
353    SELECT rowid, highlight(t2, 0, '[', ']'), highlight(t2, 1, '[', ']')
354    FROM t2 WHERE t2 MATCH $q
355  } $res
356
357  set res2 [list]
358  foreach {rowid a b} $res {
359    set res2 [concat [list $rowid $a $b] $res2]
360  }
361
362  do_execsql_test 6.1.$tn.desc {
363    SELECT rowid, highlight(t2, 0, '[', ']'), highlight(t2, 1, '[', ']')
364    FROM t2 WHERE t2 MATCH $q ORDER BY rowid DESC
365  } $res2
366}
367
368do_execsql_test 6.2.1 {
369  INSERT INTO t2(rowid, a, b) VALUES(13,
370      'x xx xxx xxxx xxxxx xxxxxx xxxxxxx', 'y yy yyy yyyy yyyyy yyyyyy yyyyyyy'
371  );
372  SELECT rowid, highlight(t2, 0, '<', '>'), highlight(t2, 1, '(', ')')
373  FROM t2 WHERE t2 MATCH 'x OR y'
374} {
375  1 {<yyyy> vvvvv qq oo <yyyyyy> vvvv eee} {ffff uu r qq aaaa}
376  2 {ww oooooo bbbbb ssssss mm} {ffffff (yy) iiii rr s ccc qqqqq}
377  4 {r f i rrrrrr ww hhh} {aa (yyy) t (x) aaaaa ii}
378  13 {<x> <xx> <xxx> <xxxx> <xxxxx> <xxxxxx> <xxxxxxx>}
379     {(y) (yy) (yyy) (yyyy) (yyyyy) (yyyyyy) (yyyyyyy)}
380}
381
382#-------------------------------------------------------------------------
383# Test that the xColumnSize() API is not confused by colocated tokens.
384#
385reset_db
386sqlite3_fts5_create_tokenizer db tcl tcl_create
387fts5_aux_test_functions db
388proc tcl_tokenize {tflags text} {
389  foreach {w iStart iEnd} [fts5_tokenize_split $text] {
390    sqlite3_fts5_token $w $iStart $iEnd
391    if {[string length $w]==1} {
392      for {set i 2} {$i<=10} {incr i} {
393        sqlite3_fts5_token -colo [string repeat $w $i] $iStart $iEnd
394      }
395    }
396  }
397}
398
399do_execsql_test 7.0.1 {
400  CREATE VIRTUAL TABLE t1 USING fts5(a, b, columnsize=1, tokenize=tcl, detail=%DETAIL%);
401  INSERT INTO t1 VALUES('0 2 3', '4 5 6 7');
402  INSERT INTO t1 VALUES('8 9', '0 0 0 0 0 0 0 0 0 0');
403  SELECT fts5_test_columnsize(t1) FROM t1 WHERE t1 MATCH '000 AND 00 AND 0';
404} {{3 4} {2 10}}
405
406do_execsql_test 7.0.2 {
407  INSERT INTO t1(t1) VALUES('integrity-check');
408}
409
410do_execsql_test 7.1.1 {
411  CREATE VIRTUAL TABLE t2 USING fts5(a, b, columnsize=0, tokenize=tcl, detail=%DETAIL%);
412  INSERT INTO t2 VALUES('0 2 3', '4 5 6 7');
413  INSERT INTO t2 VALUES('8 9', '0 0 0 0 0 0 0 0 0 0');
414  SELECT fts5_test_columnsize(t2) FROM t2 WHERE t2 MATCH '000 AND 00 AND 0';
415} {{3 4} {2 10}}
416
417do_execsql_test 7.1.2 {
418  INSERT INTO t2(t2) VALUES('integrity-check');
419}
420
421} ;# foreach_detail_mode
422
423finish_test
424