xref: /sqlite-3.40.0/test/fts4langid.test (revision e88af017)
1# 2012 March 01
2#
3# The author disclaims copyright to this source code.  In place of
4# a legal notice, here is a blessing:
5#
6#    May you do good and not evil.
7#    May you find forgiveness for yourself and forgive others.
8#    May you share freely, never taking more than you give.
9#
10#*************************************************************************
11# This file implements regression tests for SQLite library.  The
12# focus of this script is testing the languageid=xxx FTS4 option.
13#
14
15set testdir [file dirname $argv0]
16source $testdir/tester.tcl
17
18# If SQLITE_ENABLE_FTS3 is defined, omit this file.
19ifcapable !fts3 {
20  finish_test
21  return
22}
23
24set ::testprefix fts4langid
25
26#---------------------------------------------------------------------------
27# Test plan:
28#
29#   1.* - Warm-body tests created for specific purposes during development.
30#         Passing these doesn't really prove much.
31#
32#   2.1.* - Test that FTS queries only ever return rows associated with
33#           the requested language.
34#
35#   2.2.* - Same as 2.1.*, after an 'optimize' command.
36#
37#   2.3.* - Same as 2.1.*, after a 'rebuild' command.
38#
39#   3.* - Tests with content= tables. Both where there is a real
40#         underlying content table and where there is not.
41#
42#   4.* - Test that if one is provided, the tokenizer xLanguage method
43#         is called to configure the tokenizer before tokenizing query
44#         or document text.
45#
46#   5.* - Test the fts4aux table when the associated FTS4 table contains
47#         multiple languages.
48#
49
50do_execsql_test 1.1 {
51  CREATE VIRTUAL TABLE t1 USING fts4(a, b, languageid=lang_id);
52}
53
54do_execsql_test 1.2 {
55  SELECT sql FROM sqlite_master WHERE name = 't1_content';
56} {{CREATE TABLE 't1_content'(docid INTEGER PRIMARY KEY, 'c0a', 'c1b', langid)}}
57
58do_execsql_test 1.3 {SELECT docid FROM t1} {}
59do_execsql_test 1.4 {SELECT lang_id FROM t1} {}
60
61do_execsql_test 1.5 {INSERT INTO t1(a, b) VALUES('aaa', 'bbb')}
62do_execsql_test 1.6 {SELECT lang_id FROM t1 } {0}
63
64do_execsql_test 1.7 {INSERT INTO t1(a, b, lang_id) VALUES('aaa', 'bbb', 4)}
65do_execsql_test 1.8 {SELECT lang_id FROM t1 } {0 4}
66
67do_execsql_test 1.9  {INSERT INTO t1(a, b, lang_id) VALUES('aaa', 'bbb', 'xyz')}
68do_execsql_test 1.10 {SELECT lang_id FROM t1} {0 4 0}
69
70do_execsql_test 1.11 {
71  CREATE VIRTUAL TABLE t2 USING fts4;
72  INSERT INTO t2 VALUES('abc');
73}
74do_execsql_test 1.12 { SELECT rowid FROM t2 WHERE content MATCH 'abc' } 1
75
76do_execsql_test 1.13 {
77  DROP TABLE t1;
78  CREATE VIRTUAL TABLE t1 USING fts4(languageid=lang_id);
79  INSERT INTO t1(content)          VALUES('a b c');
80  INSERT INTO t1(content, lang_id) VALUES('a b c', 1);
81}
82
83do_execsql_test 1.14 {
84  SELECT rowid FROM t1 WHERE t1 MATCH 'b';
85} {1}
86do_execsql_test 1.15 {
87  SELECT rowid FROM t1 WHERE t1 MATCH 'b' AND lang_id = 0;
88} {1}
89
90do_execsql_test 1.16 {
91  SELECT rowid FROM t1 WHERE t1 MATCH 'b' AND lang_id = 1;
92} {2}
93
94do_catchsql_test 1.17 {
95  INSERT INTO t1(content, lang_id) VALUES('123', -1);
96} {1 {constraint failed}}
97
98do_execsql_test 1.18 {
99  DROP TABLE t1;
100  CREATE VIRTUAL TABLE t1 USING fts4(languageid=lang_id);
101  INSERT INTO t1(content, lang_id) VALUES('A', 13);
102  INSERT INTO t1(content, lang_id) VALUES('B', 13);
103  INSERT INTO t1(content, lang_id) VALUES('C', 13);
104  INSERT INTO t1(content, lang_id) VALUES('D', 13);
105  INSERT INTO t1(content, lang_id) VALUES('E', 13);
106  INSERT INTO t1(content, lang_id) VALUES('F', 13);
107  INSERT INTO t1(content, lang_id) VALUES('G', 13);
108  INSERT INTO t1(content, lang_id) VALUES('H', 13);
109  INSERT INTO t1(content, lang_id) VALUES('I', 13);
110  INSERT INTO t1(content, lang_id) VALUES('J', 13);
111  INSERT INTO t1(content, lang_id) VALUES('K', 13);
112  INSERT INTO t1(content, lang_id) VALUES('L', 13);
113  INSERT INTO t1(content, lang_id) VALUES('M', 13);
114  INSERT INTO t1(content, lang_id) VALUES('N', 13);
115  INSERT INTO t1(content, lang_id) VALUES('O', 13);
116  INSERT INTO t1(content, lang_id) VALUES('P', 13);
117  INSERT INTO t1(content, lang_id) VALUES('Q', 13);
118  INSERT INTO t1(content, lang_id) VALUES('R', 13);
119  INSERT INTO t1(content, lang_id) VALUES('S', 13);
120  SELECT rowid FROM t1 WHERE t1 MATCH 'A';
121} {}
122
123
124#-------------------------------------------------------------------------
125# Test cases 2.*
126#
127proc build_multilingual_db_1 {db} {
128  $db eval { CREATE VIRTUAL TABLE t2 USING fts4(x, y, languageid=l) }
129
130  set xwords [list zero one two three four five six seven eight nine ten]
131  set ywords [list alpha beta gamma delta epsilon zeta eta theta iota kappa]
132
133  for {set i 0} {$i < 1000} {incr i} {
134    set iLangid [expr $i%9]
135    set x ""
136    set y ""
137
138    set x [list]
139    lappend x [lindex $xwords [expr ($i / 1000) % 10]]
140    lappend x [lindex $xwords [expr ($i / 100)  % 10]]
141    lappend x [lindex $xwords [expr ($i / 10)   % 10]]
142    lappend x [lindex $xwords [expr ($i / 1)   % 10]]
143
144    set y [list]
145    lappend y [lindex $ywords [expr ($i / 1000) % 10]]
146    lappend y [lindex $ywords [expr ($i / 100)  % 10]]
147    lappend y [lindex $ywords [expr ($i / 10)   % 10]]
148    lappend y [lindex $ywords [expr ($i / 1)   % 10]]
149
150    $db eval { INSERT INTO t2(docid, x, y, l) VALUES($i, $x, $y, $iLangid) }
151  }
152
153  $db eval {
154    CREATE TABLE data(x, y, l);
155    INSERT INTO data(rowid, x, y, l) SELECT docid, x, y, l FROM t2;
156  }
157}
158
159proc rowid_list_set_langid {langid} {
160  set ::rowid_list_langid $langid
161}
162proc rowid_list {pattern} {
163  set langid $::rowid_list_langid
164  set res [list]
165  db eval {SELECT rowid, x, y FROM data WHERE l = $langid ORDER BY rowid ASC} {
166    if {[string match "*$pattern*" $x] || [string match "*$pattern*" $y]} {
167      lappend res $rowid
168    }
169  }
170  return $res
171}
172
173proc or_merge_list {list1 list2} {
174  set res [list]
175
176  set i1 0
177  set i2 0
178
179  set n1 [llength $list1]
180  set n2 [llength $list2]
181
182  while {$i1 < $n1 && $i2 < $n2} {
183    set e1 [lindex $list1 $i1]
184    set e2 [lindex $list2 $i2]
185
186    if {$e1==$e2} {
187      lappend res $e1
188      incr i1
189      incr i2
190    } elseif {$e1 < $e2} {
191      lappend res $e1
192      incr i1
193    } else {
194      lappend res $e2
195      incr i2
196    }
197  }
198
199  concat $res [lrange $list1 $i1 end] [lrange $list2 $i2 end]
200}
201
202proc or_merge_lists {args} {
203  set res [lindex $args 0]
204  for {set i 1} {$i < [llength $args]} {incr i} {
205    set res [or_merge_list $res [lindex $args $i]]
206  }
207  set res
208}
209
210proc and_merge_list {list1 list2} {
211  foreach i $list2 { set a($i) 1 }
212  set res [list]
213  foreach i $list1 {
214    if {[info exists a($i)]} {lappend res $i}
215  }
216  set res
217}
218
219
220proc and_merge_lists {args} {
221  set res [lindex $args 0]
222  for {set i 1} {$i < [llength $args]} {incr i} {
223    set res [and_merge_list $res [lindex $args $i]]
224  }
225  set res
226}
227
228proc filter_list {list langid} {
229  set res [list]
230  foreach i $list {
231    if {($i % 9) == $langid} {lappend res $i}
232  }
233  set res
234}
235
236do_test 2.0 {
237  reset_db
238  build_multilingual_db_1 db
239} {}
240
241proc do_test_query1 {tn query res_script} {
242  for {set langid 0} {$langid < 10} {incr langid} {
243    rowid_list_set_langid $langid
244    set res [eval $res_script]
245
246    set actual [
247      execsql {SELECT docid FROM t2 WHERE t2 MATCH $query AND l = $langid}
248    ]
249    do_test $tn.$langid [list set {} $actual] $res
250  }
251}
252
253# Run some queries.
254do_test_query1 2.1.1  {delta}          { rowid_list delta }
255do_test_query1 2.1.2  {"zero one two"} { rowid_list "zero one two" }
256do_test_query1 2.1.3  {zero one two} {
257  and_merge_lists [rowid_list zero] [rowid_list one] [rowid_list two]
258}
259do_test_query1 2.1.4  {"zero one" OR "one two"} {
260  or_merge_lists [rowid_list "zero one"] [rowid_list "one two"]
261}
262
263# Now try the same tests as above, but after running the 'optimize'
264# command on the FTS table.
265#
266do_execsql_test 2.2 {
267  INSERT INTO t2(t2) VALUES('optimize');
268  SELECT count(*) FROM t2_segdir;
269} {9}
270do_test_query1 2.2.1 {delta}          { rowid_list delta }
271do_test_query1 2.2.2 {"zero one two"} { rowid_list "zero one two" }
272do_test_query1 2.2.3 {zero one two} {
273  and_merge_lists [rowid_list zero] [rowid_list one] [rowid_list two]
274}
275do_test_query1 2.2.4 {"zero one" OR "one two"} {
276  or_merge_lists [rowid_list "zero one"] [rowid_list "one two"]
277}
278
279# And rebuild.
280#
281do_test 2.3 {
282  reset_db
283  build_multilingual_db_1 db
284  execsql { INSERT INTO t2(t2) VALUES('rebuild') }
285} {}
286do_test_query1 2.3.1 {delta}          { rowid_list delta }
287do_test_query1 2.3.2 {"zero one two"} { rowid_list "zero one two" }
288do_test_query1 2.3.3 {zero one two} {
289  and_merge_lists [rowid_list zero] [rowid_list one] [rowid_list two]
290}
291do_test_query1 2.3.4 {"zero one" OR "one two"} {
292  or_merge_lists [rowid_list "zero one"] [rowid_list "one two"]
293}
294
295#-------------------------------------------------------------------------
296# Test cases 3.*
297#
298do_test 3.0 {
299  reset_db
300  build_multilingual_db_1 db
301  execsql {
302    CREATE TABLE t3_data(l, x, y);
303    INSERT INTO t3_data(rowid, l, x, y) SELECT docid, l, x, y FROM t2;
304    DROP TABLE t2;
305  }
306} {}
307do_execsql_test 3.1 {
308  CREATE VIRTUAL TABLE t2 USING fts4(content=t3_data, languageid=l);
309  INSERT INTO t2(t2) VALUES('rebuild');
310}
311
312do_test_query1 3.1.1 {delta}          { rowid_list delta }
313do_test_query1 3.1.2 {"zero one two"} { rowid_list "zero one two" }
314do_test_query1 3.1.3 {zero one two} {
315  and_merge_lists [rowid_list zero] [rowid_list one] [rowid_list two]
316}
317do_test_query1 3.1.4 {"zero one" OR "one two"} {
318  or_merge_lists [rowid_list "zero one"] [rowid_list "one two"]
319}
320
321do_execsql_test 3.2.1 {
322  DROP TABLE t2;
323  CREATE VIRTUAL TABLE t2 USING fts4(x, y, languageid=l, content=nosuchtable);
324}
325
326do_execsql_test 3.2.2 {
327  INSERT INTO t2(docid, x, y, l) SELECT rowid, x, y, l FROM t3_data;
328}
329
330do_execsql_test 3.2.3 {
331  DROP TABLE t3_data;
332}
333
334do_test_query1 3.3.1 {delta}          { rowid_list delta }
335do_test_query1 3.3.2 {"zero one two"} { rowid_list "zero one two" }
336do_test_query1 3.3.3 {zero one two} {
337  and_merge_lists [rowid_list zero] [rowid_list one] [rowid_list two]
338}
339do_test_query1 3.3.4 {"zero one" OR "one two"} {
340  or_merge_lists [rowid_list "zero one"] [rowid_list "one two"]
341}
342
343do_execsql_test 3.4 {
344  CREATE TABLE t8c(a, b);
345  CREATE VIRTUAL TABLE t8 USING fts4(content=t8c, languageid=langid);
346  INSERT INTO t8(docid, a, b) VALUES(-1, 'one two three', 'x y z');
347  SELECT docid FROM t8 WHERE t8 MATCH 'one x' AND langid=0
348} {-1}
349
350#-------------------------------------------------------------------------
351# Test cases 4.*
352#
353proc build_multilingual_db_2 {db} {
354  $db eval {
355    CREATE VIRTUAL TABLE t4 USING fts4(
356        tokenize=testtokenizer,
357        languageid=lid
358    );
359  }
360  for {set i 0} {$i < 50} {incr i} {
361    execsql {
362      INSERT INTO t4(docid, content, lid) VALUES($i, 'The Quick Brown Fox', $i)
363    }
364  }
365}
366
367do_test 4.1.0 {
368  reset_db
369  set ptr [fts3_test_tokenizer]
370  sqlite3_db_config db SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER 1
371  execsql { SELECT fts3_tokenizer('testtokenizer', $ptr) }
372  build_multilingual_db_2 db
373} {}
374do_execsql_test 4.1.1 {
375  SELECT docid FROM t4 WHERE t4 MATCH 'quick';
376} {0}
377do_execsql_test 4.1.2 {
378  SELECT docid FROM t4 WHERE t4 MATCH 'quick' AND lid=1;
379} {}
380do_execsql_test 4.1.3 {
381  SELECT docid FROM t4 WHERE t4 MATCH 'Quick' AND lid=1;
382} {1}
383for {set i 0} {$i < 50} {incr i} {
384  do_execsql_test 4.1.4.$i {
385    SELECT count(*) FROM t4 WHERE t4 MATCH 'fox' AND lid=$i;
386  } [expr 0==($i%2)]
387}
388do_catchsql_test 4.1.5 {
389  INSERT INTO t4(content, lid) VALUES('hello world', 101)
390} {1 {SQL logic error}}
391
392#-------------------------------------------------------------------------
393# Test cases 5.*
394#
395# The following test cases are designed to detect a 32-bit overflow bug
396# that existed at one point.
397#
398proc build_multilingual_db_3 {db} {
399  $db eval {
400    CREATE VIRTUAL TABLE t5 USING fts4(languageid=lid);
401  }
402  set languages [list 0 1 2 [expr 1<<30]]
403
404  foreach lid $languages {
405    execsql {
406      INSERT INTO t5(docid, content, lid) VALUES(
407          $lid, 'My language is ' || $lid, $lid
408      )
409    }
410  }
411}
412
413do_test 5.1.0 {
414  reset_db
415  build_multilingual_db_3 db
416} {}
417
418do_execsql_test 5.1.1 {
419  SELECT level FROM t5_segdir;
420} [list 0 1024 2048 [expr 1<<40]]
421
422do_execsql_test 5.1.2 {SELECT docid FROM t5 WHERE t5 MATCH 'language'} 0
423foreach langid [list 0 1 2 [expr 1<<30]] {
424  do_execsql_test 5.2.$langid {
425    SELECT docid FROM t5 WHERE t5 MATCH 'language' AND lid = $langid
426  } $langid
427}
428
429set lid [expr 1<<30]
430do_execsql_test 5.3.1 {
431  CREATE VIRTUAL TABLE t6 USING fts4(languageid=lid);
432  INSERT INTO t6 VALUES('I belong to language 0!');
433}
434do_test 5.3.2 {
435  for {set i 0} {$i < 20} {incr i} {
436    execsql {
437      INSERT INTO t6(content, lid) VALUES(
438        'I (row '||$i||') belong to langauge N!', $lid
439      );
440    }
441  }
442  execsql { SELECT docid FROM t6 WHERE t6 MATCH 'belong' }
443} {1}
444
445do_test 5.3.3 {
446  execsql { SELECT docid FROM t6 WHERE t6 MATCH 'belong' AND lid=$lid}
447} {2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21}
448
449do_execsql_test 5.3.4 { INSERT INTO t6(t6) VALUES('optimize') } {}
450do_execsql_test 5.3.5 { SELECT docid FROM t6 WHERE t6 MATCH 'belong' } {1}
451do_execsql_test 5.3.6 {
452  SELECT docid FROM t6 WHERE t6 MATCH 'belong' AND lid=$lid
453} {2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21}
454
455
456set lid [expr 1<<30]
457foreach lid [list 4 [expr 1<<30]] {
458  do_execsql_test 5.4.$lid.1 {
459    DELETE FROM t6;
460    SELECT count(*) FROM t6_segdir;
461    SELECT count(*) FROM t6_segments;
462  } {0 0}
463  do_execsql_test 5.4.$lid.2 {
464    INSERT INTO t6(content, lid) VALUES('zero zero zero', $lid);
465    INSERT INTO t6(content, lid) VALUES('zero zero one', $lid);
466    INSERT INTO t6(content, lid) VALUES('zero one zero', $lid);
467    INSERT INTO t6(content, lid) VALUES('zero one one', $lid);
468    INSERT INTO t6(content, lid) VALUES('one zero zero', $lid);
469    INSERT INTO t6(content, lid) VALUES('one zero one', $lid);
470    INSERT INTO t6(content, lid) VALUES('one one zero', $lid);
471    INSERT INTO t6(content, lid) VALUES('one one one', $lid);
472
473    SELECT docid FROM t6 WHERE t6 MATCH '"zero zero"' AND lid=$lid;
474  } {1 2 5}
475
476  do_execsql_test 5.4.$lid.3 {
477    SELECT count(*) FROM t6_segdir;
478    SELECT count(*) FROM t6_segments;
479  } {8 0}
480
481  do_execsql_test 5.4.$lid.4 {
482    INSERT INTO t6(t6) VALUES('merge=100,3');
483    INSERT INTO t6(t6) VALUES('merge=100,3');
484    SELECT docid FROM t6 WHERE t6 MATCH '"zero zero"' AND lid=$lid;
485  } {1 2 5}
486
487  do_execsql_test 5.4.$lid.5 {
488    SELECT count(*) FROM t6_segdir;
489    SELECT count(*) FROM t6_segments;
490  } {1 2}
491}
492
493reset_db
494do_execsql_test 6.0 {
495  CREATE VIRTUAL TABLE vt0 USING fts4(c0, languageid="lid");
496  INSERT INTO vt0 VALUES ('a'), ('b');
497  BEGIN;
498    UPDATE vt0 SET lid = 1 WHERE lid=0;
499}
500do_execsql_test 6.1 {
501  INSERT INTO vt0(vt0) VALUES('integrity-check');
502}
503do_execsql_test 6.2 {
504  COMMIT;
505  INSERT INTO vt0(vt0) VALUES('integrity-check');
506}
507finish_test
508