xref: /sqlite-3.40.0/test/fts4langid.test (revision 7aa3ebee)
1# 2012 March 01
2#
3# The author disclaims copyright to this source code.  In place of
4# a legal notice, here is a blessing:
5#
6#    May you do good and not evil.
7#    May you find forgiveness for yourself and forgive others.
8#    May you share freely, never taking more than you give.
9#
10#*************************************************************************
11# This file implements regression tests for SQLite library.  The
12# focus of this script is testing the languageid=xxx FTS4 option.
13#
14
15set testdir [file dirname $argv0]
16source $testdir/tester.tcl
17set ::testprefix fts4content
18
19# If SQLITE_ENABLE_FTS3 is defined, omit this file.
20ifcapable !fts3 {
21  finish_test
22  return
23}
24
25set ::testprefix fts4langid
26
27#---------------------------------------------------------------------------
28# Test plan:
29#
30#   1.* - Warm-body tests created for specific purposes during development.
31#         Passing these doesn't really prove much.
32#
33#   2.1.* - Test that FTS queries only ever return rows associated with
34#           the requested language.
35#
36#   2.2.* - Same as 2.1.*, after an 'optimize' command.
37#
38#   2.3.* - Same as 2.1.*, after a 'rebuild' command.
39#
40#   3.* - Tests with content= tables. Both where there is a real
41#         underlying content table and where there is not.
42#
43#   4.* - Test that if one is provided, the tokenizer xLanguage method
44#         is called to configure the tokenizer before tokenizing query
45#         or document text.
46#
47#   5.* - Test the fts4aux table when the associated FTS4 table contains
48#         multiple languages.
49#
50
51do_execsql_test 1.1 {
52  CREATE VIRTUAL TABLE t1 USING fts4(a, b, languageid=lang_id);
53}
54
55do_execsql_test 1.2 {
56  SELECT sql FROM sqlite_master WHERE name = 't1_content';
57} {{CREATE TABLE 't1_content'(docid INTEGER PRIMARY KEY, 'c0a', 'c1b', langid)}}
58
59do_execsql_test 1.3 {SELECT docid FROM t1} {}
60do_execsql_test 1.4 {SELECT lang_id FROM t1} {}
61
62do_execsql_test 1.5 {INSERT INTO t1(a, b) VALUES('aaa', 'bbb')}
63do_execsql_test 1.6 {SELECT lang_id FROM t1 } {0}
64
65do_execsql_test 1.7 {INSERT INTO t1(a, b, lang_id) VALUES('aaa', 'bbb', 4)}
66do_execsql_test 1.8 {SELECT lang_id FROM t1 } {0 4}
67
68do_execsql_test 1.9  {INSERT INTO t1(a, b, lang_id) VALUES('aaa', 'bbb', 'xyz')}
69do_execsql_test 1.10 {SELECT lang_id FROM t1} {0 4 0}
70
71do_execsql_test 1.11 {
72  CREATE VIRTUAL TABLE t2 USING fts4;
73  INSERT INTO t2 VALUES('abc');
74}
75do_execsql_test 1.12 { SELECT rowid FROM t2 WHERE content MATCH 'abc' } 1
76
77do_execsql_test 1.13 {
78  DROP TABLE t1;
79  CREATE VIRTUAL TABLE t1 USING fts4(languageid=lang_id);
80  INSERT INTO t1(content)          VALUES('a b c');
81  INSERT INTO t1(content, lang_id) VALUES('a b c', 1);
82}
83
84do_execsql_test 1.14 {
85  SELECT rowid FROM t1 WHERE t1 MATCH 'b';
86} {1}
87do_execsql_test 1.15 {
88  SELECT rowid FROM t1 WHERE t1 MATCH 'b' AND lang_id = 0;
89} {1}
90
91do_execsql_test 1.16 {
92  SELECT rowid FROM t1 WHERE t1 MATCH 'b' AND lang_id = 1;
93} {2}
94
95do_catchsql_test 1.17 {
96  INSERT INTO t1(content, lang_id) VALUES('123', -1);
97} {1 {constraint failed}}
98
99do_execsql_test 1.18 {
100  DROP TABLE t1;
101  CREATE VIRTUAL TABLE t1 USING fts4(languageid=lang_id);
102  INSERT INTO t1(content, lang_id) VALUES('A', 13);
103  INSERT INTO t1(content, lang_id) VALUES('B', 13);
104  INSERT INTO t1(content, lang_id) VALUES('C', 13);
105  INSERT INTO t1(content, lang_id) VALUES('D', 13);
106  INSERT INTO t1(content, lang_id) VALUES('E', 13);
107  INSERT INTO t1(content, lang_id) VALUES('F', 13);
108  INSERT INTO t1(content, lang_id) VALUES('G', 13);
109  INSERT INTO t1(content, lang_id) VALUES('H', 13);
110  INSERT INTO t1(content, lang_id) VALUES('I', 13);
111  INSERT INTO t1(content, lang_id) VALUES('J', 13);
112  INSERT INTO t1(content, lang_id) VALUES('K', 13);
113  INSERT INTO t1(content, lang_id) VALUES('L', 13);
114  INSERT INTO t1(content, lang_id) VALUES('M', 13);
115  INSERT INTO t1(content, lang_id) VALUES('N', 13);
116  INSERT INTO t1(content, lang_id) VALUES('O', 13);
117  INSERT INTO t1(content, lang_id) VALUES('P', 13);
118  INSERT INTO t1(content, lang_id) VALUES('Q', 13);
119  INSERT INTO t1(content, lang_id) VALUES('R', 13);
120  INSERT INTO t1(content, lang_id) VALUES('S', 13);
121  SELECT rowid FROM t1 WHERE t1 MATCH 'A';
122} {}
123
124
125#-------------------------------------------------------------------------
126# Test cases 2.*
127#
128proc build_multilingual_db_1 {db} {
129  $db eval { CREATE VIRTUAL TABLE t2 USING fts4(x, y, languageid=l) }
130
131  set xwords [list zero one two three four five six seven eight nine ten]
132  set ywords [list alpha beta gamma delta epsilon zeta eta theta iota kappa]
133
134  for {set i 0} {$i < 1000} {incr i} {
135    set iLangid [expr $i%9]
136    set x ""
137    set y ""
138
139    set x [list]
140    lappend x [lindex $xwords [expr ($i / 1000) % 10]]
141    lappend x [lindex $xwords [expr ($i / 100)  % 10]]
142    lappend x [lindex $xwords [expr ($i / 10)   % 10]]
143    lappend x [lindex $xwords [expr ($i / 1)   % 10]]
144
145    set y [list]
146    lappend y [lindex $ywords [expr ($i / 1000) % 10]]
147    lappend y [lindex $ywords [expr ($i / 100)  % 10]]
148    lappend y [lindex $ywords [expr ($i / 10)   % 10]]
149    lappend y [lindex $ywords [expr ($i / 1)   % 10]]
150
151    $db eval { INSERT INTO t2(docid, x, y, l) VALUES($i, $x, $y, $iLangid) }
152  }
153
154  $db eval {
155    CREATE TABLE data(x, y, l);
156    INSERT INTO data(rowid, x, y, l) SELECT docid, x, y, l FROM t2;
157  }
158}
159
160proc rowid_list_set_langid {langid} {
161  set ::rowid_list_langid $langid
162}
163proc rowid_list {pattern} {
164  set langid $::rowid_list_langid
165  set res [list]
166  db eval {SELECT rowid, x, y FROM data WHERE l = $langid ORDER BY rowid ASC} {
167    if {[string match "*$pattern*" $x] || [string match "*$pattern*" $y]} {
168      lappend res $rowid
169    }
170  }
171  return $res
172}
173
174proc or_merge_list {list1 list2} {
175  set res [list]
176
177  set i1 0
178  set i2 0
179
180  set n1 [llength $list1]
181  set n2 [llength $list2]
182
183  while {$i1 < $n1 && $i2 < $n2} {
184    set e1 [lindex $list1 $i1]
185    set e2 [lindex $list2 $i2]
186
187    if {$e1==$e2} {
188      lappend res $e1
189      incr i1
190      incr i2
191    } elseif {$e1 < $e2} {
192      lappend res $e1
193      incr i1
194    } else {
195      lappend res $e2
196      incr i2
197    }
198  }
199
200  concat $res [lrange $list1 $i1 end] [lrange $list2 $i2 end]
201}
202
203proc or_merge_lists {args} {
204  set res [lindex $args 0]
205  for {set i 1} {$i < [llength $args]} {incr i} {
206    set res [or_merge_list $res [lindex $args $i]]
207  }
208  set res
209}
210
211proc and_merge_list {list1 list2} {
212  foreach i $list2 { set a($i) 1 }
213  set res [list]
214  foreach i $list1 {
215    if {[info exists a($i)]} {lappend res $i}
216  }
217  set res
218}
219
220
221proc and_merge_lists {args} {
222  set res [lindex $args 0]
223  for {set i 1} {$i < [llength $args]} {incr i} {
224    set res [and_merge_list $res [lindex $args $i]]
225  }
226  set res
227}
228
229proc filter_list {list langid} {
230  set res [list]
231  foreach i $list {
232    if {($i % 9) == $langid} {lappend res $i}
233  }
234  set res
235}
236
237do_test 2.0 {
238  reset_db
239  build_multilingual_db_1 db
240} {}
241
242proc do_test_query1 {tn query res_script} {
243  for {set langid 0} {$langid < 10} {incr langid} {
244    rowid_list_set_langid $langid
245    set res [eval $res_script]
246
247    set actual [
248      execsql {SELECT docid FROM t2 WHERE t2 MATCH $query AND l = $langid}
249    ]
250    do_test $tn.$langid [list set {} $actual] $res
251  }
252}
253
254# Run some queries.
255do_test_query1 2.1.1  {delta}          { rowid_list delta }
256do_test_query1 2.1.2  {"zero one two"} { rowid_list "zero one two" }
257do_test_query1 2.1.3  {zero one two} {
258  and_merge_lists [rowid_list zero] [rowid_list one] [rowid_list two]
259}
260do_test_query1 2.1.4  {"zero one" OR "one two"} {
261  or_merge_lists [rowid_list "zero one"] [rowid_list "one two"]
262}
263
264# Now try the same tests as above, but after running the 'optimize'
265# command on the FTS table.
266#
267do_execsql_test 2.2 {
268  INSERT INTO t2(t2) VALUES('optimize');
269  SELECT count(*) FROM t2_segdir;
270} {9}
271do_test_query1 2.2.1 {delta}          { rowid_list delta }
272do_test_query1 2.2.2 {"zero one two"} { rowid_list "zero one two" }
273do_test_query1 2.2.3 {zero one two} {
274  and_merge_lists [rowid_list zero] [rowid_list one] [rowid_list two]
275}
276do_test_query1 2.2.4 {"zero one" OR "one two"} {
277  or_merge_lists [rowid_list "zero one"] [rowid_list "one two"]
278}
279
280# And rebuild.
281#
282do_test 2.3 {
283  reset_db
284  build_multilingual_db_1 db
285  execsql { INSERT INTO t2(t2) VALUES('rebuild') }
286} {}
287do_test_query1 2.3.1 {delta}          { rowid_list delta }
288do_test_query1 2.3.2 {"zero one two"} { rowid_list "zero one two" }
289do_test_query1 2.3.3 {zero one two} {
290  and_merge_lists [rowid_list zero] [rowid_list one] [rowid_list two]
291}
292do_test_query1 2.3.4 {"zero one" OR "one two"} {
293  or_merge_lists [rowid_list "zero one"] [rowid_list "one two"]
294}
295
296#-------------------------------------------------------------------------
297# Test cases 3.*
298#
299do_test 3.0 {
300  reset_db
301  build_multilingual_db_1 db
302  execsql {
303    CREATE TABLE t3_data(l, x, y);
304    INSERT INTO t3_data(rowid, l, x, y) SELECT docid, l, x, y FROM t2;
305    DROP TABLE t2;
306  }
307} {}
308do_execsql_test 3.1 {
309  CREATE VIRTUAL TABLE t2 USING fts4(content=t3_data, languageid=l);
310  INSERT INTO t2(t2) VALUES('rebuild');
311}
312
313do_test_query1 3.1.1 {delta}          { rowid_list delta }
314do_test_query1 3.1.2 {"zero one two"} { rowid_list "zero one two" }
315do_test_query1 3.1.3 {zero one two} {
316  and_merge_lists [rowid_list zero] [rowid_list one] [rowid_list two]
317}
318do_test_query1 3.1.4 {"zero one" OR "one two"} {
319  or_merge_lists [rowid_list "zero one"] [rowid_list "one two"]
320}
321
322do_execsql_test 3.2.1 {
323  DROP TABLE t2;
324  CREATE VIRTUAL TABLE t2 USING fts4(x, y, languageid=l, content=nosuchtable);
325}
326
327do_execsql_test 3.2.2 {
328  INSERT INTO t2(docid, x, y, l) SELECT rowid, x, y, l FROM t3_data;
329}
330
331do_execsql_test 3.2.3 {
332  DROP TABLE t3_data;
333}
334
335do_test_query1 3.3.1 {delta}          { rowid_list delta }
336do_test_query1 3.3.2 {"zero one two"} { rowid_list "zero one two" }
337do_test_query1 3.3.3 {zero one two} {
338  and_merge_lists [rowid_list zero] [rowid_list one] [rowid_list two]
339}
340do_test_query1 3.3.4 {"zero one" OR "one two"} {
341  or_merge_lists [rowid_list "zero one"] [rowid_list "one two"]
342}
343
344#-------------------------------------------------------------------------
345# Test cases 4.*
346#
347proc build_multilingual_db_2 {db} {
348  $db eval {
349    CREATE VIRTUAL TABLE t4 USING fts4(
350        tokenize=testtokenizer,
351        languageid=lid
352    );
353  }
354  for {set i 0} {$i < 50} {incr i} {
355    execsql {
356      INSERT INTO t4(docid, content, lid) VALUES($i, 'The Quick Brown Fox', $i)
357    }
358  }
359}
360
361ifcapable fts3_tokenizer {
362  do_test 4.1.0 {
363    reset_db
364    set ptr [fts3_test_tokenizer]
365    execsql { SELECT fts3_tokenizer('testtokenizer', $ptr) }
366    build_multilingual_db_2 db
367  } {}
368  do_execsql_test 4.1.1 {
369    SELECT docid FROM t4 WHERE t4 MATCH 'quick';
370  } {0}
371  do_execsql_test 4.1.2 {
372    SELECT docid FROM t4 WHERE t4 MATCH 'quick' AND lid=1;
373  } {}
374  do_execsql_test 4.1.3 {
375    SELECT docid FROM t4 WHERE t4 MATCH 'Quick' AND lid=1;
376  } {1}
377  for {set i 0} {$i < 50} {incr i} {
378    do_execsql_test 4.1.4.$i {
379      SELECT count(*) FROM t4 WHERE t4 MATCH 'fox' AND lid=$i;
380    } [expr 0==($i%2)]
381  }
382  do_catchsql_test 4.1.5 {
383    INSERT INTO t4(content, lid) VALUES('hello world', 101)
384  } {1 {SQL logic error or missing database}}
385}
386
387#-------------------------------------------------------------------------
388# Test cases 5.*
389#
390# The following test cases are designed to detect a 32-bit overflow bug
391# that existed at one point.
392#
393proc build_multilingual_db_3 {db} {
394  $db eval {
395    CREATE VIRTUAL TABLE t5 USING fts4(languageid=lid);
396  }
397  set languages [list 0 1 2 [expr 1<<30]]
398
399  foreach lid $languages {
400    execsql {
401      INSERT INTO t5(docid, content, lid) VALUES(
402          $lid, 'My language is ' || $lid, $lid
403      )
404    }
405  }
406}
407
408do_test 5.1.0 {
409  reset_db
410  build_multilingual_db_3 db
411} {}
412
413do_execsql_test 5.1.1 {
414  SELECT level FROM t5_segdir;
415} [list 0 1024 2048 [expr 1<<40]]
416
417do_execsql_test 5.1.2 {SELECT docid FROM t5 WHERE t5 MATCH 'language'} 0
418foreach langid [list 0 1 2 [expr 1<<30]] {
419  do_execsql_test 5.2.$langid {
420    SELECT docid FROM t5 WHERE t5 MATCH 'language' AND lid = $langid
421  } $langid
422}
423
424set lid [expr 1<<30]
425do_execsql_test 5.3.1 {
426  CREATE VIRTUAL TABLE t6 USING fts4(languageid=lid);
427  INSERT INTO t6 VALUES('I belong to language 0!');
428}
429do_test 5.3.2 {
430  for {set i 0} {$i < 20} {incr i} {
431    execsql {
432      INSERT INTO t6(content, lid) VALUES(
433        'I (row '||$i||') belong to langauge N!', $lid
434      );
435    }
436  }
437  execsql { SELECT docid FROM t6 WHERE t6 MATCH 'belong' }
438} {1}
439
440do_test 5.3.3 {
441  execsql { SELECT docid FROM t6 WHERE t6 MATCH 'belong' AND lid=$lid}
442} {2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21}
443
444do_execsql_test 5.3.4 { INSERT INTO t6(t6) VALUES('optimize') } {}
445do_execsql_test 5.3.5 { SELECT docid FROM t6 WHERE t6 MATCH 'belong' } {1}
446do_execsql_test 5.3.6 {
447  SELECT docid FROM t6 WHERE t6 MATCH 'belong' AND lid=$lid
448} {2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21}
449
450
451set lid [expr 1<<30]
452foreach lid [list 4 [expr 1<<30]] {
453  do_execsql_test 5.4.$lid.1 {
454    DELETE FROM t6;
455    SELECT count(*) FROM t6_segdir;
456    SELECT count(*) FROM t6_segments;
457  } {0 0}
458  do_execsql_test 5.4.$lid.2 {
459    INSERT INTO t6(content, lid) VALUES('zero zero zero', $lid);
460    INSERT INTO t6(content, lid) VALUES('zero zero one', $lid);
461    INSERT INTO t6(content, lid) VALUES('zero one zero', $lid);
462    INSERT INTO t6(content, lid) VALUES('zero one one', $lid);
463    INSERT INTO t6(content, lid) VALUES('one zero zero', $lid);
464    INSERT INTO t6(content, lid) VALUES('one zero one', $lid);
465    INSERT INTO t6(content, lid) VALUES('one one zero', $lid);
466    INSERT INTO t6(content, lid) VALUES('one one one', $lid);
467
468    SELECT docid FROM t6 WHERE t6 MATCH '"zero zero"' AND lid=$lid;
469  } {1 2 5}
470
471  do_execsql_test 5.4.$lid.3 {
472    SELECT count(*) FROM t6_segdir;
473    SELECT count(*) FROM t6_segments;
474  } {8 0}
475
476  do_execsql_test 5.4.$lid.4 {
477    INSERT INTO t6(t6) VALUES('merge=100,3');
478    INSERT INTO t6(t6) VALUES('merge=100,3');
479    SELECT docid FROM t6 WHERE t6 MATCH '"zero zero"' AND lid=$lid;
480  } {1 2 5}
481
482  do_execsql_test 5.4.$lid.5 {
483    SELECT count(*) FROM t6_segdir;
484    SELECT count(*) FROM t6_segments;
485  } {4 4}
486}
487finish_test
488