xref: /sqlite-3.40.0/test/fts3rnd.test (revision 786b0689)
1# 2009 December 03
2#
3#    May you do good and not evil.
4#    May you find forgiveness for yourself and forgive others.
5#    May you share freely, never taking more than you give.
6#
7#***********************************************************************
8#
9# Brute force (random data) tests for FTS3.
10#
11
12#-------------------------------------------------------------------------
13#
14# The FTS3 tests implemented in this file focus on testing that FTS3
15# returns the correct set of documents for various types of full-text
16# query. This is done using pseudo-randomly generated data and queries.
17# The expected result of each query is calculated using Tcl code.
18#
19#   1. The database is initialized to contain a single table with three
20#      columns. 100 rows are inserted into the table. Each of the three
21#      values in each row is a document consisting of between 0 and 100
22#      terms. Terms are selected from a vocabulary of $G(nVocab) terms.
23#
24#   2. The following is performed 100 times:
25#
26#      a. A row is inserted into the database. The row contents are
27#         generated as in step 1. The docid is a pseudo-randomly selected
28#         value between 0 and 1000000.
29#
30#      b. A psuedo-randomly selected row is updated. One of its columns is
31#         set to contain a new document generated in the same way as the
32#         documents in step 1.
33#
34#      c. A psuedo-randomly selected row is deleted.
35#
36#      d. For each of several types of fts3 queries, 10 SELECT queries
37#         of the form:
38#
39#           SELECT docid FROM <tbl> WHERE <tbl> MATCH '<query>'
40#
41#         are evaluated. The results are compared to those calculated by
42#         Tcl code in this file. The patterns used for the different query
43#         types are:
44#
45#           1.  query = <term>
46#           2.  query = <prefix>
47#           3.  query = "<term> <term>"
48#           4.  query = "<term> <term> <term>"
49#           5.  query = "<prefix> <prefix> <prefix>"
50#           6.  query = <term> NEAR <term>
51#           7.  query = <term> NEAR/11 <term> NEAR/11 <term>
52#           8.  query = <term> OR <term>
53#           9.  query = <term> NOT <term>
54#           10. query = <term> AND <term>
55#           11. query = <term> NEAR <term> OR <term> NEAR <term>
56#           12. query = <term> NEAR <term> NOT <term> NEAR <term>
57#           13. query = <term> NEAR <term> AND <term> NEAR <term>
58#
59#         where <term> is a term psuedo-randomly selected from the vocabulary
60#         and prefix is the first 2 characters of such a term followed by
61#         a "*" character.
62#
63#      Every second iteration, steps (a) through (d) above are performed
64#      within a single transaction. This forces the queries in (d) to
65#      read data from both the database and the in-memory hash table
66#      that caches the full-text index entries created by steps (a), (b)
67#      and (c) until the transaction is committed.
68#
69# The procedure above is run 5 times, using advisory fts3 node sizes of 50,
70# 500, 1000 and 2000 bytes.
71#
72# After the test using an advisory node-size of 50, an OOM test is run using
73# the database. This test is similar to step (d) above, except that it tests
74# the effects of transient and persistent OOM conditions encountered while
75# executing each query.
76#
77
78set testdir [file dirname $argv0]
79source $testdir/tester.tcl
80
81# If this build does not include FTS3, skip the tests in this file.
82#
83ifcapable !fts3 { finish_test ; return }
84source $testdir/fts3_common.tcl
85source $testdir/malloc_common.tcl
86
87set G(nVocab) 100
88
89set nVocab 100
90set lVocab [list]
91
92expr srand(0)
93
94# Generate a vocabulary of nVocab words. Each word is 3 characters long.
95#
96set lChar {a b c d e f g h i j k l m n o p q r s t u v w x y z}
97for {set i 0} {$i < $nVocab} {incr i} {
98  set len [expr int(rand()*3)+2]
99  set    word [lindex $lChar [expr int(rand()*26)]]
100  append word [lindex $lChar [expr int(rand()*26)]]
101  if {$len>2} { append word [lindex $lChar [expr int(rand()*26)]] }
102  if {$len>3} { append word [lindex $lChar [expr int(rand()*26)]] }
103  lappend lVocab $word
104}
105
106proc random_term {} {
107  lindex $::lVocab [expr {int(rand()*$::nVocab)}]
108}
109
110# Return a document consisting of $nWord arbitrarily selected terms
111# from the $::lVocab list.
112#
113proc generate_doc {nWord} {
114  set doc [list]
115  for {set i 0} {$i < $nWord} {incr i} {
116    lappend doc [random_term]
117  }
118  return $doc
119}
120
121
122
123# Primitives to update the table.
124#
125unset -nocomplain t1
126proc insert_row {rowid} {
127  set a [generate_doc [expr int((rand()*100))]]
128  set b [generate_doc [expr int((rand()*100))]]
129  set c [generate_doc [expr int((rand()*100))]]
130  execsql { INSERT INTO t1(docid, a, b, c) VALUES($rowid, $a, $b, $c) }
131  set ::t1($rowid) [list $a $b $c]
132}
133proc delete_row {rowid} {
134  execsql { DELETE FROM t1 WHERE rowid = $rowid }
135  catch {unset ::t1($rowid)}
136}
137proc update_row {rowid} {
138  set cols {a b c}
139  set iCol [expr int(rand()*3)]
140  set doc  [generate_doc [expr int((rand()*100))]]
141  lset ::t1($rowid) $iCol $doc
142  execsql "UPDATE t1 SET [lindex $cols $iCol] = \$doc WHERE rowid = \$rowid"
143}
144
145proc simple_phrase {zPrefix} {
146  set ret [list]
147
148  set reg [string map {* {[^ ]*}} $zPrefix]
149  set reg " $reg "
150
151  foreach key [lsort -integer [array names ::t1]] {
152    set value $::t1($key)
153    set cnt [list]
154    foreach col $value {
155      if {[regexp $reg " $col "]} { lappend ret $key ; break }
156    }
157  }
158
159  #lsort -uniq -integer $ret
160  set ret
161}
162
163# This [proc] is used to test the FTS3 matchinfo() function.
164#
165proc simple_token_matchinfo {zToken bDesc} {
166
167  set nDoc(0) 0
168  set nDoc(1) 0
169  set nDoc(2) 0
170  set nHit(0) 0
171  set nHit(1) 0
172  set nHit(2) 0
173
174  set dir -inc
175  if {$bDesc} { set dir -dec }
176
177  foreach key [array names ::t1] {
178    set value $::t1($key)
179    set a($key) [list]
180    foreach i {0 1 2} col $value {
181      set hit [llength [lsearch -all $col $zToken]]
182      lappend a($key) $hit
183      incr nHit($i) $hit
184      if {$hit>0} { incr nDoc($i) }
185    }
186  }
187
188  set ret [list]
189  foreach docid [lsort -integer $dir [array names a]] {
190    if { [lindex [lsort -integer $a($docid)] end] } {
191      set matchinfo [list 1 3]
192      foreach i {0 1 2} hit $a($docid) {
193        lappend matchinfo $hit $nHit($i) $nDoc($i)
194      }
195      lappend ret $docid $matchinfo
196    }
197  }
198
199  set ret
200}
201
202proc simple_near {termlist nNear} {
203  set ret [list]
204
205  foreach {key value} [array get ::t1] {
206    foreach v $value {
207
208      set l [lsearch -exact -all $v [lindex $termlist 0]]
209      foreach T [lrange $termlist 1 end] {
210        set l2 [list]
211        foreach i $l {
212          set iStart [expr $i - $nNear - 1]
213          set iEnd [expr $i + $nNear + 1]
214          if {$iStart < 0} {set iStart 0}
215          foreach i2 [lsearch -exact -all [lrange $v $iStart $iEnd] $T] {
216            incr i2 $iStart
217            if {$i2 != $i} { lappend l2 $i2 }
218          }
219        }
220        set l [lsort -uniq -integer $l2]
221      }
222
223      if {[llength $l]} {
224#puts "MATCH($key): $v"
225        lappend ret $key
226      }
227    }
228  }
229
230  lsort -unique -integer $ret
231}
232
233# The following three procs:
234#
235#   setup_not A B
236#   setup_or  A B
237#   setup_and A B
238#
239# each take two arguments. Both arguments must be lists of integer values
240# sorted by value. The return value is the list produced by evaluating
241# the equivalent of "A op B", where op is the FTS3 operator NOT, OR or
242# AND.
243#
244proc setop_not {A B} {
245  foreach b $B { set n($b) {} }
246  set ret [list]
247  foreach a $A { if {![info exists n($a)]} {lappend ret $a} }
248  return $ret
249}
250proc setop_or {A B} {
251  lsort -integer -uniq [concat $A $B]
252}
253proc setop_and {A B} {
254  foreach b $B { set n($b) {} }
255  set ret [list]
256  foreach a $A { if {[info exists n($a)]} {lappend ret $a} }
257  return $ret
258}
259
260proc mit {blob} {
261  set scan(littleEndian) i*
262  set scan(bigEndian) I*
263  binary scan $blob $scan($::tcl_platform(byteOrder)) r
264  return $r
265}
266db func mit mit
267set sqlite_fts3_enable_parentheses 1
268
269proc do_orderbydocid_test {tn sql res} {
270  uplevel [list do_select_test $tn.asc "$sql ORDER BY docid ASC" $res]
271  uplevel [list do_select_test $tn.desc "$sql ORDER BY docid DESC" \
272    [lsort -int -dec $res]
273  ]
274}
275
276set NUM_TRIALS 100
277
278foreach {nodesize order} {
279  50    DESC
280  50    ASC
281  500   ASC
282  1000  DESC
283  2000  ASC
284} {
285  catch { array unset ::t1 }
286  set testname "$nodesize/$order"
287
288  # Create the FTS3 table. Populate it (and the Tcl array) with 100 rows.
289  #
290  db transaction {
291    catchsql { DROP TABLE t1 }
292    execsql "CREATE VIRTUAL TABLE t1 USING fts4(a, b, c, order=$order)"
293    execsql "INSERT INTO t1(t1) VALUES('nodesize=$nodesize')"
294    for {set i 0} {$i < 100} {incr i} { insert_row $i }
295  }
296
297  for {set iTest 1} {$iTest <= $NUM_TRIALS} {incr iTest} {
298    catchsql COMMIT
299
300    set DO_MALLOC_TEST 0
301    set nRep 10
302    if {$iTest==100 && $nodesize==50} {
303      set DO_MALLOC_TEST 1
304      set nRep 2
305    }
306
307    set ::testprefix fts3rnd-1.$testname.$iTest
308
309    # Delete one row, update one row and insert one row.
310    #
311    set rows [array names ::t1]
312    set nRow [llength $rows]
313    set iUpdate [lindex $rows [expr {int(rand()*$nRow)}]]
314    set iDelete $iUpdate
315    while {$iDelete == $iUpdate} {
316      set iDelete [lindex $rows [expr {int(rand()*$nRow)}]]
317    }
318    set iInsert $iUpdate
319    while {[info exists ::t1($iInsert)]} {
320      set iInsert [expr {int(rand()*1000000)}]
321    }
322    execsql BEGIN
323      insert_row $iInsert
324      update_row $iUpdate
325      delete_row $iDelete
326    if {0==($iTest%2)} { execsql COMMIT }
327
328    if {0==($iTest%2)} {
329      #do_test 0 { fts3_integrity_check t1 } ok
330    }
331
332    # Pick 10 terms from the vocabulary. Check that the results of querying
333    # the database for the set of documents containing each of these terms
334    # is the same as the result obtained by scanning the contents of the Tcl
335    # array for each term.
336    #
337    for {set i 0} {$i < 10} {incr i} {
338      set term [random_term]
339      do_select_test 1.$i.asc {
340        SELECT docid, mit(matchinfo(t1)) FROM t1 WHERE t1 MATCH $term
341        ORDER BY docid ASC
342      } [simple_token_matchinfo $term 0]
343      do_select_test 1.$i.desc {
344        SELECT docid, mit(matchinfo(t1)) FROM t1 WHERE t1 MATCH $term
345        ORDER BY docid DESC
346      } [simple_token_matchinfo $term 1]
347    }
348
349    # This time, use the first two characters of each term as a term prefix
350    # to query for. Test that querying the Tcl array produces the same results
351    # as querying the FTS3 table for the prefix.
352    #
353    for {set i 0} {$i < $nRep} {incr i} {
354      set prefix [string range [random_term] 0 end-1]
355      set match "${prefix}*"
356      do_orderbydocid_test 2.$i {
357        SELECT docid FROM t1 WHERE t1 MATCH $match
358      } [simple_phrase $match]
359    }
360
361    # Similar to the above, except for phrase queries.
362    #
363    for {set i 0} {$i < $nRep} {incr i} {
364      set term [list [random_term] [random_term]]
365      set match "\"$term\""
366      do_orderbydocid_test 3.$i {
367        SELECT docid FROM t1 WHERE t1 MATCH $match
368      } [simple_phrase $term]
369    }
370
371    # Three word phrases.
372    #
373    for {set i 0} {$i < $nRep} {incr i} {
374      set term [list [random_term] [random_term] [random_term]]
375      set match "\"$term\""
376      do_orderbydocid_test 4.$i {
377        SELECT docid FROM t1 WHERE t1 MATCH $match
378      } [simple_phrase $term]
379    }
380
381    # Three word phrases made up of term-prefixes.
382    #
383    for {set i 0} {$i < $nRep} {incr i} {
384      set    query "[string range [random_term] 0 end-1]* "
385      append query "[string range [random_term] 0 end-1]* "
386      append query "[string range [random_term] 0 end-1]*"
387
388      set match "\"$query\""
389      do_orderbydocid_test 5.$i {
390        SELECT docid FROM t1 WHERE t1 MATCH $match
391      } [simple_phrase $query]
392    }
393
394    # A NEAR query with terms as the arguments:
395    #
396    #     ... MATCH '$term1 NEAR $term2' ...
397    #
398    for {set i 0} {$i < $nRep} {incr i} {
399      set terms [list [random_term] [random_term]]
400      set match [join $terms " NEAR "]
401      do_orderbydocid_test 6.$i {
402        SELECT docid FROM t1 WHERE t1 MATCH $match
403      } [simple_near $terms 10]
404    }
405
406    # A 3-way NEAR query with terms as the arguments.
407    #
408    for {set i 0} {$i < $nRep} {incr i} {
409      set terms [list [random_term] [random_term] [random_term]]
410      set nNear 11
411      set match [join $terms " NEAR/$nNear "]
412      do_orderbydocid_test 7.$i {
413        SELECT docid FROM t1 WHERE t1 MATCH $match
414      } [simple_near $terms $nNear]
415    }
416
417    # Set operations on simple term queries.
418    #
419    foreach {tn op proc} {
420      8  OR  setop_or
421      9  NOT setop_not
422      10 AND setop_and
423    } {
424      for {set i 0} {$i < $nRep} {incr i} {
425        set term1 [random_term]
426        set term2 [random_term]
427        set match "$term1 $op $term2"
428        do_orderbydocid_test $tn.$i {
429          SELECT docid FROM t1 WHERE t1 MATCH $match
430        } [$proc [simple_phrase $term1] [simple_phrase $term2]]
431      }
432    }
433
434    # Set operations on NEAR queries.
435    #
436    foreach {tn op proc} {
437      11 OR  setop_or
438      12 NOT setop_not
439      13 AND setop_and
440    } {
441      for {set i 0} {$i < $nRep} {incr i} {
442        set term1 [random_term]
443        set term2 [random_term]
444        set term3 [random_term]
445        set term4 [random_term]
446        set match "$term1 NEAR $term2 $op $term3 NEAR $term4"
447        do_orderbydocid_test $tn.$i {
448          SELECT docid FROM t1 WHERE t1 MATCH $match
449        } [$proc                                  \
450            [simple_near [list $term1 $term2] 10] \
451            [simple_near [list $term3 $term4] 10]
452          ]
453      }
454    }
455
456    catchsql COMMIT
457  }
458}
459
460finish_test
461