xref: /sqlite-3.40.0/test/fts3_common.tcl (revision 32bb700a)
1# 2009 November 04
2#
3# The author disclaims copyright to this source code.  In place of
4# a legal notice, here is a blessing:
5#
6#    May you do good and not evil.
7#    May you find forgiveness for yourself and forgive others.
8#    May you share freely, never taking more than you give.
9#
10#***********************************************************************
11#
12# This file contains common code used the fts3 tests. At one point
13# equivalent functionality was implemented in C code. But it is easier
14# to use Tcl.
15#
16
17#-------------------------------------------------------------------------
18# INSTRUCTIONS
19#
20# The following commands are available:
21#
22#   fts3_build_db_1 N
23#     Using database handle [db] create an FTS4 table named t1 and populate
24#     it with N rows of data. N must be less than 10,000. Refer to the
25#     header comments above the proc implementation below for details.
26#
27#   fts3_build_db_2 N
28#     Using database handle [db] create an FTS4 table named t2 and populate
29#     it with N rows of data. N must be less than 100,000. Refer to the
30#     header comments above the proc implementation below for details.
31#
32#   fts3_integrity_check TBL
33#     TBL must be an FTS table in the database currently opened by handle
34#     [db]. This proc loads and tokenizes all documents within the table,
35#     then checks that the current contents of the FTS index matches the
36#     results.
37#
38#   fts3_terms TBL WHERE
39#     Todo.
40#
41#   fts3_doclist TBL TERM WHERE
42#     Todo.
43#
44#
45#
46
47ifcapable fts3 {
48  sqlite3_fts3_may_be_corrupt 0
49}
50
51#-------------------------------------------------------------------------
52# USAGE: fts3_build_db_1 SWITCHES N
53#
54# Build a sample FTS table in the database opened by database connection
55# [db]. The name of the new table is "t1".
56#
57proc fts3_build_db_1 {args} {
58
59  set default(-module) fts4
60
61  set nArg [llength $args]
62  if {($nArg%2)==0} {
63    error "wrong # args: should be \"fts3_build_db_1 ?switches? n\""
64  }
65
66  set n [lindex $args [expr $nArg-1]]
67  array set opts [array get default]
68  array set opts [lrange $args 0 [expr $nArg-2]]
69  foreach k [array names opts] {
70    if {0==[info exists default($k)]} { error "unknown option: $k" }
71  }
72
73  if {$n > 10000} {error "n must be <= 10000"}
74  db eval "CREATE VIRTUAL TABLE t1 USING $opts(-module) (x, y)"
75
76  set xwords [list zero one two three four five six seven eight nine ten]
77  set ywords [list alpha beta gamma delta epsilon zeta eta theta iota kappa]
78
79  for {set i 0} {$i < $n} {incr i} {
80    set x ""
81    set y ""
82
83    set x [list]
84    lappend x [lindex $xwords [expr ($i / 1000) % 10]]
85    lappend x [lindex $xwords [expr ($i / 100)  % 10]]
86    lappend x [lindex $xwords [expr ($i / 10)   % 10]]
87    lappend x [lindex $xwords [expr ($i / 1)   % 10]]
88
89    set y [list]
90    lappend y [lindex $ywords [expr ($i / 1000) % 10]]
91    lappend y [lindex $ywords [expr ($i / 100)  % 10]]
92    lappend y [lindex $ywords [expr ($i / 10)   % 10]]
93    lappend y [lindex $ywords [expr ($i / 1)   % 10]]
94
95    db eval { INSERT INTO t1(docid, x, y) VALUES($i, $x, $y) }
96  }
97}
98
99#-------------------------------------------------------------------------
100# USAGE: fts3_build_db_2 N ARGS
101#
102# Build a sample FTS table in the database opened by database connection
103# [db]. The name of the new table is "t2".
104#
105proc fts3_build_db_2 {args} {
106
107  set default(-module) fts4
108  set default(-extra)   ""
109
110  set nArg [llength $args]
111  if {($nArg%2)==0} {
112    error "wrong # args: should be \"fts3_build_db_1 ?switches? n\""
113  }
114
115  set n [lindex $args [expr $nArg-1]]
116  array set opts [array get default]
117  array set opts [lrange $args 0 [expr $nArg-2]]
118  foreach k [array names opts] {
119    if {0==[info exists default($k)]} { error "unknown option: $k" }
120  }
121
122  if {$n > 100000} {error "n must be <= 100000"}
123
124  set sql "CREATE VIRTUAL TABLE t2 USING $opts(-module) (content"
125  if {$opts(-extra) != ""} {
126    append sql ", " $opts(-extra)
127  }
128  append sql ")"
129  db eval $sql
130
131  set chars [list a b c d e f g h  i j k l m n o p  q r s t u v w x  y z ""]
132
133  for {set i 0} {$i < $n} {incr i} {
134    set word ""
135    set nChar [llength $chars]
136    append word [lindex $chars [expr {($i / 1)   % $nChar}]]
137    append word [lindex $chars [expr {($i / $nChar)  % $nChar}]]
138    append word [lindex $chars [expr {($i / ($nChar*$nChar)) % $nChar}]]
139
140    db eval { INSERT INTO t2(docid, content) VALUES($i, $word) }
141  }
142}
143
144#-------------------------------------------------------------------------
145# USAGE: fts3_integrity_check TBL
146#
147# This proc is used to verify that the full-text index is consistent with
148# the contents of the fts3 table. In other words, it checks that the
149# data in the %_contents table matches that in the %_segdir and %_segments
150# tables.
151#
152# This is not an efficient procedure. It uses a lot of memory and a lot
153# of CPU. But it is better than not checking at all.
154#
155# The procedure is:
156#
157#   1) Read the entire full-text index from the %_segdir and %_segments
158#      tables into memory. For each entry in the index, the following is
159#      done:
160#
161#          set C($iDocid,$iCol,$iPosition) $zTerm
162#
163#   2) Iterate through each column of each row of the %_content table.
164#      Tokenize all documents, and check that for each token there is
165#      a corresponding entry in the $C array. After checking a token,
166#      [unset] the $C array entry.
167#
168#   3) Check that array $C is now empty.
169#
170#
171proc fts3_integrity_check {tbl} {
172
173  fts3_read2 $tbl 1 A
174
175  foreach zTerm [array names A] {
176    #puts $zTerm
177    foreach doclist $A($zTerm) {
178      set docid 0
179      while {[string length $doclist]>0} {
180        set iCol 0
181        set iPos 0
182        set lPos [list]
183        set lCol [list]
184
185        # First varint of a doclist-entry is the docid. Delta-compressed
186        # with respect to the docid of the previous entry.
187        #
188        incr docid [gobble_varint doclist]
189        if {[info exists D($zTerm,$docid)]} {
190          while {[set iDelta [gobble_varint doclist]] != 0} {}
191          continue
192        }
193        set D($zTerm,$docid) 1
194
195        # Gobble varints until the 0x00 that terminates the doclist-entry
196        # is found.
197        while {[set iDelta [gobble_varint doclist]] > 0} {
198          if {$iDelta == 1} {
199            set iCol [gobble_varint doclist]
200            set iPos 0
201          } else {
202            incr iPos $iDelta
203            incr iPos -2
204            set C($docid,$iCol,$iPos) $zTerm
205          }
206        }
207      }
208    }
209  }
210
211  foreach key [array names C] {
212    #puts "$key -> $C($key)"
213  }
214
215
216  db eval "SELECT * FROM ${tbl}_content" E {
217    set iCol 0
218    set iDoc $E(docid)
219    foreach col [lrange $E(*) 1 end] {
220      set c $E($col)
221      set sql {SELECT fts3_tokenizer_test('simple', $c)}
222
223      foreach {pos term dummy} [db one $sql] {
224        if {![info exists C($iDoc,$iCol,$pos)]} {
225          set es "Error at docid=$iDoc col=$iCol pos=$pos. Index is missing"
226          lappend errors $es
227        } else {
228          if {[string compare $C($iDoc,$iCol,$pos) $term]} {
229            set    es "Error at docid=$iDoc col=$iCol pos=$pos. Index "
230            append es "has \"$C($iDoc,$iCol,$pos)\", document has \"$term\""
231            lappend errors $es
232          }
233          unset C($iDoc,$iCol,$pos)
234        }
235      }
236      incr iCol
237    }
238  }
239
240  foreach c [array names C] {
241    lappend errors "Bad index entry: $c -> $C($c)"
242  }
243
244  if {[info exists errors]} { return [join $errors "\n"] }
245  return "ok"
246}
247
248# USAGE: fts3_terms TBL WHERE
249#
250# Argument TBL must be the name of an FTS3 table. Argument WHERE is an
251# SQL expression that will be used as the WHERE clause when scanning
252# the %_segdir table. As in the following query:
253#
254#   "SELECT * FROM ${TBL}_segdir WHERE ${WHERE}"
255#
256# This function returns a list of all terms present in the segments
257# selected by the statement above.
258#
259proc fts3_terms {tbl where} {
260  fts3_read $tbl $where a
261  return [lsort [array names a]]
262}
263
264
265# USAGE: fts3_doclist TBL TERM WHERE
266#
267# Argument TBL must be the name of an FTS3 table. TERM is a term that may
268# or may not be present in the table. Argument WHERE is used to select a
269# subset of the b-tree segments in the associated full-text index as
270# described above for [fts3_terms].
271#
272# This function returns the results of merging the doclists associated
273# with TERM in the selected segments. Each doclist is an element of the
274# returned list. Each doclist is formatted as follows:
275#
276#   [$docid ?$col[$off1 $off2...]?...]
277#
278# The formatting is odd for a Tcl command in order to be compatible with
279# the original C-language implementation. If argument WHERE is "1", then
280# any empty doclists are omitted from the returned list.
281#
282proc fts3_doclist {tbl term where} {
283  fts3_read $tbl $where a
284
285
286  foreach doclist $a($term) {
287    set docid 0
288
289    while {[string length $doclist]>0} {
290      set iCol 0
291      set iPos 0
292      set lPos [list]
293      set lCol [list]
294      incr docid [gobble_varint doclist]
295
296      while {[set iDelta [gobble_varint doclist]] > 0} {
297        if {$iDelta == 1} {
298          lappend lCol [list $iCol $lPos]
299          set iPos 0
300          set lPos [list]
301          set iCol [gobble_varint doclist]
302        } else {
303          incr iPos $iDelta
304          incr iPos -2
305          lappend lPos $iPos
306        }
307      }
308
309      if {[llength $lPos]>0} {
310        lappend lCol [list $iCol $lPos]
311      }
312
313      if {$where != "1" || [llength $lCol]>0} {
314        set ret($docid) $lCol
315      } else {
316        unset -nocomplain ret($docid)
317      }
318    }
319  }
320
321  set lDoc [list]
322  foreach docid [lsort -integer [array names ret]] {
323    set lCol [list]
324    set cols ""
325    foreach col $ret($docid) {
326      foreach {iCol lPos} $col {}
327      append cols " $iCol\[[join $lPos { }]\]"
328    }
329    lappend lDoc "\[${docid}${cols}\]"
330  }
331
332  join $lDoc " "
333}
334
335###########################################################################
336
337proc gobble_varint {varname} {
338  upvar $varname blob
339  set n [read_fts3varint $blob ret]
340  set blob [string range $blob $n end]
341  return $ret
342}
343proc gobble_string {varname nLength} {
344  upvar $varname blob
345  set ret [string range $blob 0 [expr $nLength-1]]
346  set blob [string range $blob $nLength end]
347  return $ret
348}
349
350# The argument is a blob of data representing an FTS3 segment leaf.
351# Return a list consisting of alternating terms (strings) and doclists
352# (blobs of data).
353#
354proc fts3_readleaf {blob} {
355  set zPrev ""
356  set terms [list]
357
358  while {[string length $blob] > 0} {
359    set nPrefix [gobble_varint blob]
360    set nSuffix [gobble_varint blob]
361
362    set zTerm [string range $zPrev 0 [expr $nPrefix-1]]
363    append zTerm [gobble_string blob $nSuffix]
364    set nDoclist [gobble_varint blob]
365    set doclist [gobble_string blob $nDoclist]
366
367    lappend terms $zTerm $doclist
368    set zPrev $zTerm
369  }
370
371  return $terms
372}
373
374proc fts3_read2 {tbl where varname} {
375  upvar $varname a
376  array unset a
377  db eval " SELECT start_block, leaves_end_block, root
378            FROM ${tbl}_segdir WHERE $where
379            ORDER BY level ASC, idx DESC
380  " {
381    set c 0
382    binary scan $root c c
383    if {$c==0} {
384      foreach {t d} [fts3_readleaf $root] { lappend a($t) $d }
385    } else {
386      db eval " SELECT block
387                FROM ${tbl}_segments
388                WHERE blockid>=$start_block AND blockid<=$leaves_end_block
389                ORDER BY blockid
390      " {
391        foreach {t d} [fts3_readleaf $block] { lappend a($t) $d }
392      }
393    }
394  }
395}
396
397proc fts3_read {tbl where varname} {
398  upvar $varname a
399  array unset a
400  db eval " SELECT start_block, leaves_end_block, root
401            FROM ${tbl}_segdir WHERE $where
402            ORDER BY level DESC, idx ASC
403  " {
404    if {$start_block == 0} {
405      foreach {t d} [fts3_readleaf $root] { lappend a($t) $d }
406    } else {
407      db eval " SELECT block
408                FROM ${tbl}_segments
409                WHERE blockid>=$start_block AND blockid<$leaves_end_block
410                ORDER BY blockid
411      " {
412        foreach {t d} [fts3_readleaf $block] { lappend a($t) $d }
413
414      }
415    }
416  }
417}
418
419##########################################################################
420
421