1# 2009 November 04 2# 3# The author disclaims copyright to this source code. In place of 4# a legal notice, here is a blessing: 5# 6# May you do good and not evil. 7# May you find forgiveness for yourself and forgive others. 8# May you share freely, never taking more than you give. 9# 10#*********************************************************************** 11# 12# This file contains common code used the fts3 tests. At one point 13# equivalent functionality was implemented in C code. But it is easier 14# to use Tcl. 15# 16 17#------------------------------------------------------------------------- 18# INSTRUCTIONS 19# 20# The following commands are available: 21# 22# fts3_build_db_1 N 23# Using database handle [db] create an FTS4 table named t1 and populate 24# it with N rows of data. N must be less than 10,000. Refer to the 25# header comments above the proc implementation below for details. 26# 27# fts3_build_db_2 N 28# Using database handle [db] create an FTS4 table named t2 and populate 29# it with N rows of data. N must be less than 100,000. Refer to the 30# header comments above the proc implementation below for details. 31# 32# fts3_integrity_check TBL 33# TBL must be an FTS table in the database currently opened by handle 34# [db]. This proc loads and tokenizes all documents within the table, 35# then checks that the current contents of the FTS index matches the 36# results. 37# 38# fts3_terms TBL WHERE 39# Todo. 40# 41# fts3_doclist TBL TERM WHERE 42# Todo. 43# 44# 45# 46 47ifcapable fts3 { 48 sqlite3_fts3_may_be_corrupt 0 49} 50 51#------------------------------------------------------------------------- 52# USAGE: fts3_build_db_1 SWITCHES N 53# 54# Build a sample FTS table in the database opened by database connection 55# [db]. The name of the new table is "t1". 56# 57proc fts3_build_db_1 {args} { 58 59 set default(-module) fts4 60 61 set nArg [llength $args] 62 if {($nArg%2)==0} { 63 error "wrong # args: should be \"fts3_build_db_1 ?switches? n\"" 64 } 65 66 set n [lindex $args [expr $nArg-1]] 67 array set opts [array get default] 68 array set opts [lrange $args 0 [expr $nArg-2]] 69 foreach k [array names opts] { 70 if {0==[info exists default($k)]} { error "unknown option: $k" } 71 } 72 73 if {$n > 10000} {error "n must be <= 10000"} 74 db eval "CREATE VIRTUAL TABLE t1 USING $opts(-module) (x, y)" 75 76 set xwords [list zero one two three four five six seven eight nine ten] 77 set ywords [list alpha beta gamma delta epsilon zeta eta theta iota kappa] 78 79 for {set i 0} {$i < $n} {incr i} { 80 set x "" 81 set y "" 82 83 set x [list] 84 lappend x [lindex $xwords [expr ($i / 1000) % 10]] 85 lappend x [lindex $xwords [expr ($i / 100) % 10]] 86 lappend x [lindex $xwords [expr ($i / 10) % 10]] 87 lappend x [lindex $xwords [expr ($i / 1) % 10]] 88 89 set y [list] 90 lappend y [lindex $ywords [expr ($i / 1000) % 10]] 91 lappend y [lindex $ywords [expr ($i / 100) % 10]] 92 lappend y [lindex $ywords [expr ($i / 10) % 10]] 93 lappend y [lindex $ywords [expr ($i / 1) % 10]] 94 95 db eval { INSERT INTO t1(docid, x, y) VALUES($i, $x, $y) } 96 } 97} 98 99#------------------------------------------------------------------------- 100# USAGE: fts3_build_db_2 N ARGS 101# 102# Build a sample FTS table in the database opened by database connection 103# [db]. The name of the new table is "t2". 104# 105proc fts3_build_db_2 {args} { 106 107 set default(-module) fts4 108 set default(-extra) "" 109 110 set nArg [llength $args] 111 if {($nArg%2)==0} { 112 error "wrong # args: should be \"fts3_build_db_1 ?switches? n\"" 113 } 114 115 set n [lindex $args [expr $nArg-1]] 116 array set opts [array get default] 117 array set opts [lrange $args 0 [expr $nArg-2]] 118 foreach k [array names opts] { 119 if {0==[info exists default($k)]} { error "unknown option: $k" } 120 } 121 122 if {$n > 100000} {error "n must be <= 100000"} 123 124 set sql "CREATE VIRTUAL TABLE t2 USING $opts(-module) (content" 125 if {$opts(-extra) != ""} { 126 append sql ", " $opts(-extra) 127 } 128 append sql ")" 129 db eval $sql 130 131 set chars [list a b c d e f g h i j k l m n o p q r s t u v w x y z ""] 132 133 for {set i 0} {$i < $n} {incr i} { 134 set word "" 135 set nChar [llength $chars] 136 append word [lindex $chars [expr {($i / 1) % $nChar}]] 137 append word [lindex $chars [expr {($i / $nChar) % $nChar}]] 138 append word [lindex $chars [expr {($i / ($nChar*$nChar)) % $nChar}]] 139 140 db eval { INSERT INTO t2(docid, content) VALUES($i, $word) } 141 } 142} 143 144#------------------------------------------------------------------------- 145# USAGE: fts3_integrity_check TBL 146# 147# This proc is used to verify that the full-text index is consistent with 148# the contents of the fts3 table. In other words, it checks that the 149# data in the %_contents table matches that in the %_segdir and %_segments 150# tables. 151# 152# This is not an efficient procedure. It uses a lot of memory and a lot 153# of CPU. But it is better than not checking at all. 154# 155# The procedure is: 156# 157# 1) Read the entire full-text index from the %_segdir and %_segments 158# tables into memory. For each entry in the index, the following is 159# done: 160# 161# set C($iDocid,$iCol,$iPosition) $zTerm 162# 163# 2) Iterate through each column of each row of the %_content table. 164# Tokenize all documents, and check that for each token there is 165# a corresponding entry in the $C array. After checking a token, 166# [unset] the $C array entry. 167# 168# 3) Check that array $C is now empty. 169# 170# 171proc fts3_integrity_check {tbl} { 172 173 fts3_read2 $tbl 1 A 174 175 foreach zTerm [array names A] { 176 #puts $zTerm 177 foreach doclist $A($zTerm) { 178 set docid 0 179 while {[string length $doclist]>0} { 180 set iCol 0 181 set iPos 0 182 set lPos [list] 183 set lCol [list] 184 185 # First varint of a doclist-entry is the docid. Delta-compressed 186 # with respect to the docid of the previous entry. 187 # 188 incr docid [gobble_varint doclist] 189 if {[info exists D($zTerm,$docid)]} { 190 while {[set iDelta [gobble_varint doclist]] != 0} {} 191 continue 192 } 193 set D($zTerm,$docid) 1 194 195 # Gobble varints until the 0x00 that terminates the doclist-entry 196 # is found. 197 while {[set iDelta [gobble_varint doclist]] > 0} { 198 if {$iDelta == 1} { 199 set iCol [gobble_varint doclist] 200 set iPos 0 201 } else { 202 incr iPos $iDelta 203 incr iPos -2 204 set C($docid,$iCol,$iPos) $zTerm 205 } 206 } 207 } 208 } 209 } 210 211 foreach key [array names C] { 212 #puts "$key -> $C($key)" 213 } 214 215 216 db eval "SELECT * FROM ${tbl}_content" E { 217 set iCol 0 218 set iDoc $E(docid) 219 foreach col [lrange $E(*) 1 end] { 220 set c $E($col) 221 set sql {SELECT fts3_tokenizer_test('simple', $c)} 222 223 foreach {pos term dummy} [db one $sql] { 224 if {![info exists C($iDoc,$iCol,$pos)]} { 225 set es "Error at docid=$iDoc col=$iCol pos=$pos. Index is missing" 226 lappend errors $es 227 } else { 228 if {[string compare $C($iDoc,$iCol,$pos) $term]} { 229 set es "Error at docid=$iDoc col=$iCol pos=$pos. Index " 230 append es "has \"$C($iDoc,$iCol,$pos)\", document has \"$term\"" 231 lappend errors $es 232 } 233 unset C($iDoc,$iCol,$pos) 234 } 235 } 236 incr iCol 237 } 238 } 239 240 foreach c [array names C] { 241 lappend errors "Bad index entry: $c -> $C($c)" 242 } 243 244 if {[info exists errors]} { return [join $errors "\n"] } 245 return "ok" 246} 247 248# USAGE: fts3_terms TBL WHERE 249# 250# Argument TBL must be the name of an FTS3 table. Argument WHERE is an 251# SQL expression that will be used as the WHERE clause when scanning 252# the %_segdir table. As in the following query: 253# 254# "SELECT * FROM ${TBL}_segdir WHERE ${WHERE}" 255# 256# This function returns a list of all terms present in the segments 257# selected by the statement above. 258# 259proc fts3_terms {tbl where} { 260 fts3_read $tbl $where a 261 return [lsort [array names a]] 262} 263 264 265# USAGE: fts3_doclist TBL TERM WHERE 266# 267# Argument TBL must be the name of an FTS3 table. TERM is a term that may 268# or may not be present in the table. Argument WHERE is used to select a 269# subset of the b-tree segments in the associated full-text index as 270# described above for [fts3_terms]. 271# 272# This function returns the results of merging the doclists associated 273# with TERM in the selected segments. Each doclist is an element of the 274# returned list. Each doclist is formatted as follows: 275# 276# [$docid ?$col[$off1 $off2...]?...] 277# 278# The formatting is odd for a Tcl command in order to be compatible with 279# the original C-language implementation. If argument WHERE is "1", then 280# any empty doclists are omitted from the returned list. 281# 282proc fts3_doclist {tbl term where} { 283 fts3_read $tbl $where a 284 285 286 foreach doclist $a($term) { 287 set docid 0 288 289 while {[string length $doclist]>0} { 290 set iCol 0 291 set iPos 0 292 set lPos [list] 293 set lCol [list] 294 incr docid [gobble_varint doclist] 295 296 while {[set iDelta [gobble_varint doclist]] > 0} { 297 if {$iDelta == 1} { 298 lappend lCol [list $iCol $lPos] 299 set iPos 0 300 set lPos [list] 301 set iCol [gobble_varint doclist] 302 } else { 303 incr iPos $iDelta 304 incr iPos -2 305 lappend lPos $iPos 306 } 307 } 308 309 if {[llength $lPos]>0} { 310 lappend lCol [list $iCol $lPos] 311 } 312 313 if {$where != "1" || [llength $lCol]>0} { 314 set ret($docid) $lCol 315 } else { 316 unset -nocomplain ret($docid) 317 } 318 } 319 } 320 321 set lDoc [list] 322 foreach docid [lsort -integer [array names ret]] { 323 set lCol [list] 324 set cols "" 325 foreach col $ret($docid) { 326 foreach {iCol lPos} $col {} 327 append cols " $iCol\[[join $lPos { }]\]" 328 } 329 lappend lDoc "\[${docid}${cols}\]" 330 } 331 332 join $lDoc " " 333} 334 335########################################################################### 336 337proc gobble_varint {varname} { 338 upvar $varname blob 339 set n [read_fts3varint $blob ret] 340 set blob [string range $blob $n end] 341 return $ret 342} 343proc gobble_string {varname nLength} { 344 upvar $varname blob 345 set ret [string range $blob 0 [expr $nLength-1]] 346 set blob [string range $blob $nLength end] 347 return $ret 348} 349 350# The argument is a blob of data representing an FTS3 segment leaf. 351# Return a list consisting of alternating terms (strings) and doclists 352# (blobs of data). 353# 354proc fts3_readleaf {blob} { 355 set zPrev "" 356 set terms [list] 357 358 while {[string length $blob] > 0} { 359 set nPrefix [gobble_varint blob] 360 set nSuffix [gobble_varint blob] 361 362 set zTerm [string range $zPrev 0 [expr $nPrefix-1]] 363 append zTerm [gobble_string blob $nSuffix] 364 set nDoclist [gobble_varint blob] 365 set doclist [gobble_string blob $nDoclist] 366 367 lappend terms $zTerm $doclist 368 set zPrev $zTerm 369 } 370 371 return $terms 372} 373 374proc fts3_read2 {tbl where varname} { 375 upvar $varname a 376 array unset a 377 db eval " SELECT start_block, leaves_end_block, root 378 FROM ${tbl}_segdir WHERE $where 379 ORDER BY level ASC, idx DESC 380 " { 381 set c 0 382 binary scan $root c c 383 if {$c==0} { 384 foreach {t d} [fts3_readleaf $root] { lappend a($t) $d } 385 } else { 386 db eval " SELECT block 387 FROM ${tbl}_segments 388 WHERE blockid>=$start_block AND blockid<=$leaves_end_block 389 ORDER BY blockid 390 " { 391 foreach {t d} [fts3_readleaf $block] { lappend a($t) $d } 392 } 393 } 394 } 395} 396 397proc fts3_read {tbl where varname} { 398 upvar $varname a 399 array unset a 400 db eval " SELECT start_block, leaves_end_block, root 401 FROM ${tbl}_segdir WHERE $where 402 ORDER BY level DESC, idx ASC 403 " { 404 if {$start_block == 0} { 405 foreach {t d} [fts3_readleaf $root] { lappend a($t) $d } 406 } else { 407 db eval " SELECT block 408 FROM ${tbl}_segments 409 WHERE blockid>=$start_block AND blockid<$leaves_end_block 410 ORDER BY blockid 411 " { 412 foreach {t d} [fts3_readleaf $block] { lappend a($t) $d } 413 414 } 415 } 416 } 417} 418 419########################################################################## 420 421