xref: /sqlite-3.40.0/ext/fts5/tool/fts5txt2db.tcl (revision a22c1c84)
1##########################################################################
2# 2016 Jan 27
3#
4# The author disclaims copyright to this source code.  In place of
5# a legal notice, here is a blessing:
6#
7#    May you do good and not evil.
8#    May you find forgiveness for yourself and forgive others.
9#    May you share freely, never taking more than you give.
10#
11proc process_cmdline {} {
12  cmdline::process ::A $::argv {
13    {fts5                 "use fts5 (this is the default)"}
14    {fts4                 "use fts4"}
15    {trigram              "Use tokenize=trigram"}
16    {colsize   "10 10 10" "list of column sizes"}
17    {tblname   "t1"       "table name to create"}
18    {detail    "full"     "Fts5 detail mode to use"}
19    {repeat    1          "Load each file this many times"}
20    {prefix    ""         "Fts prefix= option"}
21    {trans     1          "True to use a transaction"}
22    database
23    file...
24  } {
25  This script is designed to create fts4/5 tables with more than one column.
26  The -colsize option should be set to a Tcl list of integer values, one for
27  each column in the table. Each value is the number of tokens that will be
28  inserted into the column value for each row. For example, setting the -colsize
29  option to "5 10" creates an FTS table with 2 columns, with roughly 5 and 10
30  tokens per row in each, respectively.
31
32  Each "FILE" argument should be a text file. The contents of these text files
33  is split on whitespace characters to form a list of tokens. The first N1
34  tokens are used for the first column of the first row, where N1 is the first
35  element of the -colsize list. The next N2 are used for the second column of
36  the first row, and so on. Rows are added to the table until the entire list
37  of tokens is exhausted.
38  }
39}
40
41###########################################################################
42###########################################################################
43# Command line options processor. This is generic code that can be copied
44# between scripts.
45#
46namespace eval cmdline {
47  proc cmdline_error {O E {msg ""}} {
48    if {$msg != ""} {
49      puts stderr "Error: $msg"
50      puts stderr ""
51    }
52
53    set L [list]
54    foreach o $O {
55      if {[llength $o]==1} {
56        lappend L [string toupper $o]
57      }
58    }
59
60    puts stderr "Usage: $::argv0 ?SWITCHES? $L"
61    puts stderr ""
62    puts stderr "Switches are:"
63    foreach o $O {
64      if {[llength $o]==3} {
65        foreach {a b c} $o {}
66        puts stderr [format "    -%-15s %s (default \"%s\")" "$a VAL" $c $b]
67      } elseif {[llength $o]==2} {
68        foreach {a b} $o {}
69        puts stderr [format "    -%-15s %s" $a $b]
70      }
71    }
72    puts stderr ""
73    puts stderr $E
74    exit -1
75  }
76
77  proc process {avar lArgs O E} {
78    upvar $avar A
79    set zTrailing ""       ;# True if ... is present in $O
80    set lPosargs [list]
81
82    # Populate A() with default values. Also, for each switch in the command
83    # line spec, set an entry in the idx() array as follows:
84    #
85    #  {tblname t1 "table name to use"}
86    #      -> [set idx(-tblname) {tblname t1 "table name to use"}
87    #
88    # For each position parameter, append its name to $lPosargs. If the ...
89    # specifier is present, set $zTrailing to the name of the prefix.
90    #
91    foreach o $O {
92      set nm [lindex $o 0]
93      set nArg [llength $o]
94      switch -- $nArg {
95        1 {
96          if {[string range $nm end-2 end]=="..."} {
97            set zTrailing [string range $nm 0 end-3]
98          } else {
99            lappend lPosargs $nm
100          }
101        }
102        2 {
103          set A($nm) 0
104          set idx(-$nm) $o
105        }
106        3 {
107          set A($nm) [lindex $o 1]
108          set idx(-$nm) $o
109        }
110        default {
111          error "Error in command line specification"
112        }
113      }
114    }
115
116    # Set explicitly specified option values
117    #
118    set nArg [llength $lArgs]
119    for {set i 0} {$i < $nArg} {incr i} {
120      set opt [lindex $lArgs $i]
121      if {[string range $opt 0 0]!="-" || $opt=="--"} break
122      set c [array names idx "${opt}*"]
123      if {[llength $c]==0} { cmdline_error $O $E "Unrecognized option: $opt"}
124      if {[llength $c]>1}  { cmdline_error $O $E "Ambiguous option: $opt"}
125
126      if {[llength $idx($c)]==3} {
127        if {$i==[llength $lArgs]-1} {
128          cmdline_error $O $E "Option requires argument: $c"
129        }
130        incr i
131        set A([lindex $idx($c) 0]) [lindex $lArgs $i]
132      } else {
133        set A([lindex $idx($c) 0]) 1
134      }
135    }
136
137    # Deal with position arguments.
138    #
139    set nPosarg [llength $lPosargs]
140    set nRem [expr $nArg - $i]
141    if {$nRem < $nPosarg || ($zTrailing=="" && $nRem > $nPosarg)} {
142      cmdline_error $O $E
143    }
144    for {set j 0} {$j < $nPosarg} {incr j} {
145      set A([lindex $lPosargs $j]) [lindex $lArgs [expr $j+$i]]
146    }
147    if {$zTrailing!=""} {
148      set A($zTrailing) [lrange $lArgs [expr $j+$i] end]
149    }
150  }
151} ;# namespace eval cmdline
152# End of command line options processor.
153###########################################################################
154###########################################################################
155
156process_cmdline
157
158# If -fts4 was specified, use fts4. Otherwise, fts5.
159if {$A(fts4)} {
160  set A(fts) fts4
161} else {
162  set A(fts) fts5
163}
164
165sqlite3 db $A(database)
166
167# Create the FTS table in the db. Return a list of the table columns.
168#
169proc create_table {} {
170  global A
171  set cols [list a b c d e f g h i j k l m n o p q r s t u v w x y z]
172
173  set nCol [llength $A(colsize)]
174  set cols [lrange $cols 0 [expr $nCol-1]]
175
176  set sql    "CREATE VIRTUAL TABLE IF NOT EXISTS $A(tblname) USING $A(fts) ("
177  append sql [join $cols ,]
178  if {$A(fts)=="fts5"} { append sql ",detail=$A(detail)" }
179  if {$A(trigram)} { append sql ",tokenize=trigram" }
180  append sql ", prefix='$A(prefix)');"
181
182  db eval $sql
183  return $cols
184}
185
186# Return a list of tokens from the named file.
187#
188proc readfile {file} {
189  set fd [open $file]
190  set data [read $fd]
191  close $fd
192  split $data
193}
194
195proc repeat {L n} {
196  set res [list]
197  for {set i 0} {$i < $n} {incr i} {
198    set res [concat $res $L]
199  }
200  set res
201}
202
203
204# Load all the data into a big list of tokens.
205#
206set tokens [list]
207foreach f $A(file) {
208  set tokens [concat $tokens [repeat [readfile $f] $A(repeat)]]
209}
210
211set N [llength $tokens]
212set i 0
213set cols [create_table]
214set sql "INSERT INTO $A(tblname) VALUES(\$R([lindex $cols 0])"
215foreach c [lrange $cols 1 end] {
216  append sql ", \$R($c)"
217}
218append sql ")"
219
220if {$A(trans)} { db eval BEGIN }
221  while {$i < $N} {
222    foreach c $cols s $A(colsize) {
223      set R($c) [lrange $tokens $i [expr $i+$s-1]]
224      incr i $s
225    }
226    db eval $sql
227  }
228if {$A(trans)} { db eval COMMIT }
229
230
231
232