1832a58a6Sdanielk1977# 2007 June 21 2832a58a6Sdanielk1977# 3832a58a6Sdanielk1977# The author disclaims copyright to this source code. In place of 4832a58a6Sdanielk1977# a legal notice, here is a blessing: 5832a58a6Sdanielk1977# 6832a58a6Sdanielk1977# May you do good and not evil. 7832a58a6Sdanielk1977# May you find forgiveness for yourself and forgive others. 8832a58a6Sdanielk1977# May you share freely, never taking more than you give. 9832a58a6Sdanielk1977# 10832a58a6Sdanielk1977#************************************************************************* 11832a58a6Sdanielk1977# This file implements regression tests for SQLite library. The focus 12832a58a6Sdanielk1977# of this script is testing the pluggable tokeniser feature of the 13832a58a6Sdanielk1977# FTS2 module. 14832a58a6Sdanielk1977# 15*46760820Sdanielk1977# $Id: fts2token.test,v 1.3 2007/06/25 12:05:40 danielk1977 Exp $ 16832a58a6Sdanielk1977# 17832a58a6Sdanielk1977 18832a58a6Sdanielk1977set testdir [file dirname $argv0] 19832a58a6Sdanielk1977source $testdir/tester.tcl 20832a58a6Sdanielk1977 21832a58a6Sdanielk1977# If SQLITE_ENABLE_FTS2 is defined, omit this file. 22832a58a6Sdanielk1977ifcapable !fts2 { 23832a58a6Sdanielk1977 finish_test 24832a58a6Sdanielk1977 return 25832a58a6Sdanielk1977} 26832a58a6Sdanielk1977 27f86643b3Sdanielk1977proc escape_string {str} { 28f86643b3Sdanielk1977 set out "" 29f86643b3Sdanielk1977 foreach char [split $str ""] { 30f86643b3Sdanielk1977 scan $char %c i 31f86643b3Sdanielk1977 if {$i<=127} { 32f86643b3Sdanielk1977 append out $char 33f86643b3Sdanielk1977 } else { 34f86643b3Sdanielk1977 append out [format {\x%.4x} $i] 35f86643b3Sdanielk1977 } 36f86643b3Sdanielk1977 } 37f86643b3Sdanielk1977 set out 38f86643b3Sdanielk1977} 39f86643b3Sdanielk1977 40832a58a6Sdanielk1977#-------------------------------------------------------------------------- 41832a58a6Sdanielk1977# Test cases fts2token-1.* are the warm-body test for the SQL scalar 42832a58a6Sdanielk1977# function fts2_tokenizer(). The procedure is as follows: 43832a58a6Sdanielk1977# 44832a58a6Sdanielk1977# 1: Verify that there is no such fts2 tokenizer as 'blah'. 45832a58a6Sdanielk1977# 46832a58a6Sdanielk1977# 2: Query for the built-in tokenizer 'simple'. Insert a copy of the 47832a58a6Sdanielk1977# retrieved value as tokenizer 'blah'. 48832a58a6Sdanielk1977# 49832a58a6Sdanielk1977# 3: Test that the value returned for tokenizer 'blah' is now the 50832a58a6Sdanielk1977# same as that retrieved for 'simple'. 51832a58a6Sdanielk1977# 52832a58a6Sdanielk1977# 4: Test that it is now possible to create an fts2 table using 53832a58a6Sdanielk1977# tokenizer 'blah' (it was not possible in step 1). 54832a58a6Sdanielk1977# 55832a58a6Sdanielk1977# 5: Test that the table created to use tokenizer 'blah' is usable. 56832a58a6Sdanielk1977# 57832a58a6Sdanielk1977do_test fts2token-1.1 { 58832a58a6Sdanielk1977 catchsql { 59832a58a6Sdanielk1977 CREATE VIRTUAL TABLE t1 USING fts2(content, tokenize blah); 60832a58a6Sdanielk1977 } 61832a58a6Sdanielk1977} {1 {unknown tokenizer: blah}} 62832a58a6Sdanielk1977do_test fts2token-1.2 { 63832a58a6Sdanielk1977 execsql { 64832a58a6Sdanielk1977 SELECT fts2_tokenizer('blah', fts2_tokenizer('simple')) IS NULL; 65832a58a6Sdanielk1977 } 66832a58a6Sdanielk1977} {0} 67832a58a6Sdanielk1977do_test fts2token-1.3 { 68832a58a6Sdanielk1977 execsql { 69832a58a6Sdanielk1977 SELECT fts2_tokenizer('blah') == fts2_tokenizer('simple'); 70832a58a6Sdanielk1977 } 71832a58a6Sdanielk1977} {1} 72832a58a6Sdanielk1977do_test fts2token-1.4 { 73832a58a6Sdanielk1977 catchsql { 74832a58a6Sdanielk1977 CREATE VIRTUAL TABLE t1 USING fts2(content, tokenize blah); 75832a58a6Sdanielk1977 } 76832a58a6Sdanielk1977} {0 {}} 77832a58a6Sdanielk1977do_test fts2token-1.5 { 78832a58a6Sdanielk1977 execsql { 79832a58a6Sdanielk1977 INSERT INTO t1(content) VALUES('There was movement at the station'); 80832a58a6Sdanielk1977 INSERT INTO t1(content) VALUES('For the word has passed around'); 81832a58a6Sdanielk1977 INSERT INTO t1(content) VALUES('That the colt from ol regret had got away'); 82832a58a6Sdanielk1977 SELECT content FROM t1 WHERE content MATCH 'movement' 83832a58a6Sdanielk1977 } 84832a58a6Sdanielk1977} {{There was movement at the station}} 85832a58a6Sdanielk1977 86832a58a6Sdanielk1977#-------------------------------------------------------------------------- 87832a58a6Sdanielk1977# Test cases fts2token-2.* test error cases in the scalar function based 88832a58a6Sdanielk1977# API for getting and setting tokenizers. 89832a58a6Sdanielk1977# 90832a58a6Sdanielk1977do_test fts2token-2.1 { 91832a58a6Sdanielk1977 catchsql { 92832a58a6Sdanielk1977 SELECT fts2_tokenizer('nosuchtokenizer'); 93832a58a6Sdanielk1977 } 94832a58a6Sdanielk1977} {1 {unknown tokenizer: nosuchtokenizer}} 95832a58a6Sdanielk1977 96832a58a6Sdanielk1977#-------------------------------------------------------------------------- 97832a58a6Sdanielk1977# Test cases fts2token-3.* test the three built-in tokenizers with a 98832a58a6Sdanielk1977# simple input string via the built-in test function. This is as much 99832a58a6Sdanielk1977# to test the test function as the tokenizer implementations. 100832a58a6Sdanielk1977# 101832a58a6Sdanielk1977do_test fts2token-3.1 { 102832a58a6Sdanielk1977 execsql { 103832a58a6Sdanielk1977 SELECT fts2_tokenizer_test('simple', 'I don''t see how'); 104832a58a6Sdanielk1977 } 105832a58a6Sdanielk1977} {{0 i I 1 don don 2 t t 3 see see 4 how how}} 106832a58a6Sdanielk1977do_test fts2token-3.2 { 107832a58a6Sdanielk1977 execsql { 108832a58a6Sdanielk1977 SELECT fts2_tokenizer_test('porter', 'I don''t see how'); 109832a58a6Sdanielk1977 } 110832a58a6Sdanielk1977} {{0 i I 1 don don 2 t t 3 see see 4 how how}} 111832a58a6Sdanielk1977ifcapable icu { 112832a58a6Sdanielk1977 do_test fts2token-3.3 { 113832a58a6Sdanielk1977 execsql { 114832a58a6Sdanielk1977 SELECT fts2_tokenizer_test('icu', 'I don''t see how'); 115832a58a6Sdanielk1977 } 116832a58a6Sdanielk1977 } {{0 i I 1 don't don't 2 see see 3 how how}} 117832a58a6Sdanielk1977} 118832a58a6Sdanielk1977 119f86643b3Sdanielk1977#-------------------------------------------------------------------------- 120f86643b3Sdanielk1977# Test cases fts2token-4.* test the ICU tokenizer. In practice, this 121f86643b3Sdanielk1977# tokenizer only has two modes - "thai" and "everybody else". Some other 122f86643b3Sdanielk1977# Asian languages (Lao, Khmer etc.) require the same special treatment as 123f86643b3Sdanielk1977# Thai, but ICU doesn't support them yet. 124f86643b3Sdanielk1977# 125f86643b3Sdanielk1977ifcapable icu { 126f86643b3Sdanielk1977 127f86643b3Sdanielk1977 proc do_icu_test {name locale input output} { 128f86643b3Sdanielk1977 set ::out [db eval { SELECT fts2_tokenizer_test('icu', $locale, $input) }] 129f86643b3Sdanielk1977 do_test $name { 130f86643b3Sdanielk1977 lindex $::out 0 131f86643b3Sdanielk1977 } $output 132f86643b3Sdanielk1977 } 133f86643b3Sdanielk1977 134f86643b3Sdanielk1977 do_icu_test fts2token-4.1 en_US {} {} 135f86643b3Sdanielk1977 do_icu_test fts2token-4.2 en_US {Test cases fts2} [list \ 136f86643b3Sdanielk1977 0 test Test 1 cases cases 2 fts2 fts2 137f86643b3Sdanielk1977 ] 138f86643b3Sdanielk1977 139f86643b3Sdanielk1977 # The following test shows that ICU is smart enough to recognise 140f86643b3Sdanielk1977 # Thai chararacters, even when the locale is set to English/United 141f86643b3Sdanielk1977 # States. 142f86643b3Sdanielk1977 # 143f86643b3Sdanielk1977 set input "\u0e2d\u0e30\u0e44\u0e23\u0e19\u0e30\u0e04\u0e23\u0e31\u0e1a" 144f86643b3Sdanielk1977 set output "0 \u0e2d\u0e30\u0e44\u0e23 \u0e2d\u0e30\u0e44\u0e23 " 145f86643b3Sdanielk1977 append output "1 \u0e19\u0e30 \u0e19\u0e30 " 146f86643b3Sdanielk1977 append output "2 \u0e04\u0e23\u0e31\u0e1a \u0e04\u0e23\u0e31\u0e1a" 147f86643b3Sdanielk1977 148f86643b3Sdanielk1977 do_icu_test fts2token-4.3 th_TH $input $output 149f86643b3Sdanielk1977 do_icu_test fts2token-4.4 en_US $input $output 150f86643b3Sdanielk1977 151f86643b3Sdanielk1977 # ICU handles an unknown locale by falling back to the default. 152f86643b3Sdanielk1977 # So this is not an error. 153f86643b3Sdanielk1977 do_icu_test fts2token-4.5 MiddleOfTheOcean $input $output 154f86643b3Sdanielk1977 155f86643b3Sdanielk1977 set longtoken "AReallyReallyLongTokenOneThatWillSurelyRequire" 156f86643b3Sdanielk1977 append longtoken "AReallocInTheIcuTokenizerCode" 157f86643b3Sdanielk1977 158f86643b3Sdanielk1977 set input "short tokens then " 159f86643b3Sdanielk1977 append input $longtoken 160f86643b3Sdanielk1977 set output "0 short short " 161f86643b3Sdanielk1977 append output "1 tokens tokens " 162f86643b3Sdanielk1977 append output "2 then then " 163f86643b3Sdanielk1977 append output "3 [string tolower $longtoken] $longtoken" 164f86643b3Sdanielk1977 165f86643b3Sdanielk1977 do_icu_test fts2token-4.6 MiddleOfTheOcean $input $output 166f86643b3Sdanielk1977 do_icu_test fts2token-4.7 th_TH $input $output 167f86643b3Sdanielk1977 do_icu_test fts2token-4.8 en_US $input $output 168f86643b3Sdanielk1977} 169f86643b3Sdanielk1977 170*46760820Sdanielk1977do_test fts2token-internal { 171*46760820Sdanielk1977 execsql { SELECT fts2_tokenizer_internal_test() } 172*46760820Sdanielk1977} {ok} 173*46760820Sdanielk1977 174832a58a6Sdanielk1977finish_test 175