1# 2014 Dec 20 2# 3# The author disclaims copyright to this source code. In place of 4# a legal notice, here is a blessing: 5# 6# May you do good and not evil. 7# May you find forgiveness for yourself and forgive others. 8# May you share freely, never taking more than you give. 9# 10#*********************************************************************** 11# 12# Tests focusing on the fts5 tokenizers 13# 14 15source [file join [file dirname [info script]] fts5_common.tcl] 16set testprefix fts5unicode 17 18# If SQLITE_ENABLE_FTS5 is defined, omit this file. 19ifcapable !fts5 { 20 finish_test 21 return 22} 23 24proc tokenize_test {tn tokenizer input output} { 25 uplevel [list do_test $tn [subst -nocommands { 26 set ret {} 27 foreach {z s e} [sqlite3_fts5_tokenize db {$tokenizer} {$input}] { 28 lappend ret [set z] 29 } 30 set ret 31 }] [list {*}$output]] 32} 33 34foreach {tn t} {1 ascii 2 unicode61} { 35 tokenize_test 1.$tn.0 $t {A B C D} {a b c d} 36 tokenize_test 1.$tn.1 $t {May you share freely,} {may you share freely} 37 tokenize_test 1.$tn.2 $t {..May...you.shAre.freely} {may you share freely} 38 tokenize_test 1.$tn.3 $t {} {} 39} 40 41#------------------------------------------------------------------------- 42# Check that "unicode61" really is the default tokenizer. 43# 44do_execsql_test 2.0 " 45 CREATE VIRTUAL TABLE t1 USING fts5(x); 46 CREATE VIRTUAL TABLE t2 USING fts5(x, tokenize = unicode61); 47 CREATE VIRTUAL TABLE t3 USING fts5(x, tokenize = ascii); 48 INSERT INTO t1 VALUES('\xC0\xC8\xCC'); 49 INSERT INTO t2 VALUES('\xC0\xC8\xCC'); 50 INSERT INTO t3 VALUES('\xC0\xC8\xCC'); 51" 52do_execsql_test 2.1 " 53 SELECT 't1' FROM t1 WHERE t1 MATCH '\xE0\xE8\xEC'; 54 SELECT 't2' FROM t2 WHERE t2 MATCH '\xE0\xE8\xEC'; 55 SELECT 't3' FROM t3 WHERE t3 MATCH '\xE0\xE8\xEC'; 56" {t1 t2} 57 58#------------------------------------------------------------------------- 59# Check that codepoints that require 4 bytes to store in utf-8 (those that 60# require 17 or more bits to store). 61# 62 63set A [db one {SELECT char(0x1F75E)}] ;# Type So 64set B [db one {SELECT char(0x1F5FD)}] ;# Type So 65set C [db one {SELECT char(0x2F802)}] ;# Type Lo 66set D [db one {SELECT char(0x2F808)}] ;# Type Lo 67 68do_execsql_test 3.0 " 69 CREATE VIRTUAL TABLE xyz USING fts5(x, 70 tokenize = \"unicode61 separators '$C' tokenchars '$A'\" 71 ); 72 CREATE VIRTUAL TABLE xyz_v USING fts5vocab(xyz, row); 73 74 INSERT INTO xyz VALUES('$A$B$C$D'); 75" 76 77do_execsql_test 3.1 { 78 SELECT * FROM xyz_v; 79} [list $A 1 1 $D 1 1] 80 81 82 83 84 85finish_test 86