xref: /sqlite-3.40.0/test/enc.test (revision 0ea2d42a)
128d47b57Sdanielk1977# 2002 May 24
228d47b57Sdanielk1977#
328d47b57Sdanielk1977# The author disclaims copyright to this source code.  In place of
428d47b57Sdanielk1977# a legal notice, here is a blessing:
528d47b57Sdanielk1977#
628d47b57Sdanielk1977#    May you do good and not evil.
728d47b57Sdanielk1977#    May you find forgiveness for yourself and forgive others.
828d47b57Sdanielk1977#    May you share freely, never taking more than you give.
928d47b57Sdanielk1977#
1028d47b57Sdanielk1977#***********************************************************************
1128d47b57Sdanielk1977# This file implements regression tests for SQLite library.  The focus of
1228d47b57Sdanielk1977# this file is testing the SQLite routines used for converting between the
1328d47b57Sdanielk1977# various suported unicode encodings (UTF-8, UTF-16, UTF-16le and
1428d47b57Sdanielk1977# UTF-16be).
1528d47b57Sdanielk1977#
167677c0ccSdanielk1977# $Id: enc.test,v 1.7 2007/05/23 16:23:09 danielk1977 Exp $
1728d47b57Sdanielk1977
1828d47b57Sdanielk1977set testdir [file dirname $argv0]
1928d47b57Sdanielk1977source $testdir/tester.tcl
2028d47b57Sdanielk1977
216c62608fSdrh# Skip this test if the build does not support multiple encodings.
226c62608fSdrh#
236c62608fSdrhifcapable {!utf16} {
246c62608fSdrh  finish_test
256c62608fSdrh  return
266c62608fSdrh}
276c62608fSdrh
2828d47b57Sdanielk1977proc do_bincmp_test {testname got expect} {
2928d47b57Sdanielk1977  binary scan $expect \c* expectvals
3028d47b57Sdanielk1977  binary scan $got \c* gotvals
3128d47b57Sdanielk1977  do_test $testname [list set dummy $gotvals] $expectvals
3228d47b57Sdanielk1977}
3328d47b57Sdanielk1977
3428d47b57Sdanielk1977# $utf16 is a UTF-16 encoded string. Swap each pair of bytes around
3528d47b57Sdanielk1977# to change the byte-order of the string.
3628d47b57Sdanielk1977proc swap_byte_order {utf16} {
3728d47b57Sdanielk1977  binary scan $utf16 \c* ints
3828d47b57Sdanielk1977
3928d47b57Sdanielk1977  foreach {a b} $ints {
4028d47b57Sdanielk1977    lappend ints2 $b
4128d47b57Sdanielk1977    lappend ints2 $a
4228d47b57Sdanielk1977  }
4328d47b57Sdanielk1977
4428d47b57Sdanielk1977  return [binary format \c* $ints2]
4528d47b57Sdanielk1977}
4628d47b57Sdanielk1977
4728d47b57Sdanielk1977#
4828d47b57Sdanielk1977# Test that the SQLite routines for converting between UTF encodings
4928d47b57Sdanielk1977# produce the same results as their TCL counterparts.
5028d47b57Sdanielk1977#
5128d47b57Sdanielk1977# $testname is the prefix to be used for the test names.
5228d47b57Sdanielk1977# $str is a string to use for testing (encoded in UTF-8, as normal for TCL).
5328d47b57Sdanielk1977#
5428d47b57Sdanielk1977# The test procedure is:
5528d47b57Sdanielk1977# 1. Convert the string from UTF-8 to UTF-16le and check that the TCL and
5628d47b57Sdanielk1977#    SQLite routines produce the same results.
5728d47b57Sdanielk1977#
5828d47b57Sdanielk1977# 2. Convert the string from UTF-8 to UTF-16be and check that the TCL and
5928d47b57Sdanielk1977#    SQLite routines produce the same results.
6028d47b57Sdanielk1977#
6128d47b57Sdanielk1977# 3. Use the SQLite routines to convert the native machine order UTF-16
6228d47b57Sdanielk1977#    representation back to the original UTF-8. Check that the result
6328d47b57Sdanielk1977#    matches the original representation.
6428d47b57Sdanielk1977#
6528d47b57Sdanielk1977# 4. Add a byte-order mark to each of the UTF-16 representations and
6628d47b57Sdanielk1977#    check that the SQLite routines can convert them back to UTF-8.  For
6728d47b57Sdanielk1977#    byte-order mark info, refer to section 3.10 of the unicode standard.
6828d47b57Sdanielk1977#
6928d47b57Sdanielk1977# 5. Take the byte-order marked UTF-16 strings from step 4 and ensure
7028d47b57Sdanielk1977#    that SQLite can convert them both to native byte order UTF-16
7128d47b57Sdanielk1977#    strings, sans BOM.
7228d47b57Sdanielk1977#
7328d47b57Sdanielk1977# Coverage:
7428d47b57Sdanielk1977#
7528d47b57Sdanielk1977# sqlite_utf8to16be (step 2)
7628d47b57Sdanielk1977# sqlite_utf8to16le (step 1)
7728d47b57Sdanielk1977# sqlite_utf16to8 (steps 3, 4)
7828d47b57Sdanielk1977# sqlite_utf16to16le (step 5)
7928d47b57Sdanielk1977# sqlite_utf16to16be (step 5)
8028d47b57Sdanielk1977#
8128d47b57Sdanielk1977proc test_conversion {testname str} {
8228d47b57Sdanielk1977
8328d47b57Sdanielk1977  # Step 1.
84ef4ac8f9Sdrh  set utf16le_sqlite3 [test_translate $str UTF8 UTF16LE]
8528d47b57Sdanielk1977  set utf16le_tcl [encoding convertto unicode $str]
8628d47b57Sdanielk1977  append utf16le_tcl "\x00\x00"
8728d47b57Sdanielk1977  if { $::tcl_platform(byteOrder)!="littleEndian" } {
8828d47b57Sdanielk1977    set utf16le_tcl [swap_byte_order $utf16le_tcl]
8928d47b57Sdanielk1977  }
90ef4ac8f9Sdrh  do_bincmp_test $testname.1 $utf16le_sqlite3 $utf16le_tcl
9128d47b57Sdanielk1977  set utf16le $utf16le_tcl
9228d47b57Sdanielk1977
9328d47b57Sdanielk1977  # Step 2.
94ef4ac8f9Sdrh  set utf16be_sqlite3 [test_translate $str UTF8 UTF16BE]
9528d47b57Sdanielk1977  set utf16be_tcl [encoding convertto unicode $str]
9628d47b57Sdanielk1977  append utf16be_tcl "\x00\x00"
9728d47b57Sdanielk1977  if { $::tcl_platform(byteOrder)=="littleEndian" } {
9828d47b57Sdanielk1977    set utf16be_tcl [swap_byte_order $utf16be_tcl]
9928d47b57Sdanielk1977  }
100ef4ac8f9Sdrh  do_bincmp_test $testname.2 $utf16be_sqlite3 $utf16be_tcl
10128d47b57Sdanielk1977  set utf16be $utf16be_tcl
10228d47b57Sdanielk1977
10328d47b57Sdanielk1977  # Step 3.
10428d47b57Sdanielk1977  if { $::tcl_platform(byteOrder)=="littleEndian" } {
10528d47b57Sdanielk1977    set utf16 $utf16le
10628d47b57Sdanielk1977  } else {
10728d47b57Sdanielk1977    set utf16 $utf16be
10828d47b57Sdanielk1977  }
109ef4ac8f9Sdrh  set utf8_sqlite3 [test_translate $utf16 UTF16 UTF8]
110ef4ac8f9Sdrh  do_bincmp_test $testname.3 $utf8_sqlite3 [binarize $str]
11128d47b57Sdanielk1977
11228d47b57Sdanielk1977  # Step 4 (little endian).
11328d47b57Sdanielk1977  append utf16le_bom "\xFF\xFE" $utf16le
1141ba1b551Sdanielk1977  set utf8_sqlite3 [test_translate $utf16le_bom UTF16 UTF8 1]
115ef4ac8f9Sdrh  do_bincmp_test $testname.4.le $utf8_sqlite3 [binarize $str]
11628d47b57Sdanielk1977
11728d47b57Sdanielk1977  # Step 4 (big endian).
11828d47b57Sdanielk1977  append utf16be_bom "\xFE\xFF" $utf16be
119ef4ac8f9Sdrh  set utf8_sqlite3 [test_translate $utf16be_bom UTF16 UTF8]
120ef4ac8f9Sdrh  do_bincmp_test $testname.4.be $utf8_sqlite3 [binarize $str]
12128d47b57Sdanielk1977
12228d47b57Sdanielk1977  # Step 5 (little endian to little endian).
123ef4ac8f9Sdrh  set utf16_sqlite3 [test_translate $utf16le_bom UTF16LE UTF16LE]
124ef4ac8f9Sdrh  do_bincmp_test $testname.5.le.le $utf16_sqlite3 $utf16le
12528d47b57Sdanielk1977
12628d47b57Sdanielk1977  # Step 5 (big endian to big endian).
127ef4ac8f9Sdrh  set utf16_sqlite3 [test_translate $utf16be_bom UTF16 UTF16BE]
128ef4ac8f9Sdrh  do_bincmp_test $testname.5.be.be $utf16_sqlite3 $utf16be
12928d47b57Sdanielk1977
13028d47b57Sdanielk1977  # Step 5 (big endian to little endian).
131ef4ac8f9Sdrh  set utf16_sqlite3 [test_translate $utf16be_bom UTF16 UTF16LE]
132ef4ac8f9Sdrh  do_bincmp_test $testname.5.be.le $utf16_sqlite3 $utf16le
13328d47b57Sdanielk1977
13428d47b57Sdanielk1977  # Step 5 (little endian to big endian).
135ef4ac8f9Sdrh  set utf16_sqlite3 [test_translate $utf16le_bom UTF16 UTF16BE]
136ef4ac8f9Sdrh  do_bincmp_test $testname.5.le.be $utf16_sqlite3 $utf16be
13728d47b57Sdanielk1977}
13828d47b57Sdanielk1977
139bfd6cce5Sdanielk1977translate_selftest
14028d47b57Sdanielk1977
14128d47b57Sdanielk1977test_conversion enc-1 "hello world"
14228d47b57Sdanielk1977test_conversion enc-2 "sqlite"
14328d47b57Sdanielk1977test_conversion enc-3 ""
144bfd6cce5Sdanielk1977test_conversion enc-X "\u0100"
14528d47b57Sdanielk1977test_conversion enc-4 "\u1234"
14628d47b57Sdanielk1977test_conversion enc-5 "\u4321abc"
14728d47b57Sdanielk1977test_conversion enc-6 "\u4321\u1234"
14828d47b57Sdanielk1977test_conversion enc-7 [string repeat "abcde\u00EF\u00EE\uFFFCabc" 100]
14928d47b57Sdanielk1977test_conversion enc-8 [string repeat "\u007E\u007F\u0080\u0081" 100]
15028d47b57Sdanielk1977test_conversion enc-9 [string repeat "\u07FE\u07FF\u0800\u0801\uFFF0" 100]
151a9c16b0aSdanielk1977test_conversion enc-10 [string repeat "\uE000" 100]
152a9c16b0aSdanielk1977
1537677c0ccSdanielk1977proc test_collate {enc zLeft zRight} {
1547677c0ccSdanielk1977  return [string compare $zLeft $zRight]
1557677c0ccSdanielk1977}
1567677c0ccSdanielk1977add_test_collate $::DB 0 0 1
1577677c0ccSdanielk1977do_test enc-11.1 {
1587677c0ccSdanielk1977  execsql {
1597677c0ccSdanielk1977    CREATE TABLE ab(a COLLATE test_collate, b);
1607677c0ccSdanielk1977    INSERT INTO ab VALUES(CAST (X'C388' AS TEXT), X'888800');
1617677c0ccSdanielk1977    INSERT INTO ab VALUES(CAST (X'C0808080808080808080808080808080808080808080808080808080808080808080808080808080808080808080808080808080808388' AS TEXT), X'888800');
1627677c0ccSdanielk1977    CREATE INDEX ab_i ON ab(a, b);
1637677c0ccSdanielk1977  }
1647677c0ccSdanielk1977} {}
1657677c0ccSdanielk1977do_test enc-11.2 {
1667677c0ccSdanielk1977  set cp200 "\u00C8"
1677677c0ccSdanielk1977  execsql {
1687677c0ccSdanielk1977    SELECT count(*) FROM ab WHERE a = $::cp200;
1697677c0ccSdanielk1977  }
1707677c0ccSdanielk1977} {2}
1717677c0ccSdanielk1977
172*0ea2d42aSdan#-------------------------------------------------------------------------
173*0ea2d42aSdanreset_db
174*0ea2d42aSdanforcedelete test.db2
175*0ea2d42aSdanforcedelete test.db3
176*0ea2d42aSdan
177*0ea2d42aSdando_execsql_test enc-12.0 {
178*0ea2d42aSdan  PRAGMA encoding = 'utf-8';
179*0ea2d42aSdan  CREATE TABLE t1(a, b, c);
180*0ea2d42aSdan  INSERT INTO t1 VALUES('a', 'b', 'c');
181*0ea2d42aSdan  ATTACH 'test.db3' AS aux;
182*0ea2d42aSdan  CREATE TABLE aux.t3(x, y, z);
183*0ea2d42aSdan  INSERT INTO t3 VALUES('xxx', 'yyy', 'zzz');
184*0ea2d42aSdan  PRAGMA encoding;
185*0ea2d42aSdan} {UTF-8}
186*0ea2d42aSdan
187*0ea2d42aSdando_test enc-12.1 {
188*0ea2d42aSdan  sqlite3 db2 test.db2
189*0ea2d42aSdan  db2 eval {
190*0ea2d42aSdan    PRAGMA encoding = 'UTF-16le';
191*0ea2d42aSdan    CREATE TABLE t2(d, e, f);
192*0ea2d42aSdan    INSERT INTO t2 VALUES('d', 'e', 'f');
193*0ea2d42aSdan    PRAGMA encoding;
194*0ea2d42aSdan  }
195*0ea2d42aSdan} {UTF-16le}
196*0ea2d42aSdan
197*0ea2d42aSdando_test enc-12.2 {
198*0ea2d42aSdan  db2 backup test.db
199*0ea2d42aSdan  db2 close
200*0ea2d42aSdan} {}
201*0ea2d42aSdan
202*0ea2d42aSdando_catchsql_test enc-12.3 {
203*0ea2d42aSdan  SELECT * FROM t2;
204*0ea2d42aSdan} {1 {attached databases must use the same text encoding as main database}}
205*0ea2d42aSdan
206*0ea2d42aSdandb close
207*0ea2d42aSdansqlite3 db test.db3
208*0ea2d42aSdando_execsql_test enc-12.4 {
209*0ea2d42aSdan  SELECT * FROM t3;
210*0ea2d42aSdan  PRAGMA encoding = 'UTF-16le';
211*0ea2d42aSdan  SELECT * FROM t3;
212*0ea2d42aSdan} {xxx yyy zzz xxx yyy zzz}
213*0ea2d42aSdan
214*0ea2d42aSdandb close
215*0ea2d42aSdansqlite3 db test.db3
216*0ea2d42aSdanbreakpoint
217*0ea2d42aSdando_execsql_test enc-12.5 {
218*0ea2d42aSdan  PRAGMA encoding = 'UTF-16le';
219*0ea2d42aSdan  PRAGMA encoding;
220*0ea2d42aSdan} {UTF-8}
221*0ea2d42aSdan
222*0ea2d42aSdanreset_db
223*0ea2d42aSdando_execsql_test enc-12.6 {
224*0ea2d42aSdan  PRAGMA encoding = 'UTF-8';
225*0ea2d42aSdan  CREATE TEMP TABLE t1(a, b, c);
226*0ea2d42aSdan  INSERT INTO t1 VALUES('xxx', 'yyy', 'zzz');
227*0ea2d42aSdan}
228*0ea2d42aSdando_test enc-12.7 {
229*0ea2d42aSdan  sqlite3 db2 test.db2
230*0ea2d42aSdan  db2 backup test.db
231*0ea2d42aSdan  db2 close
232*0ea2d42aSdan  db eval {
233*0ea2d42aSdan    SELECT * FROM t1;
234*0ea2d42aSdan  }
235*0ea2d42aSdan} {xxx yyy zzz}
236*0ea2d42aSdando_catchsql_test enc-12.8 {
237*0ea2d42aSdan  SELECT * FROM t2;
238*0ea2d42aSdan  SELECT * FROM t1;
239*0ea2d42aSdan} {1 {attached databases must use the same text encoding as main database}}
240*0ea2d42aSdan
241*0ea2d42aSdandb close
242*0ea2d42aSdansqlite3 db test.db
243*0ea2d42aSdando_execsql_test enc-12.9 {
244*0ea2d42aSdan  CREATE TEMP TABLE t1(a, b, c);
245*0ea2d42aSdan  INSERT INTO t1 VALUES('xxx', 'yyy', 'zzz');
246*0ea2d42aSdan}
247*0ea2d42aSdando_execsql_test enc-12.10 {
248*0ea2d42aSdan  SELECT * FROM t2;
249*0ea2d42aSdan  SELECT * FROM t1;
250*0ea2d42aSdan} {d e f xxx yyy zzz}
251*0ea2d42aSdan
25228d47b57Sdanielk1977finish_test
253