1a5d14fe7Sdrh /*
2a5d14fe7Sdrh ** 2004 April 13
3a5d14fe7Sdrh **
4a5d14fe7Sdrh ** The author disclaims copyright to this source code. In place of
5a5d14fe7Sdrh ** a legal notice, here is a blessing:
6a5d14fe7Sdrh **
7a5d14fe7Sdrh ** May you do good and not evil.
8a5d14fe7Sdrh ** May you find forgiveness for yourself and forgive others.
9a5d14fe7Sdrh ** May you share freely, never taking more than you give.
10a5d14fe7Sdrh **
11a5d14fe7Sdrh *************************************************************************
12a5d14fe7Sdrh ** This file contains routines used to translate between UTF-8,
13a5d14fe7Sdrh ** UTF-16, UTF-16BE, and UTF-16LE.
14a5d14fe7Sdrh **
15a5d14fe7Sdrh ** Notes on UTF-8:
16a5d14fe7Sdrh **
17a5d14fe7Sdrh ** Byte-0 Byte-1 Byte-2 Byte-3 Value
18a5d14fe7Sdrh ** 0xxxxxxx 00000000 00000000 0xxxxxxx
19a5d14fe7Sdrh ** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx
20a5d14fe7Sdrh ** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx
21a5d14fe7Sdrh ** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx
22a5d14fe7Sdrh **
23a5d14fe7Sdrh **
24a5d14fe7Sdrh ** Notes on UTF-16: (with wwww+1==uuuuu)
25a5d14fe7Sdrh **
26a5d14fe7Sdrh ** Word-0 Word-1 Value
2751846b56Sdrh ** 110110ww wwzzzzyy 110111yy yyxxxxxx 000uuuuu zzzzyyyy yyxxxxxx
2851846b56Sdrh ** zzzzyyyy yyxxxxxx 00000000 zzzzyyyy yyxxxxxx
29a5d14fe7Sdrh **
30998b56c3Sdanielk1977 **
31a5d14fe7Sdrh ** BOM or Byte Order Mark:
32a5d14fe7Sdrh ** 0xff 0xfe little-endian utf-16 follows
33a5d14fe7Sdrh ** 0xfe 0xff big-endian utf-16 follows
34998b56c3Sdanielk1977 **
35a5d14fe7Sdrh */
36998b56c3Sdanielk1977 #include "sqliteInt.h"
37b659e9bfSdrh #include <assert.h>
38bfd6cce5Sdanielk1977 #include "vdbeInt.h"
39998b56c3Sdanielk1977
40e1462a76Sdrh #if !defined(SQLITE_AMALGAMATION) && SQLITE_BYTEORDER==0
41998b56c3Sdanielk1977 /*
4238def054Sdrh ** The following constant value is used by the SQLITE_BIGENDIAN and
4338def054Sdrh ** SQLITE_LITTLEENDIAN macros.
4438def054Sdrh */
4538def054Sdrh const int sqlite3one = 1;
46e1462a76Sdrh #endif /* SQLITE_AMALGAMATION && SQLITE_BYTEORDER==0 */
4738def054Sdrh
4838def054Sdrh /*
494a919118Sdrh ** This lookup table is used to help decode the first byte of
504a919118Sdrh ** a multi-byte UTF8 character.
51d02eb1fdSdanielk1977 */
5218e526c1Sshane static const unsigned char sqlite3Utf8Trans1[] = {
534a919118Sdrh 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
544a919118Sdrh 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
554a919118Sdrh 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
564a919118Sdrh 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
574a919118Sdrh 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
584a919118Sdrh 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
594a919118Sdrh 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
604a919118Sdrh 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
61ad7dd425Sdanielk1977 };
62bfd6cce5Sdanielk1977
6366150956Sdrh
64bfd6cce5Sdanielk1977 #define WRITE_UTF8(zOut, c) { \
65bfd6cce5Sdanielk1977 if( c<0x00080 ){ \
66aa78bec9Sdrh *zOut++ = (u8)(c&0xFF); \
67bfd6cce5Sdanielk1977 } \
68bfd6cce5Sdanielk1977 else if( c<0x00800 ){ \
69aa78bec9Sdrh *zOut++ = 0xC0 + (u8)((c>>6)&0x1F); \
70aa78bec9Sdrh *zOut++ = 0x80 + (u8)(c & 0x3F); \
71bfd6cce5Sdanielk1977 } \
72bfd6cce5Sdanielk1977 else if( c<0x10000 ){ \
73aa78bec9Sdrh *zOut++ = 0xE0 + (u8)((c>>12)&0x0F); \
74aa78bec9Sdrh *zOut++ = 0x80 + (u8)((c>>6) & 0x3F); \
75aa78bec9Sdrh *zOut++ = 0x80 + (u8)(c & 0x3F); \
76bfd6cce5Sdanielk1977 }else{ \
77aa78bec9Sdrh *zOut++ = 0xF0 + (u8)((c>>18) & 0x07); \
78aa78bec9Sdrh *zOut++ = 0x80 + (u8)((c>>12) & 0x3F); \
79aa78bec9Sdrh *zOut++ = 0x80 + (u8)((c>>6) & 0x3F); \
80aa78bec9Sdrh *zOut++ = 0x80 + (u8)(c & 0x3F); \
81bfd6cce5Sdanielk1977 } \
82bfd6cce5Sdanielk1977 }
83bfd6cce5Sdanielk1977
84bfd6cce5Sdanielk1977 #define WRITE_UTF16LE(zOut, c) { \
85bfd6cce5Sdanielk1977 if( c<=0xFFFF ){ \
86aa78bec9Sdrh *zOut++ = (u8)(c&0x00FF); \
87aa78bec9Sdrh *zOut++ = (u8)((c>>8)&0x00FF); \
88bfd6cce5Sdanielk1977 }else{ \
89aa78bec9Sdrh *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
90aa78bec9Sdrh *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03)); \
91aa78bec9Sdrh *zOut++ = (u8)(c&0x00FF); \
92aa78bec9Sdrh *zOut++ = (u8)(0x00DC + ((c>>8)&0x03)); \
93bfd6cce5Sdanielk1977 } \
94bfd6cce5Sdanielk1977 }
95bfd6cce5Sdanielk1977
96bfd6cce5Sdanielk1977 #define WRITE_UTF16BE(zOut, c) { \
97bfd6cce5Sdanielk1977 if( c<=0xFFFF ){ \
98aa78bec9Sdrh *zOut++ = (u8)((c>>8)&0x00FF); \
99aa78bec9Sdrh *zOut++ = (u8)(c&0x00FF); \
100bfd6cce5Sdanielk1977 }else{ \
101aa78bec9Sdrh *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03)); \
102aa78bec9Sdrh *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
103aa78bec9Sdrh *zOut++ = (u8)(0x00DC + ((c>>8)&0x03)); \
104aa78bec9Sdrh *zOut++ = (u8)(c&0x00FF); \
105bfd6cce5Sdanielk1977 } \
106bfd6cce5Sdanielk1977 }
107bfd6cce5Sdanielk1977
108bfd6cce5Sdanielk1977 /*
10966150956Sdrh ** Translate a single UTF-8 character. Return the unicode value.
11066150956Sdrh **
11166150956Sdrh ** During translation, assume that the byte that zTerm points
11266150956Sdrh ** is a 0x00.
11366150956Sdrh **
11466150956Sdrh ** Write a pointer to the next unread byte back into *pzNext.
11566150956Sdrh **
11666150956Sdrh ** Notes On Invalid UTF-8:
11766150956Sdrh **
11866150956Sdrh ** * This routine never allows a 7-bit character (0x00 through 0x7f) to
11966150956Sdrh ** be encoded as a multi-byte character. Any multi-byte character that
12066150956Sdrh ** attempts to encode a value between 0x00 and 0x7f is rendered as 0xfffd.
12166150956Sdrh **
12266150956Sdrh ** * This routine never allows a UTF16 surrogate value to be encoded.
12366150956Sdrh ** If a multi-byte character attempts to encode a value between
12466150956Sdrh ** 0xd800 and 0xe000 then it is rendered as 0xfffd.
12566150956Sdrh **
12666150956Sdrh ** * Bytes in the range of 0x80 through 0xbf which occur as the first
12766150956Sdrh ** byte of a character are interpreted as single-byte characters
12866150956Sdrh ** and rendered as themselves even though they are technically
12966150956Sdrh ** invalid characters.
13066150956Sdrh **
1316c34e58eSdrh ** * This routine accepts over-length UTF8 encodings
1326c34e58eSdrh ** for unicode values 0x80 and greater. It does not change over-length
13366150956Sdrh ** encodings to 0xfffd as some systems recommend.
13466150956Sdrh */
135ad76a81eSdanielk1977 #define READ_UTF8(zIn, zTerm, c) \
136ad76a81eSdanielk1977 c = *(zIn++); \
137ad76a81eSdanielk1977 if( c>=0xc0 ){ \
13818e526c1Sshane c = sqlite3Utf8Trans1[c-0xc0]; \
139ad76a81eSdanielk1977 while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){ \
140ad76a81eSdanielk1977 c = (c<<6) + (0x3f & *(zIn++)); \
141ad76a81eSdanielk1977 } \
142ad76a81eSdanielk1977 if( c<0x80 \
143ad76a81eSdanielk1977 || (c&0xFFFFF800)==0xD800 \
144ad76a81eSdanielk1977 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \
145ad76a81eSdanielk1977 }
sqlite3Utf8Read(const unsigned char ** pz)1460a32fa6dSdrh u32 sqlite3Utf8Read(
14742610961Sdrh const unsigned char **pz /* Pointer to string from which to read char */
14866150956Sdrh ){
149dba2cc43Sshaneh unsigned int c;
150769e97e0Sdrh
151769e97e0Sdrh /* Same as READ_UTF8() above but without the zTerm parameter.
152769e97e0Sdrh ** For this routine, we assume the UTF8 string is always zero-terminated.
153769e97e0Sdrh */
15442610961Sdrh c = *((*pz)++);
155769e97e0Sdrh if( c>=0xc0 ){
156769e97e0Sdrh c = sqlite3Utf8Trans1[c-0xc0];
15742610961Sdrh while( (*(*pz) & 0xc0)==0x80 ){
15842610961Sdrh c = (c<<6) + (0x3f & *((*pz)++));
159769e97e0Sdrh }
160769e97e0Sdrh if( c<0x80
161769e97e0Sdrh || (c&0xFFFFF800)==0xD800
162769e97e0Sdrh || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; }
163769e97e0Sdrh }
16466150956Sdrh return c;
16566150956Sdrh }
16666150956Sdrh
16766150956Sdrh
16866150956Sdrh
169ad76a81eSdanielk1977
17066150956Sdrh /*
171bfd6cce5Sdanielk1977 ** If the TRANSLATE_TRACE macro is defined, the value of each Mem is
172bfd6cce5Sdanielk1977 ** printed on stderr on the way into and out of sqlite3VdbeMemTranslate().
173bfd6cce5Sdanielk1977 */
174bfd6cce5Sdanielk1977 /* #define TRANSLATE_TRACE 1 */
175bfd6cce5Sdanielk1977
1766c62608fSdrh #ifndef SQLITE_OMIT_UTF16
177bfd6cce5Sdanielk1977 /*
178bfd6cce5Sdanielk1977 ** This routine transforms the internal text encoding used by pMem to
179bfd6cce5Sdanielk1977 ** desiredEnc. It is an error if the string is already of the desired
180bfd6cce5Sdanielk1977 ** encoding, or if *pMem does not contain a string value.
181bfd6cce5Sdanielk1977 */
sqlite3VdbeMemTranslate(Mem * pMem,u8 desiredEnc)1824274dae9Sdrh SQLITE_NOINLINE int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){
183d4de9f7bSdrh sqlite3_int64 len; /* Maximum length of output string in bytes */
184bfd6cce5Sdanielk1977 unsigned char *zOut; /* Output buffer */
185bfd6cce5Sdanielk1977 unsigned char *zIn; /* Input iterator */
186bfd6cce5Sdanielk1977 unsigned char *zTerm; /* End of input */
187bfd6cce5Sdanielk1977 unsigned char *z; /* Output iterator */
188a39f4c5cSdrh unsigned int c;
189bfd6cce5Sdanielk1977
190b21c8cd4Sdrh assert( pMem->db==0 || sqlite3_mutex_held(pMem->db->mutex) );
191bfd6cce5Sdanielk1977 assert( pMem->flags&MEM_Str );
192bfd6cce5Sdanielk1977 assert( pMem->enc!=desiredEnc );
193bfd6cce5Sdanielk1977 assert( pMem->enc!=0 );
194bfd6cce5Sdanielk1977 assert( pMem->n>=0 );
195bfd6cce5Sdanielk1977
196b5402fbfSdanielk1977 #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
197bfd6cce5Sdanielk1977 {
1985ca06329Sdrh StrAccum acc;
1995ca06329Sdrh char zBuf[1000];
2005ca06329Sdrh sqlite3StrAccumInit(&acc, 0, zBuf, sizeof(zBuf), 0);
2015ca06329Sdrh sqlite3VdbeMemPrettyPrint(pMem, &acc);
2025ca06329Sdrh fprintf(stderr, "INPUT: %s\n", sqlite3StrAccumFinish(&acc));
203ad7dd425Sdanielk1977 }
204ad7dd425Sdanielk1977 #endif
205ad7dd425Sdanielk1977
206bfd6cce5Sdanielk1977 /* If the translation is between UTF-16 little and big endian, then
207bfd6cce5Sdanielk1977 ** all that is required is to swap the byte order. This case is handled
208bfd6cce5Sdanielk1977 ** differently from the others.
209d02eb1fdSdanielk1977 */
210bfd6cce5Sdanielk1977 if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){
211bfd6cce5Sdanielk1977 u8 temp;
21271c697efSdrh int rc;
213b21c8cd4Sdrh rc = sqlite3VdbeMemMakeWriteable(pMem);
21471c697efSdrh if( rc!=SQLITE_OK ){
21571c697efSdrh assert( rc==SQLITE_NOMEM );
216fad3039cSmistachkin return SQLITE_NOMEM_BKPT;
21771c697efSdrh }
2182646da7eSdrh zIn = (u8*)pMem->z;
219bbf695d6Sdrh zTerm = &zIn[pMem->n&~1];
220bfd6cce5Sdanielk1977 while( zIn<zTerm ){
221bfd6cce5Sdanielk1977 temp = *zIn;
222bfd6cce5Sdanielk1977 *zIn = *(zIn+1);
223bfd6cce5Sdanielk1977 zIn++;
224bfd6cce5Sdanielk1977 *zIn++ = temp;
225bfd6cce5Sdanielk1977 }
226bfd6cce5Sdanielk1977 pMem->enc = desiredEnc;
227bfd6cce5Sdanielk1977 goto translate_out;
228d02eb1fdSdanielk1977 }
229d02eb1fdSdanielk1977
230d7e69648Sdanielk1977 /* Set len to the maximum number of bytes required in the output buffer. */
231d7e69648Sdanielk1977 if( desiredEnc==SQLITE_UTF8 ){
232d7e69648Sdanielk1977 /* When converting from UTF-16, the maximum growth results from
233a49b8611Sdrh ** translating a 2-byte character to a 4-byte UTF-8 character.
234a49b8611Sdrh ** A single byte is required for the output string
235d7e69648Sdanielk1977 ** nul-terminator.
236d7e69648Sdanielk1977 */
237bbf695d6Sdrh pMem->n &= ~1;
238d4de9f7bSdrh len = 2 * (sqlite3_int64)pMem->n + 1;
239d7e69648Sdanielk1977 }else{
240d7e69648Sdanielk1977 /* When converting from UTF-8 to UTF-16 the maximum growth is caused
241d7e69648Sdanielk1977 ** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16
242d7e69648Sdanielk1977 ** character. Two bytes are required in the output buffer for the
243d7e69648Sdanielk1977 ** nul-terminator.
244d7e69648Sdanielk1977 */
245d4de9f7bSdrh len = 2 * (sqlite3_int64)pMem->n + 2;
246d7e69648Sdanielk1977 }
247d7e69648Sdanielk1977
248bfd6cce5Sdanielk1977 /* Set zIn to point at the start of the input buffer and zTerm to point 1
249bfd6cce5Sdanielk1977 ** byte past the end.
250bfd6cce5Sdanielk1977 **
251a7a8e14bSdanielk1977 ** Variable zOut is set to point at the output buffer, space obtained
252a7a8e14bSdanielk1977 ** from sqlite3_malloc().
253d02eb1fdSdanielk1977 */
2542646da7eSdrh zIn = (u8*)pMem->z;
255bfd6cce5Sdanielk1977 zTerm = &zIn[pMem->n];
256b21c8cd4Sdrh zOut = sqlite3DbMallocRaw(pMem->db, len);
257b21c8cd4Sdrh if( !zOut ){
258fad3039cSmistachkin return SQLITE_NOMEM_BKPT;
259b21c8cd4Sdrh }
260bfd6cce5Sdanielk1977 z = zOut;
261bfd6cce5Sdanielk1977
262bfd6cce5Sdanielk1977 if( pMem->enc==SQLITE_UTF8 ){
263bfd6cce5Sdanielk1977 if( desiredEnc==SQLITE_UTF16LE ){
264bfd6cce5Sdanielk1977 /* UTF-8 -> UTF-16 Little-endian */
265bfd6cce5Sdanielk1977 while( zIn<zTerm ){
266ad76a81eSdanielk1977 READ_UTF8(zIn, zTerm, c);
267bfd6cce5Sdanielk1977 WRITE_UTF16LE(z, c);
268bfd6cce5Sdanielk1977 }
269b8dd3155Sdrh }else{
270b8dd3155Sdrh assert( desiredEnc==SQLITE_UTF16BE );
271bfd6cce5Sdanielk1977 /* UTF-8 -> UTF-16 Big-endian */
272bfd6cce5Sdanielk1977 while( zIn<zTerm ){
273ad76a81eSdanielk1977 READ_UTF8(zIn, zTerm, c);
274bfd6cce5Sdanielk1977 WRITE_UTF16BE(z, c);
275bfd6cce5Sdanielk1977 }
276bfd6cce5Sdanielk1977 }
277ea678832Sdrh pMem->n = (int)(z - zOut);
278b8dd3155Sdrh *z++ = 0;
279bfd6cce5Sdanielk1977 }else{
280bfd6cce5Sdanielk1977 assert( desiredEnc==SQLITE_UTF8 );
281bfd6cce5Sdanielk1977 if( pMem->enc==SQLITE_UTF16LE ){
282bfd6cce5Sdanielk1977 /* UTF-16 Little-endian -> UTF-8 */
283bfd6cce5Sdanielk1977 while( zIn<zTerm ){
2840184a256Sdrh c = *(zIn++);
2850184a256Sdrh c += (*(zIn++))<<8;
2860184a256Sdrh if( c>=0xd800 && c<0xe000 ){
2874f1315a4Sdrh #ifdef SQLITE_REPLACE_INVALID_UTF
2880184a256Sdrh if( c>=0xdc00 || zIn>=zTerm ){
2890184a256Sdrh c = 0xfffd;
2900184a256Sdrh }else{
2910184a256Sdrh int c2 = *(zIn++);
2920184a256Sdrh c2 += (*(zIn++))<<8;
2930184a256Sdrh if( c2<0xdc00 || c2>=0xe000 ){
2940184a256Sdrh zIn -= 2;
2950184a256Sdrh c = 0xfffd;
2960184a256Sdrh }else{
2970184a256Sdrh c = ((c&0x3ff)<<10) + (c2&0x3ff) + 0x10000;
2980184a256Sdrh }
2990184a256Sdrh }
3004f1315a4Sdrh #else
3014f1315a4Sdrh if( zIn<zTerm ){
3024f1315a4Sdrh int c2 = (*zIn++);
3034f1315a4Sdrh c2 += ((*zIn++)<<8);
3044f1315a4Sdrh c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10);
3054f1315a4Sdrh }
3064f1315a4Sdrh #endif
3070184a256Sdrh }
308bfd6cce5Sdanielk1977 WRITE_UTF8(z, c);
309bfd6cce5Sdanielk1977 }
310bfd6cce5Sdanielk1977 }else{
3117ffb2b5fSmihailim /* UTF-16 Big-endian -> UTF-8 */
312bfd6cce5Sdanielk1977 while( zIn<zTerm ){
3130184a256Sdrh c = (*(zIn++))<<8;
3140184a256Sdrh c += *(zIn++);
3150184a256Sdrh if( c>=0xd800 && c<0xe000 ){
3164f1315a4Sdrh #ifdef SQLITE_REPLACE_INVALID_UTF
3170184a256Sdrh if( c>=0xdc00 || zIn>=zTerm ){
3180184a256Sdrh c = 0xfffd;
3190184a256Sdrh }else{
3200184a256Sdrh int c2 = (*(zIn++))<<8;
3210184a256Sdrh c2 += *(zIn++);
3220184a256Sdrh if( c2<0xdc00 || c2>=0xe000 ){
3230184a256Sdrh zIn -= 2;
3240184a256Sdrh c = 0xfffd;
3250184a256Sdrh }else{
3260184a256Sdrh c = ((c&0x3ff)<<10) + (c2&0x3ff) + 0x10000;
3270184a256Sdrh }
3280184a256Sdrh }
3294f1315a4Sdrh #else
3304f1315a4Sdrh if( zIn<zTerm ){
3314f1315a4Sdrh int c2 = ((*zIn++)<<8);
3324f1315a4Sdrh c2 += (*zIn++);
3334f1315a4Sdrh c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10);
3344f1315a4Sdrh }
3354f1315a4Sdrh #endif
3360184a256Sdrh }
337bfd6cce5Sdanielk1977 WRITE_UTF8(z, c);
338bfd6cce5Sdanielk1977 }
339bfd6cce5Sdanielk1977 }
340aa78bec9Sdrh pMem->n = (int)(z - zOut);
341d02eb1fdSdanielk1977 }
342b8dd3155Sdrh *z = 0;
343d7e69648Sdanielk1977 assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len );
344d02eb1fdSdanielk1977
345*21b473deSdrh c = MEM_Str|MEM_Term|(pMem->flags&(MEM_AffMask|MEM_Subtype));
346bfd6cce5Sdanielk1977 sqlite3VdbeMemRelease(pMem);
347*21b473deSdrh pMem->flags = c;
348bfd6cce5Sdanielk1977 pMem->enc = desiredEnc;
3492646da7eSdrh pMem->z = (char*)zOut;
3505f096135Sdanielk1977 pMem->zMalloc = pMem->z;
35117bcb102Sdrh pMem->szMalloc = sqlite3DbMallocSize(pMem->db, pMem->z);
352d02eb1fdSdanielk1977
353bfd6cce5Sdanielk1977 translate_out:
354b5402fbfSdanielk1977 #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
355bfd6cce5Sdanielk1977 {
3565ca06329Sdrh StrAccum acc;
3575ca06329Sdrh char zBuf[1000];
3585ca06329Sdrh sqlite3StrAccumInit(&acc, 0, zBuf, sizeof(zBuf), 0);
3595ca06329Sdrh sqlite3VdbeMemPrettyPrint(pMem, &acc);
3605ca06329Sdrh fprintf(stderr, "OUTPUT: %s\n", sqlite3StrAccumFinish(&acc));
361d02eb1fdSdanielk1977 }
362bfd6cce5Sdanielk1977 #endif
363bfd6cce5Sdanielk1977 return SQLITE_OK;
364d02eb1fdSdanielk1977 }
365f0f44b79Sdrh #endif /* SQLITE_OMIT_UTF16 */
366d02eb1fdSdanielk1977
367f0f44b79Sdrh #ifndef SQLITE_OMIT_UTF16
368d02eb1fdSdanielk1977 /*
369bfd6cce5Sdanielk1977 ** This routine checks for a byte-order mark at the beginning of the
370bfd6cce5Sdanielk1977 ** UTF-16 string stored in *pMem. If one is present, it is removed and
371bfd6cce5Sdanielk1977 ** the encoding of the Mem adjusted. This routine does not do any
372bfd6cce5Sdanielk1977 ** byte-swapping, it just sets Mem.enc appropriately.
373998b56c3Sdanielk1977 **
374bfd6cce5Sdanielk1977 ** The allocation (static, dynamic etc.) and encoding of the Mem may be
375bfd6cce5Sdanielk1977 ** changed by this function.
376998b56c3Sdanielk1977 */
sqlite3VdbeMemHandleBom(Mem * pMem)377b21c8cd4Sdrh int sqlite3VdbeMemHandleBom(Mem *pMem){
378bfd6cce5Sdanielk1977 int rc = SQLITE_OK;
379bfd6cce5Sdanielk1977 u8 bom = 0;
380998b56c3Sdanielk1977
381769e97e0Sdrh assert( pMem->n>=0 );
382769e97e0Sdrh if( pMem->n>1 ){
383bfd6cce5Sdanielk1977 u8 b1 = *(u8 *)pMem->z;
384bfd6cce5Sdanielk1977 u8 b2 = *(((u8 *)pMem->z) + 1);
38593d4675dSdanielk1977 if( b1==0xFE && b2==0xFF ){
386bfd6cce5Sdanielk1977 bom = SQLITE_UTF16BE;
38793d4675dSdanielk1977 }
38893d4675dSdanielk1977 if( b1==0xFF && b2==0xFE ){
389bfd6cce5Sdanielk1977 bom = SQLITE_UTF16LE;
39093d4675dSdanielk1977 }
39193d4675dSdanielk1977 }
392bfd6cce5Sdanielk1977
393bfd6cce5Sdanielk1977 if( bom ){
394a7a8e14bSdanielk1977 rc = sqlite3VdbeMemMakeWriteable(pMem);
395a7a8e14bSdanielk1977 if( rc==SQLITE_OK ){
396a7a8e14bSdanielk1977 pMem->n -= 2;
397a7a8e14bSdanielk1977 memmove(pMem->z, &pMem->z[2], pMem->n);
398a7a8e14bSdanielk1977 pMem->z[pMem->n] = '\0';
399a7a8e14bSdanielk1977 pMem->z[pMem->n+1] = '\0';
400a7a8e14bSdanielk1977 pMem->flags |= MEM_Term;
401a7a8e14bSdanielk1977 pMem->enc = bom;
402998b56c3Sdanielk1977 }
403bfd6cce5Sdanielk1977 }
404bfd6cce5Sdanielk1977 return rc;
405998b56c3Sdanielk1977 }
4066c62608fSdrh #endif /* SQLITE_OMIT_UTF16 */
407998b56c3Sdanielk1977
408998b56c3Sdanielk1977 /*
4096622cce3Sdanielk1977 ** pZ is a UTF-8 encoded unicode string. If nByte is less than zero,
4106622cce3Sdanielk1977 ** return the number of unicode characters in pZ up to (but not including)
4116622cce3Sdanielk1977 ** the first 0x00 byte. If nByte is not less than zero, return the
4126622cce3Sdanielk1977 ** number of unicode characters in the first nByte of pZ (or up to
4136622cce3Sdanielk1977 ** the first 0x00, whichever comes first).
414998b56c3Sdanielk1977 */
sqlite3Utf8CharLen(const char * zIn,int nByte)4154a919118Sdrh int sqlite3Utf8CharLen(const char *zIn, int nByte){
416bfd6cce5Sdanielk1977 int r = 0;
4174a919118Sdrh const u8 *z = (const u8*)zIn;
4184a919118Sdrh const u8 *zTerm;
4191ba1b551Sdanielk1977 if( nByte>=0 ){
420bfd6cce5Sdanielk1977 zTerm = &z[nByte];
421bfd6cce5Sdanielk1977 }else{
4224a919118Sdrh zTerm = (const u8*)(-1);
4236622cce3Sdanielk1977 }
424bfd6cce5Sdanielk1977 assert( z<=zTerm );
425bfd6cce5Sdanielk1977 while( *z!=0 && z<zTerm ){
4264a919118Sdrh SQLITE_SKIP_UTF8(z);
427bfd6cce5Sdanielk1977 r++;
428bfd6cce5Sdanielk1977 }
429bfd6cce5Sdanielk1977 return r;
4306622cce3Sdanielk1977 }
4316622cce3Sdanielk1977
4324152e677Sdanielk1977 /* This test function is not currently used by the automated test-suite.
4334152e677Sdanielk1977 ** Hence it is only available in debug builds.
4344152e677Sdanielk1977 */
4354152e677Sdanielk1977 #if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
4364152e677Sdanielk1977 /*
4374152e677Sdanielk1977 ** Translate UTF-8 to UTF-8.
4384152e677Sdanielk1977 **
4394152e677Sdanielk1977 ** This has the effect of making sure that the string is well-formed
4404152e677Sdanielk1977 ** UTF-8. Miscoded characters are removed.
4414152e677Sdanielk1977 **
442dba2cc43Sshaneh ** The translation is done in-place and aborted if the output
443dba2cc43Sshaneh ** overruns the input.
4444152e677Sdanielk1977 */
sqlite3Utf8To8(unsigned char * zIn)4454152e677Sdanielk1977 int sqlite3Utf8To8(unsigned char *zIn){
4464152e677Sdanielk1977 unsigned char *zOut = zIn;
4474152e677Sdanielk1977 unsigned char *zStart = zIn;
4484152e677Sdanielk1977 u32 c;
4494152e677Sdanielk1977
450dba2cc43Sshaneh while( zIn[0] && zOut<=zIn ){
45142610961Sdrh c = sqlite3Utf8Read((const u8**)&zIn);
4524152e677Sdanielk1977 if( c!=0xfffd ){
4534152e677Sdanielk1977 WRITE_UTF8(zOut, c);
4544152e677Sdanielk1977 }
4554152e677Sdanielk1977 }
4564152e677Sdanielk1977 *zOut = 0;
457b08a67a7Sshane return (int)(zOut - zStart);
4584152e677Sdanielk1977 }
4594152e677Sdanielk1977 #endif
4604152e677Sdanielk1977
4616c62608fSdrh #ifndef SQLITE_OMIT_UTF16
4626622cce3Sdanielk1977 /*
463af9a7c22Sdrh ** Convert a UTF-16 string in the native encoding into a UTF-8 string.
46417435752Sdrh ** Memory to hold the UTF-8 string is obtained from sqlite3_malloc and must
46517435752Sdrh ** be freed by the calling function.
466af9a7c22Sdrh **
467af9a7c22Sdrh ** NULL is returned if there is an allocation error.
468af9a7c22Sdrh */
sqlite3Utf16to8(sqlite3 * db,const void * z,int nByte,u8 enc)469b7dca7d7Sdan char *sqlite3Utf16to8(sqlite3 *db, const void *z, int nByte, u8 enc){
470af9a7c22Sdrh Mem m;
471af9a7c22Sdrh memset(&m, 0, sizeof(m));
472b21c8cd4Sdrh m.db = db;
473b7dca7d7Sdan sqlite3VdbeMemSetStr(&m, z, nByte, enc, SQLITE_STATIC);
474b21c8cd4Sdrh sqlite3VdbeChangeEncoding(&m, SQLITE_UTF8);
475ae72d982Sdanielk1977 if( db->mallocFailed ){
476ae72d982Sdanielk1977 sqlite3VdbeMemRelease(&m);
477ae72d982Sdanielk1977 m.z = 0;
478ae72d982Sdanielk1977 }
47917435752Sdrh assert( (m.flags & MEM_Term)!=0 || db->mallocFailed );
48017435752Sdrh assert( (m.flags & MEM_Str)!=0 || db->mallocFailed );
481b7dca7d7Sdan assert( m.z || db->mallocFailed );
482b7dca7d7Sdan return m.z;
483af9a7c22Sdrh }
484af9a7c22Sdrh
485af9a7c22Sdrh /*
4861faca757Sdrh ** zIn is a UTF-16 encoded unicode string at least nChar characters long.
487aed382f9Sdrh ** Return the number of bytes in the first nChar unicode characters
488aed382f9Sdrh ** in pZ. nChar must be non-negative.
4896622cce3Sdanielk1977 */
sqlite3Utf16ByteLen(const void * zIn,int nChar)490ee85813cSdrh int sqlite3Utf16ByteLen(const void *zIn, int nChar){
491aed382f9Sdrh int c;
492aed382f9Sdrh unsigned char const *z = zIn;
493bfd6cce5Sdanielk1977 int n = 0;
4946d116cacSdrh
4950184a256Sdrh if( SQLITE_UTF16NATIVE==SQLITE_UTF16LE ) z++;
496aed382f9Sdrh while( n<nChar ){
4970184a256Sdrh c = z[0];
4980184a256Sdrh z += 2;
4990184a256Sdrh if( c>=0xd8 && c<0xdc && z[0]>=0xdc && z[0]<0xe0 ) z += 2;
500bfd6cce5Sdanielk1977 n++;
501998b56c3Sdanielk1977 }
5020184a256Sdrh return (int)(z-(unsigned char const *)zIn)
5030184a256Sdrh - (SQLITE_UTF16NATIVE==SQLITE_UTF16LE);
504998b56c3Sdanielk1977 }
505998b56c3Sdanielk1977
50653c14021Sdrh #if defined(SQLITE_TEST)
50753c14021Sdrh /*
508bfd6cce5Sdanielk1977 ** This routine is called from the TCL test function "translate_selftest".
509bfd6cce5Sdanielk1977 ** It checks that the primitives for serializing and deserializing
510bfd6cce5Sdanielk1977 ** characters in each encoding are inverses of each other.
511bfd6cce5Sdanielk1977 */
sqlite3UtfSelfTest(void)51244a376f6Sdanielk1977 void sqlite3UtfSelfTest(void){
513b3fa0e01Sdrh unsigned int i, t;
514bfd6cce5Sdanielk1977 unsigned char zBuf[20];
515bfd6cce5Sdanielk1977 unsigned char *z;
516bfd6cce5Sdanielk1977 int n;
517a39f4c5cSdrh unsigned int c;
518bfd6cce5Sdanielk1977
5191ba1b551Sdanielk1977 for(i=0; i<0x00110000; i++){
520bfd6cce5Sdanielk1977 z = zBuf;
521bfd6cce5Sdanielk1977 WRITE_UTF8(z, i);
52218e526c1Sshane n = (int)(z-zBuf);
52318e526c1Sshane assert( n>0 && n<=4 );
5244a919118Sdrh z[0] = 0;
525bfd6cce5Sdanielk1977 z = zBuf;
52642610961Sdrh c = sqlite3Utf8Read((const u8**)&z);
527b3fa0e01Sdrh t = i;
528b3fa0e01Sdrh if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD;
529b3fa0e01Sdrh if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD;
530b3fa0e01Sdrh assert( c==t );
531bfd6cce5Sdanielk1977 assert( (z-zBuf)==n );
532bfd6cce5Sdanielk1977 }
533bfd6cce5Sdanielk1977 }
5346c62608fSdrh #endif /* SQLITE_TEST */
5356c62608fSdrh #endif /* SQLITE_OMIT_UTF16 */
536