xref: /sqlite-3.40.0/src/utf.c (revision 21b473de)
1a5d14fe7Sdrh /*
2a5d14fe7Sdrh ** 2004 April 13
3a5d14fe7Sdrh **
4a5d14fe7Sdrh ** The author disclaims copyright to this source code.  In place of
5a5d14fe7Sdrh ** a legal notice, here is a blessing:
6a5d14fe7Sdrh **
7a5d14fe7Sdrh **    May you do good and not evil.
8a5d14fe7Sdrh **    May you find forgiveness for yourself and forgive others.
9a5d14fe7Sdrh **    May you share freely, never taking more than you give.
10a5d14fe7Sdrh **
11a5d14fe7Sdrh *************************************************************************
12a5d14fe7Sdrh ** This file contains routines used to translate between UTF-8,
13a5d14fe7Sdrh ** UTF-16, UTF-16BE, and UTF-16LE.
14a5d14fe7Sdrh **
15a5d14fe7Sdrh ** Notes on UTF-8:
16a5d14fe7Sdrh **
17a5d14fe7Sdrh **   Byte-0    Byte-1    Byte-2    Byte-3    Value
18a5d14fe7Sdrh **  0xxxxxxx                                 00000000 00000000 0xxxxxxx
19a5d14fe7Sdrh **  110yyyyy  10xxxxxx                       00000000 00000yyy yyxxxxxx
20a5d14fe7Sdrh **  1110zzzz  10yyyyyy  10xxxxxx             00000000 zzzzyyyy yyxxxxxx
21a5d14fe7Sdrh **  11110uuu  10uuzzzz  10yyyyyy  10xxxxxx   000uuuuu zzzzyyyy yyxxxxxx
22a5d14fe7Sdrh **
23a5d14fe7Sdrh **
24a5d14fe7Sdrh ** Notes on UTF-16:  (with wwww+1==uuuuu)
25a5d14fe7Sdrh **
26a5d14fe7Sdrh **      Word-0               Word-1          Value
2751846b56Sdrh **  110110ww wwzzzzyy   110111yy yyxxxxxx    000uuuuu zzzzyyyy yyxxxxxx
2851846b56Sdrh **  zzzzyyyy yyxxxxxx                        00000000 zzzzyyyy yyxxxxxx
29a5d14fe7Sdrh **
30998b56c3Sdanielk1977 **
31a5d14fe7Sdrh ** BOM or Byte Order Mark:
32a5d14fe7Sdrh **     0xff 0xfe   little-endian utf-16 follows
33a5d14fe7Sdrh **     0xfe 0xff   big-endian utf-16 follows
34998b56c3Sdanielk1977 **
35a5d14fe7Sdrh */
36998b56c3Sdanielk1977 #include "sqliteInt.h"
37b659e9bfSdrh #include <assert.h>
38bfd6cce5Sdanielk1977 #include "vdbeInt.h"
39998b56c3Sdanielk1977 
40e1462a76Sdrh #if !defined(SQLITE_AMALGAMATION) && SQLITE_BYTEORDER==0
41998b56c3Sdanielk1977 /*
4238def054Sdrh ** The following constant value is used by the SQLITE_BIGENDIAN and
4338def054Sdrh ** SQLITE_LITTLEENDIAN macros.
4438def054Sdrh */
4538def054Sdrh const int sqlite3one = 1;
46e1462a76Sdrh #endif /* SQLITE_AMALGAMATION && SQLITE_BYTEORDER==0 */
4738def054Sdrh 
4838def054Sdrh /*
494a919118Sdrh ** This lookup table is used to help decode the first byte of
504a919118Sdrh ** a multi-byte UTF8 character.
51d02eb1fdSdanielk1977 */
5218e526c1Sshane static const unsigned char sqlite3Utf8Trans1[] = {
534a919118Sdrh   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
544a919118Sdrh   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
554a919118Sdrh   0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
564a919118Sdrh   0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
574a919118Sdrh   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
584a919118Sdrh   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
594a919118Sdrh   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
604a919118Sdrh   0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
61ad7dd425Sdanielk1977 };
62bfd6cce5Sdanielk1977 
6366150956Sdrh 
64bfd6cce5Sdanielk1977 #define WRITE_UTF8(zOut, c) {                          \
65bfd6cce5Sdanielk1977   if( c<0x00080 ){                                     \
66aa78bec9Sdrh     *zOut++ = (u8)(c&0xFF);                            \
67bfd6cce5Sdanielk1977   }                                                    \
68bfd6cce5Sdanielk1977   else if( c<0x00800 ){                                \
69aa78bec9Sdrh     *zOut++ = 0xC0 + (u8)((c>>6)&0x1F);                \
70aa78bec9Sdrh     *zOut++ = 0x80 + (u8)(c & 0x3F);                   \
71bfd6cce5Sdanielk1977   }                                                    \
72bfd6cce5Sdanielk1977   else if( c<0x10000 ){                                \
73aa78bec9Sdrh     *zOut++ = 0xE0 + (u8)((c>>12)&0x0F);               \
74aa78bec9Sdrh     *zOut++ = 0x80 + (u8)((c>>6) & 0x3F);              \
75aa78bec9Sdrh     *zOut++ = 0x80 + (u8)(c & 0x3F);                   \
76bfd6cce5Sdanielk1977   }else{                                               \
77aa78bec9Sdrh     *zOut++ = 0xF0 + (u8)((c>>18) & 0x07);             \
78aa78bec9Sdrh     *zOut++ = 0x80 + (u8)((c>>12) & 0x3F);             \
79aa78bec9Sdrh     *zOut++ = 0x80 + (u8)((c>>6) & 0x3F);              \
80aa78bec9Sdrh     *zOut++ = 0x80 + (u8)(c & 0x3F);                   \
81bfd6cce5Sdanielk1977   }                                                    \
82bfd6cce5Sdanielk1977 }
83bfd6cce5Sdanielk1977 
84bfd6cce5Sdanielk1977 #define WRITE_UTF16LE(zOut, c) {                                    \
85bfd6cce5Sdanielk1977   if( c<=0xFFFF ){                                                  \
86aa78bec9Sdrh     *zOut++ = (u8)(c&0x00FF);                                       \
87aa78bec9Sdrh     *zOut++ = (u8)((c>>8)&0x00FF);                                  \
88bfd6cce5Sdanielk1977   }else{                                                            \
89aa78bec9Sdrh     *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0));  \
90aa78bec9Sdrh     *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03));              \
91aa78bec9Sdrh     *zOut++ = (u8)(c&0x00FF);                                       \
92aa78bec9Sdrh     *zOut++ = (u8)(0x00DC + ((c>>8)&0x03));                         \
93bfd6cce5Sdanielk1977   }                                                                 \
94bfd6cce5Sdanielk1977 }
95bfd6cce5Sdanielk1977 
96bfd6cce5Sdanielk1977 #define WRITE_UTF16BE(zOut, c) {                                    \
97bfd6cce5Sdanielk1977   if( c<=0xFFFF ){                                                  \
98aa78bec9Sdrh     *zOut++ = (u8)((c>>8)&0x00FF);                                  \
99aa78bec9Sdrh     *zOut++ = (u8)(c&0x00FF);                                       \
100bfd6cce5Sdanielk1977   }else{                                                            \
101aa78bec9Sdrh     *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03));              \
102aa78bec9Sdrh     *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0));  \
103aa78bec9Sdrh     *zOut++ = (u8)(0x00DC + ((c>>8)&0x03));                         \
104aa78bec9Sdrh     *zOut++ = (u8)(c&0x00FF);                                       \
105bfd6cce5Sdanielk1977   }                                                                 \
106bfd6cce5Sdanielk1977 }
107bfd6cce5Sdanielk1977 
108bfd6cce5Sdanielk1977 /*
10966150956Sdrh ** Translate a single UTF-8 character.  Return the unicode value.
11066150956Sdrh **
11166150956Sdrh ** During translation, assume that the byte that zTerm points
11266150956Sdrh ** is a 0x00.
11366150956Sdrh **
11466150956Sdrh ** Write a pointer to the next unread byte back into *pzNext.
11566150956Sdrh **
11666150956Sdrh ** Notes On Invalid UTF-8:
11766150956Sdrh **
11866150956Sdrh **  *  This routine never allows a 7-bit character (0x00 through 0x7f) to
11966150956Sdrh **     be encoded as a multi-byte character.  Any multi-byte character that
12066150956Sdrh **     attempts to encode a value between 0x00 and 0x7f is rendered as 0xfffd.
12166150956Sdrh **
12266150956Sdrh **  *  This routine never allows a UTF16 surrogate value to be encoded.
12366150956Sdrh **     If a multi-byte character attempts to encode a value between
12466150956Sdrh **     0xd800 and 0xe000 then it is rendered as 0xfffd.
12566150956Sdrh **
12666150956Sdrh **  *  Bytes in the range of 0x80 through 0xbf which occur as the first
12766150956Sdrh **     byte of a character are interpreted as single-byte characters
12866150956Sdrh **     and rendered as themselves even though they are technically
12966150956Sdrh **     invalid characters.
13066150956Sdrh **
1316c34e58eSdrh **  *  This routine accepts over-length UTF8 encodings
1326c34e58eSdrh **     for unicode values 0x80 and greater.  It does not change over-length
13366150956Sdrh **     encodings to 0xfffd as some systems recommend.
13466150956Sdrh */
135ad76a81eSdanielk1977 #define READ_UTF8(zIn, zTerm, c)                           \
136ad76a81eSdanielk1977   c = *(zIn++);                                            \
137ad76a81eSdanielk1977   if( c>=0xc0 ){                                           \
13818e526c1Sshane     c = sqlite3Utf8Trans1[c-0xc0];                         \
139ad76a81eSdanielk1977     while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){            \
140ad76a81eSdanielk1977       c = (c<<6) + (0x3f & *(zIn++));                      \
141ad76a81eSdanielk1977     }                                                      \
142ad76a81eSdanielk1977     if( c<0x80                                             \
143ad76a81eSdanielk1977         || (c&0xFFFFF800)==0xD800                          \
144ad76a81eSdanielk1977         || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }        \
145ad76a81eSdanielk1977   }
sqlite3Utf8Read(const unsigned char ** pz)1460a32fa6dSdrh u32 sqlite3Utf8Read(
14742610961Sdrh   const unsigned char **pz    /* Pointer to string from which to read char */
14866150956Sdrh ){
149dba2cc43Sshaneh   unsigned int c;
150769e97e0Sdrh 
151769e97e0Sdrh   /* Same as READ_UTF8() above but without the zTerm parameter.
152769e97e0Sdrh   ** For this routine, we assume the UTF8 string is always zero-terminated.
153769e97e0Sdrh   */
15442610961Sdrh   c = *((*pz)++);
155769e97e0Sdrh   if( c>=0xc0 ){
156769e97e0Sdrh     c = sqlite3Utf8Trans1[c-0xc0];
15742610961Sdrh     while( (*(*pz) & 0xc0)==0x80 ){
15842610961Sdrh       c = (c<<6) + (0x3f & *((*pz)++));
159769e97e0Sdrh     }
160769e97e0Sdrh     if( c<0x80
161769e97e0Sdrh         || (c&0xFFFFF800)==0xD800
162769e97e0Sdrh         || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }
163769e97e0Sdrh   }
16466150956Sdrh   return c;
16566150956Sdrh }
16666150956Sdrh 
16766150956Sdrh 
16866150956Sdrh 
169ad76a81eSdanielk1977 
17066150956Sdrh /*
171bfd6cce5Sdanielk1977 ** If the TRANSLATE_TRACE macro is defined, the value of each Mem is
172bfd6cce5Sdanielk1977 ** printed on stderr on the way into and out of sqlite3VdbeMemTranslate().
173bfd6cce5Sdanielk1977 */
174bfd6cce5Sdanielk1977 /* #define TRANSLATE_TRACE 1 */
175bfd6cce5Sdanielk1977 
1766c62608fSdrh #ifndef SQLITE_OMIT_UTF16
177bfd6cce5Sdanielk1977 /*
178bfd6cce5Sdanielk1977 ** This routine transforms the internal text encoding used by pMem to
179bfd6cce5Sdanielk1977 ** desiredEnc. It is an error if the string is already of the desired
180bfd6cce5Sdanielk1977 ** encoding, or if *pMem does not contain a string value.
181bfd6cce5Sdanielk1977 */
sqlite3VdbeMemTranslate(Mem * pMem,u8 desiredEnc)1824274dae9Sdrh SQLITE_NOINLINE int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){
183d4de9f7bSdrh   sqlite3_int64 len;          /* Maximum length of output string in bytes */
184bfd6cce5Sdanielk1977   unsigned char *zOut;        /* Output buffer */
185bfd6cce5Sdanielk1977   unsigned char *zIn;         /* Input iterator */
186bfd6cce5Sdanielk1977   unsigned char *zTerm;       /* End of input */
187bfd6cce5Sdanielk1977   unsigned char *z;           /* Output iterator */
188a39f4c5cSdrh   unsigned int c;
189bfd6cce5Sdanielk1977 
190b21c8cd4Sdrh   assert( pMem->db==0 || sqlite3_mutex_held(pMem->db->mutex) );
191bfd6cce5Sdanielk1977   assert( pMem->flags&MEM_Str );
192bfd6cce5Sdanielk1977   assert( pMem->enc!=desiredEnc );
193bfd6cce5Sdanielk1977   assert( pMem->enc!=0 );
194bfd6cce5Sdanielk1977   assert( pMem->n>=0 );
195bfd6cce5Sdanielk1977 
196b5402fbfSdanielk1977 #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
197bfd6cce5Sdanielk1977   {
1985ca06329Sdrh     StrAccum acc;
1995ca06329Sdrh     char zBuf[1000];
2005ca06329Sdrh     sqlite3StrAccumInit(&acc, 0, zBuf, sizeof(zBuf), 0);
2015ca06329Sdrh     sqlite3VdbeMemPrettyPrint(pMem, &acc);
2025ca06329Sdrh     fprintf(stderr, "INPUT:  %s\n", sqlite3StrAccumFinish(&acc));
203ad7dd425Sdanielk1977   }
204ad7dd425Sdanielk1977 #endif
205ad7dd425Sdanielk1977 
206bfd6cce5Sdanielk1977   /* If the translation is between UTF-16 little and big endian, then
207bfd6cce5Sdanielk1977   ** all that is required is to swap the byte order. This case is handled
208bfd6cce5Sdanielk1977   ** differently from the others.
209d02eb1fdSdanielk1977   */
210bfd6cce5Sdanielk1977   if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){
211bfd6cce5Sdanielk1977     u8 temp;
21271c697efSdrh     int rc;
213b21c8cd4Sdrh     rc = sqlite3VdbeMemMakeWriteable(pMem);
21471c697efSdrh     if( rc!=SQLITE_OK ){
21571c697efSdrh       assert( rc==SQLITE_NOMEM );
216fad3039cSmistachkin       return SQLITE_NOMEM_BKPT;
21771c697efSdrh     }
2182646da7eSdrh     zIn = (u8*)pMem->z;
219bbf695d6Sdrh     zTerm = &zIn[pMem->n&~1];
220bfd6cce5Sdanielk1977     while( zIn<zTerm ){
221bfd6cce5Sdanielk1977       temp = *zIn;
222bfd6cce5Sdanielk1977       *zIn = *(zIn+1);
223bfd6cce5Sdanielk1977       zIn++;
224bfd6cce5Sdanielk1977       *zIn++ = temp;
225bfd6cce5Sdanielk1977     }
226bfd6cce5Sdanielk1977     pMem->enc = desiredEnc;
227bfd6cce5Sdanielk1977     goto translate_out;
228d02eb1fdSdanielk1977   }
229d02eb1fdSdanielk1977 
230d7e69648Sdanielk1977   /* Set len to the maximum number of bytes required in the output buffer. */
231d7e69648Sdanielk1977   if( desiredEnc==SQLITE_UTF8 ){
232d7e69648Sdanielk1977     /* When converting from UTF-16, the maximum growth results from
233a49b8611Sdrh     ** translating a 2-byte character to a 4-byte UTF-8 character.
234a49b8611Sdrh     ** A single byte is required for the output string
235d7e69648Sdanielk1977     ** nul-terminator.
236d7e69648Sdanielk1977     */
237bbf695d6Sdrh     pMem->n &= ~1;
238d4de9f7bSdrh     len = 2 * (sqlite3_int64)pMem->n + 1;
239d7e69648Sdanielk1977   }else{
240d7e69648Sdanielk1977     /* When converting from UTF-8 to UTF-16 the maximum growth is caused
241d7e69648Sdanielk1977     ** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16
242d7e69648Sdanielk1977     ** character. Two bytes are required in the output buffer for the
243d7e69648Sdanielk1977     ** nul-terminator.
244d7e69648Sdanielk1977     */
245d4de9f7bSdrh     len = 2 * (sqlite3_int64)pMem->n + 2;
246d7e69648Sdanielk1977   }
247d7e69648Sdanielk1977 
248bfd6cce5Sdanielk1977   /* Set zIn to point at the start of the input buffer and zTerm to point 1
249bfd6cce5Sdanielk1977   ** byte past the end.
250bfd6cce5Sdanielk1977   **
251a7a8e14bSdanielk1977   ** Variable zOut is set to point at the output buffer, space obtained
252a7a8e14bSdanielk1977   ** from sqlite3_malloc().
253d02eb1fdSdanielk1977   */
2542646da7eSdrh   zIn = (u8*)pMem->z;
255bfd6cce5Sdanielk1977   zTerm = &zIn[pMem->n];
256b21c8cd4Sdrh   zOut = sqlite3DbMallocRaw(pMem->db, len);
257b21c8cd4Sdrh   if( !zOut ){
258fad3039cSmistachkin     return SQLITE_NOMEM_BKPT;
259b21c8cd4Sdrh   }
260bfd6cce5Sdanielk1977   z = zOut;
261bfd6cce5Sdanielk1977 
262bfd6cce5Sdanielk1977   if( pMem->enc==SQLITE_UTF8 ){
263bfd6cce5Sdanielk1977     if( desiredEnc==SQLITE_UTF16LE ){
264bfd6cce5Sdanielk1977       /* UTF-8 -> UTF-16 Little-endian */
265bfd6cce5Sdanielk1977       while( zIn<zTerm ){
266ad76a81eSdanielk1977         READ_UTF8(zIn, zTerm, c);
267bfd6cce5Sdanielk1977         WRITE_UTF16LE(z, c);
268bfd6cce5Sdanielk1977       }
269b8dd3155Sdrh     }else{
270b8dd3155Sdrh       assert( desiredEnc==SQLITE_UTF16BE );
271bfd6cce5Sdanielk1977       /* UTF-8 -> UTF-16 Big-endian */
272bfd6cce5Sdanielk1977       while( zIn<zTerm ){
273ad76a81eSdanielk1977         READ_UTF8(zIn, zTerm, c);
274bfd6cce5Sdanielk1977         WRITE_UTF16BE(z, c);
275bfd6cce5Sdanielk1977       }
276bfd6cce5Sdanielk1977     }
277ea678832Sdrh     pMem->n = (int)(z - zOut);
278b8dd3155Sdrh     *z++ = 0;
279bfd6cce5Sdanielk1977   }else{
280bfd6cce5Sdanielk1977     assert( desiredEnc==SQLITE_UTF8 );
281bfd6cce5Sdanielk1977     if( pMem->enc==SQLITE_UTF16LE ){
282bfd6cce5Sdanielk1977       /* UTF-16 Little-endian -> UTF-8 */
283bfd6cce5Sdanielk1977       while( zIn<zTerm ){
2840184a256Sdrh         c = *(zIn++);
2850184a256Sdrh         c += (*(zIn++))<<8;
2860184a256Sdrh         if( c>=0xd800 && c<0xe000 ){
2874f1315a4Sdrh #ifdef SQLITE_REPLACE_INVALID_UTF
2880184a256Sdrh           if( c>=0xdc00 || zIn>=zTerm ){
2890184a256Sdrh             c = 0xfffd;
2900184a256Sdrh           }else{
2910184a256Sdrh             int c2 = *(zIn++);
2920184a256Sdrh             c2 += (*(zIn++))<<8;
2930184a256Sdrh             if( c2<0xdc00 || c2>=0xe000 ){
2940184a256Sdrh               zIn -= 2;
2950184a256Sdrh               c = 0xfffd;
2960184a256Sdrh             }else{
2970184a256Sdrh               c = ((c&0x3ff)<<10) + (c2&0x3ff) + 0x10000;
2980184a256Sdrh             }
2990184a256Sdrh           }
3004f1315a4Sdrh #else
3014f1315a4Sdrh           if( zIn<zTerm ){
3024f1315a4Sdrh             int c2 = (*zIn++);
3034f1315a4Sdrh             c2 += ((*zIn++)<<8);
3044f1315a4Sdrh             c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10);
3054f1315a4Sdrh           }
3064f1315a4Sdrh #endif
3070184a256Sdrh         }
308bfd6cce5Sdanielk1977         WRITE_UTF8(z, c);
309bfd6cce5Sdanielk1977       }
310bfd6cce5Sdanielk1977     }else{
3117ffb2b5fSmihailim       /* UTF-16 Big-endian -> UTF-8 */
312bfd6cce5Sdanielk1977       while( zIn<zTerm ){
3130184a256Sdrh         c = (*(zIn++))<<8;
3140184a256Sdrh         c += *(zIn++);
3150184a256Sdrh         if( c>=0xd800 && c<0xe000 ){
3164f1315a4Sdrh #ifdef SQLITE_REPLACE_INVALID_UTF
3170184a256Sdrh           if( c>=0xdc00 || zIn>=zTerm ){
3180184a256Sdrh             c = 0xfffd;
3190184a256Sdrh           }else{
3200184a256Sdrh             int c2 = (*(zIn++))<<8;
3210184a256Sdrh             c2 += *(zIn++);
3220184a256Sdrh             if( c2<0xdc00 || c2>=0xe000 ){
3230184a256Sdrh               zIn -= 2;
3240184a256Sdrh               c = 0xfffd;
3250184a256Sdrh             }else{
3260184a256Sdrh               c = ((c&0x3ff)<<10) + (c2&0x3ff) + 0x10000;
3270184a256Sdrh             }
3280184a256Sdrh           }
3294f1315a4Sdrh #else
3304f1315a4Sdrh           if( zIn<zTerm ){
3314f1315a4Sdrh             int c2 = ((*zIn++)<<8);
3324f1315a4Sdrh             c2 += (*zIn++);
3334f1315a4Sdrh             c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10);
3344f1315a4Sdrh           }
3354f1315a4Sdrh #endif
3360184a256Sdrh         }
337bfd6cce5Sdanielk1977         WRITE_UTF8(z, c);
338bfd6cce5Sdanielk1977       }
339bfd6cce5Sdanielk1977     }
340aa78bec9Sdrh     pMem->n = (int)(z - zOut);
341d02eb1fdSdanielk1977   }
342b8dd3155Sdrh   *z = 0;
343d7e69648Sdanielk1977   assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len );
344d02eb1fdSdanielk1977 
345*21b473deSdrh   c = MEM_Str|MEM_Term|(pMem->flags&(MEM_AffMask|MEM_Subtype));
346bfd6cce5Sdanielk1977   sqlite3VdbeMemRelease(pMem);
347*21b473deSdrh   pMem->flags = c;
348bfd6cce5Sdanielk1977   pMem->enc = desiredEnc;
3492646da7eSdrh   pMem->z = (char*)zOut;
3505f096135Sdanielk1977   pMem->zMalloc = pMem->z;
35117bcb102Sdrh   pMem->szMalloc = sqlite3DbMallocSize(pMem->db, pMem->z);
352d02eb1fdSdanielk1977 
353bfd6cce5Sdanielk1977 translate_out:
354b5402fbfSdanielk1977 #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
355bfd6cce5Sdanielk1977   {
3565ca06329Sdrh     StrAccum acc;
3575ca06329Sdrh     char zBuf[1000];
3585ca06329Sdrh     sqlite3StrAccumInit(&acc, 0, zBuf, sizeof(zBuf), 0);
3595ca06329Sdrh     sqlite3VdbeMemPrettyPrint(pMem, &acc);
3605ca06329Sdrh     fprintf(stderr, "OUTPUT: %s\n", sqlite3StrAccumFinish(&acc));
361d02eb1fdSdanielk1977   }
362bfd6cce5Sdanielk1977 #endif
363bfd6cce5Sdanielk1977   return SQLITE_OK;
364d02eb1fdSdanielk1977 }
365f0f44b79Sdrh #endif /* SQLITE_OMIT_UTF16 */
366d02eb1fdSdanielk1977 
367f0f44b79Sdrh #ifndef SQLITE_OMIT_UTF16
368d02eb1fdSdanielk1977 /*
369bfd6cce5Sdanielk1977 ** This routine checks for a byte-order mark at the beginning of the
370bfd6cce5Sdanielk1977 ** UTF-16 string stored in *pMem. If one is present, it is removed and
371bfd6cce5Sdanielk1977 ** the encoding of the Mem adjusted. This routine does not do any
372bfd6cce5Sdanielk1977 ** byte-swapping, it just sets Mem.enc appropriately.
373998b56c3Sdanielk1977 **
374bfd6cce5Sdanielk1977 ** The allocation (static, dynamic etc.) and encoding of the Mem may be
375bfd6cce5Sdanielk1977 ** changed by this function.
376998b56c3Sdanielk1977 */
sqlite3VdbeMemHandleBom(Mem * pMem)377b21c8cd4Sdrh int sqlite3VdbeMemHandleBom(Mem *pMem){
378bfd6cce5Sdanielk1977   int rc = SQLITE_OK;
379bfd6cce5Sdanielk1977   u8 bom = 0;
380998b56c3Sdanielk1977 
381769e97e0Sdrh   assert( pMem->n>=0 );
382769e97e0Sdrh   if( pMem->n>1 ){
383bfd6cce5Sdanielk1977     u8 b1 = *(u8 *)pMem->z;
384bfd6cce5Sdanielk1977     u8 b2 = *(((u8 *)pMem->z) + 1);
38593d4675dSdanielk1977     if( b1==0xFE && b2==0xFF ){
386bfd6cce5Sdanielk1977       bom = SQLITE_UTF16BE;
38793d4675dSdanielk1977     }
38893d4675dSdanielk1977     if( b1==0xFF && b2==0xFE ){
389bfd6cce5Sdanielk1977       bom = SQLITE_UTF16LE;
39093d4675dSdanielk1977     }
39193d4675dSdanielk1977   }
392bfd6cce5Sdanielk1977 
393bfd6cce5Sdanielk1977   if( bom ){
394a7a8e14bSdanielk1977     rc = sqlite3VdbeMemMakeWriteable(pMem);
395a7a8e14bSdanielk1977     if( rc==SQLITE_OK ){
396a7a8e14bSdanielk1977       pMem->n -= 2;
397a7a8e14bSdanielk1977       memmove(pMem->z, &pMem->z[2], pMem->n);
398a7a8e14bSdanielk1977       pMem->z[pMem->n] = '\0';
399a7a8e14bSdanielk1977       pMem->z[pMem->n+1] = '\0';
400a7a8e14bSdanielk1977       pMem->flags |= MEM_Term;
401a7a8e14bSdanielk1977       pMem->enc = bom;
402998b56c3Sdanielk1977     }
403bfd6cce5Sdanielk1977   }
404bfd6cce5Sdanielk1977   return rc;
405998b56c3Sdanielk1977 }
4066c62608fSdrh #endif /* SQLITE_OMIT_UTF16 */
407998b56c3Sdanielk1977 
408998b56c3Sdanielk1977 /*
4096622cce3Sdanielk1977 ** pZ is a UTF-8 encoded unicode string. If nByte is less than zero,
4106622cce3Sdanielk1977 ** return the number of unicode characters in pZ up to (but not including)
4116622cce3Sdanielk1977 ** the first 0x00 byte. If nByte is not less than zero, return the
4126622cce3Sdanielk1977 ** number of unicode characters in the first nByte of pZ (or up to
4136622cce3Sdanielk1977 ** the first 0x00, whichever comes first).
414998b56c3Sdanielk1977 */
sqlite3Utf8CharLen(const char * zIn,int nByte)4154a919118Sdrh int sqlite3Utf8CharLen(const char *zIn, int nByte){
416bfd6cce5Sdanielk1977   int r = 0;
4174a919118Sdrh   const u8 *z = (const u8*)zIn;
4184a919118Sdrh   const u8 *zTerm;
4191ba1b551Sdanielk1977   if( nByte>=0 ){
420bfd6cce5Sdanielk1977     zTerm = &z[nByte];
421bfd6cce5Sdanielk1977   }else{
4224a919118Sdrh     zTerm = (const u8*)(-1);
4236622cce3Sdanielk1977   }
424bfd6cce5Sdanielk1977   assert( z<=zTerm );
425bfd6cce5Sdanielk1977   while( *z!=0 && z<zTerm ){
4264a919118Sdrh     SQLITE_SKIP_UTF8(z);
427bfd6cce5Sdanielk1977     r++;
428bfd6cce5Sdanielk1977   }
429bfd6cce5Sdanielk1977   return r;
4306622cce3Sdanielk1977 }
4316622cce3Sdanielk1977 
4324152e677Sdanielk1977 /* This test function is not currently used by the automated test-suite.
4334152e677Sdanielk1977 ** Hence it is only available in debug builds.
4344152e677Sdanielk1977 */
4354152e677Sdanielk1977 #if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
4364152e677Sdanielk1977 /*
4374152e677Sdanielk1977 ** Translate UTF-8 to UTF-8.
4384152e677Sdanielk1977 **
4394152e677Sdanielk1977 ** This has the effect of making sure that the string is well-formed
4404152e677Sdanielk1977 ** UTF-8.  Miscoded characters are removed.
4414152e677Sdanielk1977 **
442dba2cc43Sshaneh ** The translation is done in-place and aborted if the output
443dba2cc43Sshaneh ** overruns the input.
4444152e677Sdanielk1977 */
sqlite3Utf8To8(unsigned char * zIn)4454152e677Sdanielk1977 int sqlite3Utf8To8(unsigned char *zIn){
4464152e677Sdanielk1977   unsigned char *zOut = zIn;
4474152e677Sdanielk1977   unsigned char *zStart = zIn;
4484152e677Sdanielk1977   u32 c;
4494152e677Sdanielk1977 
450dba2cc43Sshaneh   while( zIn[0] && zOut<=zIn ){
45142610961Sdrh     c = sqlite3Utf8Read((const u8**)&zIn);
4524152e677Sdanielk1977     if( c!=0xfffd ){
4534152e677Sdanielk1977       WRITE_UTF8(zOut, c);
4544152e677Sdanielk1977     }
4554152e677Sdanielk1977   }
4564152e677Sdanielk1977   *zOut = 0;
457b08a67a7Sshane   return (int)(zOut - zStart);
4584152e677Sdanielk1977 }
4594152e677Sdanielk1977 #endif
4604152e677Sdanielk1977 
4616c62608fSdrh #ifndef SQLITE_OMIT_UTF16
4626622cce3Sdanielk1977 /*
463af9a7c22Sdrh ** Convert a UTF-16 string in the native encoding into a UTF-8 string.
46417435752Sdrh ** Memory to hold the UTF-8 string is obtained from sqlite3_malloc and must
46517435752Sdrh ** be freed by the calling function.
466af9a7c22Sdrh **
467af9a7c22Sdrh ** NULL is returned if there is an allocation error.
468af9a7c22Sdrh */
sqlite3Utf16to8(sqlite3 * db,const void * z,int nByte,u8 enc)469b7dca7d7Sdan char *sqlite3Utf16to8(sqlite3 *db, const void *z, int nByte, u8 enc){
470af9a7c22Sdrh   Mem m;
471af9a7c22Sdrh   memset(&m, 0, sizeof(m));
472b21c8cd4Sdrh   m.db = db;
473b7dca7d7Sdan   sqlite3VdbeMemSetStr(&m, z, nByte, enc, SQLITE_STATIC);
474b21c8cd4Sdrh   sqlite3VdbeChangeEncoding(&m, SQLITE_UTF8);
475ae72d982Sdanielk1977   if( db->mallocFailed ){
476ae72d982Sdanielk1977     sqlite3VdbeMemRelease(&m);
477ae72d982Sdanielk1977     m.z = 0;
478ae72d982Sdanielk1977   }
47917435752Sdrh   assert( (m.flags & MEM_Term)!=0 || db->mallocFailed );
48017435752Sdrh   assert( (m.flags & MEM_Str)!=0 || db->mallocFailed );
481b7dca7d7Sdan   assert( m.z || db->mallocFailed );
482b7dca7d7Sdan   return m.z;
483af9a7c22Sdrh }
484af9a7c22Sdrh 
485af9a7c22Sdrh /*
4861faca757Sdrh ** zIn is a UTF-16 encoded unicode string at least nChar characters long.
487aed382f9Sdrh ** Return the number of bytes in the first nChar unicode characters
488aed382f9Sdrh ** in pZ.  nChar must be non-negative.
4896622cce3Sdanielk1977 */
sqlite3Utf16ByteLen(const void * zIn,int nChar)490ee85813cSdrh int sqlite3Utf16ByteLen(const void *zIn, int nChar){
491aed382f9Sdrh   int c;
492aed382f9Sdrh   unsigned char const *z = zIn;
493bfd6cce5Sdanielk1977   int n = 0;
4946d116cacSdrh 
4950184a256Sdrh   if( SQLITE_UTF16NATIVE==SQLITE_UTF16LE ) z++;
496aed382f9Sdrh   while( n<nChar ){
4970184a256Sdrh     c = z[0];
4980184a256Sdrh     z += 2;
4990184a256Sdrh     if( c>=0xd8 && c<0xdc && z[0]>=0xdc && z[0]<0xe0 ) z += 2;
500bfd6cce5Sdanielk1977     n++;
501998b56c3Sdanielk1977   }
5020184a256Sdrh   return (int)(z-(unsigned char const *)zIn)
5030184a256Sdrh               - (SQLITE_UTF16NATIVE==SQLITE_UTF16LE);
504998b56c3Sdanielk1977 }
505998b56c3Sdanielk1977 
50653c14021Sdrh #if defined(SQLITE_TEST)
50753c14021Sdrh /*
508bfd6cce5Sdanielk1977 ** This routine is called from the TCL test function "translate_selftest".
509bfd6cce5Sdanielk1977 ** It checks that the primitives for serializing and deserializing
510bfd6cce5Sdanielk1977 ** characters in each encoding are inverses of each other.
511bfd6cce5Sdanielk1977 */
sqlite3UtfSelfTest(void)51244a376f6Sdanielk1977 void sqlite3UtfSelfTest(void){
513b3fa0e01Sdrh   unsigned int i, t;
514bfd6cce5Sdanielk1977   unsigned char zBuf[20];
515bfd6cce5Sdanielk1977   unsigned char *z;
516bfd6cce5Sdanielk1977   int n;
517a39f4c5cSdrh   unsigned int c;
518bfd6cce5Sdanielk1977 
5191ba1b551Sdanielk1977   for(i=0; i<0x00110000; i++){
520bfd6cce5Sdanielk1977     z = zBuf;
521bfd6cce5Sdanielk1977     WRITE_UTF8(z, i);
52218e526c1Sshane     n = (int)(z-zBuf);
52318e526c1Sshane     assert( n>0 && n<=4 );
5244a919118Sdrh     z[0] = 0;
525bfd6cce5Sdanielk1977     z = zBuf;
52642610961Sdrh     c = sqlite3Utf8Read((const u8**)&z);
527b3fa0e01Sdrh     t = i;
528b3fa0e01Sdrh     if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD;
529b3fa0e01Sdrh     if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD;
530b3fa0e01Sdrh     assert( c==t );
531bfd6cce5Sdanielk1977     assert( (z-zBuf)==n );
532bfd6cce5Sdanielk1977   }
533bfd6cce5Sdanielk1977 }
5346c62608fSdrh #endif /* SQLITE_TEST */
5356c62608fSdrh #endif /* SQLITE_OMIT_UTF16 */
536