xref: /sqlite-3.40.0/src/utf.c (revision 5d00d0a8)
1 /*
2 ** 2004 April 13
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** This file contains routines used to translate between UTF-8,
13 ** UTF-16, UTF-16BE, and UTF-16LE.
14 **
15 ** $Id: utf.c,v 1.73 2009/04/01 18:40:32 drh Exp $
16 **
17 ** Notes on UTF-8:
18 **
19 **   Byte-0    Byte-1    Byte-2    Byte-3    Value
20 **  0xxxxxxx                                 00000000 00000000 0xxxxxxx
21 **  110yyyyy  10xxxxxx                       00000000 00000yyy yyxxxxxx
22 **  1110zzzz  10yyyyyy  10xxxxxx             00000000 zzzzyyyy yyxxxxxx
23 **  11110uuu  10uuzzzz  10yyyyyy  10xxxxxx   000uuuuu zzzzyyyy yyxxxxxx
24 **
25 **
26 ** Notes on UTF-16:  (with wwww+1==uuuuu)
27 **
28 **      Word-0               Word-1          Value
29 **  110110ww wwzzzzyy   110111yy yyxxxxxx    000uuuuu zzzzyyyy yyxxxxxx
30 **  zzzzyyyy yyxxxxxx                        00000000 zzzzyyyy yyxxxxxx
31 **
32 **
33 ** BOM or Byte Order Mark:
34 **     0xff 0xfe   little-endian utf-16 follows
35 **     0xfe 0xff   big-endian utf-16 follows
36 **
37 */
38 #include "sqliteInt.h"
39 #include <assert.h>
40 #include "vdbeInt.h"
41 
42 #ifndef SQLITE_AMALGAMATION
43 /*
44 ** The following constant value is used by the SQLITE_BIGENDIAN and
45 ** SQLITE_LITTLEENDIAN macros.
46 */
47 const int sqlite3one = 1;
48 #endif /* SQLITE_AMALGAMATION */
49 
50 /*
51 ** This lookup table is used to help decode the first byte of
52 ** a multi-byte UTF8 character.
53 */
54 static const unsigned char sqlite3Utf8Trans1[] = {
55   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
56   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
57   0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
58   0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
59   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
60   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
61   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
62   0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
63 };
64 
65 
66 #define WRITE_UTF8(zOut, c) {                          \
67   if( c<0x00080 ){                                     \
68     *zOut++ = (u8)(c&0xFF);                            \
69   }                                                    \
70   else if( c<0x00800 ){                                \
71     *zOut++ = 0xC0 + (u8)((c>>6)&0x1F);                \
72     *zOut++ = 0x80 + (u8)(c & 0x3F);                   \
73   }                                                    \
74   else if( c<0x10000 ){                                \
75     *zOut++ = 0xE0 + (u8)((c>>12)&0x0F);               \
76     *zOut++ = 0x80 + (u8)((c>>6) & 0x3F);              \
77     *zOut++ = 0x80 + (u8)(c & 0x3F);                   \
78   }else{                                               \
79     *zOut++ = 0xF0 + (u8)((c>>18) & 0x07);             \
80     *zOut++ = 0x80 + (u8)((c>>12) & 0x3F);             \
81     *zOut++ = 0x80 + (u8)((c>>6) & 0x3F);              \
82     *zOut++ = 0x80 + (u8)(c & 0x3F);                   \
83   }                                                    \
84 }
85 
86 #define WRITE_UTF16LE(zOut, c) {                                    \
87   if( c<=0xFFFF ){                                                  \
88     *zOut++ = (u8)(c&0x00FF);                                       \
89     *zOut++ = (u8)((c>>8)&0x00FF);                                  \
90   }else{                                                            \
91     *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0));  \
92     *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03));              \
93     *zOut++ = (u8)(c&0x00FF);                                       \
94     *zOut++ = (u8)(0x00DC + ((c>>8)&0x03));                         \
95   }                                                                 \
96 }
97 
98 #define WRITE_UTF16BE(zOut, c) {                                    \
99   if( c<=0xFFFF ){                                                  \
100     *zOut++ = (u8)((c>>8)&0x00FF);                                  \
101     *zOut++ = (u8)(c&0x00FF);                                       \
102   }else{                                                            \
103     *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03));              \
104     *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0));  \
105     *zOut++ = (u8)(0x00DC + ((c>>8)&0x03));                         \
106     *zOut++ = (u8)(c&0x00FF);                                       \
107   }                                                                 \
108 }
109 
110 #define READ_UTF16LE(zIn, c){                                         \
111   c = (*zIn++);                                                       \
112   c += ((*zIn++)<<8);                                                 \
113   if( c>=0xD800 && c<0xE000 ){                                        \
114     int c2 = (*zIn++);                                                \
115     c2 += ((*zIn++)<<8);                                              \
116     c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10);   \
117   }                                                                   \
118 }
119 
120 #define READ_UTF16BE(zIn, c){                                         \
121   c = ((*zIn++)<<8);                                                  \
122   c += (*zIn++);                                                      \
123   if( c>=0xD800 && c<0xE000 ){                                        \
124     int c2 = ((*zIn++)<<8);                                           \
125     c2 += (*zIn++);                                                   \
126     c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10);   \
127   }                                                                   \
128 }
129 
130 /*
131 ** Translate a single UTF-8 character.  Return the unicode value.
132 **
133 ** During translation, assume that the byte that zTerm points
134 ** is a 0x00.
135 **
136 ** Write a pointer to the next unread byte back into *pzNext.
137 **
138 ** Notes On Invalid UTF-8:
139 **
140 **  *  This routine never allows a 7-bit character (0x00 through 0x7f) to
141 **     be encoded as a multi-byte character.  Any multi-byte character that
142 **     attempts to encode a value between 0x00 and 0x7f is rendered as 0xfffd.
143 **
144 **  *  This routine never allows a UTF16 surrogate value to be encoded.
145 **     If a multi-byte character attempts to encode a value between
146 **     0xd800 and 0xe000 then it is rendered as 0xfffd.
147 **
148 **  *  Bytes in the range of 0x80 through 0xbf which occur as the first
149 **     byte of a character are interpreted as single-byte characters
150 **     and rendered as themselves even though they are technically
151 **     invalid characters.
152 **
153 **  *  This routine accepts an infinite number of different UTF8 encodings
154 **     for unicode values 0x80 and greater.  It do not change over-length
155 **     encodings to 0xfffd as some systems recommend.
156 */
157 #define READ_UTF8(zIn, zTerm, c)                           \
158   c = *(zIn++);                                            \
159   if( c>=0xc0 ){                                           \
160     c = sqlite3Utf8Trans1[c-0xc0];                         \
161     while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){            \
162       c = (c<<6) + (0x3f & *(zIn++));                      \
163     }                                                      \
164     if( c<0x80                                             \
165         || (c&0xFFFFF800)==0xD800                          \
166         || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }        \
167   }
168 int sqlite3Utf8Read(
169   const unsigned char *zIn,       /* First byte of UTF-8 character */
170   const unsigned char **pzNext    /* Write first byte past UTF-8 char here */
171 ){
172   int c;
173 
174   /* Same as READ_UTF8() above but without the zTerm parameter.
175   ** For this routine, we assume the UTF8 string is always zero-terminated.
176   */
177   c = *(zIn++);
178   if( c>=0xc0 ){
179     c = sqlite3Utf8Trans1[c-0xc0];
180     while( (*zIn & 0xc0)==0x80 ){
181       c = (c<<6) + (0x3f & *(zIn++));
182     }
183     if( c<0x80
184         || (c&0xFFFFF800)==0xD800
185         || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }
186   }
187   *pzNext = zIn;
188   return c;
189 }
190 
191 
192 
193 
194 /*
195 ** If the TRANSLATE_TRACE macro is defined, the value of each Mem is
196 ** printed on stderr on the way into and out of sqlite3VdbeMemTranslate().
197 */
198 /* #define TRANSLATE_TRACE 1 */
199 
200 #ifndef SQLITE_OMIT_UTF16
201 /*
202 ** This routine transforms the internal text encoding used by pMem to
203 ** desiredEnc. It is an error if the string is already of the desired
204 ** encoding, or if *pMem does not contain a string value.
205 */
206 int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){
207   int len;                    /* Maximum length of output string in bytes */
208   unsigned char *zOut;                  /* Output buffer */
209   unsigned char *zIn;                   /* Input iterator */
210   unsigned char *zTerm;                 /* End of input */
211   unsigned char *z;                     /* Output iterator */
212   unsigned int c;
213 
214   assert( pMem->db==0 || sqlite3_mutex_held(pMem->db->mutex) );
215   assert( pMem->flags&MEM_Str );
216   assert( pMem->enc!=desiredEnc );
217   assert( pMem->enc!=0 );
218   assert( pMem->n>=0 );
219 
220 #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
221   {
222     char zBuf[100];
223     sqlite3VdbeMemPrettyPrint(pMem, zBuf);
224     fprintf(stderr, "INPUT:  %s\n", zBuf);
225   }
226 #endif
227 
228   /* If the translation is between UTF-16 little and big endian, then
229   ** all that is required is to swap the byte order. This case is handled
230   ** differently from the others.
231   */
232   if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){
233     u8 temp;
234     int rc;
235     rc = sqlite3VdbeMemMakeWriteable(pMem);
236     if( rc!=SQLITE_OK ){
237       assert( rc==SQLITE_NOMEM );
238       return SQLITE_NOMEM;
239     }
240     zIn = (u8*)pMem->z;
241     zTerm = &zIn[pMem->n&~1];
242     while( zIn<zTerm ){
243       temp = *zIn;
244       *zIn = *(zIn+1);
245       zIn++;
246       *zIn++ = temp;
247     }
248     pMem->enc = desiredEnc;
249     goto translate_out;
250   }
251 
252   /* Set len to the maximum number of bytes required in the output buffer. */
253   if( desiredEnc==SQLITE_UTF8 ){
254     /* When converting from UTF-16, the maximum growth results from
255     ** translating a 2-byte character to a 4-byte UTF-8 character.
256     ** A single byte is required for the output string
257     ** nul-terminator.
258     */
259     pMem->n &= ~1;
260     len = pMem->n * 2 + 1;
261   }else{
262     /* When converting from UTF-8 to UTF-16 the maximum growth is caused
263     ** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16
264     ** character. Two bytes are required in the output buffer for the
265     ** nul-terminator.
266     */
267     len = pMem->n * 2 + 2;
268   }
269 
270   /* Set zIn to point at the start of the input buffer and zTerm to point 1
271   ** byte past the end.
272   **
273   ** Variable zOut is set to point at the output buffer, space obtained
274   ** from sqlite3_malloc().
275   */
276   zIn = (u8*)pMem->z;
277   zTerm = &zIn[pMem->n];
278   zOut = sqlite3DbMallocRaw(pMem->db, len);
279   if( !zOut ){
280     return SQLITE_NOMEM;
281   }
282   z = zOut;
283 
284   if( pMem->enc==SQLITE_UTF8 ){
285     if( desiredEnc==SQLITE_UTF16LE ){
286       /* UTF-8 -> UTF-16 Little-endian */
287       while( zIn<zTerm ){
288         /* c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn); */
289         READ_UTF8(zIn, zTerm, c);
290         WRITE_UTF16LE(z, c);
291       }
292     }else{
293       assert( desiredEnc==SQLITE_UTF16BE );
294       /* UTF-8 -> UTF-16 Big-endian */
295       while( zIn<zTerm ){
296         /* c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn); */
297         READ_UTF8(zIn, zTerm, c);
298         WRITE_UTF16BE(z, c);
299       }
300     }
301     pMem->n = (int)(z - zOut);
302     *z++ = 0;
303   }else{
304     assert( desiredEnc==SQLITE_UTF8 );
305     if( pMem->enc==SQLITE_UTF16LE ){
306       /* UTF-16 Little-endian -> UTF-8 */
307       while( zIn<zTerm ){
308         READ_UTF16LE(zIn, c);
309         WRITE_UTF8(z, c);
310       }
311     }else{
312       /* UTF-16 Big-endian -> UTF-8 */
313       while( zIn<zTerm ){
314         READ_UTF16BE(zIn, c);
315         WRITE_UTF8(z, c);
316       }
317     }
318     pMem->n = (int)(z - zOut);
319   }
320   *z = 0;
321   assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len );
322 
323   sqlite3VdbeMemRelease(pMem);
324   pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem);
325   pMem->enc = desiredEnc;
326   pMem->flags |= (MEM_Term|MEM_Dyn);
327   pMem->z = (char*)zOut;
328   pMem->zMalloc = pMem->z;
329 
330 translate_out:
331 #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
332   {
333     char zBuf[100];
334     sqlite3VdbeMemPrettyPrint(pMem, zBuf);
335     fprintf(stderr, "OUTPUT: %s\n", zBuf);
336   }
337 #endif
338   return SQLITE_OK;
339 }
340 
341 /*
342 ** This routine checks for a byte-order mark at the beginning of the
343 ** UTF-16 string stored in *pMem. If one is present, it is removed and
344 ** the encoding of the Mem adjusted. This routine does not do any
345 ** byte-swapping, it just sets Mem.enc appropriately.
346 **
347 ** The allocation (static, dynamic etc.) and encoding of the Mem may be
348 ** changed by this function.
349 */
350 int sqlite3VdbeMemHandleBom(Mem *pMem){
351   int rc = SQLITE_OK;
352   u8 bom = 0;
353 
354   assert( pMem->n>=0 );
355   if( pMem->n>1 ){
356     u8 b1 = *(u8 *)pMem->z;
357     u8 b2 = *(((u8 *)pMem->z) + 1);
358     if( b1==0xFE && b2==0xFF ){
359       bom = SQLITE_UTF16BE;
360     }
361     if( b1==0xFF && b2==0xFE ){
362       bom = SQLITE_UTF16LE;
363     }
364   }
365 
366   if( bom ){
367     rc = sqlite3VdbeMemMakeWriteable(pMem);
368     if( rc==SQLITE_OK ){
369       pMem->n -= 2;
370       memmove(pMem->z, &pMem->z[2], pMem->n);
371       pMem->z[pMem->n] = '\0';
372       pMem->z[pMem->n+1] = '\0';
373       pMem->flags |= MEM_Term;
374       pMem->enc = bom;
375     }
376   }
377   return rc;
378 }
379 #endif /* SQLITE_OMIT_UTF16 */
380 
381 /*
382 ** pZ is a UTF-8 encoded unicode string. If nByte is less than zero,
383 ** return the number of unicode characters in pZ up to (but not including)
384 ** the first 0x00 byte. If nByte is not less than zero, return the
385 ** number of unicode characters in the first nByte of pZ (or up to
386 ** the first 0x00, whichever comes first).
387 */
388 int sqlite3Utf8CharLen(const char *zIn, int nByte){
389   int r = 0;
390   const u8 *z = (const u8*)zIn;
391   const u8 *zTerm;
392   if( nByte>=0 ){
393     zTerm = &z[nByte];
394   }else{
395     zTerm = (const u8*)(-1);
396   }
397   assert( z<=zTerm );
398   while( *z!=0 && z<zTerm ){
399     SQLITE_SKIP_UTF8(z);
400     r++;
401   }
402   return r;
403 }
404 
405 /* This test function is not currently used by the automated test-suite.
406 ** Hence it is only available in debug builds.
407 */
408 #if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
409 /*
410 ** Translate UTF-8 to UTF-8.
411 **
412 ** This has the effect of making sure that the string is well-formed
413 ** UTF-8.  Miscoded characters are removed.
414 **
415 ** The translation is done in-place (since it is impossible for the
416 ** correct UTF-8 encoding to be longer than a malformed encoding).
417 */
418 int sqlite3Utf8To8(unsigned char *zIn){
419   unsigned char *zOut = zIn;
420   unsigned char *zStart = zIn;
421   u32 c;
422 
423   while( zIn[0] ){
424     c = sqlite3Utf8Read(zIn, (const u8**)&zIn);
425     if( c!=0xfffd ){
426       WRITE_UTF8(zOut, c);
427     }
428   }
429   *zOut = 0;
430   return (int)(zOut - zStart);
431 }
432 #endif
433 
434 #ifndef SQLITE_OMIT_UTF16
435 /*
436 ** Convert a UTF-16 string in the native encoding into a UTF-8 string.
437 ** Memory to hold the UTF-8 string is obtained from sqlite3_malloc and must
438 ** be freed by the calling function.
439 **
440 ** NULL is returned if there is an allocation error.
441 */
442 char *sqlite3Utf16to8(sqlite3 *db, const void *z, int nByte){
443   Mem m;
444   memset(&m, 0, sizeof(m));
445   m.db = db;
446   sqlite3VdbeMemSetStr(&m, z, nByte, SQLITE_UTF16NATIVE, SQLITE_STATIC);
447   sqlite3VdbeChangeEncoding(&m, SQLITE_UTF8);
448   if( db->mallocFailed ){
449     sqlite3VdbeMemRelease(&m);
450     m.z = 0;
451   }
452   assert( (m.flags & MEM_Term)!=0 || db->mallocFailed );
453   assert( (m.flags & MEM_Str)!=0 || db->mallocFailed );
454   return (m.flags & MEM_Dyn)!=0 ? m.z : sqlite3DbStrDup(db, m.z);
455 }
456 
457 /*
458 ** Convert a UTF-8 string to the UTF-16 encoding specified by parameter
459 ** enc. A pointer to the new string is returned, and the value of *pnOut
460 ** is set to the length of the returned string in bytes. The call should
461 ** arrange to call sqlite3DbFree() on the returned pointer when it is
462 ** no longer required.
463 **
464 ** If a malloc failure occurs, NULL is returned and the db.mallocFailed
465 ** flag set.
466 */
467 #ifdef SQLITE_ENABLE_STAT2
468 char *sqlite3Utf8to16(sqlite3 *db, int enc, char *z, int n, int *pnOut){
469   Mem m;
470   memset(&m, 0, sizeof(m));
471   m.db = db;
472   sqlite3VdbeMemSetStr(&m, z, n, SQLITE_UTF8, SQLITE_STATIC);
473   if( sqlite3VdbeMemTranslate(&m, enc) ){
474     assert( db->mallocFailed );
475     return 0;
476   }
477   assert( m.z==m.zMalloc );
478   *pnOut = m.n;
479   return m.z;
480 }
481 #endif
482 
483 /*
484 ** pZ is a UTF-16 encoded unicode string at least nChar characters long.
485 ** Return the number of bytes in the first nChar unicode characters
486 ** in pZ.  nChar must be non-negative.
487 */
488 int sqlite3Utf16ByteLen(const void *zIn, int nChar){
489   int c;
490   unsigned char const *z = zIn;
491   int n = 0;
492   if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){
493     /* Using an "if (SQLITE_UTF16NATIVE==SQLITE_UTF16BE)" construct here
494     ** and in other parts of this file means that at one branch will
495     ** not be covered by coverage testing on any single host. But coverage
496     ** will be complete if the tests are run on both a little-endian and
497     ** big-endian host. Because both the UTF16NATIVE and SQLITE_UTF16BE
498     ** macros are constant at compile time the compiler can determine
499     ** which branch will be followed. It is therefore assumed that no runtime
500     ** penalty is paid for this "if" statement.
501     */
502     while( n<nChar ){
503       READ_UTF16BE(z, c);
504       n++;
505     }
506   }else{
507     while( n<nChar ){
508       READ_UTF16LE(z, c);
509       n++;
510     }
511   }
512   return (int)(z-(unsigned char const *)zIn);
513 }
514 
515 #if defined(SQLITE_TEST)
516 /*
517 ** This routine is called from the TCL test function "translate_selftest".
518 ** It checks that the primitives for serializing and deserializing
519 ** characters in each encoding are inverses of each other.
520 */
521 void sqlite3UtfSelfTest(void){
522   unsigned int i, t;
523   unsigned char zBuf[20];
524   unsigned char *z;
525   int n;
526   unsigned int c;
527 
528   for(i=0; i<0x00110000; i++){
529     z = zBuf;
530     WRITE_UTF8(z, i);
531     n = (int)(z-zBuf);
532     assert( n>0 && n<=4 );
533     z[0] = 0;
534     z = zBuf;
535     c = sqlite3Utf8Read(z, (const u8**)&z);
536     t = i;
537     if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD;
538     if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD;
539     assert( c==t );
540     assert( (z-zBuf)==n );
541   }
542   for(i=0; i<0x00110000; i++){
543     if( i>=0xD800 && i<0xE000 ) continue;
544     z = zBuf;
545     WRITE_UTF16LE(z, i);
546     n = (int)(z-zBuf);
547     assert( n>0 && n<=4 );
548     z[0] = 0;
549     z = zBuf;
550     READ_UTF16LE(z, c);
551     assert( c==i );
552     assert( (z-zBuf)==n );
553   }
554   for(i=0; i<0x00110000; i++){
555     if( i>=0xD800 && i<0xE000 ) continue;
556     z = zBuf;
557     WRITE_UTF16BE(z, i);
558     n = (int)(z-zBuf);
559     assert( n>0 && n<=4 );
560     z[0] = 0;
561     z = zBuf;
562     READ_UTF16BE(z, c);
563     assert( c==i );
564     assert( (z-zBuf)==n );
565   }
566 }
567 #endif /* SQLITE_TEST */
568 #endif /* SQLITE_OMIT_UTF16 */
569