xref: /sqlite-3.40.0/src/utf.c (revision 5665b3ea)
1 /*
2 ** 2004 April 13
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** This file contains routines used to translate between UTF-8,
13 ** UTF-16, UTF-16BE, and UTF-16LE.
14 **
15 ** $Id: utf.c,v 1.51 2007/05/23 16:23:09 danielk1977 Exp $
16 **
17 ** Notes on UTF-8:
18 **
19 **   Byte-0    Byte-1    Byte-2    Byte-3    Value
20 **  0xxxxxxx                                 00000000 00000000 0xxxxxxx
21 **  110yyyyy  10xxxxxx                       00000000 00000yyy yyxxxxxx
22 **  1110zzzz  10yyyyyy  10xxxxxx             00000000 zzzzyyyy yyxxxxxx
23 **  11110uuu  10uuzzzz  10yyyyyy  10xxxxxx   000uuuuu zzzzyyyy yyxxxxxx
24 **
25 **
26 ** Notes on UTF-16:  (with wwww+1==uuuuu)
27 **
28 **      Word-0               Word-1          Value
29 **  110110ww wwzzzzyy   110111yy yyxxxxxx    000uuuuu zzzzyyyy yyxxxxxx
30 **  zzzzyyyy yyxxxxxx                        00000000 zzzzyyyy yyxxxxxx
31 **
32 **
33 ** BOM or Byte Order Mark:
34 **     0xff 0xfe   little-endian utf-16 follows
35 **     0xfe 0xff   big-endian utf-16 follows
36 **
37 */
38 #include "sqliteInt.h"
39 #include <assert.h>
40 #include "vdbeInt.h"
41 
42 /*
43 ** The following constant value is used by the SQLITE_BIGENDIAN and
44 ** SQLITE_LITTLEENDIAN macros.
45 */
46 const int sqlite3one = 1;
47 
48 /*
49 ** This lookup table is used to help decode the first byte of
50 ** a multi-byte UTF8 character.
51 */
52 const unsigned char sqlite3UtfTrans1[] = {
53   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
54   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
55   0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
56   0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
57   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
58   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
59   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
60   0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
61 };
62 
63 #define WRITE_UTF8(zOut, c) {                          \
64   if( c<0x00080 ){                                     \
65     *zOut++ = (c&0xFF);                                \
66   }                                                    \
67   else if( c<0x00800 ){                                \
68     *zOut++ = 0xC0 + ((c>>6)&0x1F);                    \
69     *zOut++ = 0x80 + (c & 0x3F);                       \
70   }                                                    \
71   else if( c<0x10000 ){                                \
72     *zOut++ = 0xE0 + ((c>>12)&0x0F);                   \
73     *zOut++ = 0x80 + ((c>>6) & 0x3F);                  \
74     *zOut++ = 0x80 + (c & 0x3F);                       \
75   }else{                                               \
76     *zOut++ = 0xF0 + ((c>>18) & 0x07);                 \
77     *zOut++ = 0x80 + ((c>>12) & 0x3F);                 \
78     *zOut++ = 0x80 + ((c>>6) & 0x3F);                  \
79     *zOut++ = 0x80 + (c & 0x3F);                       \
80   }                                                    \
81 }
82 
83 #define WRITE_UTF16LE(zOut, c) {                                \
84   if( c<=0xFFFF ){                                              \
85     *zOut++ = (c&0x00FF);                                       \
86     *zOut++ = ((c>>8)&0x00FF);                                  \
87   }else{                                                        \
88     *zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0));  \
89     *zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03));              \
90     *zOut++ = (c&0x00FF);                                       \
91     *zOut++ = (0x00DC + ((c>>8)&0x03));                         \
92   }                                                             \
93 }
94 
95 #define WRITE_UTF16BE(zOut, c) {                                \
96   if( c<=0xFFFF ){                                              \
97     *zOut++ = ((c>>8)&0x00FF);                                  \
98     *zOut++ = (c&0x00FF);                                       \
99   }else{                                                        \
100     *zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03));              \
101     *zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0));  \
102     *zOut++ = (0x00DC + ((c>>8)&0x03));                         \
103     *zOut++ = (c&0x00FF);                                       \
104   }                                                             \
105 }
106 
107 #define READ_UTF16LE(zIn, c){                                         \
108   c = (*zIn++);                                                       \
109   c += ((*zIn++)<<8);                                                 \
110   if( c>=0xD800 && c<0xE000 ){                                       \
111     int c2 = (*zIn++);                                                \
112     c2 += ((*zIn++)<<8);                                              \
113     c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10);   \
114     if( (c & 0xFFFF0000)==0 ) c = 0xFFFD;                             \
115   }                                                                   \
116 }
117 
118 #define READ_UTF16BE(zIn, c){                                         \
119   c = ((*zIn++)<<8);                                                  \
120   c += (*zIn++);                                                      \
121   if( c>=0xD800 && c<0xE000 ){                                       \
122     int c2 = ((*zIn++)<<8);                                           \
123     c2 += (*zIn++);                                                   \
124     c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10);   \
125     if( (c & 0xFFFF0000)==0 ) c = 0xFFFD;                             \
126   }                                                                   \
127 }
128 
129 /*
130 ** If the TRANSLATE_TRACE macro is defined, the value of each Mem is
131 ** printed on stderr on the way into and out of sqlite3VdbeMemTranslate().
132 */
133 /* #define TRANSLATE_TRACE 1 */
134 
135 #ifndef SQLITE_OMIT_UTF16
136 /*
137 ** This routine transforms the internal text encoding used by pMem to
138 ** desiredEnc. It is an error if the string is already of the desired
139 ** encoding, or if *pMem does not contain a string value.
140 */
141 int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){
142   unsigned char zShort[NBFS]; /* Temporary short output buffer */
143   int len;                    /* Maximum length of output string in bytes */
144   unsigned char *zOut;                  /* Output buffer */
145   unsigned char *zIn;                   /* Input iterator */
146   unsigned char *zTerm;                 /* End of input */
147   unsigned char *z;                     /* Output iterator */
148   unsigned int c;
149 
150   assert( pMem->flags&MEM_Str );
151   assert( pMem->enc!=desiredEnc );
152   assert( pMem->enc!=0 );
153   assert( pMem->n>=0 );
154 
155 #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
156   {
157     char zBuf[100];
158     sqlite3VdbeMemPrettyPrint(pMem, zBuf);
159     fprintf(stderr, "INPUT:  %s\n", zBuf);
160   }
161 #endif
162 
163   /* If the translation is between UTF-16 little and big endian, then
164   ** all that is required is to swap the byte order. This case is handled
165   ** differently from the others.
166   */
167   if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){
168     u8 temp;
169     int rc;
170     rc = sqlite3VdbeMemMakeWriteable(pMem);
171     if( rc!=SQLITE_OK ){
172       assert( rc==SQLITE_NOMEM );
173       return SQLITE_NOMEM;
174     }
175     zIn = (u8*)pMem->z;
176     zTerm = &zIn[pMem->n];
177     while( zIn<zTerm ){
178       temp = *zIn;
179       *zIn = *(zIn+1);
180       zIn++;
181       *zIn++ = temp;
182     }
183     pMem->enc = desiredEnc;
184     goto translate_out;
185   }
186 
187   /* Set len to the maximum number of bytes required in the output buffer. */
188   if( desiredEnc==SQLITE_UTF8 ){
189     /* When converting from UTF-16, the maximum growth results from
190     ** translating a 2-byte character to a 4-byte UTF-8 character.
191     ** A single byte is required for the output string
192     ** nul-terminator.
193     */
194     len = pMem->n * 2 + 1;
195   }else{
196     /* When converting from UTF-8 to UTF-16 the maximum growth is caused
197     ** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16
198     ** character. Two bytes are required in the output buffer for the
199     ** nul-terminator.
200     */
201     len = pMem->n * 2 + 2;
202   }
203 
204   /* Set zIn to point at the start of the input buffer and zTerm to point 1
205   ** byte past the end.
206   **
207   ** Variable zOut is set to point at the output buffer. This may be space
208   ** obtained from malloc(), or Mem.zShort, if it large enough and not in
209   ** use, or the zShort array on the stack (see above).
210   */
211   zIn = (u8*)pMem->z;
212   zTerm = &zIn[pMem->n];
213   if( len>NBFS ){
214     zOut = sqliteMallocRaw(len);
215     if( !zOut ) return SQLITE_NOMEM;
216   }else{
217     zOut = zShort;
218   }
219   z = zOut;
220 
221   if( pMem->enc==SQLITE_UTF8 ){
222     unsigned int iExtra = 0xD800;
223 
224     if( 0==(pMem->flags&MEM_Term) && zTerm>zIn && (zTerm[-1]&0x80) ){
225       /* This UTF8 string is not nul-terminated, and the last byte is
226       ** not a character in the ascii range (codpoints 0..127). This
227       ** means the SQLITE_READ_UTF8() macro might read past the end
228       ** of the allocated buffer.
229       **
230       ** There are four possibilities:
231       **
232       **   1. The last byte is the first byte of a non-ASCII character,
233       **
234       **   2. The final N bytes of the input string are continuation bytes
235       **      and immediately preceding them is the first byte of a
236       **      non-ASCII character.
237       **
238       **   3. The final N bytes of the input string are continuation bytes
239       **      and immediately preceding them is a byte that encodes a
240       **      character in the ASCII range.
241       **
242       **   4. The entire string consists of continuation characters.
243       **
244       ** Cases (3) and (4) require no special handling. The SQLITE_READ_UTF8()
245       ** macro will not overread the buffer in these cases.
246       */
247       unsigned char *zExtra = &zTerm[-1];
248       while( zExtra>zIn && (zExtra[0]&0xC0)==0x80 ){
249         zExtra--;
250       }
251 
252       if( (zExtra[0]&0xC0)==0xC0 ){
253         /* Make a copy of the last character encoding in the input string.
254         ** Then make sure it is nul-terminated and use SQLITE_READ_UTF8()
255         ** to decode the codepoint. Store the codepoint in variable iExtra,
256         ** it will be appended to the output string later.
257         */
258         unsigned char *zFree = 0;
259         unsigned char zBuf[16];
260         int nExtra = (pMem->n+zIn-zExtra);
261         zTerm = zExtra;
262         if( nExtra>15 ){
263           zExtra = sqliteMallocRaw(nExtra+1);
264           if( !zExtra ){
265             return SQLITE_NOMEM;
266           }
267           zFree = zExtra;
268         }else{
269           zExtra = zBuf;
270         }
271         memcpy(zExtra, zTerm, nExtra);
272         zExtra[nExtra] = '\0';
273         SQLITE_READ_UTF8(zExtra, iExtra);
274         sqliteFree(zFree);
275       }
276     }
277 
278     if( desiredEnc==SQLITE_UTF16LE ){
279       /* UTF-8 -> UTF-16 Little-endian */
280       while( zIn<zTerm ){
281         SQLITE_READ_UTF8(zIn, c);
282         WRITE_UTF16LE(z, c);
283       }
284       if( iExtra!=0xD800 ){
285         WRITE_UTF16LE(z, iExtra);
286       }
287     }else{
288       assert( desiredEnc==SQLITE_UTF16BE );
289       /* UTF-8 -> UTF-16 Big-endian */
290       while( zIn<zTerm ){
291         SQLITE_READ_UTF8(zIn, c);
292         WRITE_UTF16BE(z, c);
293       }
294       if( iExtra!=0xD800 ){
295         WRITE_UTF16BE(z, iExtra);
296       }
297     }
298     pMem->n = z - zOut;
299     *z++ = 0;
300   }else{
301     assert( desiredEnc==SQLITE_UTF8 );
302     if( pMem->enc==SQLITE_UTF16LE ){
303       /* UTF-16 Little-endian -> UTF-8 */
304       while( zIn<zTerm ){
305         READ_UTF16LE(zIn, c);
306         WRITE_UTF8(z, c);
307       }
308     }else{
309       /* UTF-16 Little-endian -> UTF-8 */
310       while( zIn<zTerm ){
311         READ_UTF16BE(zIn, c);
312         WRITE_UTF8(z, c);
313       }
314     }
315     pMem->n = z - zOut;
316   }
317   *z = 0;
318   assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len );
319 
320   sqlite3VdbeMemRelease(pMem);
321   pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem|MEM_Short);
322   pMem->enc = desiredEnc;
323   if( zOut==zShort ){
324     memcpy(pMem->zShort, zOut, len);
325     zOut = (u8*)pMem->zShort;
326     pMem->flags |= (MEM_Term|MEM_Short);
327   }else{
328     pMem->flags |= (MEM_Term|MEM_Dyn);
329   }
330   pMem->z = (char*)zOut;
331 
332 translate_out:
333 #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
334   {
335     char zBuf[100];
336     sqlite3VdbeMemPrettyPrint(pMem, zBuf);
337     fprintf(stderr, "OUTPUT: %s\n", zBuf);
338   }
339 #endif
340   return SQLITE_OK;
341 }
342 
343 /*
344 ** This routine checks for a byte-order mark at the beginning of the
345 ** UTF-16 string stored in *pMem. If one is present, it is removed and
346 ** the encoding of the Mem adjusted. This routine does not do any
347 ** byte-swapping, it just sets Mem.enc appropriately.
348 **
349 ** The allocation (static, dynamic etc.) and encoding of the Mem may be
350 ** changed by this function.
351 */
352 int sqlite3VdbeMemHandleBom(Mem *pMem){
353   int rc = SQLITE_OK;
354   u8 bom = 0;
355 
356   if( pMem->n<0 || pMem->n>1 ){
357     u8 b1 = *(u8 *)pMem->z;
358     u8 b2 = *(((u8 *)pMem->z) + 1);
359     if( b1==0xFE && b2==0xFF ){
360       bom = SQLITE_UTF16BE;
361     }
362     if( b1==0xFF && b2==0xFE ){
363       bom = SQLITE_UTF16LE;
364     }
365   }
366 
367   if( bom ){
368     /* This function is called as soon as a string is stored in a Mem*,
369     ** from within sqlite3VdbeMemSetStr(). At that point it is not possible
370     ** for the string to be stored in Mem.zShort, or for it to be stored
371     ** in dynamic memory with no destructor.
372     */
373     assert( !(pMem->flags&MEM_Short) );
374     assert( !(pMem->flags&MEM_Dyn) || pMem->xDel );
375     if( pMem->flags & MEM_Dyn ){
376       void (*xDel)(void*) = pMem->xDel;
377       char *z = pMem->z;
378       pMem->z = 0;
379       pMem->xDel = 0;
380       rc = sqlite3VdbeMemSetStr(pMem, &z[2], pMem->n-2, bom, SQLITE_TRANSIENT);
381       xDel(z);
382     }else{
383       rc = sqlite3VdbeMemSetStr(pMem, &pMem->z[2], pMem->n-2, bom,
384           SQLITE_TRANSIENT);
385     }
386   }
387   return rc;
388 }
389 #endif /* SQLITE_OMIT_UTF16 */
390 
391 /*
392 ** pZ is a UTF-8 encoded unicode string. If nByte is less than zero,
393 ** return the number of unicode characters in pZ up to (but not including)
394 ** the first 0x00 byte. If nByte is not less than zero, return the
395 ** number of unicode characters in the first nByte of pZ (or up to
396 ** the first 0x00, whichever comes first).
397 */
398 int sqlite3Utf8CharLen(const char *zIn, int nByte){
399   int r = 0;
400   const u8 *z = (const u8*)zIn;
401   const u8 *zTerm;
402   if( nByte>=0 ){
403     zTerm = &z[nByte];
404   }else{
405     zTerm = (const u8*)(-1);
406   }
407   assert( z<=zTerm );
408   while( *z!=0 && z<zTerm ){
409     SQLITE_SKIP_UTF8(z);
410     r++;
411   }
412   return r;
413 }
414 
415 #ifndef SQLITE_OMIT_UTF16
416 /*
417 ** Convert a UTF-16 string in the native encoding into a UTF-8 string.
418 ** Memory to hold the UTF-8 string is obtained from malloc and must be
419 ** freed by the calling function.
420 **
421 ** NULL is returned if there is an allocation error.
422 */
423 char *sqlite3Utf16to8(const void *z, int nByte){
424   Mem m;
425   memset(&m, 0, sizeof(m));
426   sqlite3VdbeMemSetStr(&m, z, nByte, SQLITE_UTF16NATIVE, SQLITE_STATIC);
427   sqlite3VdbeChangeEncoding(&m, SQLITE_UTF8);
428   assert( (m.flags & MEM_Term)!=0 || sqlite3MallocFailed() );
429   assert( (m.flags & MEM_Str)!=0 || sqlite3MallocFailed() );
430   return (m.flags & MEM_Dyn)!=0 ? m.z : sqliteStrDup(m.z);
431 }
432 
433 /*
434 ** pZ is a UTF-16 encoded unicode string. If nChar is less than zero,
435 ** return the number of bytes up to (but not including), the first pair
436 ** of consecutive 0x00 bytes in pZ. If nChar is not less than zero,
437 ** then return the number of bytes in the first nChar unicode characters
438 ** in pZ (or up until the first pair of 0x00 bytes, whichever comes first).
439 */
440 int sqlite3Utf16ByteLen(const void *zIn, int nChar){
441   unsigned int c = 1;
442   char const *z = zIn;
443   int n = 0;
444   if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){
445     /* Using an "if (SQLITE_UTF16NATIVE==SQLITE_UTF16BE)" construct here
446     ** and in other parts of this file means that at one branch will
447     ** not be covered by coverage testing on any single host. But coverage
448     ** will be complete if the tests are run on both a little-endian and
449     ** big-endian host. Because both the UTF16NATIVE and SQLITE_UTF16BE
450     ** macros are constant at compile time the compiler can determine
451     ** which branch will be followed. It is therefore assumed that no runtime
452     ** penalty is paid for this "if" statement.
453     */
454     while( c && ((nChar<0) || n<nChar) ){
455       READ_UTF16BE(z, c);
456       n++;
457     }
458   }else{
459     while( c && ((nChar<0) || n<nChar) ){
460       READ_UTF16LE(z, c);
461       n++;
462     }
463   }
464   return (z-(char const *)zIn)-((c==0)?2:0);
465 }
466 
467 #if defined(SQLITE_TEST)
468 /*
469 ** Translate UTF-8 to UTF-8.
470 **
471 ** This has the effect of making sure that the string is well-formed
472 ** UTF-8.  Miscoded characters are removed.
473 **
474 ** The translation is done in-place (since it is impossible for the
475 ** correct UTF-8 encoding to be longer than a malformed encoding).
476 */
477 int sqlite3Utf8To8(unsigned char *zIn){
478   unsigned char *zOut = zIn;
479   unsigned char *zStart = zIn;
480   int c;
481 
482   while(1){
483     SQLITE_READ_UTF8(zIn, c);
484     if( c==0 ) break;
485     if( c!=0xfffd ){
486       WRITE_UTF8(zOut, c);
487     }
488   }
489   *zOut = 0;
490   return zOut - zStart;
491 }
492 #endif
493 
494 #if defined(SQLITE_TEST)
495 /*
496 ** This routine is called from the TCL test function "translate_selftest".
497 ** It checks that the primitives for serializing and deserializing
498 ** characters in each encoding are inverses of each other.
499 */
500 void sqlite3UtfSelfTest(){
501   unsigned int i, t;
502   unsigned char zBuf[20];
503   unsigned char *z;
504   int n;
505   unsigned int c;
506 
507   for(i=0; i<0x00110000; i++){
508     z = zBuf;
509     WRITE_UTF8(z, i);
510     n = z-zBuf;
511     z[0] = 0;
512     z = zBuf;
513     SQLITE_READ_UTF8(z, c);
514     t = i;
515     if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD;
516     if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD;
517     assert( c==t );
518     assert( (z-zBuf)==n );
519   }
520   for(i=0; i<0x00110000; i++){
521     if( i>=0xD800 && i<0xE000 ) continue;
522     z = zBuf;
523     WRITE_UTF16LE(z, i);
524     n = z-zBuf;
525     z[0] = 0;
526     z = zBuf;
527     READ_UTF16LE(z, c);
528     assert( c==i );
529     assert( (z-zBuf)==n );
530   }
531   for(i=0; i<0x00110000; i++){
532     if( i>=0xD800 && i<0xE000 ) continue;
533     z = zBuf;
534     WRITE_UTF16BE(z, i);
535     n = z-zBuf;
536     z[0] = 0;
537     z = zBuf;
538     READ_UTF16BE(z, c);
539     assert( c==i );
540     assert( (z-zBuf)==n );
541   }
542 }
543 #endif /* SQLITE_TEST */
544 #endif /* SQLITE_OMIT_UTF16 */
545