xref: /sqlite-3.40.0/src/utf.c (revision ef5ecb41)
1 /*
2 ** 2004 April 13
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** This file contains routines used to translate between UTF-8,
13 ** UTF-16, UTF-16BE, and UTF-16LE.
14 **
15 ** $Id: utf.c,v 1.18 2004/06/06 12:41:50 danielk1977 Exp $
16 **
17 ** Notes on UTF-8:
18 **
19 **   Byte-0    Byte-1    Byte-2    Byte-3    Value
20 **  0xxxxxxx                                 00000000 00000000 0xxxxxxx
21 **  110yyyyy  10xxxxxx                       00000000 00000yyy yyxxxxxx
22 **  1110zzzz  10yyyyyy  10xxxxxx             00000000 zzzzyyyy yyxxxxxx
23 **  11110uuu  10uuzzzz  10yyyyyy  10xxxxxx   000uuuuu zzzzyyyy yyxxxxxx
24 **
25 **
26 ** Notes on UTF-16:  (with wwww+1==uuuuu)
27 **
28 **      Word-0               Word-1          Value
29 **  110110ww wwzzzzyy   110111yy yyxxxxxx    000uuuuu zzzzyyyy yyxxxxxx
30 **  zzzzyyyy yyxxxxxx                        00000000 zzzzyyyy yyxxxxxx
31 **
32 **
33 ** BOM or Byte Order Mark:
34 **     0xff 0xfe   little-endian utf-16 follows
35 **     0xfe 0xff   big-endian utf-16 follows
36 **
37 **
38 ** Handling of malformed strings:
39 **
40 ** SQLite accepts and processes malformed strings without an error wherever
41 ** possible. However this is not possible when converting between UTF-8 and
42 ** UTF-16.
43 **
44 ** When converting malformed UTF-8 strings to UTF-16, one instance of the
45 ** replacement character U+FFFD for each byte that cannot be interpeted as
46 ** part of a valid unicode character.
47 **
48 ** When converting malformed UTF-16 strings to UTF-8, one instance of the
49 ** replacement character U+FFFD for each pair of bytes that cannot be
50 ** interpeted as part of a valid unicode character.
51 */
52 #include <assert.h>
53 #include "sqliteInt.h"
54 
55 typedef struct UtfString UtfString;
56 struct UtfString {
57   unsigned char *pZ;    /* Raw string data */
58   int n;                /* Allocated length of pZ in bytes */
59   int c;                /* Number of pZ bytes already read or written */
60 };
61 
62 /*
63 ** These two macros are used to interpret the first two bytes of the
64 ** unsigned char array pZ as a 16-bit unsigned int. BE16() for a big-endian
65 ** interpretation, LE16() for little-endian.
66 */
67 #define BE16(pZ) (((u16)((pZ)[0])<<8) + (u16)((pZ)[1]))
68 #define LE16(pZ) (((u16)((pZ)[1])<<8) + (u16)((pZ)[0]))
69 
70 /*
71 ** READ_16 interprets the first two bytes of the unsigned char array pZ
72 ** as a 16-bit unsigned int. If big_endian is non-zero the intepretation
73 ** is big-endian, otherwise little-endian.
74 */
75 #define READ_16(pZ,big_endian) (big_endian?BE16(pZ):LE16(pZ))
76 
77 /*
78 ** The following macro, LOWERCASE(x), takes an integer representing a
79 ** unicode code point. The value returned is the same code point folded to
80 ** lower case, if applicable. SQLite currently understands the upper/lower
81 ** case relationship between the 26 characters used in the English
82 ** language only.
83 **
84 ** This means that characters with umlauts etc. will not be folded
85 ** correctly (unless they are encoded as composite characters, which would
86 ** doubtless cause much trouble).
87 */
88 #define LOWERCASE(x) (x<91?(int)(UpperToLower[x]):x);
89 static unsigned char UpperToLower[91] = {
90       0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
91      18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
92      36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
93      54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 97, 98, 99,100,101,102,103,
94     104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,
95     122,
96 };
97 
98 /*
99 ** The first parameter, zStr, points at a unicode string. This routine
100 ** reads a single character from the string and returns the codepoint value
101 ** of the character read.
102 **
103 ** The value of *pEnc is the string encoding. If *pEnc is TEXT_Utf16le or
104 ** TEXT_Utf16be, and the first character read is a byte-order-mark, then
105 ** the value of *pEnc is modified if necessary. In this case the next
106 ** character is read and it's code-point value returned.
107 **
108 ** The value of *pOffset is the byte-offset in zStr from which to begin
109 ** reading. It is incremented by the number of bytes read by this function.
110 **
111 ** If the fourth parameter, fold, is non-zero, then codepoint values are
112 ** folded to lower-case before being returned. See comments for macro
113 ** LOWERCASE(x) for details.
114 */
115 int sqlite3ReadUniChar(const char *zStr, int *pOffset, u8 *pEnc, int fold){
116   int ret = 0;
117 
118   switch( *pEnc ){
119     case TEXT_Utf8: {
120 
121 #if 0
122   static const int initVal[] = {
123       0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,
124      15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
125      30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
126      45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
127      60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
128      75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
129      90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
130     105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
131     120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
132     135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
133     150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
134     165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
135     180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,   0,   1,   2,
136       3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,  16,  17,
137      18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,   0,
138       1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
139       0,   1,   2,   3,   4,   5,   6,   7,   0,   1,   2,   3,   0,   1, 254,
140     255,
141   };
142   ret = initVal[(unsigned char)zStr[(*pOffset)++]];
143   while( (0xc0&zStr[*pOffset])==0x80 ){
144     ret = (ret<<6) | (0x3f&(zStr[(*pOffset)++]));
145   }
146 #endif
147 
148       struct Utf8TblRow {
149         u8 b1_mask;
150         u8 b1_masked_val;
151         u8 b1_value_mask;
152         int trailing_bytes;
153       };
154       static const struct Utf8TblRow utf8tbl[] = {
155         { 0x80, 0x00, 0x7F, 0 },
156         { 0xE0, 0xC0, 0x1F, 1 },
157         { 0xF0, 0xE0, 0x0F, 2 },
158         { 0xF8, 0xF0, 0x0E, 3 },
159         { 0, 0, 0, 0}
160       };
161 
162       u8 b1;   /* First byte of the potentially multi-byte utf-8 character */
163       int ii;
164       struct Utf8TblRow const *pRow;
165 
166       pRow = &(utf8tbl[0]);
167 
168       b1 = zStr[(*pOffset)++];
169       while( pRow->b1_mask && (b1&pRow->b1_mask)!=pRow->b1_masked_val ){
170         pRow++;
171       }
172       if( !pRow->b1_mask ){
173         return (int)0xFFFD;
174       }
175 
176       ret = (u32)(b1&pRow->b1_value_mask);
177       for( ii=0; ii<pRow->trailing_bytes; ii++ ){
178         u8 b = zStr[(*pOffset)++];
179         if( (b&0xC0)!=0x80 ){
180           return (int)0xFFFD;
181         }
182         ret = (ret<<6) + (u32)(b&0x3F);
183       }
184       break;
185     }
186 
187     case TEXT_Utf16le:
188     case TEXT_Utf16be: {
189       u32 code_point;   /* the first code-point in the character */
190       u32 code_point2;  /* the second code-point in the character, if any */
191 
192       code_point = READ_16(&zStr[*pOffset], (*pEnc==TEXT_Utf16be));
193       *pOffset += 2;
194 
195       /* If this is a non-surrogate code-point, just cast it to an int and
196       ** this is the code-point value.
197       */
198       if( code_point<0xD800 || code_point>0xE000 ){
199         ret = code_point;
200         break;
201       }
202 
203       /* If this is a trailing surrogate code-point, then the string is
204       ** malformed; return the replacement character.
205       */
206       if( code_point>0xDBFF ){
207         return (int)0xFFFD;
208       }
209 
210       /* The code-point just read is a leading surrogate code-point. If their
211       ** is not enough data left or the next code-point is not a trailing
212       ** surrogate, return the replacement character.
213       */
214       code_point2 = READ_16(&zStr[*pOffset], (*pEnc==TEXT_Utf16be));
215       *pOffset += 2;
216       if( code_point2<0xDC00 || code_point>0xDFFF ){
217         return (int)0xFFFD;
218       }
219 
220       ret = (
221           (((code_point&0x03C0)+0x0040)<<16) +   /* uuuuu */
222           ((code_point&0x003F)<<10) +            /* xxxxxx */
223           (code_point2&0x03FF)                   /* yy yyyyyyyy */
224       );
225     }
226     default:
227       assert(0);
228   }
229 
230   if( fold ){
231     return LOWERCASE(ret);
232   }
233   return ret;
234 }
235 
236 /*
237 ** Read the BOM from the start of *pStr, if one is present. Return zero
238 ** for little-endian, non-zero for big-endian. If no BOM is present, return
239 ** the value of the parameter "big_endian".
240 **
241 ** Return values:
242 **     1 -> big-endian string
243 **     0 -> little-endian string
244 */
245 static int readUtf16Bom(UtfString *pStr, int big_endian){
246   /* The BOM must be the first thing read from the string */
247   assert( pStr->c==0 );
248 
249   /* If the string data consists of 1 byte or less, the BOM will make no
250   ** difference anyway. In this case just fall through to the default case
251   ** and return the native byte-order for this machine.
252   **
253   ** Otherwise, check the first 2 bytes of the string to see if a BOM is
254   ** present.
255   */
256   if( pStr->n>1 ){
257     u8 bom = sqlite3UtfReadBom(pStr->pZ, 2);
258     if( bom ){
259       pStr->c += 2;
260       return (bom==TEXT_Utf16le)?0:1;
261     }
262   }
263 
264   return big_endian;
265 }
266 
267 /*
268 ** zData is a UTF-16 encoded string, nData bytes in length. This routine
269 ** checks if there is a byte-order mark at the start of zData. If no
270 ** byte order mark is found 0 is returned. Otherwise TEXT_Utf16be or
271 ** TEXT_Utf16le is returned, depending on whether The BOM indicates that
272 ** the text is big-endian or little-endian.
273 */
274 u8 sqlite3UtfReadBom(const void *zData, int nData){
275   if( nData<0 || nData>1 ){
276     u8 b1 = *(u8 *)zData;
277     u8 b2 = *(((u8 *)zData) + 1);
278     if( b1==0xFE && b2==0xFF ){
279       return TEXT_Utf16be;
280     }
281     if( b1==0xFF && b2==0xFE ){
282       return TEXT_Utf16le;
283     }
284   }
285   return 0;
286 }
287 
288 
289 /*
290 ** Read a single unicode character from the UTF-8 encoded string *pStr. The
291 ** value returned is a unicode scalar value. In the case of malformed
292 ** strings, the unicode replacement character U+FFFD may be returned.
293 */
294 static u32 readUtf8(UtfString *pStr){
295   u8 enc = TEXT_Utf8;
296   return sqlite3ReadUniChar(pStr->pZ, &pStr->c, &enc, 0);
297 }
298 
299 /*
300 ** Write the unicode character 'code' to the string pStr using UTF-8
301 ** encoding. SQLITE_NOMEM may be returned if sqlite3Malloc() fails.
302 */
303 static int writeUtf8(UtfString *pStr, u32 code){
304   struct Utf8WriteTblRow {
305     u32 max_code;
306     int trailing_bytes;
307     u8 b1_and_mask;
308     u8 b1_or_mask;
309   };
310   static const struct Utf8WriteTblRow utf8tbl[] = {
311     {0x0000007F, 0, 0x7F, 0x00},
312     {0x000007FF, 1, 0xDF, 0xC0},
313     {0x0000FFFF, 2, 0xEF, 0xE0},
314     {0x0010FFFF, 3, 0xF7, 0xF0},
315     {0x00000000, 0, 0x00, 0x00}
316   };
317   const struct Utf8WriteTblRow *pRow = &utf8tbl[0];
318 
319   while( code>pRow->max_code ){
320     assert( pRow->max_code );
321     pRow++;
322   }
323 
324   /* Ensure there is enough room left in the output buffer to write
325   ** this UTF-8 character.
326   */
327   assert( (pStr->n-pStr->c)>=(pRow->trailing_bytes+1) );
328 
329   /* Write the UTF-8 encoded character to pStr. All cases below are
330   ** intentionally fall-through.
331   */
332   switch( pRow->trailing_bytes ){
333     case 3:
334       pStr->pZ[pStr->c+3] = (((u8)code)&0x3F)|0x80;
335       code = code>>6;
336     case 2:
337       pStr->pZ[pStr->c+2] = (((u8)code)&0x3F)|0x80;
338       code = code>>6;
339     case 1:
340       pStr->pZ[pStr->c+1] = (((u8)code)&0x3F)|0x80;
341       code = code>>6;
342     case 0:
343       pStr->pZ[pStr->c] = (((u8)code)&(pRow->b1_and_mask))|(pRow->b1_or_mask);
344   }
345   pStr->c += (pRow->trailing_bytes + 1);
346 
347   return 0;
348 }
349 
350 /*
351 ** Read a single unicode character from the UTF-16 encoded string *pStr. The
352 ** value returned is a unicode scalar value. In the case of malformed
353 ** strings, the unicode replacement character U+FFFD may be returned.
354 **
355 ** If big_endian is true, the string is assumed to be UTF-16BE encoded.
356 ** Otherwise, it is UTF-16LE encoded.
357 */
358 static u32 readUtf16(UtfString *pStr, int big_endian){
359   u32 code_point;   /* the first code-point in the character */
360 
361   /* If there is only one byte of data left in the string, return the
362   ** replacement character.
363   */
364   if( (pStr->n-pStr->c)==1 ){
365     pStr->c++;
366     return (int)0xFFFD;
367   }
368 
369   code_point = READ_16(&(pStr->pZ[pStr->c]), big_endian);
370   pStr->c += 2;
371 
372   /* If this is a non-surrogate code-point, just cast it to an int and
373   ** return the code-point value.
374   */
375   if( code_point<0xD800 || code_point>0xE000 ){
376     return code_point;
377   }
378 
379   /* If this is a trailing surrogate code-point, then the string is
380   ** malformed; return the replacement character.
381   */
382   if( code_point>0xDBFF ){
383     return 0xFFFD;
384   }
385 
386   /* The code-point just read is a leading surrogate code-point. If their
387   ** is not enough data left or the next code-point is not a trailing
388   ** surrogate, return the replacement character.
389   */
390   if( (pStr->n-pStr->c)>1 ){
391     u32 code_point2 = READ_16(&pStr->pZ[pStr->c], big_endian);
392     if( code_point2<0xDC00 || code_point>0xDFFF ){
393       return 0xFFFD;
394     }
395     pStr->c += 2;
396 
397     return (
398         (((code_point&0x03C0)+0x0040)<<16) +   /* uuuuu */
399         ((code_point&0x003F)<<10) +            /* xxxxxx */
400         (code_point2&0x03FF)                   /* yy yyyyyyyy */
401     );
402 
403   }else{
404     return (int)0xFFFD;
405   }
406 
407   /* not reached */
408 }
409 
410 static int writeUtf16(UtfString *pStr, int code, int big_endian){
411   int bytes;
412   unsigned char *hi_byte;
413   unsigned char *lo_byte;
414 
415   bytes = (code>0x0000FFFF?4:2);
416 
417   /* Ensure there is enough room left in the output buffer to write
418   ** this UTF-8 character.
419   */
420   assert( (pStr->n-pStr->c)>=bytes );
421 
422   /* Initialise hi_byte and lo_byte to point at the locations into which
423   ** the MSB and LSB of the (first) 16-bit unicode code-point written for
424   ** this character.
425   */
426   hi_byte = (big_endian?&pStr->pZ[pStr->c]:&pStr->pZ[pStr->c+1]);
427   lo_byte = (big_endian?&pStr->pZ[pStr->c+1]:&pStr->pZ[pStr->c]);
428 
429   if( bytes==2 ){
430     *hi_byte = (u8)((code&0x0000FF00)>>8);
431     *lo_byte = (u8)(code&0x000000FF);
432   }else{
433     u32 wrd;
434     wrd = ((((code&0x001F0000)-0x00010000)+(code&0x0000FC00))>>10)|0x0000D800;
435     *hi_byte = (u8)((wrd&0x0000FF00)>>8);
436     *lo_byte = (u8)(wrd&0x000000FF);
437 
438     wrd = (code&0x000003FF)|0x0000DC00;
439     *(hi_byte+2) = (u8)((wrd&0x0000FF00)>>8);
440     *(lo_byte+2) = (u8)(wrd&0x000000FF);
441   }
442 
443   pStr->c += bytes;
444 
445   return 0;
446 }
447 
448 /*
449 ** pZ is a UTF-8 encoded unicode string. If nByte is less than zero,
450 ** return the number of unicode characters in pZ up to (but not including)
451 ** the first 0x00 byte. If nByte is not less than zero, return the
452 ** number of unicode characters in the first nByte of pZ (or up to
453 ** the first 0x00, whichever comes first).
454 */
455 int sqlite3utf8CharLen(const char *pZ, int nByte){
456   UtfString str;
457   int ret = 0;
458   u32 code = 1;
459 
460   str.pZ = (char *)pZ;
461   str.n = nByte;
462   str.c = 0;
463 
464   while( (nByte<0 || str.c<str.n) && code!=0 ){
465     code = readUtf8(&str);
466     ret++;
467   }
468   if( code==0 ) ret--;
469 
470   return ret;
471 }
472 
473 /*
474 ** pZ is a UTF-16 encoded unicode string. If nChar is less than zero,
475 ** return the number of bytes up to (but not including), the first pair
476 ** of consecutive 0x00 bytes in pZ. If nChar is not less than zero,
477 ** then return the number of bytes in the first nChar unicode characters
478 ** in pZ (or up until the first pair of 0x00 bytes, whichever comes first).
479 */
480 int sqlite3utf16ByteLen(const void *pZ, int nChar){
481   if( nChar<0 ){
482     const unsigned char *pC1 = (unsigned char *)pZ;
483     const unsigned char *pC2 = (unsigned char *)pZ+1;
484     while( *pC1 || *pC2 ){
485       pC1 += 2;
486       pC2 += 2;
487     }
488     return pC1-(unsigned char *)pZ;
489   }else{
490     UtfString str;
491     u32 code = 1;
492     int big_endian;
493     int nRead = 0;
494     int ret;
495 
496     str.pZ = (char *)pZ;
497     str.c = 0;
498     str.n = -1;
499 
500     /* Check for a BOM. We just ignore it if there is one, it's only read
501     ** so that it is not counted as a character.
502     */
503     big_endian = readUtf16Bom(&str, 0);
504     ret = 0-str.c;
505 
506     while( code!=0 && nRead<nChar ){
507       code = readUtf16(&str, big_endian);
508       nRead++;
509     }
510     if( code==0 ){
511       ret -= 2;
512     }
513     return str.c + ret;
514   }
515 }
516 
517 /*
518 ** Convert a string in UTF-16 native byte (or with a Byte-order-mark or
519 ** "BOM") into a UTF-8 string.  The UTF-8 string is written into space
520 ** obtained from sqlite3Malloc() and must be released by the calling function.
521 **
522 ** The parameter N is the number of bytes in the UTF-16 string.  If N is
523 ** negative, the entire string up to the first \u0000 character is translated.
524 **
525 ** The returned UTF-8 string is always \000 terminated.
526 */
527 unsigned char *sqlite3utf16to8(const void *pData, int N, int big_endian){
528   UtfString in;
529   UtfString out;
530 
531   out.pZ = 0;
532 
533   in.pZ = (unsigned char *)pData;
534   in.n = N;
535   in.c = 0;
536 
537   if( in.n<0 ){
538     in.n = sqlite3utf16ByteLen(in.pZ, -1);
539   }
540 
541   /* A UTF-8 encoding of a unicode string can require at most 1.5 times as
542   ** much space to store as the same string encoded using UTF-16. Allocate
543   ** this now.
544   */
545   out.n = (in.n*1.5) + 1;
546   out.pZ = sqliteMalloc(out.n);
547   if( !out.pZ ){
548     return 0;
549   }
550   out.c = 0;
551 
552   big_endian = readUtf16Bom(&in, big_endian);
553   while( in.c<in.n ){
554     writeUtf8(&out, readUtf16(&in, big_endian));
555   }
556 
557   /* Add the NULL-terminator character */
558   assert( out.c<out.n );
559   out.pZ[out.c] = 0x00;
560 
561   return out.pZ;
562 }
563 
564 static void *utf8toUtf16(const unsigned char *pIn, int N, int big_endian){
565   UtfString in;
566   UtfString out;
567 
568   in.pZ = (unsigned char *)pIn;
569   in.n = N;
570   in.c = 0;
571 
572   if( in.n<0 ){
573     in.n = strlen(in.pZ);
574   }
575 
576   /* A UTF-16 encoding of a unicode string can require at most twice as
577   ** much space to store as the same string encoded using UTF-8. Allocate
578   ** this now.
579   */
580   out.n = (in.n*2) + 2;
581   out.pZ = sqliteMalloc(out.n);
582   if( !out.pZ ){
583     return 0;
584   }
585   out.c = 0;
586 
587   while( in.c<in.n ){
588     writeUtf16(&out, readUtf8(&in), big_endian);
589   }
590 
591   /* Add the NULL-terminator character */
592   assert( (out.c+1)<out.n );
593   out.pZ[out.c] = 0x00;
594   out.pZ[out.c+1] = 0x00;
595 
596   return out.pZ;
597 }
598 
599 /*
600 ** Translate UTF-8 to UTF-16BE or UTF-16LE
601 */
602 void *sqlite3utf8to16be(const unsigned char *pIn, int N){
603   return utf8toUtf16(pIn, N, 1);
604 }
605 
606 void *sqlite3utf8to16le(const unsigned char *pIn, int N){
607   return utf8toUtf16(pIn, N, 0);
608 }
609 
610 /*
611 ** This routine does the work for sqlite3utf16to16le() and
612 ** sqlite3utf16to16be(). If big_endian is 1 the input string is
613 ** transformed in place to UTF-16BE encoding. If big_endian is 0 then
614 ** the input is transformed to UTF-16LE.
615 **
616 ** Unless the first two bytes of the input string is a BOM, the input is
617 ** assumed to be UTF-16 encoded using the machines native byte ordering.
618 */
619 static void utf16to16(void *pData, int N, int big_endian){
620   UtfString inout;
621   inout.pZ = (unsigned char *)pData;
622   inout.c = 0;
623   inout.n = N;
624 
625   if( inout.n<0 ){
626     inout.n = sqlite3utf16ByteLen(inout.pZ, -1);
627   }
628 
629   if( readUtf16Bom(&inout, SQLITE_BIGENDIAN)!=big_endian ){
630     /* swab(&inout.pZ[inout.c], inout.pZ, inout.n-inout.c); */
631     int i;
632     for(i=0; i<(inout.n-inout.c); i += 2){
633       char c1 = inout.pZ[i+inout.c];
634       char c2 = inout.pZ[i+inout.c+1];
635       inout.pZ[i] = c2;
636       inout.pZ[i+1] = c1;
637     }
638   }else if( inout.c ){
639     memmove(inout.pZ, &inout.pZ[inout.c], inout.n-inout.c);
640   }
641 
642   inout.pZ[inout.n-inout.c] = 0x00;
643   inout.pZ[inout.n-inout.c+1] = 0x00;
644 }
645 
646 /*
647 ** Convert a string in UTF-16 native byte or with a BOM into a UTF-16LE
648 ** string.  The conversion occurs in-place.  The output overwrites the
649 ** input.  N bytes are converted.  If N is negative everything is converted
650 ** up to the first \u0000 character.
651 **
652 ** If the native byte order is little-endian and there is no BOM, then
653 ** this routine is a no-op.  If there is a BOM at the start of the string,
654 ** it is removed.
655 **
656 ** Translation from UTF-16LE to UTF-16BE and back again is accomplished
657 ** using the library function swab().
658 */
659 void sqlite3utf16to16le(void *pData, int N){
660   utf16to16(pData, N, 0);
661 }
662 
663 /*
664 ** Convert a string in UTF-16 native byte or with a BOM into a UTF-16BE
665 ** string.  The conversion occurs in-place.  The output overwrites the
666 ** input.  N bytes are converted.  If N is negative everything is converted
667 ** up to the first \u0000 character.
668 **
669 ** If the native byte order is little-endian and there is no BOM, then
670 ** this routine is a no-op.  If there is a BOM at the start of the string,
671 ** it is removed.
672 **
673 ** Translation from UTF-16LE to UTF-16BE and back again is accomplished
674 ** using the library function swab().
675 */
676 void sqlite3utf16to16be(void *pData, int N){
677   utf16to16(pData, N, 1);
678 }
679 
680 /*
681 ** This function is used to translate between UTF-8 and UTF-16. The
682 ** result is returned in dynamically allocated memory.
683 */
684 int sqlite3utfTranslate(
685   const void *zData, int nData,  /* Input string */
686   u8 enc1,                       /* Encoding of zData */
687   void **zOut, int *nOut,        /* Output string */
688   u8 enc2                        /* Desired encoding of output */
689 ){
690   assert( enc1==TEXT_Utf8 || enc1==TEXT_Utf16le || enc1==TEXT_Utf16be );
691   assert( enc2==TEXT_Utf8 || enc2==TEXT_Utf16le || enc2==TEXT_Utf16be );
692   assert(
693     (enc1==TEXT_Utf8 && (enc2==TEXT_Utf16le || enc2==TEXT_Utf16be)) ||
694     (enc2==TEXT_Utf8 && (enc1==TEXT_Utf16le || enc1==TEXT_Utf16be))
695   );
696 
697   if( enc1==TEXT_Utf8 ){
698     if( enc2==TEXT_Utf16le ){
699       *zOut = sqlite3utf8to16le(zData, nData);
700     }else{
701       *zOut = sqlite3utf8to16be(zData, nData);
702     }
703     if( !(*zOut) ) return SQLITE_NOMEM;
704     *nOut = sqlite3utf16ByteLen(*zOut, -1);
705   }else{
706     *zOut = sqlite3utf16to8(zData, nData, enc1==TEXT_Utf16be);
707     if( !(*zOut) ) return SQLITE_NOMEM;
708     *nOut = strlen(*zOut);
709   }
710   return SQLITE_OK;
711 }
712