1 /* 2 ** 2004 April 13 3 ** 4 ** The author disclaims copyright to this source code. In place of 5 ** a legal notice, here is a blessing: 6 ** 7 ** May you do good and not evil. 8 ** May you find forgiveness for yourself and forgive others. 9 ** May you share freely, never taking more than you give. 10 ** 11 ************************************************************************* 12 ** This file contains routines used to translate between UTF-8, 13 ** UTF-16, UTF-16BE, and UTF-16LE. 14 ** 15 ** $Id: utf.c,v 1.18 2004/06/06 12:41:50 danielk1977 Exp $ 16 ** 17 ** Notes on UTF-8: 18 ** 19 ** Byte-0 Byte-1 Byte-2 Byte-3 Value 20 ** 0xxxxxxx 00000000 00000000 0xxxxxxx 21 ** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx 22 ** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx 23 ** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx 24 ** 25 ** 26 ** Notes on UTF-16: (with wwww+1==uuuuu) 27 ** 28 ** Word-0 Word-1 Value 29 ** 110110ww wwzzzzyy 110111yy yyxxxxxx 000uuuuu zzzzyyyy yyxxxxxx 30 ** zzzzyyyy yyxxxxxx 00000000 zzzzyyyy yyxxxxxx 31 ** 32 ** 33 ** BOM or Byte Order Mark: 34 ** 0xff 0xfe little-endian utf-16 follows 35 ** 0xfe 0xff big-endian utf-16 follows 36 ** 37 ** 38 ** Handling of malformed strings: 39 ** 40 ** SQLite accepts and processes malformed strings without an error wherever 41 ** possible. However this is not possible when converting between UTF-8 and 42 ** UTF-16. 43 ** 44 ** When converting malformed UTF-8 strings to UTF-16, one instance of the 45 ** replacement character U+FFFD for each byte that cannot be interpeted as 46 ** part of a valid unicode character. 47 ** 48 ** When converting malformed UTF-16 strings to UTF-8, one instance of the 49 ** replacement character U+FFFD for each pair of bytes that cannot be 50 ** interpeted as part of a valid unicode character. 51 */ 52 #include <assert.h> 53 #include "sqliteInt.h" 54 55 typedef struct UtfString UtfString; 56 struct UtfString { 57 unsigned char *pZ; /* Raw string data */ 58 int n; /* Allocated length of pZ in bytes */ 59 int c; /* Number of pZ bytes already read or written */ 60 }; 61 62 /* 63 ** These two macros are used to interpret the first two bytes of the 64 ** unsigned char array pZ as a 16-bit unsigned int. BE16() for a big-endian 65 ** interpretation, LE16() for little-endian. 66 */ 67 #define BE16(pZ) (((u16)((pZ)[0])<<8) + (u16)((pZ)[1])) 68 #define LE16(pZ) (((u16)((pZ)[1])<<8) + (u16)((pZ)[0])) 69 70 /* 71 ** READ_16 interprets the first two bytes of the unsigned char array pZ 72 ** as a 16-bit unsigned int. If big_endian is non-zero the intepretation 73 ** is big-endian, otherwise little-endian. 74 */ 75 #define READ_16(pZ,big_endian) (big_endian?BE16(pZ):LE16(pZ)) 76 77 /* 78 ** The following macro, LOWERCASE(x), takes an integer representing a 79 ** unicode code point. The value returned is the same code point folded to 80 ** lower case, if applicable. SQLite currently understands the upper/lower 81 ** case relationship between the 26 characters used in the English 82 ** language only. 83 ** 84 ** This means that characters with umlauts etc. will not be folded 85 ** correctly (unless they are encoded as composite characters, which would 86 ** doubtless cause much trouble). 87 */ 88 #define LOWERCASE(x) (x<91?(int)(UpperToLower[x]):x); 89 static unsigned char UpperToLower[91] = { 90 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 91 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 92 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 93 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 97, 98, 99,100,101,102,103, 94 104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121, 95 122, 96 }; 97 98 /* 99 ** The first parameter, zStr, points at a unicode string. This routine 100 ** reads a single character from the string and returns the codepoint value 101 ** of the character read. 102 ** 103 ** The value of *pEnc is the string encoding. If *pEnc is TEXT_Utf16le or 104 ** TEXT_Utf16be, and the first character read is a byte-order-mark, then 105 ** the value of *pEnc is modified if necessary. In this case the next 106 ** character is read and it's code-point value returned. 107 ** 108 ** The value of *pOffset is the byte-offset in zStr from which to begin 109 ** reading. It is incremented by the number of bytes read by this function. 110 ** 111 ** If the fourth parameter, fold, is non-zero, then codepoint values are 112 ** folded to lower-case before being returned. See comments for macro 113 ** LOWERCASE(x) for details. 114 */ 115 int sqlite3ReadUniChar(const char *zStr, int *pOffset, u8 *pEnc, int fold){ 116 int ret = 0; 117 118 switch( *pEnc ){ 119 case TEXT_Utf8: { 120 121 #if 0 122 static const int initVal[] = { 123 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 124 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 125 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 126 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 127 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 128 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 129 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 130 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 131 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 132 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 133 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 134 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 135 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 0, 1, 2, 136 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 137 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 138 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 139 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 254, 140 255, 141 }; 142 ret = initVal[(unsigned char)zStr[(*pOffset)++]]; 143 while( (0xc0&zStr[*pOffset])==0x80 ){ 144 ret = (ret<<6) | (0x3f&(zStr[(*pOffset)++])); 145 } 146 #endif 147 148 struct Utf8TblRow { 149 u8 b1_mask; 150 u8 b1_masked_val; 151 u8 b1_value_mask; 152 int trailing_bytes; 153 }; 154 static const struct Utf8TblRow utf8tbl[] = { 155 { 0x80, 0x00, 0x7F, 0 }, 156 { 0xE0, 0xC0, 0x1F, 1 }, 157 { 0xF0, 0xE0, 0x0F, 2 }, 158 { 0xF8, 0xF0, 0x0E, 3 }, 159 { 0, 0, 0, 0} 160 }; 161 162 u8 b1; /* First byte of the potentially multi-byte utf-8 character */ 163 int ii; 164 struct Utf8TblRow const *pRow; 165 166 pRow = &(utf8tbl[0]); 167 168 b1 = zStr[(*pOffset)++]; 169 while( pRow->b1_mask && (b1&pRow->b1_mask)!=pRow->b1_masked_val ){ 170 pRow++; 171 } 172 if( !pRow->b1_mask ){ 173 return (int)0xFFFD; 174 } 175 176 ret = (u32)(b1&pRow->b1_value_mask); 177 for( ii=0; ii<pRow->trailing_bytes; ii++ ){ 178 u8 b = zStr[(*pOffset)++]; 179 if( (b&0xC0)!=0x80 ){ 180 return (int)0xFFFD; 181 } 182 ret = (ret<<6) + (u32)(b&0x3F); 183 } 184 break; 185 } 186 187 case TEXT_Utf16le: 188 case TEXT_Utf16be: { 189 u32 code_point; /* the first code-point in the character */ 190 u32 code_point2; /* the second code-point in the character, if any */ 191 192 code_point = READ_16(&zStr[*pOffset], (*pEnc==TEXT_Utf16be)); 193 *pOffset += 2; 194 195 /* If this is a non-surrogate code-point, just cast it to an int and 196 ** this is the code-point value. 197 */ 198 if( code_point<0xD800 || code_point>0xE000 ){ 199 ret = code_point; 200 break; 201 } 202 203 /* If this is a trailing surrogate code-point, then the string is 204 ** malformed; return the replacement character. 205 */ 206 if( code_point>0xDBFF ){ 207 return (int)0xFFFD; 208 } 209 210 /* The code-point just read is a leading surrogate code-point. If their 211 ** is not enough data left or the next code-point is not a trailing 212 ** surrogate, return the replacement character. 213 */ 214 code_point2 = READ_16(&zStr[*pOffset], (*pEnc==TEXT_Utf16be)); 215 *pOffset += 2; 216 if( code_point2<0xDC00 || code_point>0xDFFF ){ 217 return (int)0xFFFD; 218 } 219 220 ret = ( 221 (((code_point&0x03C0)+0x0040)<<16) + /* uuuuu */ 222 ((code_point&0x003F)<<10) + /* xxxxxx */ 223 (code_point2&0x03FF) /* yy yyyyyyyy */ 224 ); 225 } 226 default: 227 assert(0); 228 } 229 230 if( fold ){ 231 return LOWERCASE(ret); 232 } 233 return ret; 234 } 235 236 /* 237 ** Read the BOM from the start of *pStr, if one is present. Return zero 238 ** for little-endian, non-zero for big-endian. If no BOM is present, return 239 ** the value of the parameter "big_endian". 240 ** 241 ** Return values: 242 ** 1 -> big-endian string 243 ** 0 -> little-endian string 244 */ 245 static int readUtf16Bom(UtfString *pStr, int big_endian){ 246 /* The BOM must be the first thing read from the string */ 247 assert( pStr->c==0 ); 248 249 /* If the string data consists of 1 byte or less, the BOM will make no 250 ** difference anyway. In this case just fall through to the default case 251 ** and return the native byte-order for this machine. 252 ** 253 ** Otherwise, check the first 2 bytes of the string to see if a BOM is 254 ** present. 255 */ 256 if( pStr->n>1 ){ 257 u8 bom = sqlite3UtfReadBom(pStr->pZ, 2); 258 if( bom ){ 259 pStr->c += 2; 260 return (bom==TEXT_Utf16le)?0:1; 261 } 262 } 263 264 return big_endian; 265 } 266 267 /* 268 ** zData is a UTF-16 encoded string, nData bytes in length. This routine 269 ** checks if there is a byte-order mark at the start of zData. If no 270 ** byte order mark is found 0 is returned. Otherwise TEXT_Utf16be or 271 ** TEXT_Utf16le is returned, depending on whether The BOM indicates that 272 ** the text is big-endian or little-endian. 273 */ 274 u8 sqlite3UtfReadBom(const void *zData, int nData){ 275 if( nData<0 || nData>1 ){ 276 u8 b1 = *(u8 *)zData; 277 u8 b2 = *(((u8 *)zData) + 1); 278 if( b1==0xFE && b2==0xFF ){ 279 return TEXT_Utf16be; 280 } 281 if( b1==0xFF && b2==0xFE ){ 282 return TEXT_Utf16le; 283 } 284 } 285 return 0; 286 } 287 288 289 /* 290 ** Read a single unicode character from the UTF-8 encoded string *pStr. The 291 ** value returned is a unicode scalar value. In the case of malformed 292 ** strings, the unicode replacement character U+FFFD may be returned. 293 */ 294 static u32 readUtf8(UtfString *pStr){ 295 u8 enc = TEXT_Utf8; 296 return sqlite3ReadUniChar(pStr->pZ, &pStr->c, &enc, 0); 297 } 298 299 /* 300 ** Write the unicode character 'code' to the string pStr using UTF-8 301 ** encoding. SQLITE_NOMEM may be returned if sqlite3Malloc() fails. 302 */ 303 static int writeUtf8(UtfString *pStr, u32 code){ 304 struct Utf8WriteTblRow { 305 u32 max_code; 306 int trailing_bytes; 307 u8 b1_and_mask; 308 u8 b1_or_mask; 309 }; 310 static const struct Utf8WriteTblRow utf8tbl[] = { 311 {0x0000007F, 0, 0x7F, 0x00}, 312 {0x000007FF, 1, 0xDF, 0xC0}, 313 {0x0000FFFF, 2, 0xEF, 0xE0}, 314 {0x0010FFFF, 3, 0xF7, 0xF0}, 315 {0x00000000, 0, 0x00, 0x00} 316 }; 317 const struct Utf8WriteTblRow *pRow = &utf8tbl[0]; 318 319 while( code>pRow->max_code ){ 320 assert( pRow->max_code ); 321 pRow++; 322 } 323 324 /* Ensure there is enough room left in the output buffer to write 325 ** this UTF-8 character. 326 */ 327 assert( (pStr->n-pStr->c)>=(pRow->trailing_bytes+1) ); 328 329 /* Write the UTF-8 encoded character to pStr. All cases below are 330 ** intentionally fall-through. 331 */ 332 switch( pRow->trailing_bytes ){ 333 case 3: 334 pStr->pZ[pStr->c+3] = (((u8)code)&0x3F)|0x80; 335 code = code>>6; 336 case 2: 337 pStr->pZ[pStr->c+2] = (((u8)code)&0x3F)|0x80; 338 code = code>>6; 339 case 1: 340 pStr->pZ[pStr->c+1] = (((u8)code)&0x3F)|0x80; 341 code = code>>6; 342 case 0: 343 pStr->pZ[pStr->c] = (((u8)code)&(pRow->b1_and_mask))|(pRow->b1_or_mask); 344 } 345 pStr->c += (pRow->trailing_bytes + 1); 346 347 return 0; 348 } 349 350 /* 351 ** Read a single unicode character from the UTF-16 encoded string *pStr. The 352 ** value returned is a unicode scalar value. In the case of malformed 353 ** strings, the unicode replacement character U+FFFD may be returned. 354 ** 355 ** If big_endian is true, the string is assumed to be UTF-16BE encoded. 356 ** Otherwise, it is UTF-16LE encoded. 357 */ 358 static u32 readUtf16(UtfString *pStr, int big_endian){ 359 u32 code_point; /* the first code-point in the character */ 360 361 /* If there is only one byte of data left in the string, return the 362 ** replacement character. 363 */ 364 if( (pStr->n-pStr->c)==1 ){ 365 pStr->c++; 366 return (int)0xFFFD; 367 } 368 369 code_point = READ_16(&(pStr->pZ[pStr->c]), big_endian); 370 pStr->c += 2; 371 372 /* If this is a non-surrogate code-point, just cast it to an int and 373 ** return the code-point value. 374 */ 375 if( code_point<0xD800 || code_point>0xE000 ){ 376 return code_point; 377 } 378 379 /* If this is a trailing surrogate code-point, then the string is 380 ** malformed; return the replacement character. 381 */ 382 if( code_point>0xDBFF ){ 383 return 0xFFFD; 384 } 385 386 /* The code-point just read is a leading surrogate code-point. If their 387 ** is not enough data left or the next code-point is not a trailing 388 ** surrogate, return the replacement character. 389 */ 390 if( (pStr->n-pStr->c)>1 ){ 391 u32 code_point2 = READ_16(&pStr->pZ[pStr->c], big_endian); 392 if( code_point2<0xDC00 || code_point>0xDFFF ){ 393 return 0xFFFD; 394 } 395 pStr->c += 2; 396 397 return ( 398 (((code_point&0x03C0)+0x0040)<<16) + /* uuuuu */ 399 ((code_point&0x003F)<<10) + /* xxxxxx */ 400 (code_point2&0x03FF) /* yy yyyyyyyy */ 401 ); 402 403 }else{ 404 return (int)0xFFFD; 405 } 406 407 /* not reached */ 408 } 409 410 static int writeUtf16(UtfString *pStr, int code, int big_endian){ 411 int bytes; 412 unsigned char *hi_byte; 413 unsigned char *lo_byte; 414 415 bytes = (code>0x0000FFFF?4:2); 416 417 /* Ensure there is enough room left in the output buffer to write 418 ** this UTF-8 character. 419 */ 420 assert( (pStr->n-pStr->c)>=bytes ); 421 422 /* Initialise hi_byte and lo_byte to point at the locations into which 423 ** the MSB and LSB of the (first) 16-bit unicode code-point written for 424 ** this character. 425 */ 426 hi_byte = (big_endian?&pStr->pZ[pStr->c]:&pStr->pZ[pStr->c+1]); 427 lo_byte = (big_endian?&pStr->pZ[pStr->c+1]:&pStr->pZ[pStr->c]); 428 429 if( bytes==2 ){ 430 *hi_byte = (u8)((code&0x0000FF00)>>8); 431 *lo_byte = (u8)(code&0x000000FF); 432 }else{ 433 u32 wrd; 434 wrd = ((((code&0x001F0000)-0x00010000)+(code&0x0000FC00))>>10)|0x0000D800; 435 *hi_byte = (u8)((wrd&0x0000FF00)>>8); 436 *lo_byte = (u8)(wrd&0x000000FF); 437 438 wrd = (code&0x000003FF)|0x0000DC00; 439 *(hi_byte+2) = (u8)((wrd&0x0000FF00)>>8); 440 *(lo_byte+2) = (u8)(wrd&0x000000FF); 441 } 442 443 pStr->c += bytes; 444 445 return 0; 446 } 447 448 /* 449 ** pZ is a UTF-8 encoded unicode string. If nByte is less than zero, 450 ** return the number of unicode characters in pZ up to (but not including) 451 ** the first 0x00 byte. If nByte is not less than zero, return the 452 ** number of unicode characters in the first nByte of pZ (or up to 453 ** the first 0x00, whichever comes first). 454 */ 455 int sqlite3utf8CharLen(const char *pZ, int nByte){ 456 UtfString str; 457 int ret = 0; 458 u32 code = 1; 459 460 str.pZ = (char *)pZ; 461 str.n = nByte; 462 str.c = 0; 463 464 while( (nByte<0 || str.c<str.n) && code!=0 ){ 465 code = readUtf8(&str); 466 ret++; 467 } 468 if( code==0 ) ret--; 469 470 return ret; 471 } 472 473 /* 474 ** pZ is a UTF-16 encoded unicode string. If nChar is less than zero, 475 ** return the number of bytes up to (but not including), the first pair 476 ** of consecutive 0x00 bytes in pZ. If nChar is not less than zero, 477 ** then return the number of bytes in the first nChar unicode characters 478 ** in pZ (or up until the first pair of 0x00 bytes, whichever comes first). 479 */ 480 int sqlite3utf16ByteLen(const void *pZ, int nChar){ 481 if( nChar<0 ){ 482 const unsigned char *pC1 = (unsigned char *)pZ; 483 const unsigned char *pC2 = (unsigned char *)pZ+1; 484 while( *pC1 || *pC2 ){ 485 pC1 += 2; 486 pC2 += 2; 487 } 488 return pC1-(unsigned char *)pZ; 489 }else{ 490 UtfString str; 491 u32 code = 1; 492 int big_endian; 493 int nRead = 0; 494 int ret; 495 496 str.pZ = (char *)pZ; 497 str.c = 0; 498 str.n = -1; 499 500 /* Check for a BOM. We just ignore it if there is one, it's only read 501 ** so that it is not counted as a character. 502 */ 503 big_endian = readUtf16Bom(&str, 0); 504 ret = 0-str.c; 505 506 while( code!=0 && nRead<nChar ){ 507 code = readUtf16(&str, big_endian); 508 nRead++; 509 } 510 if( code==0 ){ 511 ret -= 2; 512 } 513 return str.c + ret; 514 } 515 } 516 517 /* 518 ** Convert a string in UTF-16 native byte (or with a Byte-order-mark or 519 ** "BOM") into a UTF-8 string. The UTF-8 string is written into space 520 ** obtained from sqlite3Malloc() and must be released by the calling function. 521 ** 522 ** The parameter N is the number of bytes in the UTF-16 string. If N is 523 ** negative, the entire string up to the first \u0000 character is translated. 524 ** 525 ** The returned UTF-8 string is always \000 terminated. 526 */ 527 unsigned char *sqlite3utf16to8(const void *pData, int N, int big_endian){ 528 UtfString in; 529 UtfString out; 530 531 out.pZ = 0; 532 533 in.pZ = (unsigned char *)pData; 534 in.n = N; 535 in.c = 0; 536 537 if( in.n<0 ){ 538 in.n = sqlite3utf16ByteLen(in.pZ, -1); 539 } 540 541 /* A UTF-8 encoding of a unicode string can require at most 1.5 times as 542 ** much space to store as the same string encoded using UTF-16. Allocate 543 ** this now. 544 */ 545 out.n = (in.n*1.5) + 1; 546 out.pZ = sqliteMalloc(out.n); 547 if( !out.pZ ){ 548 return 0; 549 } 550 out.c = 0; 551 552 big_endian = readUtf16Bom(&in, big_endian); 553 while( in.c<in.n ){ 554 writeUtf8(&out, readUtf16(&in, big_endian)); 555 } 556 557 /* Add the NULL-terminator character */ 558 assert( out.c<out.n ); 559 out.pZ[out.c] = 0x00; 560 561 return out.pZ; 562 } 563 564 static void *utf8toUtf16(const unsigned char *pIn, int N, int big_endian){ 565 UtfString in; 566 UtfString out; 567 568 in.pZ = (unsigned char *)pIn; 569 in.n = N; 570 in.c = 0; 571 572 if( in.n<0 ){ 573 in.n = strlen(in.pZ); 574 } 575 576 /* A UTF-16 encoding of a unicode string can require at most twice as 577 ** much space to store as the same string encoded using UTF-8. Allocate 578 ** this now. 579 */ 580 out.n = (in.n*2) + 2; 581 out.pZ = sqliteMalloc(out.n); 582 if( !out.pZ ){ 583 return 0; 584 } 585 out.c = 0; 586 587 while( in.c<in.n ){ 588 writeUtf16(&out, readUtf8(&in), big_endian); 589 } 590 591 /* Add the NULL-terminator character */ 592 assert( (out.c+1)<out.n ); 593 out.pZ[out.c] = 0x00; 594 out.pZ[out.c+1] = 0x00; 595 596 return out.pZ; 597 } 598 599 /* 600 ** Translate UTF-8 to UTF-16BE or UTF-16LE 601 */ 602 void *sqlite3utf8to16be(const unsigned char *pIn, int N){ 603 return utf8toUtf16(pIn, N, 1); 604 } 605 606 void *sqlite3utf8to16le(const unsigned char *pIn, int N){ 607 return utf8toUtf16(pIn, N, 0); 608 } 609 610 /* 611 ** This routine does the work for sqlite3utf16to16le() and 612 ** sqlite3utf16to16be(). If big_endian is 1 the input string is 613 ** transformed in place to UTF-16BE encoding. If big_endian is 0 then 614 ** the input is transformed to UTF-16LE. 615 ** 616 ** Unless the first two bytes of the input string is a BOM, the input is 617 ** assumed to be UTF-16 encoded using the machines native byte ordering. 618 */ 619 static void utf16to16(void *pData, int N, int big_endian){ 620 UtfString inout; 621 inout.pZ = (unsigned char *)pData; 622 inout.c = 0; 623 inout.n = N; 624 625 if( inout.n<0 ){ 626 inout.n = sqlite3utf16ByteLen(inout.pZ, -1); 627 } 628 629 if( readUtf16Bom(&inout, SQLITE_BIGENDIAN)!=big_endian ){ 630 /* swab(&inout.pZ[inout.c], inout.pZ, inout.n-inout.c); */ 631 int i; 632 for(i=0; i<(inout.n-inout.c); i += 2){ 633 char c1 = inout.pZ[i+inout.c]; 634 char c2 = inout.pZ[i+inout.c+1]; 635 inout.pZ[i] = c2; 636 inout.pZ[i+1] = c1; 637 } 638 }else if( inout.c ){ 639 memmove(inout.pZ, &inout.pZ[inout.c], inout.n-inout.c); 640 } 641 642 inout.pZ[inout.n-inout.c] = 0x00; 643 inout.pZ[inout.n-inout.c+1] = 0x00; 644 } 645 646 /* 647 ** Convert a string in UTF-16 native byte or with a BOM into a UTF-16LE 648 ** string. The conversion occurs in-place. The output overwrites the 649 ** input. N bytes are converted. If N is negative everything is converted 650 ** up to the first \u0000 character. 651 ** 652 ** If the native byte order is little-endian and there is no BOM, then 653 ** this routine is a no-op. If there is a BOM at the start of the string, 654 ** it is removed. 655 ** 656 ** Translation from UTF-16LE to UTF-16BE and back again is accomplished 657 ** using the library function swab(). 658 */ 659 void sqlite3utf16to16le(void *pData, int N){ 660 utf16to16(pData, N, 0); 661 } 662 663 /* 664 ** Convert a string in UTF-16 native byte or with a BOM into a UTF-16BE 665 ** string. The conversion occurs in-place. The output overwrites the 666 ** input. N bytes are converted. If N is negative everything is converted 667 ** up to the first \u0000 character. 668 ** 669 ** If the native byte order is little-endian and there is no BOM, then 670 ** this routine is a no-op. If there is a BOM at the start of the string, 671 ** it is removed. 672 ** 673 ** Translation from UTF-16LE to UTF-16BE and back again is accomplished 674 ** using the library function swab(). 675 */ 676 void sqlite3utf16to16be(void *pData, int N){ 677 utf16to16(pData, N, 1); 678 } 679 680 /* 681 ** This function is used to translate between UTF-8 and UTF-16. The 682 ** result is returned in dynamically allocated memory. 683 */ 684 int sqlite3utfTranslate( 685 const void *zData, int nData, /* Input string */ 686 u8 enc1, /* Encoding of zData */ 687 void **zOut, int *nOut, /* Output string */ 688 u8 enc2 /* Desired encoding of output */ 689 ){ 690 assert( enc1==TEXT_Utf8 || enc1==TEXT_Utf16le || enc1==TEXT_Utf16be ); 691 assert( enc2==TEXT_Utf8 || enc2==TEXT_Utf16le || enc2==TEXT_Utf16be ); 692 assert( 693 (enc1==TEXT_Utf8 && (enc2==TEXT_Utf16le || enc2==TEXT_Utf16be)) || 694 (enc2==TEXT_Utf8 && (enc1==TEXT_Utf16le || enc1==TEXT_Utf16be)) 695 ); 696 697 if( enc1==TEXT_Utf8 ){ 698 if( enc2==TEXT_Utf16le ){ 699 *zOut = sqlite3utf8to16le(zData, nData); 700 }else{ 701 *zOut = sqlite3utf8to16be(zData, nData); 702 } 703 if( !(*zOut) ) return SQLITE_NOMEM; 704 *nOut = sqlite3utf16ByteLen(*zOut, -1); 705 }else{ 706 *zOut = sqlite3utf16to8(zData, nData, enc1==TEXT_Utf16be); 707 if( !(*zOut) ) return SQLITE_NOMEM; 708 *nOut = strlen(*zOut); 709 } 710 return SQLITE_OK; 711 } 712