1 /* 2 ** 2004 April 13 3 ** 4 ** The author disclaims copyright to this source code. In place of 5 ** a legal notice, here is a blessing: 6 ** 7 ** May you do good and not evil. 8 ** May you find forgiveness for yourself and forgive others. 9 ** May you share freely, never taking more than you give. 10 ** 11 ************************************************************************* 12 ** This file contains routines used to translate between UTF-8, 13 ** UTF-16, UTF-16BE, and UTF-16LE. 14 ** 15 ** $Id: utf.c,v 1.51 2007/05/23 16:23:09 danielk1977 Exp $ 16 ** 17 ** Notes on UTF-8: 18 ** 19 ** Byte-0 Byte-1 Byte-2 Byte-3 Value 20 ** 0xxxxxxx 00000000 00000000 0xxxxxxx 21 ** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx 22 ** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx 23 ** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx 24 ** 25 ** 26 ** Notes on UTF-16: (with wwww+1==uuuuu) 27 ** 28 ** Word-0 Word-1 Value 29 ** 110110ww wwzzzzyy 110111yy yyxxxxxx 000uuuuu zzzzyyyy yyxxxxxx 30 ** zzzzyyyy yyxxxxxx 00000000 zzzzyyyy yyxxxxxx 31 ** 32 ** 33 ** BOM or Byte Order Mark: 34 ** 0xff 0xfe little-endian utf-16 follows 35 ** 0xfe 0xff big-endian utf-16 follows 36 ** 37 */ 38 #include "sqliteInt.h" 39 #include <assert.h> 40 #include "vdbeInt.h" 41 42 /* 43 ** The following constant value is used by the SQLITE_BIGENDIAN and 44 ** SQLITE_LITTLEENDIAN macros. 45 */ 46 const int sqlite3one = 1; 47 48 /* 49 ** This lookup table is used to help decode the first byte of 50 ** a multi-byte UTF8 character. 51 */ 52 const unsigned char sqlite3UtfTrans1[] = { 53 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 54 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 55 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 56 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 57 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 58 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 59 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 60 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00, 61 }; 62 63 #define WRITE_UTF8(zOut, c) { \ 64 if( c<0x00080 ){ \ 65 *zOut++ = (c&0xFF); \ 66 } \ 67 else if( c<0x00800 ){ \ 68 *zOut++ = 0xC0 + ((c>>6)&0x1F); \ 69 *zOut++ = 0x80 + (c & 0x3F); \ 70 } \ 71 else if( c<0x10000 ){ \ 72 *zOut++ = 0xE0 + ((c>>12)&0x0F); \ 73 *zOut++ = 0x80 + ((c>>6) & 0x3F); \ 74 *zOut++ = 0x80 + (c & 0x3F); \ 75 }else{ \ 76 *zOut++ = 0xF0 + ((c>>18) & 0x07); \ 77 *zOut++ = 0x80 + ((c>>12) & 0x3F); \ 78 *zOut++ = 0x80 + ((c>>6) & 0x3F); \ 79 *zOut++ = 0x80 + (c & 0x3F); \ 80 } \ 81 } 82 83 #define WRITE_UTF16LE(zOut, c) { \ 84 if( c<=0xFFFF ){ \ 85 *zOut++ = (c&0x00FF); \ 86 *zOut++ = ((c>>8)&0x00FF); \ 87 }else{ \ 88 *zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \ 89 *zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03)); \ 90 *zOut++ = (c&0x00FF); \ 91 *zOut++ = (0x00DC + ((c>>8)&0x03)); \ 92 } \ 93 } 94 95 #define WRITE_UTF16BE(zOut, c) { \ 96 if( c<=0xFFFF ){ \ 97 *zOut++ = ((c>>8)&0x00FF); \ 98 *zOut++ = (c&0x00FF); \ 99 }else{ \ 100 *zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03)); \ 101 *zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \ 102 *zOut++ = (0x00DC + ((c>>8)&0x03)); \ 103 *zOut++ = (c&0x00FF); \ 104 } \ 105 } 106 107 #define READ_UTF16LE(zIn, c){ \ 108 c = (*zIn++); \ 109 c += ((*zIn++)<<8); \ 110 if( c>=0xD800 && c<0xE000 ){ \ 111 int c2 = (*zIn++); \ 112 c2 += ((*zIn++)<<8); \ 113 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \ 114 if( (c & 0xFFFF0000)==0 ) c = 0xFFFD; \ 115 } \ 116 } 117 118 #define READ_UTF16BE(zIn, c){ \ 119 c = ((*zIn++)<<8); \ 120 c += (*zIn++); \ 121 if( c>=0xD800 && c<0xE000 ){ \ 122 int c2 = ((*zIn++)<<8); \ 123 c2 += (*zIn++); \ 124 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \ 125 if( (c & 0xFFFF0000)==0 ) c = 0xFFFD; \ 126 } \ 127 } 128 129 /* 130 ** If the TRANSLATE_TRACE macro is defined, the value of each Mem is 131 ** printed on stderr on the way into and out of sqlite3VdbeMemTranslate(). 132 */ 133 /* #define TRANSLATE_TRACE 1 */ 134 135 #ifndef SQLITE_OMIT_UTF16 136 /* 137 ** This routine transforms the internal text encoding used by pMem to 138 ** desiredEnc. It is an error if the string is already of the desired 139 ** encoding, or if *pMem does not contain a string value. 140 */ 141 int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){ 142 unsigned char zShort[NBFS]; /* Temporary short output buffer */ 143 int len; /* Maximum length of output string in bytes */ 144 unsigned char *zOut; /* Output buffer */ 145 unsigned char *zIn; /* Input iterator */ 146 unsigned char *zTerm; /* End of input */ 147 unsigned char *z; /* Output iterator */ 148 unsigned int c; 149 150 assert( pMem->flags&MEM_Str ); 151 assert( pMem->enc!=desiredEnc ); 152 assert( pMem->enc!=0 ); 153 assert( pMem->n>=0 ); 154 155 #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG) 156 { 157 char zBuf[100]; 158 sqlite3VdbeMemPrettyPrint(pMem, zBuf); 159 fprintf(stderr, "INPUT: %s\n", zBuf); 160 } 161 #endif 162 163 /* If the translation is between UTF-16 little and big endian, then 164 ** all that is required is to swap the byte order. This case is handled 165 ** differently from the others. 166 */ 167 if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){ 168 u8 temp; 169 int rc; 170 rc = sqlite3VdbeMemMakeWriteable(pMem); 171 if( rc!=SQLITE_OK ){ 172 assert( rc==SQLITE_NOMEM ); 173 return SQLITE_NOMEM; 174 } 175 zIn = (u8*)pMem->z; 176 zTerm = &zIn[pMem->n]; 177 while( zIn<zTerm ){ 178 temp = *zIn; 179 *zIn = *(zIn+1); 180 zIn++; 181 *zIn++ = temp; 182 } 183 pMem->enc = desiredEnc; 184 goto translate_out; 185 } 186 187 /* Set len to the maximum number of bytes required in the output buffer. */ 188 if( desiredEnc==SQLITE_UTF8 ){ 189 /* When converting from UTF-16, the maximum growth results from 190 ** translating a 2-byte character to a 4-byte UTF-8 character. 191 ** A single byte is required for the output string 192 ** nul-terminator. 193 */ 194 len = pMem->n * 2 + 1; 195 }else{ 196 /* When converting from UTF-8 to UTF-16 the maximum growth is caused 197 ** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16 198 ** character. Two bytes are required in the output buffer for the 199 ** nul-terminator. 200 */ 201 len = pMem->n * 2 + 2; 202 } 203 204 /* Set zIn to point at the start of the input buffer and zTerm to point 1 205 ** byte past the end. 206 ** 207 ** Variable zOut is set to point at the output buffer. This may be space 208 ** obtained from malloc(), or Mem.zShort, if it large enough and not in 209 ** use, or the zShort array on the stack (see above). 210 */ 211 zIn = (u8*)pMem->z; 212 zTerm = &zIn[pMem->n]; 213 if( len>NBFS ){ 214 zOut = sqliteMallocRaw(len); 215 if( !zOut ) return SQLITE_NOMEM; 216 }else{ 217 zOut = zShort; 218 } 219 z = zOut; 220 221 if( pMem->enc==SQLITE_UTF8 ){ 222 unsigned int iExtra = 0xD800; 223 224 if( 0==(pMem->flags&MEM_Term) && zTerm>zIn && (zTerm[-1]&0x80) ){ 225 /* This UTF8 string is not nul-terminated, and the last byte is 226 ** not a character in the ascii range (codpoints 0..127). This 227 ** means the SQLITE_READ_UTF8() macro might read past the end 228 ** of the allocated buffer. 229 ** 230 ** There are four possibilities: 231 ** 232 ** 1. The last byte is the first byte of a non-ASCII character, 233 ** 234 ** 2. The final N bytes of the input string are continuation bytes 235 ** and immediately preceding them is the first byte of a 236 ** non-ASCII character. 237 ** 238 ** 3. The final N bytes of the input string are continuation bytes 239 ** and immediately preceding them is a byte that encodes a 240 ** character in the ASCII range. 241 ** 242 ** 4. The entire string consists of continuation characters. 243 ** 244 ** Cases (3) and (4) require no special handling. The SQLITE_READ_UTF8() 245 ** macro will not overread the buffer in these cases. 246 */ 247 unsigned char *zExtra = &zTerm[-1]; 248 while( zExtra>zIn && (zExtra[0]&0xC0)==0x80 ){ 249 zExtra--; 250 } 251 252 if( (zExtra[0]&0xC0)==0xC0 ){ 253 /* Make a copy of the last character encoding in the input string. 254 ** Then make sure it is nul-terminated and use SQLITE_READ_UTF8() 255 ** to decode the codepoint. Store the codepoint in variable iExtra, 256 ** it will be appended to the output string later. 257 */ 258 unsigned char *zFree = 0; 259 unsigned char zBuf[16]; 260 int nExtra = (pMem->n+zIn-zExtra); 261 zTerm = zExtra; 262 if( nExtra>15 ){ 263 zExtra = sqliteMallocRaw(nExtra+1); 264 if( !zExtra ){ 265 return SQLITE_NOMEM; 266 } 267 zFree = zExtra; 268 }else{ 269 zExtra = zBuf; 270 } 271 memcpy(zExtra, zTerm, nExtra); 272 zExtra[nExtra] = '\0'; 273 SQLITE_READ_UTF8(zExtra, iExtra); 274 sqliteFree(zFree); 275 } 276 } 277 278 if( desiredEnc==SQLITE_UTF16LE ){ 279 /* UTF-8 -> UTF-16 Little-endian */ 280 while( zIn<zTerm ){ 281 SQLITE_READ_UTF8(zIn, c); 282 WRITE_UTF16LE(z, c); 283 } 284 if( iExtra!=0xD800 ){ 285 WRITE_UTF16LE(z, iExtra); 286 } 287 }else{ 288 assert( desiredEnc==SQLITE_UTF16BE ); 289 /* UTF-8 -> UTF-16 Big-endian */ 290 while( zIn<zTerm ){ 291 SQLITE_READ_UTF8(zIn, c); 292 WRITE_UTF16BE(z, c); 293 } 294 if( iExtra!=0xD800 ){ 295 WRITE_UTF16BE(z, iExtra); 296 } 297 } 298 pMem->n = z - zOut; 299 *z++ = 0; 300 }else{ 301 assert( desiredEnc==SQLITE_UTF8 ); 302 if( pMem->enc==SQLITE_UTF16LE ){ 303 /* UTF-16 Little-endian -> UTF-8 */ 304 while( zIn<zTerm ){ 305 READ_UTF16LE(zIn, c); 306 WRITE_UTF8(z, c); 307 } 308 }else{ 309 /* UTF-16 Little-endian -> UTF-8 */ 310 while( zIn<zTerm ){ 311 READ_UTF16BE(zIn, c); 312 WRITE_UTF8(z, c); 313 } 314 } 315 pMem->n = z - zOut; 316 } 317 *z = 0; 318 assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len ); 319 320 sqlite3VdbeMemRelease(pMem); 321 pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem|MEM_Short); 322 pMem->enc = desiredEnc; 323 if( zOut==zShort ){ 324 memcpy(pMem->zShort, zOut, len); 325 zOut = (u8*)pMem->zShort; 326 pMem->flags |= (MEM_Term|MEM_Short); 327 }else{ 328 pMem->flags |= (MEM_Term|MEM_Dyn); 329 } 330 pMem->z = (char*)zOut; 331 332 translate_out: 333 #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG) 334 { 335 char zBuf[100]; 336 sqlite3VdbeMemPrettyPrint(pMem, zBuf); 337 fprintf(stderr, "OUTPUT: %s\n", zBuf); 338 } 339 #endif 340 return SQLITE_OK; 341 } 342 343 /* 344 ** This routine checks for a byte-order mark at the beginning of the 345 ** UTF-16 string stored in *pMem. If one is present, it is removed and 346 ** the encoding of the Mem adjusted. This routine does not do any 347 ** byte-swapping, it just sets Mem.enc appropriately. 348 ** 349 ** The allocation (static, dynamic etc.) and encoding of the Mem may be 350 ** changed by this function. 351 */ 352 int sqlite3VdbeMemHandleBom(Mem *pMem){ 353 int rc = SQLITE_OK; 354 u8 bom = 0; 355 356 if( pMem->n<0 || pMem->n>1 ){ 357 u8 b1 = *(u8 *)pMem->z; 358 u8 b2 = *(((u8 *)pMem->z) + 1); 359 if( b1==0xFE && b2==0xFF ){ 360 bom = SQLITE_UTF16BE; 361 } 362 if( b1==0xFF && b2==0xFE ){ 363 bom = SQLITE_UTF16LE; 364 } 365 } 366 367 if( bom ){ 368 /* This function is called as soon as a string is stored in a Mem*, 369 ** from within sqlite3VdbeMemSetStr(). At that point it is not possible 370 ** for the string to be stored in Mem.zShort, or for it to be stored 371 ** in dynamic memory with no destructor. 372 */ 373 assert( !(pMem->flags&MEM_Short) ); 374 assert( !(pMem->flags&MEM_Dyn) || pMem->xDel ); 375 if( pMem->flags & MEM_Dyn ){ 376 void (*xDel)(void*) = pMem->xDel; 377 char *z = pMem->z; 378 pMem->z = 0; 379 pMem->xDel = 0; 380 rc = sqlite3VdbeMemSetStr(pMem, &z[2], pMem->n-2, bom, SQLITE_TRANSIENT); 381 xDel(z); 382 }else{ 383 rc = sqlite3VdbeMemSetStr(pMem, &pMem->z[2], pMem->n-2, bom, 384 SQLITE_TRANSIENT); 385 } 386 } 387 return rc; 388 } 389 #endif /* SQLITE_OMIT_UTF16 */ 390 391 /* 392 ** pZ is a UTF-8 encoded unicode string. If nByte is less than zero, 393 ** return the number of unicode characters in pZ up to (but not including) 394 ** the first 0x00 byte. If nByte is not less than zero, return the 395 ** number of unicode characters in the first nByte of pZ (or up to 396 ** the first 0x00, whichever comes first). 397 */ 398 int sqlite3Utf8CharLen(const char *zIn, int nByte){ 399 int r = 0; 400 const u8 *z = (const u8*)zIn; 401 const u8 *zTerm; 402 if( nByte>=0 ){ 403 zTerm = &z[nByte]; 404 }else{ 405 zTerm = (const u8*)(-1); 406 } 407 assert( z<=zTerm ); 408 while( *z!=0 && z<zTerm ){ 409 SQLITE_SKIP_UTF8(z); 410 r++; 411 } 412 return r; 413 } 414 415 #ifndef SQLITE_OMIT_UTF16 416 /* 417 ** Convert a UTF-16 string in the native encoding into a UTF-8 string. 418 ** Memory to hold the UTF-8 string is obtained from malloc and must be 419 ** freed by the calling function. 420 ** 421 ** NULL is returned if there is an allocation error. 422 */ 423 char *sqlite3Utf16to8(const void *z, int nByte){ 424 Mem m; 425 memset(&m, 0, sizeof(m)); 426 sqlite3VdbeMemSetStr(&m, z, nByte, SQLITE_UTF16NATIVE, SQLITE_STATIC); 427 sqlite3VdbeChangeEncoding(&m, SQLITE_UTF8); 428 assert( (m.flags & MEM_Term)!=0 || sqlite3MallocFailed() ); 429 assert( (m.flags & MEM_Str)!=0 || sqlite3MallocFailed() ); 430 return (m.flags & MEM_Dyn)!=0 ? m.z : sqliteStrDup(m.z); 431 } 432 433 /* 434 ** pZ is a UTF-16 encoded unicode string. If nChar is less than zero, 435 ** return the number of bytes up to (but not including), the first pair 436 ** of consecutive 0x00 bytes in pZ. If nChar is not less than zero, 437 ** then return the number of bytes in the first nChar unicode characters 438 ** in pZ (or up until the first pair of 0x00 bytes, whichever comes first). 439 */ 440 int sqlite3Utf16ByteLen(const void *zIn, int nChar){ 441 unsigned int c = 1; 442 char const *z = zIn; 443 int n = 0; 444 if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){ 445 /* Using an "if (SQLITE_UTF16NATIVE==SQLITE_UTF16BE)" construct here 446 ** and in other parts of this file means that at one branch will 447 ** not be covered by coverage testing on any single host. But coverage 448 ** will be complete if the tests are run on both a little-endian and 449 ** big-endian host. Because both the UTF16NATIVE and SQLITE_UTF16BE 450 ** macros are constant at compile time the compiler can determine 451 ** which branch will be followed. It is therefore assumed that no runtime 452 ** penalty is paid for this "if" statement. 453 */ 454 while( c && ((nChar<0) || n<nChar) ){ 455 READ_UTF16BE(z, c); 456 n++; 457 } 458 }else{ 459 while( c && ((nChar<0) || n<nChar) ){ 460 READ_UTF16LE(z, c); 461 n++; 462 } 463 } 464 return (z-(char const *)zIn)-((c==0)?2:0); 465 } 466 467 #if defined(SQLITE_TEST) 468 /* 469 ** Translate UTF-8 to UTF-8. 470 ** 471 ** This has the effect of making sure that the string is well-formed 472 ** UTF-8. Miscoded characters are removed. 473 ** 474 ** The translation is done in-place (since it is impossible for the 475 ** correct UTF-8 encoding to be longer than a malformed encoding). 476 */ 477 int sqlite3Utf8To8(unsigned char *zIn){ 478 unsigned char *zOut = zIn; 479 unsigned char *zStart = zIn; 480 int c; 481 482 while(1){ 483 SQLITE_READ_UTF8(zIn, c); 484 if( c==0 ) break; 485 if( c!=0xfffd ){ 486 WRITE_UTF8(zOut, c); 487 } 488 } 489 *zOut = 0; 490 return zOut - zStart; 491 } 492 #endif 493 494 #if defined(SQLITE_TEST) 495 /* 496 ** This routine is called from the TCL test function "translate_selftest". 497 ** It checks that the primitives for serializing and deserializing 498 ** characters in each encoding are inverses of each other. 499 */ 500 void sqlite3UtfSelfTest(){ 501 unsigned int i, t; 502 unsigned char zBuf[20]; 503 unsigned char *z; 504 int n; 505 unsigned int c; 506 507 for(i=0; i<0x00110000; i++){ 508 z = zBuf; 509 WRITE_UTF8(z, i); 510 n = z-zBuf; 511 z[0] = 0; 512 z = zBuf; 513 SQLITE_READ_UTF8(z, c); 514 t = i; 515 if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD; 516 if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD; 517 assert( c==t ); 518 assert( (z-zBuf)==n ); 519 } 520 for(i=0; i<0x00110000; i++){ 521 if( i>=0xD800 && i<0xE000 ) continue; 522 z = zBuf; 523 WRITE_UTF16LE(z, i); 524 n = z-zBuf; 525 z[0] = 0; 526 z = zBuf; 527 READ_UTF16LE(z, c); 528 assert( c==i ); 529 assert( (z-zBuf)==n ); 530 } 531 for(i=0; i<0x00110000; i++){ 532 if( i>=0xD800 && i<0xE000 ) continue; 533 z = zBuf; 534 WRITE_UTF16BE(z, i); 535 n = z-zBuf; 536 z[0] = 0; 537 z = zBuf; 538 READ_UTF16BE(z, c); 539 assert( c==i ); 540 assert( (z-zBuf)==n ); 541 } 542 } 543 #endif /* SQLITE_TEST */ 544 #endif /* SQLITE_OMIT_UTF16 */ 545