1 /* 2 ** 2004 April 13 3 ** 4 ** The author disclaims copyright to this source code. In place of 5 ** a legal notice, here is a blessing: 6 ** 7 ** May you do good and not evil. 8 ** May you find forgiveness for yourself and forgive others. 9 ** May you share freely, never taking more than you give. 10 ** 11 ************************************************************************* 12 ** This file contains routines used to translate between UTF-8, 13 ** UTF-16, UTF-16BE, and UTF-16LE. 14 ** 15 ** $Id: utf.c,v 1.32 2005/01/28 01:29:08 drh Exp $ 16 ** 17 ** Notes on UTF-8: 18 ** 19 ** Byte-0 Byte-1 Byte-2 Byte-3 Value 20 ** 0xxxxxxx 00000000 00000000 0xxxxxxx 21 ** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx 22 ** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx 23 ** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx 24 ** 25 ** 26 ** Notes on UTF-16: (with wwww+1==uuuuu) 27 ** 28 ** Word-0 Word-1 Value 29 ** 110110ww wwzzzzyy 110111yy yyxxxxxx 000uuuuu zzzzyyyy yyxxxxxx 30 ** zzzzyyyy yyxxxxxx 00000000 zzzzyyyy yyxxxxxx 31 ** 32 ** 33 ** BOM or Byte Order Mark: 34 ** 0xff 0xfe little-endian utf-16 follows 35 ** 0xfe 0xff big-endian utf-16 follows 36 ** 37 ** 38 ** Handling of malformed strings: 39 ** 40 ** SQLite accepts and processes malformed strings without an error wherever 41 ** possible. However this is not possible when converting between UTF-8 and 42 ** UTF-16. 43 ** 44 ** When converting malformed UTF-8 strings to UTF-16, one instance of the 45 ** replacement character U+FFFD for each byte that cannot be interpeted as 46 ** part of a valid unicode character. 47 ** 48 ** When converting malformed UTF-16 strings to UTF-8, one instance of the 49 ** replacement character U+FFFD for each pair of bytes that cannot be 50 ** interpeted as part of a valid unicode character. 51 ** 52 ** This file contains the following public routines: 53 ** 54 ** sqlite3VdbeMemTranslate() - Translate the encoding used by a Mem* string. 55 ** sqlite3VdbeMemHandleBom() - Handle byte-order-marks in UTF16 Mem* strings. 56 ** sqlite3utf16ByteLen() - Calculate byte-length of a void* UTF16 string. 57 ** sqlite3utf8CharLen() - Calculate char-length of a char* UTF8 string. 58 ** sqlite3utf8LikeCompare() - Do a LIKE match given two UTF8 char* strings. 59 ** 60 */ 61 #include "sqliteInt.h" 62 #include <assert.h> 63 #include "vdbeInt.h" 64 65 /* 66 ** This table maps from the first byte of a UTF-8 character to the number 67 ** of trailing bytes expected. A value '255' indicates that the table key 68 ** is not a legal first byte for a UTF-8 character. 69 */ 70 static const u8 xtra_utf8_bytes[256] = { 71 /* 0xxxxxxx */ 72 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 73 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 74 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 75 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 76 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 77 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 78 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 79 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 80 81 /* 10wwwwww */ 82 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 83 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 84 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 85 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 86 87 /* 110yyyyy */ 88 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 89 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 90 91 /* 1110zzzz */ 92 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 93 94 /* 11110yyy */ 95 3, 3, 3, 3, 3, 3, 3, 3, 255, 255, 255, 255, 255, 255, 255, 255, 96 }; 97 98 /* 99 ** This table maps from the number of trailing bytes in a UTF-8 character 100 ** to an integer constant that is effectively calculated for each character 101 ** read by a naive implementation of a UTF-8 character reader. The code 102 ** in the READ_UTF8 macro explains things best. 103 */ 104 static const int xtra_utf8_bits[4] = { 105 0, 106 12416, /* (0xC0 << 6) + (0x80) */ 107 925824, /* (0xE0 << 12) + (0x80 << 6) + (0x80) */ 108 63447168 /* (0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ 109 }; 110 111 #define READ_UTF8(zIn, c) { \ 112 int xtra; \ 113 c = *(zIn)++; \ 114 xtra = xtra_utf8_bytes[c]; \ 115 switch( xtra ){ \ 116 case 255: c = (int)0xFFFD; break; \ 117 case 3: c = (c<<6) + *(zIn)++; \ 118 case 2: c = (c<<6) + *(zIn)++; \ 119 case 1: c = (c<<6) + *(zIn)++; \ 120 c -= xtra_utf8_bits[xtra]; \ 121 } \ 122 } 123 int sqlite3ReadUtf8(const unsigned char *z){ 124 int c; 125 READ_UTF8(z, c); 126 return c; 127 } 128 129 #define SKIP_UTF8(zIn) { \ 130 zIn += (xtra_utf8_bytes[*(u8 *)zIn] + 1); \ 131 } 132 133 #define WRITE_UTF8(zOut, c) { \ 134 if( c<0x00080 ){ \ 135 *zOut++ = (c&0xFF); \ 136 } \ 137 else if( c<0x00800 ){ \ 138 *zOut++ = 0xC0 + ((c>>6)&0x1F); \ 139 *zOut++ = 0x80 + (c & 0x3F); \ 140 } \ 141 else if( c<0x10000 ){ \ 142 *zOut++ = 0xE0 + ((c>>12)&0x0F); \ 143 *zOut++ = 0x80 + ((c>>6) & 0x3F); \ 144 *zOut++ = 0x80 + (c & 0x3F); \ 145 }else{ \ 146 *zOut++ = 0xF0 + ((c>>18) & 0x07); \ 147 *zOut++ = 0x80 + ((c>>12) & 0x3F); \ 148 *zOut++ = 0x80 + ((c>>6) & 0x3F); \ 149 *zOut++ = 0x80 + (c & 0x3F); \ 150 } \ 151 } 152 153 #define WRITE_UTF16LE(zOut, c) { \ 154 if( c<=0xFFFF ){ \ 155 *zOut++ = (c&0x00FF); \ 156 *zOut++ = ((c>>8)&0x00FF); \ 157 }else{ \ 158 *zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \ 159 *zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03)); \ 160 *zOut++ = (c&0x00FF); \ 161 *zOut++ = (0x00DC + ((c>>8)&0x03)); \ 162 } \ 163 } 164 165 #define WRITE_UTF16BE(zOut, c) { \ 166 if( c<=0xFFFF ){ \ 167 *zOut++ = ((c>>8)&0x00FF); \ 168 *zOut++ = (c&0x00FF); \ 169 }else{ \ 170 *zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03)); \ 171 *zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \ 172 *zOut++ = (0x00DC + ((c>>8)&0x03)); \ 173 *zOut++ = (c&0x00FF); \ 174 } \ 175 } 176 177 #define READ_UTF16LE(zIn, c){ \ 178 c = (*zIn++); \ 179 c += ((*zIn++)<<8); \ 180 if( c>=0xD800 && c<=0xE000 ){ \ 181 int c2 = (*zIn++); \ 182 c2 += ((*zIn++)<<8); \ 183 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \ 184 } \ 185 } 186 187 #define READ_UTF16BE(zIn, c){ \ 188 c = ((*zIn++)<<8); \ 189 c += (*zIn++); \ 190 if( c>=0xD800 && c<=0xE000 ){ \ 191 int c2 = ((*zIn++)<<8); \ 192 c2 += (*zIn++); \ 193 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \ 194 } \ 195 } 196 197 #define SKIP_UTF16BE(zIn){ \ 198 if( *zIn>=0xD8 && (*zIn<0xE0 || (*zIn==0xE0 && *(zIn+1)==0x00)) ){ \ 199 zIn += 4; \ 200 }else{ \ 201 zIn += 2; \ 202 } \ 203 } 204 #define SKIP_UTF16LE(zIn){ \ 205 zIn++; \ 206 if( *zIn>=0xD8 && (*zIn<0xE0 || (*zIn==0xE0 && *(zIn-1)==0x00)) ){ \ 207 zIn += 3; \ 208 }else{ \ 209 zIn += 1; \ 210 } \ 211 } 212 213 #define RSKIP_UTF16LE(zIn){ \ 214 if( *zIn>=0xD8 && (*zIn<0xE0 || (*zIn==0xE0 && *(zIn-1)==0x00)) ){ \ 215 zIn -= 4; \ 216 }else{ \ 217 zIn -= 2; \ 218 } \ 219 } 220 #define RSKIP_UTF16BE(zIn){ \ 221 zIn--; \ 222 if( *zIn>=0xD8 && (*zIn<0xE0 || (*zIn==0xE0 && *(zIn+1)==0x00)) ){ \ 223 zIn -= 3; \ 224 }else{ \ 225 zIn -= 1; \ 226 } \ 227 } 228 229 /* 230 ** If the TRANSLATE_TRACE macro is defined, the value of each Mem is 231 ** printed on stderr on the way into and out of sqlite3VdbeMemTranslate(). 232 */ 233 /* #define TRANSLATE_TRACE 1 */ 234 235 #ifndef SQLITE_OMIT_UTF16 236 /* 237 ** This routine transforms the internal text encoding used by pMem to 238 ** desiredEnc. It is an error if the string is already of the desired 239 ** encoding, or if *pMem does not contain a string value. 240 */ 241 int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){ 242 unsigned char zShort[NBFS]; /* Temporary short output buffer */ 243 int len; /* Maximum length of output string in bytes */ 244 unsigned char *zOut; /* Output buffer */ 245 unsigned char *zIn; /* Input iterator */ 246 unsigned char *zTerm; /* End of input */ 247 unsigned char *z; /* Output iterator */ 248 int c; 249 250 assert( pMem->flags&MEM_Str ); 251 assert( pMem->enc!=desiredEnc ); 252 assert( pMem->enc!=0 ); 253 assert( pMem->n>=0 ); 254 255 #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG) 256 { 257 char zBuf[100]; 258 sqlite3VdbeMemPrettyPrint(pMem, zBuf, 100); 259 fprintf(stderr, "INPUT: %s\n", zBuf); 260 } 261 #endif 262 263 /* If the translation is between UTF-16 little and big endian, then 264 ** all that is required is to swap the byte order. This case is handled 265 ** differently from the others. 266 */ 267 if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){ 268 u8 temp; 269 int rc; 270 rc = sqlite3VdbeMemMakeWriteable(pMem); 271 if( rc!=SQLITE_OK ){ 272 assert( rc==SQLITE_NOMEM ); 273 return SQLITE_NOMEM; 274 } 275 zIn = pMem->z; 276 zTerm = &zIn[pMem->n]; 277 while( zIn<zTerm ){ 278 temp = *zIn; 279 *zIn = *(zIn+1); 280 zIn++; 281 *zIn++ = temp; 282 } 283 pMem->enc = desiredEnc; 284 goto translate_out; 285 } 286 287 /* Set len to the maximum number of bytes required in the output buffer. */ 288 if( desiredEnc==SQLITE_UTF8 ){ 289 /* When converting from UTF-16, the maximum growth results from 290 ** translating a 2-byte character to a 3-byte UTF-8 character (i.e. 291 ** code-point 0xFFFC). A single byte is required for the output string 292 ** nul-terminator. 293 */ 294 len = (pMem->n/2) * 3 + 1; 295 }else{ 296 /* When converting from UTF-8 to UTF-16 the maximum growth is caused 297 ** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16 298 ** character. Two bytes are required in the output buffer for the 299 ** nul-terminator. 300 */ 301 len = pMem->n * 2 + 2; 302 } 303 304 /* Set zIn to point at the start of the input buffer and zTerm to point 1 305 ** byte past the end. 306 ** 307 ** Variable zOut is set to point at the output buffer. This may be space 308 ** obtained from malloc(), or Mem.zShort, if it large enough and not in 309 ** use, or the zShort array on the stack (see above). 310 */ 311 zIn = pMem->z; 312 zTerm = &zIn[pMem->n]; 313 if( len>NBFS ){ 314 zOut = sqliteMallocRaw(len); 315 if( !zOut ) return SQLITE_NOMEM; 316 }else{ 317 zOut = zShort; 318 } 319 z = zOut; 320 321 if( pMem->enc==SQLITE_UTF8 ){ 322 if( desiredEnc==SQLITE_UTF16LE ){ 323 /* UTF-8 -> UTF-16 Little-endian */ 324 while( zIn<zTerm ){ 325 READ_UTF8(zIn, c); 326 WRITE_UTF16LE(z, c); 327 } 328 }else{ 329 assert( desiredEnc==SQLITE_UTF16BE ); 330 /* UTF-8 -> UTF-16 Big-endian */ 331 while( zIn<zTerm ){ 332 READ_UTF8(zIn, c); 333 WRITE_UTF16BE(z, c); 334 } 335 } 336 pMem->n = z - zOut; 337 *z++ = 0; 338 }else{ 339 assert( desiredEnc==SQLITE_UTF8 ); 340 if( pMem->enc==SQLITE_UTF16LE ){ 341 /* UTF-16 Little-endian -> UTF-8 */ 342 while( zIn<zTerm ){ 343 READ_UTF16LE(zIn, c); 344 WRITE_UTF8(z, c); 345 } 346 }else{ 347 /* UTF-16 Little-endian -> UTF-8 */ 348 while( zIn<zTerm ){ 349 READ_UTF16BE(zIn, c); 350 WRITE_UTF8(z, c); 351 } 352 } 353 pMem->n = z - zOut; 354 } 355 *z = 0; 356 assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len ); 357 358 sqlite3VdbeMemRelease(pMem); 359 pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem|MEM_Short); 360 pMem->enc = desiredEnc; 361 if( zOut==zShort ){ 362 memcpy(pMem->zShort, zOut, len); 363 zOut = pMem->zShort; 364 pMem->flags |= (MEM_Term|MEM_Short); 365 }else{ 366 pMem->flags |= (MEM_Term|MEM_Dyn); 367 } 368 pMem->z = zOut; 369 370 translate_out: 371 #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG) 372 { 373 char zBuf[100]; 374 sqlite3VdbeMemPrettyPrint(pMem, zBuf, 100); 375 fprintf(stderr, "OUTPUT: %s\n", zBuf); 376 } 377 #endif 378 return SQLITE_OK; 379 } 380 381 /* 382 ** This routine checks for a byte-order mark at the beginning of the 383 ** UTF-16 string stored in *pMem. If one is present, it is removed and 384 ** the encoding of the Mem adjusted. This routine does not do any 385 ** byte-swapping, it just sets Mem.enc appropriately. 386 ** 387 ** The allocation (static, dynamic etc.) and encoding of the Mem may be 388 ** changed by this function. 389 */ 390 int sqlite3VdbeMemHandleBom(Mem *pMem){ 391 int rc = SQLITE_OK; 392 u8 bom = 0; 393 394 if( pMem->n<0 || pMem->n>1 ){ 395 u8 b1 = *(u8 *)pMem->z; 396 u8 b2 = *(((u8 *)pMem->z) + 1); 397 if( b1==0xFE && b2==0xFF ){ 398 bom = SQLITE_UTF16BE; 399 } 400 if( b1==0xFF && b2==0xFE ){ 401 bom = SQLITE_UTF16LE; 402 } 403 } 404 405 if( bom ){ 406 /* This function is called as soon as a string is stored in a Mem*, 407 ** from within sqlite3VdbeMemSetStr(). At that point it is not possible 408 ** for the string to be stored in Mem.zShort, or for it to be stored 409 ** in dynamic memory with no destructor. 410 */ 411 assert( !(pMem->flags&MEM_Short) ); 412 assert( !(pMem->flags&MEM_Dyn) || pMem->xDel ); 413 if( pMem->flags & MEM_Dyn ){ 414 void (*xDel)(void*) = pMem->xDel; 415 char *z = pMem->z; 416 pMem->z = 0; 417 pMem->xDel = 0; 418 rc = sqlite3VdbeMemSetStr(pMem, &z[2], pMem->n-2, bom, SQLITE_TRANSIENT); 419 xDel(z); 420 }else{ 421 rc = sqlite3VdbeMemSetStr(pMem, &pMem->z[2], pMem->n-2, bom, 422 SQLITE_TRANSIENT); 423 } 424 } 425 return rc; 426 } 427 #endif /* SQLITE_OMIT_UTF16 */ 428 429 /* 430 ** pZ is a UTF-8 encoded unicode string. If nByte is less than zero, 431 ** return the number of unicode characters in pZ up to (but not including) 432 ** the first 0x00 byte. If nByte is not less than zero, return the 433 ** number of unicode characters in the first nByte of pZ (or up to 434 ** the first 0x00, whichever comes first). 435 */ 436 int sqlite3utf8CharLen(const char *z, int nByte){ 437 int r = 0; 438 const char *zTerm; 439 if( nByte>=0 ){ 440 zTerm = &z[nByte]; 441 }else{ 442 zTerm = (const char *)(-1); 443 } 444 assert( z<=zTerm ); 445 while( *z!=0 && z<zTerm ){ 446 SKIP_UTF8(z); 447 r++; 448 } 449 return r; 450 } 451 452 #ifndef SQLITE_OMIT_UTF16 453 /* 454 ** pZ is a UTF-16 encoded unicode string. If nChar is less than zero, 455 ** return the number of bytes up to (but not including), the first pair 456 ** of consecutive 0x00 bytes in pZ. If nChar is not less than zero, 457 ** then return the number of bytes in the first nChar unicode characters 458 ** in pZ (or up until the first pair of 0x00 bytes, whichever comes first). 459 */ 460 int sqlite3utf16ByteLen(const void *zIn, int nChar){ 461 int c = 1; 462 char const *z = zIn; 463 int n = 0; 464 if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){ 465 while( c && ((nChar<0) || n<nChar) ){ 466 READ_UTF16BE(z, c); 467 n++; 468 } 469 }else{ 470 while( c && ((nChar<0) || n<nChar) ){ 471 READ_UTF16LE(z, c); 472 n++; 473 } 474 } 475 return (z-(char const *)zIn)-((c==0)?2:0); 476 } 477 478 /* 479 ** UTF-16 implementation of the substr() 480 */ 481 void sqlite3utf16Substr( 482 sqlite3_context *context, 483 int argc, 484 sqlite3_value **argv 485 ){ 486 int y, z; 487 unsigned char const *zStr; 488 unsigned char const *zStrEnd; 489 unsigned char const *zStart; 490 unsigned char const *zEnd; 491 int i; 492 493 zStr = (unsigned char const *)sqlite3_value_text16(argv[0]); 494 zStrEnd = &zStr[sqlite3_value_bytes16(argv[0])]; 495 y = sqlite3_value_int(argv[1]); 496 z = sqlite3_value_int(argv[2]); 497 498 if( y>0 ){ 499 y = y-1; 500 zStart = zStr; 501 if( SQLITE_UTF16BE==SQLITE_UTF16NATIVE ){ 502 for(i=0; i<y && zStart<zStrEnd; i++) SKIP_UTF16BE(zStart); 503 }else{ 504 for(i=0; i<y && zStart<zStrEnd; i++) SKIP_UTF16LE(zStart); 505 } 506 }else{ 507 zStart = zStrEnd; 508 if( SQLITE_UTF16BE==SQLITE_UTF16NATIVE ){ 509 for(i=y; i<0 && zStart>zStr; i++) RSKIP_UTF16BE(zStart); 510 }else{ 511 for(i=y; i<0 && zStart>zStr; i++) RSKIP_UTF16LE(zStart); 512 } 513 for(; i<0; i++) z -= 1; 514 } 515 516 zEnd = zStart; 517 if( SQLITE_UTF16BE==SQLITE_UTF16NATIVE ){ 518 for(i=0; i<z && zEnd<zStrEnd; i++) SKIP_UTF16BE(zEnd); 519 }else{ 520 for(i=0; i<z && zEnd<zStrEnd; i++) SKIP_UTF16LE(zEnd); 521 } 522 523 sqlite3_result_text16(context, zStart, zEnd-zStart, SQLITE_TRANSIENT); 524 } 525 526 #if defined(SQLITE_TEST) 527 /* 528 ** This routine is called from the TCL test function "translate_selftest". 529 ** It checks that the primitives for serializing and deserializing 530 ** characters in each encoding are inverses of each other. 531 */ 532 void sqlite3utfSelfTest(){ 533 int i; 534 unsigned char zBuf[20]; 535 unsigned char *z; 536 int n; 537 int c; 538 539 for(i=0; i<0x00110000; i++){ 540 z = zBuf; 541 WRITE_UTF8(z, i); 542 n = z-zBuf; 543 z = zBuf; 544 READ_UTF8(z, c); 545 assert( c==i ); 546 assert( (z-zBuf)==n ); 547 } 548 for(i=0; i<0x00110000; i++){ 549 if( i>=0xD800 && i<=0xE000 ) continue; 550 z = zBuf; 551 WRITE_UTF16LE(z, i); 552 n = z-zBuf; 553 z = zBuf; 554 READ_UTF16LE(z, c); 555 assert( c==i ); 556 assert( (z-zBuf)==n ); 557 } 558 for(i=0; i<0x00110000; i++){ 559 if( i>=0xD800 && i<=0xE000 ) continue; 560 z = zBuf; 561 WRITE_UTF16BE(z, i); 562 n = z-zBuf; 563 z = zBuf; 564 READ_UTF16BE(z, c); 565 assert( c==i ); 566 assert( (z-zBuf)==n ); 567 } 568 } 569 #endif /* SQLITE_TEST */ 570 #endif /* SQLITE_OMIT_UTF16 */ 571