1 /* 2 ** 2004 April 13 3 ** 4 ** The author disclaims copyright to this source code. In place of 5 ** a legal notice, here is a blessing: 6 ** 7 ** May you do good and not evil. 8 ** May you find forgiveness for yourself and forgive others. 9 ** May you share freely, never taking more than you give. 10 ** 11 ************************************************************************* 12 ** This file contains routines used to translate between UTF-8, 13 ** UTF-16, UTF-16BE, and UTF-16LE. 14 ** 15 ** $Id: utf.c,v 1.50 2007/05/16 18:23:05 danielk1977 Exp $ 16 ** 17 ** Notes on UTF-8: 18 ** 19 ** Byte-0 Byte-1 Byte-2 Byte-3 Value 20 ** 0xxxxxxx 00000000 00000000 0xxxxxxx 21 ** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx 22 ** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx 23 ** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx 24 ** 25 ** 26 ** Notes on UTF-16: (with wwww+1==uuuuu) 27 ** 28 ** Word-0 Word-1 Value 29 ** 110110ww wwzzzzyy 110111yy yyxxxxxx 000uuuuu zzzzyyyy yyxxxxxx 30 ** zzzzyyyy yyxxxxxx 00000000 zzzzyyyy yyxxxxxx 31 ** 32 ** 33 ** BOM or Byte Order Mark: 34 ** 0xff 0xfe little-endian utf-16 follows 35 ** 0xfe 0xff big-endian utf-16 follows 36 ** 37 */ 38 #include "sqliteInt.h" 39 #include <assert.h> 40 #include "vdbeInt.h" 41 42 /* 43 ** The following constant value is used by the SQLITE_BIGENDIAN and 44 ** SQLITE_LITTLEENDIAN macros. 45 */ 46 const int sqlite3one = 1; 47 48 /* 49 ** This lookup table is used to help decode the first byte of 50 ** a multi-byte UTF8 character. 51 */ 52 const unsigned char sqlite3UtfTrans1[] = { 53 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 54 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 55 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 56 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 57 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 58 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 59 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 60 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00, 61 }; 62 63 #define WRITE_UTF8(zOut, c) { \ 64 if( c<0x00080 ){ \ 65 *zOut++ = (c&0xFF); \ 66 } \ 67 else if( c<0x00800 ){ \ 68 *zOut++ = 0xC0 + ((c>>6)&0x1F); \ 69 *zOut++ = 0x80 + (c & 0x3F); \ 70 } \ 71 else if( c<0x10000 ){ \ 72 *zOut++ = 0xE0 + ((c>>12)&0x0F); \ 73 *zOut++ = 0x80 + ((c>>6) & 0x3F); \ 74 *zOut++ = 0x80 + (c & 0x3F); \ 75 }else{ \ 76 *zOut++ = 0xF0 + ((c>>18) & 0x07); \ 77 *zOut++ = 0x80 + ((c>>12) & 0x3F); \ 78 *zOut++ = 0x80 + ((c>>6) & 0x3F); \ 79 *zOut++ = 0x80 + (c & 0x3F); \ 80 } \ 81 } 82 83 #define WRITE_UTF16LE(zOut, c) { \ 84 if( c<=0xFFFF ){ \ 85 *zOut++ = (c&0x00FF); \ 86 *zOut++ = ((c>>8)&0x00FF); \ 87 }else{ \ 88 *zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \ 89 *zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03)); \ 90 *zOut++ = (c&0x00FF); \ 91 *zOut++ = (0x00DC + ((c>>8)&0x03)); \ 92 } \ 93 } 94 95 #define WRITE_UTF16BE(zOut, c) { \ 96 if( c<=0xFFFF ){ \ 97 *zOut++ = ((c>>8)&0x00FF); \ 98 *zOut++ = (c&0x00FF); \ 99 }else{ \ 100 *zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03)); \ 101 *zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \ 102 *zOut++ = (0x00DC + ((c>>8)&0x03)); \ 103 *zOut++ = (c&0x00FF); \ 104 } \ 105 } 106 107 #define READ_UTF16LE(zIn, c){ \ 108 c = (*zIn++); \ 109 c += ((*zIn++)<<8); \ 110 if( c>=0xD800 && c<0xE000 ){ \ 111 int c2 = (*zIn++); \ 112 c2 += ((*zIn++)<<8); \ 113 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \ 114 if( (c & 0xFFFF0000)==0 ) c = 0xFFFD; \ 115 } \ 116 } 117 118 #define READ_UTF16BE(zIn, c){ \ 119 c = ((*zIn++)<<8); \ 120 c += (*zIn++); \ 121 if( c>=0xD800 && c<0xE000 ){ \ 122 int c2 = ((*zIn++)<<8); \ 123 c2 += (*zIn++); \ 124 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \ 125 if( (c & 0xFFFF0000)==0 ) c = 0xFFFD; \ 126 } \ 127 } 128 129 /* 130 ** If the TRANSLATE_TRACE macro is defined, the value of each Mem is 131 ** printed on stderr on the way into and out of sqlite3VdbeMemTranslate(). 132 */ 133 /* #define TRANSLATE_TRACE 1 */ 134 135 #ifndef SQLITE_OMIT_UTF16 136 /* 137 ** This routine transforms the internal text encoding used by pMem to 138 ** desiredEnc. It is an error if the string is already of the desired 139 ** encoding, or if *pMem does not contain a string value. 140 */ 141 int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){ 142 unsigned char zShort[NBFS]; /* Temporary short output buffer */ 143 int len; /* Maximum length of output string in bytes */ 144 unsigned char *zOut; /* Output buffer */ 145 unsigned char *zIn; /* Input iterator */ 146 unsigned char *zTerm; /* End of input */ 147 unsigned char *z; /* Output iterator */ 148 unsigned int c; 149 150 assert( pMem->flags&MEM_Str ); 151 assert( pMem->enc!=desiredEnc ); 152 assert( pMem->enc!=0 ); 153 assert( pMem->n>=0 ); 154 155 #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG) 156 { 157 char zBuf[100]; 158 sqlite3VdbeMemPrettyPrint(pMem, zBuf); 159 fprintf(stderr, "INPUT: %s\n", zBuf); 160 } 161 #endif 162 163 /* If the translation is between UTF-16 little and big endian, then 164 ** all that is required is to swap the byte order. This case is handled 165 ** differently from the others. 166 */ 167 if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){ 168 u8 temp; 169 int rc; 170 rc = sqlite3VdbeMemMakeWriteable(pMem); 171 if( rc!=SQLITE_OK ){ 172 assert( rc==SQLITE_NOMEM ); 173 return SQLITE_NOMEM; 174 } 175 zIn = (u8*)pMem->z; 176 zTerm = &zIn[pMem->n]; 177 while( zIn<zTerm ){ 178 temp = *zIn; 179 *zIn = *(zIn+1); 180 zIn++; 181 *zIn++ = temp; 182 } 183 pMem->enc = desiredEnc; 184 goto translate_out; 185 } 186 187 /* Set len to the maximum number of bytes required in the output buffer. */ 188 if( desiredEnc==SQLITE_UTF8 ){ 189 /* When converting from UTF-16, the maximum growth results from 190 ** translating a 2-byte character to a 4-byte UTF-8 character. 191 ** A single byte is required for the output string 192 ** nul-terminator. 193 */ 194 len = pMem->n * 2 + 1; 195 }else{ 196 /* When converting from UTF-8 to UTF-16 the maximum growth is caused 197 ** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16 198 ** character. Two bytes are required in the output buffer for the 199 ** nul-terminator. 200 */ 201 len = pMem->n * 2 + 2; 202 } 203 204 /* Set zIn to point at the start of the input buffer and zTerm to point 1 205 ** byte past the end. 206 ** 207 ** Variable zOut is set to point at the output buffer. This may be space 208 ** obtained from malloc(), or Mem.zShort, if it large enough and not in 209 ** use, or the zShort array on the stack (see above). 210 */ 211 zIn = (u8*)pMem->z; 212 zTerm = &zIn[pMem->n]; 213 if( len>NBFS ){ 214 zOut = sqliteMallocRaw(len); 215 if( !zOut ) return SQLITE_NOMEM; 216 }else{ 217 zOut = zShort; 218 } 219 z = zOut; 220 221 if( pMem->enc==SQLITE_UTF8 ){ 222 if( desiredEnc==SQLITE_UTF16LE ){ 223 /* UTF-8 -> UTF-16 Little-endian */ 224 while( zIn<zTerm ){ 225 SQLITE_READ_UTF8(zIn, c); 226 WRITE_UTF16LE(z, c); 227 } 228 }else{ 229 assert( desiredEnc==SQLITE_UTF16BE ); 230 /* UTF-8 -> UTF-16 Big-endian */ 231 while( zIn<zTerm ){ 232 SQLITE_READ_UTF8(zIn, c); 233 WRITE_UTF16BE(z, c); 234 } 235 } 236 pMem->n = z - zOut; 237 *z++ = 0; 238 }else{ 239 assert( desiredEnc==SQLITE_UTF8 ); 240 if( pMem->enc==SQLITE_UTF16LE ){ 241 /* UTF-16 Little-endian -> UTF-8 */ 242 while( zIn<zTerm ){ 243 READ_UTF16LE(zIn, c); 244 WRITE_UTF8(z, c); 245 } 246 }else{ 247 /* UTF-16 Little-endian -> UTF-8 */ 248 while( zIn<zTerm ){ 249 READ_UTF16BE(zIn, c); 250 WRITE_UTF8(z, c); 251 } 252 } 253 pMem->n = z - zOut; 254 } 255 *z = 0; 256 assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len ); 257 258 sqlite3VdbeMemRelease(pMem); 259 pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem|MEM_Short); 260 pMem->enc = desiredEnc; 261 if( zOut==zShort ){ 262 memcpy(pMem->zShort, zOut, len); 263 zOut = (u8*)pMem->zShort; 264 pMem->flags |= (MEM_Term|MEM_Short); 265 }else{ 266 pMem->flags |= (MEM_Term|MEM_Dyn); 267 } 268 pMem->z = (char*)zOut; 269 270 translate_out: 271 #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG) 272 { 273 char zBuf[100]; 274 sqlite3VdbeMemPrettyPrint(pMem, zBuf); 275 fprintf(stderr, "OUTPUT: %s\n", zBuf); 276 } 277 #endif 278 return SQLITE_OK; 279 } 280 281 /* 282 ** This routine checks for a byte-order mark at the beginning of the 283 ** UTF-16 string stored in *pMem. If one is present, it is removed and 284 ** the encoding of the Mem adjusted. This routine does not do any 285 ** byte-swapping, it just sets Mem.enc appropriately. 286 ** 287 ** The allocation (static, dynamic etc.) and encoding of the Mem may be 288 ** changed by this function. 289 */ 290 int sqlite3VdbeMemHandleBom(Mem *pMem){ 291 int rc = SQLITE_OK; 292 u8 bom = 0; 293 294 if( pMem->n<0 || pMem->n>1 ){ 295 u8 b1 = *(u8 *)pMem->z; 296 u8 b2 = *(((u8 *)pMem->z) + 1); 297 if( b1==0xFE && b2==0xFF ){ 298 bom = SQLITE_UTF16BE; 299 } 300 if( b1==0xFF && b2==0xFE ){ 301 bom = SQLITE_UTF16LE; 302 } 303 } 304 305 if( bom ){ 306 /* This function is called as soon as a string is stored in a Mem*, 307 ** from within sqlite3VdbeMemSetStr(). At that point it is not possible 308 ** for the string to be stored in Mem.zShort, or for it to be stored 309 ** in dynamic memory with no destructor. 310 */ 311 assert( !(pMem->flags&MEM_Short) ); 312 assert( !(pMem->flags&MEM_Dyn) || pMem->xDel ); 313 if( pMem->flags & MEM_Dyn ){ 314 void (*xDel)(void*) = pMem->xDel; 315 char *z = pMem->z; 316 pMem->z = 0; 317 pMem->xDel = 0; 318 rc = sqlite3VdbeMemSetStr(pMem, &z[2], pMem->n-2, bom, SQLITE_TRANSIENT); 319 xDel(z); 320 }else{ 321 rc = sqlite3VdbeMemSetStr(pMem, &pMem->z[2], pMem->n-2, bom, 322 SQLITE_TRANSIENT); 323 } 324 } 325 return rc; 326 } 327 #endif /* SQLITE_OMIT_UTF16 */ 328 329 /* 330 ** pZ is a UTF-8 encoded unicode string. If nByte is less than zero, 331 ** return the number of unicode characters in pZ up to (but not including) 332 ** the first 0x00 byte. If nByte is not less than zero, return the 333 ** number of unicode characters in the first nByte of pZ (or up to 334 ** the first 0x00, whichever comes first). 335 */ 336 int sqlite3Utf8CharLen(const char *zIn, int nByte){ 337 int r = 0; 338 const u8 *z = (const u8*)zIn; 339 const u8 *zTerm; 340 if( nByte>=0 ){ 341 zTerm = &z[nByte]; 342 }else{ 343 zTerm = (const u8*)(-1); 344 } 345 assert( z<=zTerm ); 346 while( *z!=0 && z<zTerm ){ 347 SQLITE_SKIP_UTF8(z); 348 r++; 349 } 350 return r; 351 } 352 353 #ifndef SQLITE_OMIT_UTF16 354 /* 355 ** Convert a UTF-16 string in the native encoding into a UTF-8 string. 356 ** Memory to hold the UTF-8 string is obtained from malloc and must be 357 ** freed by the calling function. 358 ** 359 ** NULL is returned if there is an allocation error. 360 */ 361 char *sqlite3Utf16to8(const void *z, int nByte){ 362 Mem m; 363 memset(&m, 0, sizeof(m)); 364 sqlite3VdbeMemSetStr(&m, z, nByte, SQLITE_UTF16NATIVE, SQLITE_STATIC); 365 sqlite3VdbeChangeEncoding(&m, SQLITE_UTF8); 366 assert( (m.flags & MEM_Term)!=0 || sqlite3MallocFailed() ); 367 assert( (m.flags & MEM_Str)!=0 || sqlite3MallocFailed() ); 368 return (m.flags & MEM_Dyn)!=0 ? m.z : sqliteStrDup(m.z); 369 } 370 371 /* 372 ** pZ is a UTF-16 encoded unicode string. If nChar is less than zero, 373 ** return the number of bytes up to (but not including), the first pair 374 ** of consecutive 0x00 bytes in pZ. If nChar is not less than zero, 375 ** then return the number of bytes in the first nChar unicode characters 376 ** in pZ (or up until the first pair of 0x00 bytes, whichever comes first). 377 */ 378 int sqlite3Utf16ByteLen(const void *zIn, int nChar){ 379 unsigned int c = 1; 380 char const *z = zIn; 381 int n = 0; 382 if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){ 383 /* Using an "if (SQLITE_UTF16NATIVE==SQLITE_UTF16BE)" construct here 384 ** and in other parts of this file means that at one branch will 385 ** not be covered by coverage testing on any single host. But coverage 386 ** will be complete if the tests are run on both a little-endian and 387 ** big-endian host. Because both the UTF16NATIVE and SQLITE_UTF16BE 388 ** macros are constant at compile time the compiler can determine 389 ** which branch will be followed. It is therefore assumed that no runtime 390 ** penalty is paid for this "if" statement. 391 */ 392 while( c && ((nChar<0) || n<nChar) ){ 393 READ_UTF16BE(z, c); 394 n++; 395 } 396 }else{ 397 while( c && ((nChar<0) || n<nChar) ){ 398 READ_UTF16LE(z, c); 399 n++; 400 } 401 } 402 return (z-(char const *)zIn)-((c==0)?2:0); 403 } 404 405 #if defined(SQLITE_TEST) 406 /* 407 ** Translate UTF-8 to UTF-8. 408 ** 409 ** This has the effect of making sure that the string is well-formed 410 ** UTF-8. Miscoded characters are removed. 411 ** 412 ** The translation is done in-place (since it is impossible for the 413 ** correct UTF-8 encoding to be longer than a malformed encoding). 414 */ 415 int sqlite3Utf8To8(unsigned char *zIn){ 416 unsigned char *zOut = zIn; 417 unsigned char *zStart = zIn; 418 int c; 419 420 while(1){ 421 SQLITE_READ_UTF8(zIn, c); 422 if( c==0 ) break; 423 if( c!=0xfffd ){ 424 WRITE_UTF8(zOut, c); 425 } 426 } 427 *zOut = 0; 428 return zOut - zStart; 429 } 430 #endif 431 432 #if defined(SQLITE_TEST) 433 /* 434 ** This routine is called from the TCL test function "translate_selftest". 435 ** It checks that the primitives for serializing and deserializing 436 ** characters in each encoding are inverses of each other. 437 */ 438 void sqlite3UtfSelfTest(){ 439 unsigned int i, t; 440 unsigned char zBuf[20]; 441 unsigned char *z; 442 int n; 443 unsigned int c; 444 445 for(i=0; i<0x00110000; i++){ 446 z = zBuf; 447 WRITE_UTF8(z, i); 448 n = z-zBuf; 449 z[0] = 0; 450 z = zBuf; 451 SQLITE_READ_UTF8(z, c); 452 t = i; 453 if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD; 454 if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD; 455 assert( c==t ); 456 assert( (z-zBuf)==n ); 457 } 458 for(i=0; i<0x00110000; i++){ 459 if( i>=0xD800 && i<0xE000 ) continue; 460 z = zBuf; 461 WRITE_UTF16LE(z, i); 462 n = z-zBuf; 463 z[0] = 0; 464 z = zBuf; 465 READ_UTF16LE(z, c); 466 assert( c==i ); 467 assert( (z-zBuf)==n ); 468 } 469 for(i=0; i<0x00110000; i++){ 470 if( i>=0xD800 && i<0xE000 ) continue; 471 z = zBuf; 472 WRITE_UTF16BE(z, i); 473 n = z-zBuf; 474 z[0] = 0; 475 z = zBuf; 476 READ_UTF16BE(z, c); 477 assert( c==i ); 478 assert( (z-zBuf)==n ); 479 } 480 } 481 #endif /* SQLITE_TEST */ 482 #endif /* SQLITE_OMIT_UTF16 */ 483