19091055eSJustin Lebar /*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
29091055eSJustin Lebar *
32946cd70SChandler Carruth * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
42946cd70SChandler Carruth * See https://llvm.org/LICENSE.txt for license information.
52946cd70SChandler Carruth * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
69091055eSJustin Lebar *
79091055eSJustin Lebar *===------------------------------------------------------------------------=*/
89091055eSJustin Lebar /*
99091055eSJustin Lebar * Copyright 2001-2004 Unicode, Inc.
109091055eSJustin Lebar *
119091055eSJustin Lebar * Disclaimer
129091055eSJustin Lebar *
139091055eSJustin Lebar * This source code is provided as is by Unicode, Inc. No claims are
149091055eSJustin Lebar * made as to fitness for any particular purpose. No warranties of any
159091055eSJustin Lebar * kind are expressed or implied. The recipient agrees to determine
169091055eSJustin Lebar * applicability of information provided. If this file has been
179091055eSJustin Lebar * purchased on magnetic or optical media from Unicode, Inc., the
189091055eSJustin Lebar * sole remedy for any claim will be exchange of defective media
199091055eSJustin Lebar * within 90 days of receipt.
209091055eSJustin Lebar *
219091055eSJustin Lebar * Limitations on Rights to Redistribute This Code
229091055eSJustin Lebar *
239091055eSJustin Lebar * Unicode, Inc. hereby grants the right to freely use the information
249091055eSJustin Lebar * supplied in this file in the creation of products supporting the
259091055eSJustin Lebar * Unicode Standard, and to make copies of this file in any form
269091055eSJustin Lebar * for internal or external distribution as long as this notice
279091055eSJustin Lebar * remains attached.
289091055eSJustin Lebar */
299091055eSJustin Lebar
309091055eSJustin Lebar /* ---------------------------------------------------------------------
319091055eSJustin Lebar
329091055eSJustin Lebar Conversions between UTF32, UTF-16, and UTF-8. Source code file.
339091055eSJustin Lebar Author: Mark E. Davis, 1994.
349091055eSJustin Lebar Rev History: Rick McGowan, fixes & updates May 2001.
359091055eSJustin Lebar Sept 2001: fixed const & error conditions per
369091055eSJustin Lebar mods suggested by S. Parent & A. Lillich.
379091055eSJustin Lebar June 2002: Tim Dodd added detection and handling of incomplete
389091055eSJustin Lebar source sequences, enhanced error detection, added casts
399091055eSJustin Lebar to eliminate compiler warnings.
409091055eSJustin Lebar July 2003: slight mods to back out aggressive FFFE detection.
419091055eSJustin Lebar Jan 2004: updated switches in from-UTF8 conversions.
429091055eSJustin Lebar Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
439091055eSJustin Lebar
449091055eSJustin Lebar See the header file "ConvertUTF.h" for complete documentation.
459091055eSJustin Lebar
469091055eSJustin Lebar ------------------------------------------------------------------------ */
479091055eSJustin Lebar
489091055eSJustin Lebar #include "llvm/Support/ConvertUTF.h"
499091055eSJustin Lebar #ifdef CVTUTF_DEBUG
509091055eSJustin Lebar #include <stdio.h>
519091055eSJustin Lebar #endif
529091055eSJustin Lebar #include <assert.h>
539091055eSJustin Lebar
54229c9c11SGalina Kistanova /*
55229c9c11SGalina Kistanova * This code extensively uses fall-through switches.
56229c9c11SGalina Kistanova * Keep the compiler from warning about that.
57229c9c11SGalina Kistanova */
58229c9c11SGalina Kistanova #if defined(__clang__) && defined(__has_warning)
59229c9c11SGalina Kistanova # if __has_warning("-Wimplicit-fallthrough")
60229c9c11SGalina Kistanova # define ConvertUTF_DISABLE_WARNINGS \
61229c9c11SGalina Kistanova _Pragma("clang diagnostic push") \
62229c9c11SGalina Kistanova _Pragma("clang diagnostic ignored \"-Wimplicit-fallthrough\"")
63229c9c11SGalina Kistanova # define ConvertUTF_RESTORE_WARNINGS \
64229c9c11SGalina Kistanova _Pragma("clang diagnostic pop")
65229c9c11SGalina Kistanova # endif
66229c9c11SGalina Kistanova #elif defined(__GNUC__) && __GNUC__ > 6
67229c9c11SGalina Kistanova # define ConvertUTF_DISABLE_WARNINGS \
68229c9c11SGalina Kistanova _Pragma("GCC diagnostic push") \
69229c9c11SGalina Kistanova _Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")
70229c9c11SGalina Kistanova # define ConvertUTF_RESTORE_WARNINGS \
71229c9c11SGalina Kistanova _Pragma("GCC diagnostic pop")
72229c9c11SGalina Kistanova #endif
73229c9c11SGalina Kistanova #ifndef ConvertUTF_DISABLE_WARNINGS
74229c9c11SGalina Kistanova # define ConvertUTF_DISABLE_WARNINGS
75229c9c11SGalina Kistanova #endif
76229c9c11SGalina Kistanova #ifndef ConvertUTF_RESTORE_WARNINGS
77229c9c11SGalina Kistanova # define ConvertUTF_RESTORE_WARNINGS
78229c9c11SGalina Kistanova #endif
79229c9c11SGalina Kistanova
80229c9c11SGalina Kistanova ConvertUTF_DISABLE_WARNINGS
81229c9c11SGalina Kistanova
829091055eSJustin Lebar namespace llvm {
839091055eSJustin Lebar
849091055eSJustin Lebar static const int halfShift = 10; /* used for shifting by 10 bits */
859091055eSJustin Lebar
869091055eSJustin Lebar static const UTF32 halfBase = 0x0010000UL;
879091055eSJustin Lebar static const UTF32 halfMask = 0x3FFUL;
889091055eSJustin Lebar
899091055eSJustin Lebar #define UNI_SUR_HIGH_START (UTF32)0xD800
909091055eSJustin Lebar #define UNI_SUR_HIGH_END (UTF32)0xDBFF
919091055eSJustin Lebar #define UNI_SUR_LOW_START (UTF32)0xDC00
929091055eSJustin Lebar #define UNI_SUR_LOW_END (UTF32)0xDFFF
939091055eSJustin Lebar
949091055eSJustin Lebar /* --------------------------------------------------------------------- */
959091055eSJustin Lebar
969091055eSJustin Lebar /*
979091055eSJustin Lebar * Index into the table below with the first byte of a UTF-8 sequence to
989091055eSJustin Lebar * get the number of trailing bytes that are supposed to follow it.
999091055eSJustin Lebar * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
1009091055eSJustin Lebar * left as-is for anyone who may want to do such conversion, which was
1019091055eSJustin Lebar * allowed in earlier algorithms.
1029091055eSJustin Lebar */
1039091055eSJustin Lebar static const char trailingBytesForUTF8[256] = {
1049091055eSJustin Lebar 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1059091055eSJustin Lebar 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1069091055eSJustin Lebar 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1079091055eSJustin Lebar 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1089091055eSJustin Lebar 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1099091055eSJustin Lebar 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1109091055eSJustin Lebar 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1119091055eSJustin Lebar 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
1129091055eSJustin Lebar };
1139091055eSJustin Lebar
1149091055eSJustin Lebar /*
1159091055eSJustin Lebar * Magic values subtracted from a buffer value during UTF8 conversion.
1169091055eSJustin Lebar * This table contains as many values as there might be trailing bytes
1179091055eSJustin Lebar * in a UTF-8 sequence.
1189091055eSJustin Lebar */
1199091055eSJustin Lebar static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
1209091055eSJustin Lebar 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
1219091055eSJustin Lebar
1229091055eSJustin Lebar /*
1239091055eSJustin Lebar * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
1249091055eSJustin Lebar * into the first byte, depending on how many bytes follow. There are
1259091055eSJustin Lebar * as many entries in this table as there are UTF-8 sequence types.
1269091055eSJustin Lebar * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
1279091055eSJustin Lebar * for *legal* UTF-8 will be 4 or fewer bytes total.
1289091055eSJustin Lebar */
1299091055eSJustin Lebar static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
1309091055eSJustin Lebar
1319091055eSJustin Lebar /* --------------------------------------------------------------------- */
1329091055eSJustin Lebar
1339091055eSJustin Lebar /* The interface converts a whole buffer to avoid function-call overhead.
1349091055eSJustin Lebar * Constants have been gathered. Loops & conditionals have been removed as
1359091055eSJustin Lebar * much as possible for efficiency, in favor of drop-through switches.
1369091055eSJustin Lebar * (See "Note A" at the bottom of the file for equivalent code.)
1379091055eSJustin Lebar * If your compiler supports it, the "isLegalUTF8" call can be turned
1389091055eSJustin Lebar * into an inline function.
1399091055eSJustin Lebar */
1409091055eSJustin Lebar
1419091055eSJustin Lebar
1429091055eSJustin Lebar /* --------------------------------------------------------------------- */
1439091055eSJustin Lebar
ConvertUTF32toUTF16(const UTF32 ** sourceStart,const UTF32 * sourceEnd,UTF16 ** targetStart,UTF16 * targetEnd,ConversionFlags flags)1449091055eSJustin Lebar ConversionResult ConvertUTF32toUTF16 (
1459091055eSJustin Lebar const UTF32** sourceStart, const UTF32* sourceEnd,
1469091055eSJustin Lebar UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
1479091055eSJustin Lebar ConversionResult result = conversionOK;
1489091055eSJustin Lebar const UTF32* source = *sourceStart;
1499091055eSJustin Lebar UTF16* target = *targetStart;
1509091055eSJustin Lebar while (source < sourceEnd) {
1519091055eSJustin Lebar UTF32 ch;
1529091055eSJustin Lebar if (target >= targetEnd) {
1539091055eSJustin Lebar result = targetExhausted; break;
1549091055eSJustin Lebar }
1559091055eSJustin Lebar ch = *source++;
1569091055eSJustin Lebar if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
1579091055eSJustin Lebar /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
1589091055eSJustin Lebar if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
1599091055eSJustin Lebar if (flags == strictConversion) {
1609091055eSJustin Lebar --source; /* return to the illegal value itself */
1619091055eSJustin Lebar result = sourceIllegal;
1629091055eSJustin Lebar break;
1639091055eSJustin Lebar } else {
1649091055eSJustin Lebar *target++ = UNI_REPLACEMENT_CHAR;
1659091055eSJustin Lebar }
1669091055eSJustin Lebar } else {
1679091055eSJustin Lebar *target++ = (UTF16)ch; /* normal case */
1689091055eSJustin Lebar }
1699091055eSJustin Lebar } else if (ch > UNI_MAX_LEGAL_UTF32) {
1709091055eSJustin Lebar if (flags == strictConversion) {
1719091055eSJustin Lebar result = sourceIllegal;
1729091055eSJustin Lebar } else {
1739091055eSJustin Lebar *target++ = UNI_REPLACEMENT_CHAR;
1749091055eSJustin Lebar }
1759091055eSJustin Lebar } else {
1769091055eSJustin Lebar /* target is a character in range 0xFFFF - 0x10FFFF. */
1779091055eSJustin Lebar if (target + 1 >= targetEnd) {
1789091055eSJustin Lebar --source; /* Back up source pointer! */
1799091055eSJustin Lebar result = targetExhausted; break;
1809091055eSJustin Lebar }
1819091055eSJustin Lebar ch -= halfBase;
1829091055eSJustin Lebar *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
1839091055eSJustin Lebar *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
1849091055eSJustin Lebar }
1859091055eSJustin Lebar }
1869091055eSJustin Lebar *sourceStart = source;
1879091055eSJustin Lebar *targetStart = target;
1889091055eSJustin Lebar return result;
1899091055eSJustin Lebar }
1909091055eSJustin Lebar
1919091055eSJustin Lebar /* --------------------------------------------------------------------- */
1929091055eSJustin Lebar
ConvertUTF16toUTF32(const UTF16 ** sourceStart,const UTF16 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags)1939091055eSJustin Lebar ConversionResult ConvertUTF16toUTF32 (
1949091055eSJustin Lebar const UTF16** sourceStart, const UTF16* sourceEnd,
1959091055eSJustin Lebar UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
1969091055eSJustin Lebar ConversionResult result = conversionOK;
1979091055eSJustin Lebar const UTF16* source = *sourceStart;
1989091055eSJustin Lebar UTF32* target = *targetStart;
1999091055eSJustin Lebar UTF32 ch, ch2;
2009091055eSJustin Lebar while (source < sourceEnd) {
2019091055eSJustin Lebar const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
2029091055eSJustin Lebar ch = *source++;
2039091055eSJustin Lebar /* If we have a surrogate pair, convert to UTF32 first. */
2049091055eSJustin Lebar if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
2059091055eSJustin Lebar /* If the 16 bits following the high surrogate are in the source buffer... */
2069091055eSJustin Lebar if (source < sourceEnd) {
2079091055eSJustin Lebar ch2 = *source;
2089091055eSJustin Lebar /* If it's a low surrogate, convert to UTF32. */
2099091055eSJustin Lebar if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
2109091055eSJustin Lebar ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
2119091055eSJustin Lebar + (ch2 - UNI_SUR_LOW_START) + halfBase;
2129091055eSJustin Lebar ++source;
2139091055eSJustin Lebar } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
2149091055eSJustin Lebar --source; /* return to the illegal value itself */
2159091055eSJustin Lebar result = sourceIllegal;
2169091055eSJustin Lebar break;
2179091055eSJustin Lebar }
2189091055eSJustin Lebar } else { /* We don't have the 16 bits following the high surrogate. */
2199091055eSJustin Lebar --source; /* return to the high surrogate */
2209091055eSJustin Lebar result = sourceExhausted;
2219091055eSJustin Lebar break;
2229091055eSJustin Lebar }
2239091055eSJustin Lebar } else if (flags == strictConversion) {
2249091055eSJustin Lebar /* UTF-16 surrogate values are illegal in UTF-32 */
2259091055eSJustin Lebar if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
2269091055eSJustin Lebar --source; /* return to the illegal value itself */
2279091055eSJustin Lebar result = sourceIllegal;
2289091055eSJustin Lebar break;
2299091055eSJustin Lebar }
2309091055eSJustin Lebar }
2319091055eSJustin Lebar if (target >= targetEnd) {
2329091055eSJustin Lebar source = oldSource; /* Back up source pointer! */
2339091055eSJustin Lebar result = targetExhausted; break;
2349091055eSJustin Lebar }
2359091055eSJustin Lebar *target++ = ch;
2369091055eSJustin Lebar }
2379091055eSJustin Lebar *sourceStart = source;
2389091055eSJustin Lebar *targetStart = target;
2399091055eSJustin Lebar #ifdef CVTUTF_DEBUG
2409091055eSJustin Lebar if (result == sourceIllegal) {
2419091055eSJustin Lebar fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
2429091055eSJustin Lebar fflush(stderr);
2439091055eSJustin Lebar }
2449091055eSJustin Lebar #endif
2459091055eSJustin Lebar return result;
2469091055eSJustin Lebar }
ConvertUTF16toUTF8(const UTF16 ** sourceStart,const UTF16 * sourceEnd,UTF8 ** targetStart,UTF8 * targetEnd,ConversionFlags flags)2479091055eSJustin Lebar ConversionResult ConvertUTF16toUTF8 (
2489091055eSJustin Lebar const UTF16** sourceStart, const UTF16* sourceEnd,
2499091055eSJustin Lebar UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
2509091055eSJustin Lebar ConversionResult result = conversionOK;
2519091055eSJustin Lebar const UTF16* source = *sourceStart;
2529091055eSJustin Lebar UTF8* target = *targetStart;
2539091055eSJustin Lebar while (source < sourceEnd) {
2549091055eSJustin Lebar UTF32 ch;
2559091055eSJustin Lebar unsigned short bytesToWrite = 0;
2569091055eSJustin Lebar const UTF32 byteMask = 0xBF;
2579091055eSJustin Lebar const UTF32 byteMark = 0x80;
2589091055eSJustin Lebar const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
2599091055eSJustin Lebar ch = *source++;
2609091055eSJustin Lebar /* If we have a surrogate pair, convert to UTF32 first. */
2619091055eSJustin Lebar if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
2629091055eSJustin Lebar /* If the 16 bits following the high surrogate are in the source buffer... */
2639091055eSJustin Lebar if (source < sourceEnd) {
2649091055eSJustin Lebar UTF32 ch2 = *source;
2659091055eSJustin Lebar /* If it's a low surrogate, convert to UTF32. */
2669091055eSJustin Lebar if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
2679091055eSJustin Lebar ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
2689091055eSJustin Lebar + (ch2 - UNI_SUR_LOW_START) + halfBase;
2699091055eSJustin Lebar ++source;
2709091055eSJustin Lebar } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
2719091055eSJustin Lebar --source; /* return to the illegal value itself */
2729091055eSJustin Lebar result = sourceIllegal;
2739091055eSJustin Lebar break;
2749091055eSJustin Lebar }
2759091055eSJustin Lebar } else { /* We don't have the 16 bits following the high surrogate. */
2769091055eSJustin Lebar --source; /* return to the high surrogate */
2779091055eSJustin Lebar result = sourceExhausted;
2789091055eSJustin Lebar break;
2799091055eSJustin Lebar }
2809091055eSJustin Lebar } else if (flags == strictConversion) {
2819091055eSJustin Lebar /* UTF-16 surrogate values are illegal in UTF-32 */
2829091055eSJustin Lebar if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
2839091055eSJustin Lebar --source; /* return to the illegal value itself */
2849091055eSJustin Lebar result = sourceIllegal;
2859091055eSJustin Lebar break;
2869091055eSJustin Lebar }
2879091055eSJustin Lebar }
2889091055eSJustin Lebar /* Figure out how many bytes the result will require */
2899091055eSJustin Lebar if (ch < (UTF32)0x80) { bytesToWrite = 1;
2909091055eSJustin Lebar } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
2919091055eSJustin Lebar } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
2929091055eSJustin Lebar } else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
2939091055eSJustin Lebar } else { bytesToWrite = 3;
2949091055eSJustin Lebar ch = UNI_REPLACEMENT_CHAR;
2959091055eSJustin Lebar }
2969091055eSJustin Lebar
2979091055eSJustin Lebar target += bytesToWrite;
2989091055eSJustin Lebar if (target > targetEnd) {
2999091055eSJustin Lebar source = oldSource; /* Back up source pointer! */
3009091055eSJustin Lebar target -= bytesToWrite; result = targetExhausted; break;
3019091055eSJustin Lebar }
3029091055eSJustin Lebar switch (bytesToWrite) { /* note: everything falls through. */
3039091055eSJustin Lebar case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
3049091055eSJustin Lebar case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
3059091055eSJustin Lebar case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
3069091055eSJustin Lebar case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
3079091055eSJustin Lebar }
3089091055eSJustin Lebar target += bytesToWrite;
3099091055eSJustin Lebar }
3109091055eSJustin Lebar *sourceStart = source;
3119091055eSJustin Lebar *targetStart = target;
3129091055eSJustin Lebar return result;
3139091055eSJustin Lebar }
3149091055eSJustin Lebar
3159091055eSJustin Lebar /* --------------------------------------------------------------------- */
3169091055eSJustin Lebar
ConvertUTF32toUTF8(const UTF32 ** sourceStart,const UTF32 * sourceEnd,UTF8 ** targetStart,UTF8 * targetEnd,ConversionFlags flags)3179091055eSJustin Lebar ConversionResult ConvertUTF32toUTF8 (
3189091055eSJustin Lebar const UTF32** sourceStart, const UTF32* sourceEnd,
3199091055eSJustin Lebar UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
3209091055eSJustin Lebar ConversionResult result = conversionOK;
3219091055eSJustin Lebar const UTF32* source = *sourceStart;
3229091055eSJustin Lebar UTF8* target = *targetStart;
3239091055eSJustin Lebar while (source < sourceEnd) {
3249091055eSJustin Lebar UTF32 ch;
3259091055eSJustin Lebar unsigned short bytesToWrite = 0;
3269091055eSJustin Lebar const UTF32 byteMask = 0xBF;
3279091055eSJustin Lebar const UTF32 byteMark = 0x80;
3289091055eSJustin Lebar ch = *source++;
3299091055eSJustin Lebar if (flags == strictConversion ) {
3309091055eSJustin Lebar /* UTF-16 surrogate values are illegal in UTF-32 */
3319091055eSJustin Lebar if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
3329091055eSJustin Lebar --source; /* return to the illegal value itself */
3339091055eSJustin Lebar result = sourceIllegal;
3349091055eSJustin Lebar break;
3359091055eSJustin Lebar }
3369091055eSJustin Lebar }
3379091055eSJustin Lebar /*
3389091055eSJustin Lebar * Figure out how many bytes the result will require. Turn any
3399091055eSJustin Lebar * illegally large UTF32 things (> Plane 17) into replacement chars.
3409091055eSJustin Lebar */
3419091055eSJustin Lebar if (ch < (UTF32)0x80) { bytesToWrite = 1;
3429091055eSJustin Lebar } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
3439091055eSJustin Lebar } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
3449091055eSJustin Lebar } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
3459091055eSJustin Lebar } else { bytesToWrite = 3;
3469091055eSJustin Lebar ch = UNI_REPLACEMENT_CHAR;
3479091055eSJustin Lebar result = sourceIllegal;
3489091055eSJustin Lebar }
3499091055eSJustin Lebar
3509091055eSJustin Lebar target += bytesToWrite;
3519091055eSJustin Lebar if (target > targetEnd) {
3529091055eSJustin Lebar --source; /* Back up source pointer! */
3539091055eSJustin Lebar target -= bytesToWrite; result = targetExhausted; break;
3549091055eSJustin Lebar }
3559091055eSJustin Lebar switch (bytesToWrite) { /* note: everything falls through. */
3569091055eSJustin Lebar case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
3579091055eSJustin Lebar case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
3589091055eSJustin Lebar case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
3599091055eSJustin Lebar case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
3609091055eSJustin Lebar }
3619091055eSJustin Lebar target += bytesToWrite;
3629091055eSJustin Lebar }
3639091055eSJustin Lebar *sourceStart = source;
3649091055eSJustin Lebar *targetStart = target;
3659091055eSJustin Lebar return result;
3669091055eSJustin Lebar }
3679091055eSJustin Lebar
3689091055eSJustin Lebar /* --------------------------------------------------------------------- */
3699091055eSJustin Lebar
3709091055eSJustin Lebar /*
3719091055eSJustin Lebar * Utility routine to tell whether a sequence of bytes is legal UTF-8.
3729091055eSJustin Lebar * This must be called with the length pre-determined by the first byte.
3739091055eSJustin Lebar * If not calling this from ConvertUTF8to*, then the length can be set by:
3749091055eSJustin Lebar * length = trailingBytesForUTF8[*source]+1;
3759091055eSJustin Lebar * and the sequence is illegal right away if there aren't that many bytes
3769091055eSJustin Lebar * available.
3779091055eSJustin Lebar * If presented with a length > 4, this returns false. The Unicode
3789091055eSJustin Lebar * definition of UTF-8 goes up to 4-byte sequences.
3799091055eSJustin Lebar */
3809091055eSJustin Lebar
isLegalUTF8(const UTF8 * source,int length)3819091055eSJustin Lebar static Boolean isLegalUTF8(const UTF8 *source, int length) {
3829091055eSJustin Lebar UTF8 a;
3839091055eSJustin Lebar const UTF8 *srcptr = source+length;
3849091055eSJustin Lebar switch (length) {
3859091055eSJustin Lebar default: return false;
3869091055eSJustin Lebar /* Everything else falls through when "true"... */
3879091055eSJustin Lebar case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
3889091055eSJustin Lebar case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
3899091055eSJustin Lebar case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
3909091055eSJustin Lebar
3919091055eSJustin Lebar switch (*source) {
3929091055eSJustin Lebar /* no fall-through in this inner switch */
3939091055eSJustin Lebar case 0xE0: if (a < 0xA0) return false; break;
3949091055eSJustin Lebar case 0xED: if (a > 0x9F) return false; break;
3959091055eSJustin Lebar case 0xF0: if (a < 0x90) return false; break;
3969091055eSJustin Lebar case 0xF4: if (a > 0x8F) return false; break;
3979091055eSJustin Lebar default: if (a < 0x80) return false;
3989091055eSJustin Lebar }
3999091055eSJustin Lebar
4009091055eSJustin Lebar case 1: if (*source >= 0x80 && *source < 0xC2) return false;
4019091055eSJustin Lebar }
4029091055eSJustin Lebar if (*source > 0xF4) return false;
4039091055eSJustin Lebar return true;
4049091055eSJustin Lebar }
4059091055eSJustin Lebar
4069091055eSJustin Lebar /* --------------------------------------------------------------------- */
4079091055eSJustin Lebar
4089091055eSJustin Lebar /*
4099091055eSJustin Lebar * Exported function to return whether a UTF-8 sequence is legal or not.
4109091055eSJustin Lebar * This is not used here; it's just exported.
4119091055eSJustin Lebar */
isLegalUTF8Sequence(const UTF8 * source,const UTF8 * sourceEnd)4129091055eSJustin Lebar Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
4139091055eSJustin Lebar int length = trailingBytesForUTF8[*source]+1;
4149091055eSJustin Lebar if (length > sourceEnd - source) {
4159091055eSJustin Lebar return false;
4169091055eSJustin Lebar }
4179091055eSJustin Lebar return isLegalUTF8(source, length);
4189091055eSJustin Lebar }
4199091055eSJustin Lebar
420*d4892a16SCorentin Jabot /*
421*d4892a16SCorentin Jabot * Exported function to return the size of the first utf-8 code unit sequence,
422*d4892a16SCorentin Jabot * Or 0 if the sequence is not valid;
423*d4892a16SCorentin Jabot */
getUTF8SequenceSize(const UTF8 * source,const UTF8 * sourceEnd)424*d4892a16SCorentin Jabot unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd) {
425*d4892a16SCorentin Jabot int length = trailingBytesForUTF8[*source] + 1;
426*d4892a16SCorentin Jabot return (length <= sourceEnd - source && isLegalUTF8(source, length)) ? length
427*d4892a16SCorentin Jabot : 0;
428*d4892a16SCorentin Jabot }
429*d4892a16SCorentin Jabot
4309091055eSJustin Lebar /* --------------------------------------------------------------------- */
4319091055eSJustin Lebar
4329091055eSJustin Lebar static unsigned
findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 * source,const UTF8 * sourceEnd)4339091055eSJustin Lebar findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source,
4349091055eSJustin Lebar const UTF8 *sourceEnd) {
4359091055eSJustin Lebar UTF8 b1, b2, b3;
4369091055eSJustin Lebar
4379091055eSJustin Lebar assert(!isLegalUTF8Sequence(source, sourceEnd));
4389091055eSJustin Lebar
4399091055eSJustin Lebar /*
4409091055eSJustin Lebar * Unicode 6.3.0, D93b:
4419091055eSJustin Lebar *
4429091055eSJustin Lebar * Maximal subpart of an ill-formed subsequence: The longest code unit
4439091055eSJustin Lebar * subsequence starting at an unconvertible offset that is either:
4449091055eSJustin Lebar * a. the initial subsequence of a well-formed code unit sequence, or
4459091055eSJustin Lebar * b. a subsequence of length one.
4469091055eSJustin Lebar */
4479091055eSJustin Lebar
4489091055eSJustin Lebar if (source == sourceEnd)
4499091055eSJustin Lebar return 0;
4509091055eSJustin Lebar
4519091055eSJustin Lebar /*
4529091055eSJustin Lebar * Perform case analysis. See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8
4539091055eSJustin Lebar * Byte Sequences.
4549091055eSJustin Lebar */
4559091055eSJustin Lebar
4569091055eSJustin Lebar b1 = *source;
4579091055eSJustin Lebar ++source;
4589091055eSJustin Lebar if (b1 >= 0xC2 && b1 <= 0xDF) {
4599091055eSJustin Lebar /*
4609091055eSJustin Lebar * First byte is valid, but we know that this code unit sequence is
4619091055eSJustin Lebar * invalid, so the maximal subpart has to end after the first byte.
4629091055eSJustin Lebar */
4639091055eSJustin Lebar return 1;
4649091055eSJustin Lebar }
4659091055eSJustin Lebar
4669091055eSJustin Lebar if (source == sourceEnd)
4679091055eSJustin Lebar return 1;
4689091055eSJustin Lebar
4699091055eSJustin Lebar b2 = *source;
4709091055eSJustin Lebar ++source;
4719091055eSJustin Lebar
4729091055eSJustin Lebar if (b1 == 0xE0) {
4739091055eSJustin Lebar return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;
4749091055eSJustin Lebar }
4759091055eSJustin Lebar if (b1 >= 0xE1 && b1 <= 0xEC) {
4769091055eSJustin Lebar return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
4779091055eSJustin Lebar }
4789091055eSJustin Lebar if (b1 == 0xED) {
4799091055eSJustin Lebar return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;
4809091055eSJustin Lebar }
4819091055eSJustin Lebar if (b1 >= 0xEE && b1 <= 0xEF) {
4829091055eSJustin Lebar return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
4839091055eSJustin Lebar }
4849091055eSJustin Lebar if (b1 == 0xF0) {
4859091055eSJustin Lebar if (b2 >= 0x90 && b2 <= 0xBF) {
4869091055eSJustin Lebar if (source == sourceEnd)
4879091055eSJustin Lebar return 2;
4889091055eSJustin Lebar
4899091055eSJustin Lebar b3 = *source;
4909091055eSJustin Lebar return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
4919091055eSJustin Lebar }
4929091055eSJustin Lebar return 1;
4939091055eSJustin Lebar }
4949091055eSJustin Lebar if (b1 >= 0xF1 && b1 <= 0xF3) {
4959091055eSJustin Lebar if (b2 >= 0x80 && b2 <= 0xBF) {
4969091055eSJustin Lebar if (source == sourceEnd)
4979091055eSJustin Lebar return 2;
4989091055eSJustin Lebar
4999091055eSJustin Lebar b3 = *source;
5009091055eSJustin Lebar return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
5019091055eSJustin Lebar }
5029091055eSJustin Lebar return 1;
5039091055eSJustin Lebar }
5049091055eSJustin Lebar if (b1 == 0xF4) {
5059091055eSJustin Lebar if (b2 >= 0x80 && b2 <= 0x8F) {
5069091055eSJustin Lebar if (source == sourceEnd)
5079091055eSJustin Lebar return 2;
5089091055eSJustin Lebar
5099091055eSJustin Lebar b3 = *source;
5109091055eSJustin Lebar return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
5119091055eSJustin Lebar }
5129091055eSJustin Lebar return 1;
5139091055eSJustin Lebar }
5149091055eSJustin Lebar
5159091055eSJustin Lebar assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);
5169091055eSJustin Lebar /*
5179091055eSJustin Lebar * There are no valid sequences that start with these bytes. Maximal subpart
5189091055eSJustin Lebar * is defined to have length 1 in these cases.
5199091055eSJustin Lebar */
5209091055eSJustin Lebar return 1;
5219091055eSJustin Lebar }
5229091055eSJustin Lebar
5239091055eSJustin Lebar /* --------------------------------------------------------------------- */
5249091055eSJustin Lebar
5259091055eSJustin Lebar /*
5269091055eSJustin Lebar * Exported function to return the total number of bytes in a codepoint
5279091055eSJustin Lebar * represented in UTF-8, given the value of the first byte.
5289091055eSJustin Lebar */
getNumBytesForUTF8(UTF8 first)5299091055eSJustin Lebar unsigned getNumBytesForUTF8(UTF8 first) {
5309091055eSJustin Lebar return trailingBytesForUTF8[first] + 1;
5319091055eSJustin Lebar }
5329091055eSJustin Lebar
5339091055eSJustin Lebar /* --------------------------------------------------------------------- */
5349091055eSJustin Lebar
5359091055eSJustin Lebar /*
5369091055eSJustin Lebar * Exported function to return whether a UTF-8 string is legal or not.
5379091055eSJustin Lebar * This is not used here; it's just exported.
5389091055eSJustin Lebar */
isLegalUTF8String(const UTF8 ** source,const UTF8 * sourceEnd)5399091055eSJustin Lebar Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {
5409091055eSJustin Lebar while (*source != sourceEnd) {
5419091055eSJustin Lebar int length = trailingBytesForUTF8[**source] + 1;
5429091055eSJustin Lebar if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
5439091055eSJustin Lebar return false;
5449091055eSJustin Lebar *source += length;
5459091055eSJustin Lebar }
5469091055eSJustin Lebar return true;
5479091055eSJustin Lebar }
5489091055eSJustin Lebar
5499091055eSJustin Lebar /* --------------------------------------------------------------------- */
5509091055eSJustin Lebar
ConvertUTF8toUTF16(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF16 ** targetStart,UTF16 * targetEnd,ConversionFlags flags)5519091055eSJustin Lebar ConversionResult ConvertUTF8toUTF16 (
5529091055eSJustin Lebar const UTF8** sourceStart, const UTF8* sourceEnd,
5539091055eSJustin Lebar UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
5549091055eSJustin Lebar ConversionResult result = conversionOK;
5559091055eSJustin Lebar const UTF8* source = *sourceStart;
5569091055eSJustin Lebar UTF16* target = *targetStart;
5579091055eSJustin Lebar while (source < sourceEnd) {
5589091055eSJustin Lebar UTF32 ch = 0;
5599091055eSJustin Lebar unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
5609091055eSJustin Lebar if (extraBytesToRead >= sourceEnd - source) {
5619091055eSJustin Lebar result = sourceExhausted; break;
5629091055eSJustin Lebar }
5639091055eSJustin Lebar /* Do this check whether lenient or strict */
5649091055eSJustin Lebar if (!isLegalUTF8(source, extraBytesToRead+1)) {
5659091055eSJustin Lebar result = sourceIllegal;
5669091055eSJustin Lebar break;
5679091055eSJustin Lebar }
5689091055eSJustin Lebar /*
5699091055eSJustin Lebar * The cases all fall through. See "Note A" below.
5709091055eSJustin Lebar */
5719091055eSJustin Lebar switch (extraBytesToRead) {
5729091055eSJustin Lebar case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
5739091055eSJustin Lebar case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
5749091055eSJustin Lebar case 3: ch += *source++; ch <<= 6;
5759091055eSJustin Lebar case 2: ch += *source++; ch <<= 6;
5769091055eSJustin Lebar case 1: ch += *source++; ch <<= 6;
5779091055eSJustin Lebar case 0: ch += *source++;
5789091055eSJustin Lebar }
5799091055eSJustin Lebar ch -= offsetsFromUTF8[extraBytesToRead];
5809091055eSJustin Lebar
5819091055eSJustin Lebar if (target >= targetEnd) {
5829091055eSJustin Lebar source -= (extraBytesToRead+1); /* Back up source pointer! */
5839091055eSJustin Lebar result = targetExhausted; break;
5849091055eSJustin Lebar }
5859091055eSJustin Lebar if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
5869091055eSJustin Lebar /* UTF-16 surrogate values are illegal in UTF-32 */
5879091055eSJustin Lebar if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
5889091055eSJustin Lebar if (flags == strictConversion) {
5899091055eSJustin Lebar source -= (extraBytesToRead+1); /* return to the illegal value itself */
5909091055eSJustin Lebar result = sourceIllegal;
5919091055eSJustin Lebar break;
5929091055eSJustin Lebar } else {
5939091055eSJustin Lebar *target++ = UNI_REPLACEMENT_CHAR;
5949091055eSJustin Lebar }
5959091055eSJustin Lebar } else {
5969091055eSJustin Lebar *target++ = (UTF16)ch; /* normal case */
5979091055eSJustin Lebar }
5989091055eSJustin Lebar } else if (ch > UNI_MAX_UTF16) {
5999091055eSJustin Lebar if (flags == strictConversion) {
6009091055eSJustin Lebar result = sourceIllegal;
6019091055eSJustin Lebar source -= (extraBytesToRead+1); /* return to the start */
6029091055eSJustin Lebar break; /* Bail out; shouldn't continue */
6039091055eSJustin Lebar } else {
6049091055eSJustin Lebar *target++ = UNI_REPLACEMENT_CHAR;
6059091055eSJustin Lebar }
6069091055eSJustin Lebar } else {
6079091055eSJustin Lebar /* target is a character in range 0xFFFF - 0x10FFFF. */
6089091055eSJustin Lebar if (target + 1 >= targetEnd) {
6099091055eSJustin Lebar source -= (extraBytesToRead+1); /* Back up source pointer! */
6109091055eSJustin Lebar result = targetExhausted; break;
6119091055eSJustin Lebar }
6129091055eSJustin Lebar ch -= halfBase;
6139091055eSJustin Lebar *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
6149091055eSJustin Lebar *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
6159091055eSJustin Lebar }
6169091055eSJustin Lebar }
6179091055eSJustin Lebar *sourceStart = source;
6189091055eSJustin Lebar *targetStart = target;
6199091055eSJustin Lebar return result;
6209091055eSJustin Lebar }
6219091055eSJustin Lebar
6229091055eSJustin Lebar /* --------------------------------------------------------------------- */
6239091055eSJustin Lebar
ConvertUTF8toUTF32Impl(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags,Boolean InputIsPartial)6249091055eSJustin Lebar static ConversionResult ConvertUTF8toUTF32Impl(
6259091055eSJustin Lebar const UTF8** sourceStart, const UTF8* sourceEnd,
6269091055eSJustin Lebar UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags,
6279091055eSJustin Lebar Boolean InputIsPartial) {
6289091055eSJustin Lebar ConversionResult result = conversionOK;
6299091055eSJustin Lebar const UTF8* source = *sourceStart;
6309091055eSJustin Lebar UTF32* target = *targetStart;
6319091055eSJustin Lebar while (source < sourceEnd) {
6329091055eSJustin Lebar UTF32 ch = 0;
6339091055eSJustin Lebar unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
6349091055eSJustin Lebar if (extraBytesToRead >= sourceEnd - source) {
6359091055eSJustin Lebar if (flags == strictConversion || InputIsPartial) {
6369091055eSJustin Lebar result = sourceExhausted;
6379091055eSJustin Lebar break;
6389091055eSJustin Lebar } else {
6399091055eSJustin Lebar result = sourceIllegal;
6409091055eSJustin Lebar
6419091055eSJustin Lebar /*
6429091055eSJustin Lebar * Replace the maximal subpart of ill-formed sequence with
6439091055eSJustin Lebar * replacement character.
6449091055eSJustin Lebar */
6459091055eSJustin Lebar source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
6469091055eSJustin Lebar sourceEnd);
6479091055eSJustin Lebar *target++ = UNI_REPLACEMENT_CHAR;
6489091055eSJustin Lebar continue;
6499091055eSJustin Lebar }
6509091055eSJustin Lebar }
6519091055eSJustin Lebar if (target >= targetEnd) {
6529091055eSJustin Lebar result = targetExhausted; break;
6539091055eSJustin Lebar }
6549091055eSJustin Lebar
6559091055eSJustin Lebar /* Do this check whether lenient or strict */
6569091055eSJustin Lebar if (!isLegalUTF8(source, extraBytesToRead+1)) {
6579091055eSJustin Lebar result = sourceIllegal;
6589091055eSJustin Lebar if (flags == strictConversion) {
6599091055eSJustin Lebar /* Abort conversion. */
6609091055eSJustin Lebar break;
6619091055eSJustin Lebar } else {
6629091055eSJustin Lebar /*
6639091055eSJustin Lebar * Replace the maximal subpart of ill-formed sequence with
6649091055eSJustin Lebar * replacement character.
6659091055eSJustin Lebar */
6669091055eSJustin Lebar source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
6679091055eSJustin Lebar sourceEnd);
6689091055eSJustin Lebar *target++ = UNI_REPLACEMENT_CHAR;
6699091055eSJustin Lebar continue;
6709091055eSJustin Lebar }
6719091055eSJustin Lebar }
6729091055eSJustin Lebar /*
6739091055eSJustin Lebar * The cases all fall through. See "Note A" below.
6749091055eSJustin Lebar */
6759091055eSJustin Lebar switch (extraBytesToRead) {
6769091055eSJustin Lebar case 5: ch += *source++; ch <<= 6;
6779091055eSJustin Lebar case 4: ch += *source++; ch <<= 6;
6789091055eSJustin Lebar case 3: ch += *source++; ch <<= 6;
6799091055eSJustin Lebar case 2: ch += *source++; ch <<= 6;
6809091055eSJustin Lebar case 1: ch += *source++; ch <<= 6;
6819091055eSJustin Lebar case 0: ch += *source++;
6829091055eSJustin Lebar }
6839091055eSJustin Lebar ch -= offsetsFromUTF8[extraBytesToRead];
6849091055eSJustin Lebar
6859091055eSJustin Lebar if (ch <= UNI_MAX_LEGAL_UTF32) {
6869091055eSJustin Lebar /*
6879091055eSJustin Lebar * UTF-16 surrogate values are illegal in UTF-32, and anything
6889091055eSJustin Lebar * over Plane 17 (> 0x10FFFF) is illegal.
6899091055eSJustin Lebar */
6909091055eSJustin Lebar if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
6919091055eSJustin Lebar if (flags == strictConversion) {
6929091055eSJustin Lebar source -= (extraBytesToRead+1); /* return to the illegal value itself */
6939091055eSJustin Lebar result = sourceIllegal;
6949091055eSJustin Lebar break;
6959091055eSJustin Lebar } else {
6969091055eSJustin Lebar *target++ = UNI_REPLACEMENT_CHAR;
6979091055eSJustin Lebar }
6989091055eSJustin Lebar } else {
6999091055eSJustin Lebar *target++ = ch;
7009091055eSJustin Lebar }
7019091055eSJustin Lebar } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
7029091055eSJustin Lebar result = sourceIllegal;
7039091055eSJustin Lebar *target++ = UNI_REPLACEMENT_CHAR;
7049091055eSJustin Lebar }
7059091055eSJustin Lebar }
7069091055eSJustin Lebar *sourceStart = source;
7079091055eSJustin Lebar *targetStart = target;
7089091055eSJustin Lebar return result;
7099091055eSJustin Lebar }
7109091055eSJustin Lebar
ConvertUTF8toUTF32Partial(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags)7119091055eSJustin Lebar ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart,
7129091055eSJustin Lebar const UTF8 *sourceEnd,
7139091055eSJustin Lebar UTF32 **targetStart,
7149091055eSJustin Lebar UTF32 *targetEnd,
7159091055eSJustin Lebar ConversionFlags flags) {
7169091055eSJustin Lebar return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
7179091055eSJustin Lebar flags, /*InputIsPartial=*/true);
7189091055eSJustin Lebar }
7199091055eSJustin Lebar
ConvertUTF8toUTF32(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags)7209091055eSJustin Lebar ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart,
7219091055eSJustin Lebar const UTF8 *sourceEnd, UTF32 **targetStart,
7229091055eSJustin Lebar UTF32 *targetEnd, ConversionFlags flags) {
7239091055eSJustin Lebar return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
7249091055eSJustin Lebar flags, /*InputIsPartial=*/false);
7259091055eSJustin Lebar }
7269091055eSJustin Lebar
7279091055eSJustin Lebar /* ---------------------------------------------------------------------
7289091055eSJustin Lebar
7299091055eSJustin Lebar Note A.
7309091055eSJustin Lebar The fall-through switches in UTF-8 reading code save a
7319091055eSJustin Lebar temp variable, some decrements & conditionals. The switches
7329091055eSJustin Lebar are equivalent to the following loop:
7339091055eSJustin Lebar {
7349091055eSJustin Lebar int tmpBytesToRead = extraBytesToRead+1;
7359091055eSJustin Lebar do {
7369091055eSJustin Lebar ch += *source++;
7379091055eSJustin Lebar --tmpBytesToRead;
7389091055eSJustin Lebar if (tmpBytesToRead) ch <<= 6;
7399091055eSJustin Lebar } while (tmpBytesToRead > 0);
7409091055eSJustin Lebar }
7419091055eSJustin Lebar In UTF-8 writing code, the switches on "bytesToWrite" are
7429091055eSJustin Lebar similarly unrolled loops.
7439091055eSJustin Lebar
7449091055eSJustin Lebar --------------------------------------------------------------------- */
7459091055eSJustin Lebar
7469091055eSJustin Lebar } // namespace llvm
747229c9c11SGalina Kistanova
748229c9c11SGalina Kistanova ConvertUTF_RESTORE_WARNINGS
749