1d88c1a5aSDimitry Andric /*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
2d88c1a5aSDimitry Andric  *
3d88c1a5aSDimitry Andric  *                     The LLVM Compiler Infrastructure
4d88c1a5aSDimitry Andric  *
5d88c1a5aSDimitry Andric  * This file is distributed under the University of Illinois Open Source
6d88c1a5aSDimitry Andric  * License. See LICENSE.TXT for details.
7d88c1a5aSDimitry Andric  *
8d88c1a5aSDimitry Andric  *===------------------------------------------------------------------------=*/
9d88c1a5aSDimitry Andric /*
10d88c1a5aSDimitry Andric  * Copyright 2001-2004 Unicode, Inc.
11d88c1a5aSDimitry Andric  *
12d88c1a5aSDimitry Andric  * Disclaimer
13d88c1a5aSDimitry Andric  *
14d88c1a5aSDimitry Andric  * This source code is provided as is by Unicode, Inc. No claims are
15d88c1a5aSDimitry Andric  * made as to fitness for any particular purpose. No warranties of any
16d88c1a5aSDimitry Andric  * kind are expressed or implied. The recipient agrees to determine
17d88c1a5aSDimitry Andric  * applicability of information provided. If this file has been
18d88c1a5aSDimitry Andric  * purchased on magnetic or optical media from Unicode, Inc., the
19d88c1a5aSDimitry Andric  * sole remedy for any claim will be exchange of defective media
20d88c1a5aSDimitry Andric  * within 90 days of receipt.
21d88c1a5aSDimitry Andric  *
22d88c1a5aSDimitry Andric  * Limitations on Rights to Redistribute This Code
23d88c1a5aSDimitry Andric  *
24d88c1a5aSDimitry Andric  * Unicode, Inc. hereby grants the right to freely use the information
25d88c1a5aSDimitry Andric  * supplied in this file in the creation of products supporting the
26d88c1a5aSDimitry Andric  * Unicode Standard, and to make copies of this file in any form
27d88c1a5aSDimitry Andric  * for internal or external distribution as long as this notice
28d88c1a5aSDimitry Andric  * remains attached.
29d88c1a5aSDimitry Andric  */
30d88c1a5aSDimitry Andric 
31d88c1a5aSDimitry Andric /* ---------------------------------------------------------------------
32d88c1a5aSDimitry Andric 
33d88c1a5aSDimitry Andric     Conversions between UTF32, UTF-16, and UTF-8. Source code file.
34d88c1a5aSDimitry Andric     Author: Mark E. Davis, 1994.
35d88c1a5aSDimitry Andric     Rev History: Rick McGowan, fixes & updates May 2001.
36d88c1a5aSDimitry Andric     Sept 2001: fixed const & error conditions per
37d88c1a5aSDimitry Andric         mods suggested by S. Parent & A. Lillich.
38d88c1a5aSDimitry Andric     June 2002: Tim Dodd added detection and handling of incomplete
39d88c1a5aSDimitry Andric         source sequences, enhanced error detection, added casts
40d88c1a5aSDimitry Andric         to eliminate compiler warnings.
41d88c1a5aSDimitry Andric     July 2003: slight mods to back out aggressive FFFE detection.
42d88c1a5aSDimitry Andric     Jan 2004: updated switches in from-UTF8 conversions.
43d88c1a5aSDimitry Andric     Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
44d88c1a5aSDimitry Andric 
45d88c1a5aSDimitry Andric     See the header file "ConvertUTF.h" for complete documentation.
46d88c1a5aSDimitry Andric 
47d88c1a5aSDimitry Andric ------------------------------------------------------------------------ */
48d88c1a5aSDimitry Andric 
49d88c1a5aSDimitry Andric #include "llvm/Support/ConvertUTF.h"
50d88c1a5aSDimitry Andric #ifdef CVTUTF_DEBUG
51d88c1a5aSDimitry Andric #include <stdio.h>
52d88c1a5aSDimitry Andric #endif
53d88c1a5aSDimitry Andric #include <assert.h>
54d88c1a5aSDimitry Andric 
55302affcbSDimitry Andric /*
56302affcbSDimitry Andric  * This code extensively uses fall-through switches.
57302affcbSDimitry Andric  * Keep the compiler from warning about that.
58302affcbSDimitry Andric  */
59302affcbSDimitry Andric #if defined(__clang__) && defined(__has_warning)
60302affcbSDimitry Andric # if __has_warning("-Wimplicit-fallthrough")
61302affcbSDimitry Andric #  define ConvertUTF_DISABLE_WARNINGS \
62302affcbSDimitry Andric     _Pragma("clang diagnostic push")  \
63302affcbSDimitry Andric     _Pragma("clang diagnostic ignored \"-Wimplicit-fallthrough\"")
64302affcbSDimitry Andric #  define ConvertUTF_RESTORE_WARNINGS \
65302affcbSDimitry Andric     _Pragma("clang diagnostic pop")
66302affcbSDimitry Andric # endif
67302affcbSDimitry Andric #elif defined(__GNUC__) && __GNUC__ > 6
68302affcbSDimitry Andric # define ConvertUTF_DISABLE_WARNINGS \
69302affcbSDimitry Andric    _Pragma("GCC diagnostic push")    \
70302affcbSDimitry Andric    _Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")
71302affcbSDimitry Andric # define ConvertUTF_RESTORE_WARNINGS \
72302affcbSDimitry Andric    _Pragma("GCC diagnostic pop")
73302affcbSDimitry Andric #endif
74302affcbSDimitry Andric #ifndef ConvertUTF_DISABLE_WARNINGS
75302affcbSDimitry Andric # define ConvertUTF_DISABLE_WARNINGS
76302affcbSDimitry Andric #endif
77302affcbSDimitry Andric #ifndef ConvertUTF_RESTORE_WARNINGS
78302affcbSDimitry Andric # define ConvertUTF_RESTORE_WARNINGS
79302affcbSDimitry Andric #endif
80302affcbSDimitry Andric 
81302affcbSDimitry Andric ConvertUTF_DISABLE_WARNINGS
82302affcbSDimitry Andric 
83d88c1a5aSDimitry Andric namespace llvm {
84d88c1a5aSDimitry Andric 
85d88c1a5aSDimitry Andric static const int halfShift  = 10; /* used for shifting by 10 bits */
86d88c1a5aSDimitry Andric 
87d88c1a5aSDimitry Andric static const UTF32 halfBase = 0x0010000UL;
88d88c1a5aSDimitry Andric static const UTF32 halfMask = 0x3FFUL;
89d88c1a5aSDimitry Andric 
90d88c1a5aSDimitry Andric #define UNI_SUR_HIGH_START  (UTF32)0xD800
91d88c1a5aSDimitry Andric #define UNI_SUR_HIGH_END    (UTF32)0xDBFF
92d88c1a5aSDimitry Andric #define UNI_SUR_LOW_START   (UTF32)0xDC00
93d88c1a5aSDimitry Andric #define UNI_SUR_LOW_END     (UTF32)0xDFFF
94d88c1a5aSDimitry Andric 
95d88c1a5aSDimitry Andric /* --------------------------------------------------------------------- */
96d88c1a5aSDimitry Andric 
97d88c1a5aSDimitry Andric /*
98d88c1a5aSDimitry Andric  * Index into the table below with the first byte of a UTF-8 sequence to
99d88c1a5aSDimitry Andric  * get the number of trailing bytes that are supposed to follow it.
100d88c1a5aSDimitry Andric  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
101d88c1a5aSDimitry Andric  * left as-is for anyone who may want to do such conversion, which was
102d88c1a5aSDimitry Andric  * allowed in earlier algorithms.
103d88c1a5aSDimitry Andric  */
104d88c1a5aSDimitry Andric static const char trailingBytesForUTF8[256] = {
105d88c1a5aSDimitry Andric     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
106d88c1a5aSDimitry Andric     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
107d88c1a5aSDimitry Andric     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
108d88c1a5aSDimitry Andric     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
109d88c1a5aSDimitry Andric     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
110d88c1a5aSDimitry Andric     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
111d88c1a5aSDimitry Andric     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
112d88c1a5aSDimitry Andric     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
113d88c1a5aSDimitry Andric };
114d88c1a5aSDimitry Andric 
115d88c1a5aSDimitry Andric /*
116d88c1a5aSDimitry Andric  * Magic values subtracted from a buffer value during UTF8 conversion.
117d88c1a5aSDimitry Andric  * This table contains as many values as there might be trailing bytes
118d88c1a5aSDimitry Andric  * in a UTF-8 sequence.
119d88c1a5aSDimitry Andric  */
120d88c1a5aSDimitry Andric static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
121d88c1a5aSDimitry Andric                      0x03C82080UL, 0xFA082080UL, 0x82082080UL };
122d88c1a5aSDimitry Andric 
123d88c1a5aSDimitry Andric /*
124d88c1a5aSDimitry Andric  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
125d88c1a5aSDimitry Andric  * into the first byte, depending on how many bytes follow.  There are
126d88c1a5aSDimitry Andric  * as many entries in this table as there are UTF-8 sequence types.
127d88c1a5aSDimitry Andric  * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
128d88c1a5aSDimitry Andric  * for *legal* UTF-8 will be 4 or fewer bytes total.
129d88c1a5aSDimitry Andric  */
130d88c1a5aSDimitry Andric static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
131d88c1a5aSDimitry Andric 
132d88c1a5aSDimitry Andric /* --------------------------------------------------------------------- */
133d88c1a5aSDimitry Andric 
134d88c1a5aSDimitry Andric /* The interface converts a whole buffer to avoid function-call overhead.
135d88c1a5aSDimitry Andric  * Constants have been gathered. Loops & conditionals have been removed as
136d88c1a5aSDimitry Andric  * much as possible for efficiency, in favor of drop-through switches.
137d88c1a5aSDimitry Andric  * (See "Note A" at the bottom of the file for equivalent code.)
138d88c1a5aSDimitry Andric  * If your compiler supports it, the "isLegalUTF8" call can be turned
139d88c1a5aSDimitry Andric  * into an inline function.
140d88c1a5aSDimitry Andric  */
141d88c1a5aSDimitry Andric 
142d88c1a5aSDimitry Andric 
143d88c1a5aSDimitry Andric /* --------------------------------------------------------------------- */
144d88c1a5aSDimitry Andric 
ConvertUTF32toUTF16(const UTF32 ** sourceStart,const UTF32 * sourceEnd,UTF16 ** targetStart,UTF16 * targetEnd,ConversionFlags flags)145d88c1a5aSDimitry Andric ConversionResult ConvertUTF32toUTF16 (
146d88c1a5aSDimitry Andric         const UTF32** sourceStart, const UTF32* sourceEnd,
147d88c1a5aSDimitry Andric         UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
148d88c1a5aSDimitry Andric     ConversionResult result = conversionOK;
149d88c1a5aSDimitry Andric     const UTF32* source = *sourceStart;
150d88c1a5aSDimitry Andric     UTF16* target = *targetStart;
151d88c1a5aSDimitry Andric     while (source < sourceEnd) {
152d88c1a5aSDimitry Andric         UTF32 ch;
153d88c1a5aSDimitry Andric         if (target >= targetEnd) {
154d88c1a5aSDimitry Andric             result = targetExhausted; break;
155d88c1a5aSDimitry Andric         }
156d88c1a5aSDimitry Andric         ch = *source++;
157d88c1a5aSDimitry Andric         if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
158d88c1a5aSDimitry Andric             /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
159d88c1a5aSDimitry Andric             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
160d88c1a5aSDimitry Andric                 if (flags == strictConversion) {
161d88c1a5aSDimitry Andric                     --source; /* return to the illegal value itself */
162d88c1a5aSDimitry Andric                     result = sourceIllegal;
163d88c1a5aSDimitry Andric                     break;
164d88c1a5aSDimitry Andric                 } else {
165d88c1a5aSDimitry Andric                     *target++ = UNI_REPLACEMENT_CHAR;
166d88c1a5aSDimitry Andric                 }
167d88c1a5aSDimitry Andric             } else {
168d88c1a5aSDimitry Andric                 *target++ = (UTF16)ch; /* normal case */
169d88c1a5aSDimitry Andric             }
170d88c1a5aSDimitry Andric         } else if (ch > UNI_MAX_LEGAL_UTF32) {
171d88c1a5aSDimitry Andric             if (flags == strictConversion) {
172d88c1a5aSDimitry Andric                 result = sourceIllegal;
173d88c1a5aSDimitry Andric             } else {
174d88c1a5aSDimitry Andric                 *target++ = UNI_REPLACEMENT_CHAR;
175d88c1a5aSDimitry Andric             }
176d88c1a5aSDimitry Andric         } else {
177d88c1a5aSDimitry Andric             /* target is a character in range 0xFFFF - 0x10FFFF. */
178d88c1a5aSDimitry Andric             if (target + 1 >= targetEnd) {
179d88c1a5aSDimitry Andric                 --source; /* Back up source pointer! */
180d88c1a5aSDimitry Andric                 result = targetExhausted; break;
181d88c1a5aSDimitry Andric             }
182d88c1a5aSDimitry Andric             ch -= halfBase;
183d88c1a5aSDimitry Andric             *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
184d88c1a5aSDimitry Andric             *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
185d88c1a5aSDimitry Andric         }
186d88c1a5aSDimitry Andric     }
187d88c1a5aSDimitry Andric     *sourceStart = source;
188d88c1a5aSDimitry Andric     *targetStart = target;
189d88c1a5aSDimitry Andric     return result;
190d88c1a5aSDimitry Andric }
191d88c1a5aSDimitry Andric 
192d88c1a5aSDimitry Andric /* --------------------------------------------------------------------- */
193d88c1a5aSDimitry Andric 
ConvertUTF16toUTF32(const UTF16 ** sourceStart,const UTF16 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags)194d88c1a5aSDimitry Andric ConversionResult ConvertUTF16toUTF32 (
195d88c1a5aSDimitry Andric         const UTF16** sourceStart, const UTF16* sourceEnd,
196d88c1a5aSDimitry Andric         UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
197d88c1a5aSDimitry Andric     ConversionResult result = conversionOK;
198d88c1a5aSDimitry Andric     const UTF16* source = *sourceStart;
199d88c1a5aSDimitry Andric     UTF32* target = *targetStart;
200d88c1a5aSDimitry Andric     UTF32 ch, ch2;
201d88c1a5aSDimitry Andric     while (source < sourceEnd) {
202d88c1a5aSDimitry Andric         const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
203d88c1a5aSDimitry Andric         ch = *source++;
204d88c1a5aSDimitry Andric         /* If we have a surrogate pair, convert to UTF32 first. */
205d88c1a5aSDimitry Andric         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
206d88c1a5aSDimitry Andric             /* If the 16 bits following the high surrogate are in the source buffer... */
207d88c1a5aSDimitry Andric             if (source < sourceEnd) {
208d88c1a5aSDimitry Andric                 ch2 = *source;
209d88c1a5aSDimitry Andric                 /* If it's a low surrogate, convert to UTF32. */
210d88c1a5aSDimitry Andric                 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
211d88c1a5aSDimitry Andric                     ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
212d88c1a5aSDimitry Andric                         + (ch2 - UNI_SUR_LOW_START) + halfBase;
213d88c1a5aSDimitry Andric                     ++source;
214d88c1a5aSDimitry Andric                 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
215d88c1a5aSDimitry Andric                     --source; /* return to the illegal value itself */
216d88c1a5aSDimitry Andric                     result = sourceIllegal;
217d88c1a5aSDimitry Andric                     break;
218d88c1a5aSDimitry Andric                 }
219d88c1a5aSDimitry Andric             } else { /* We don't have the 16 bits following the high surrogate. */
220d88c1a5aSDimitry Andric                 --source; /* return to the high surrogate */
221d88c1a5aSDimitry Andric                 result = sourceExhausted;
222d88c1a5aSDimitry Andric                 break;
223d88c1a5aSDimitry Andric             }
224d88c1a5aSDimitry Andric         } else if (flags == strictConversion) {
225d88c1a5aSDimitry Andric             /* UTF-16 surrogate values are illegal in UTF-32 */
226d88c1a5aSDimitry Andric             if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
227d88c1a5aSDimitry Andric                 --source; /* return to the illegal value itself */
228d88c1a5aSDimitry Andric                 result = sourceIllegal;
229d88c1a5aSDimitry Andric                 break;
230d88c1a5aSDimitry Andric             }
231d88c1a5aSDimitry Andric         }
232d88c1a5aSDimitry Andric         if (target >= targetEnd) {
233d88c1a5aSDimitry Andric             source = oldSource; /* Back up source pointer! */
234d88c1a5aSDimitry Andric             result = targetExhausted; break;
235d88c1a5aSDimitry Andric         }
236d88c1a5aSDimitry Andric         *target++ = ch;
237d88c1a5aSDimitry Andric     }
238d88c1a5aSDimitry Andric     *sourceStart = source;
239d88c1a5aSDimitry Andric     *targetStart = target;
240d88c1a5aSDimitry Andric #ifdef CVTUTF_DEBUG
241d88c1a5aSDimitry Andric if (result == sourceIllegal) {
242d88c1a5aSDimitry Andric     fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
243d88c1a5aSDimitry Andric     fflush(stderr);
244d88c1a5aSDimitry Andric }
245d88c1a5aSDimitry Andric #endif
246d88c1a5aSDimitry Andric     return result;
247d88c1a5aSDimitry Andric }
ConvertUTF16toUTF8(const UTF16 ** sourceStart,const UTF16 * sourceEnd,UTF8 ** targetStart,UTF8 * targetEnd,ConversionFlags flags)248d88c1a5aSDimitry Andric ConversionResult ConvertUTF16toUTF8 (
249d88c1a5aSDimitry Andric         const UTF16** sourceStart, const UTF16* sourceEnd,
250d88c1a5aSDimitry Andric         UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
251d88c1a5aSDimitry Andric     ConversionResult result = conversionOK;
252d88c1a5aSDimitry Andric     const UTF16* source = *sourceStart;
253d88c1a5aSDimitry Andric     UTF8* target = *targetStart;
254d88c1a5aSDimitry Andric     while (source < sourceEnd) {
255d88c1a5aSDimitry Andric         UTF32 ch;
256d88c1a5aSDimitry Andric         unsigned short bytesToWrite = 0;
257d88c1a5aSDimitry Andric         const UTF32 byteMask = 0xBF;
258d88c1a5aSDimitry Andric         const UTF32 byteMark = 0x80;
259d88c1a5aSDimitry Andric         const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
260d88c1a5aSDimitry Andric         ch = *source++;
261d88c1a5aSDimitry Andric         /* If we have a surrogate pair, convert to UTF32 first. */
262d88c1a5aSDimitry Andric         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
263d88c1a5aSDimitry Andric             /* If the 16 bits following the high surrogate are in the source buffer... */
264d88c1a5aSDimitry Andric             if (source < sourceEnd) {
265d88c1a5aSDimitry Andric                 UTF32 ch2 = *source;
266d88c1a5aSDimitry Andric                 /* If it's a low surrogate, convert to UTF32. */
267d88c1a5aSDimitry Andric                 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
268d88c1a5aSDimitry Andric                     ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
269d88c1a5aSDimitry Andric                         + (ch2 - UNI_SUR_LOW_START) + halfBase;
270d88c1a5aSDimitry Andric                     ++source;
271d88c1a5aSDimitry Andric                 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
272d88c1a5aSDimitry Andric                     --source; /* return to the illegal value itself */
273d88c1a5aSDimitry Andric                     result = sourceIllegal;
274d88c1a5aSDimitry Andric                     break;
275d88c1a5aSDimitry Andric                 }
276d88c1a5aSDimitry Andric             } else { /* We don't have the 16 bits following the high surrogate. */
277d88c1a5aSDimitry Andric                 --source; /* return to the high surrogate */
278d88c1a5aSDimitry Andric                 result = sourceExhausted;
279d88c1a5aSDimitry Andric                 break;
280d88c1a5aSDimitry Andric             }
281d88c1a5aSDimitry Andric         } else if (flags == strictConversion) {
282d88c1a5aSDimitry Andric             /* UTF-16 surrogate values are illegal in UTF-32 */
283d88c1a5aSDimitry Andric             if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
284d88c1a5aSDimitry Andric                 --source; /* return to the illegal value itself */
285d88c1a5aSDimitry Andric                 result = sourceIllegal;
286d88c1a5aSDimitry Andric                 break;
287d88c1a5aSDimitry Andric             }
288d88c1a5aSDimitry Andric         }
289d88c1a5aSDimitry Andric         /* Figure out how many bytes the result will require */
290d88c1a5aSDimitry Andric         if (ch < (UTF32)0x80) {      bytesToWrite = 1;
291d88c1a5aSDimitry Andric         } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
292d88c1a5aSDimitry Andric         } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
293d88c1a5aSDimitry Andric         } else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
294d88c1a5aSDimitry Andric         } else {                            bytesToWrite = 3;
295d88c1a5aSDimitry Andric                                             ch = UNI_REPLACEMENT_CHAR;
296d88c1a5aSDimitry Andric         }
297d88c1a5aSDimitry Andric 
298d88c1a5aSDimitry Andric         target += bytesToWrite;
299d88c1a5aSDimitry Andric         if (target > targetEnd) {
300d88c1a5aSDimitry Andric             source = oldSource; /* Back up source pointer! */
301d88c1a5aSDimitry Andric             target -= bytesToWrite; result = targetExhausted; break;
302d88c1a5aSDimitry Andric         }
303d88c1a5aSDimitry Andric         switch (bytesToWrite) { /* note: everything falls through. */
304d88c1a5aSDimitry Andric             case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
305d88c1a5aSDimitry Andric             case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
306d88c1a5aSDimitry Andric             case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
307d88c1a5aSDimitry Andric             case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
308d88c1a5aSDimitry Andric         }
309d88c1a5aSDimitry Andric         target += bytesToWrite;
310d88c1a5aSDimitry Andric     }
311d88c1a5aSDimitry Andric     *sourceStart = source;
312d88c1a5aSDimitry Andric     *targetStart = target;
313d88c1a5aSDimitry Andric     return result;
314d88c1a5aSDimitry Andric }
315d88c1a5aSDimitry Andric 
316d88c1a5aSDimitry Andric /* --------------------------------------------------------------------- */
317d88c1a5aSDimitry Andric 
ConvertUTF32toUTF8(const UTF32 ** sourceStart,const UTF32 * sourceEnd,UTF8 ** targetStart,UTF8 * targetEnd,ConversionFlags flags)318d88c1a5aSDimitry Andric ConversionResult ConvertUTF32toUTF8 (
319d88c1a5aSDimitry Andric         const UTF32** sourceStart, const UTF32* sourceEnd,
320d88c1a5aSDimitry Andric         UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
321d88c1a5aSDimitry Andric     ConversionResult result = conversionOK;
322d88c1a5aSDimitry Andric     const UTF32* source = *sourceStart;
323d88c1a5aSDimitry Andric     UTF8* target = *targetStart;
324d88c1a5aSDimitry Andric     while (source < sourceEnd) {
325d88c1a5aSDimitry Andric         UTF32 ch;
326d88c1a5aSDimitry Andric         unsigned short bytesToWrite = 0;
327d88c1a5aSDimitry Andric         const UTF32 byteMask = 0xBF;
328d88c1a5aSDimitry Andric         const UTF32 byteMark = 0x80;
329d88c1a5aSDimitry Andric         ch = *source++;
330d88c1a5aSDimitry Andric         if (flags == strictConversion ) {
331d88c1a5aSDimitry Andric             /* UTF-16 surrogate values are illegal in UTF-32 */
332d88c1a5aSDimitry Andric             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
333d88c1a5aSDimitry Andric                 --source; /* return to the illegal value itself */
334d88c1a5aSDimitry Andric                 result = sourceIllegal;
335d88c1a5aSDimitry Andric                 break;
336d88c1a5aSDimitry Andric             }
337d88c1a5aSDimitry Andric         }
338d88c1a5aSDimitry Andric         /*
339d88c1a5aSDimitry Andric          * Figure out how many bytes the result will require. Turn any
340d88c1a5aSDimitry Andric          * illegally large UTF32 things (> Plane 17) into replacement chars.
341d88c1a5aSDimitry Andric          */
342d88c1a5aSDimitry Andric         if (ch < (UTF32)0x80) {      bytesToWrite = 1;
343d88c1a5aSDimitry Andric         } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
344d88c1a5aSDimitry Andric         } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
345d88c1a5aSDimitry Andric         } else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
346d88c1a5aSDimitry Andric         } else {                            bytesToWrite = 3;
347d88c1a5aSDimitry Andric                                             ch = UNI_REPLACEMENT_CHAR;
348d88c1a5aSDimitry Andric                                             result = sourceIllegal;
349d88c1a5aSDimitry Andric         }
350d88c1a5aSDimitry Andric 
351d88c1a5aSDimitry Andric         target += bytesToWrite;
352d88c1a5aSDimitry Andric         if (target > targetEnd) {
353d88c1a5aSDimitry Andric             --source; /* Back up source pointer! */
354d88c1a5aSDimitry Andric             target -= bytesToWrite; result = targetExhausted; break;
355d88c1a5aSDimitry Andric         }
356d88c1a5aSDimitry Andric         switch (bytesToWrite) { /* note: everything falls through. */
357d88c1a5aSDimitry Andric             case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
358d88c1a5aSDimitry Andric             case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
359d88c1a5aSDimitry Andric             case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
360d88c1a5aSDimitry Andric             case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
361d88c1a5aSDimitry Andric         }
362d88c1a5aSDimitry Andric         target += bytesToWrite;
363d88c1a5aSDimitry Andric     }
364d88c1a5aSDimitry Andric     *sourceStart = source;
365d88c1a5aSDimitry Andric     *targetStart = target;
366d88c1a5aSDimitry Andric     return result;
367d88c1a5aSDimitry Andric }
368d88c1a5aSDimitry Andric 
369d88c1a5aSDimitry Andric /* --------------------------------------------------------------------- */
370d88c1a5aSDimitry Andric 
371d88c1a5aSDimitry Andric /*
372d88c1a5aSDimitry Andric  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
373d88c1a5aSDimitry Andric  * This must be called with the length pre-determined by the first byte.
374d88c1a5aSDimitry Andric  * If not calling this from ConvertUTF8to*, then the length can be set by:
375d88c1a5aSDimitry Andric  *  length = trailingBytesForUTF8[*source]+1;
376d88c1a5aSDimitry Andric  * and the sequence is illegal right away if there aren't that many bytes
377d88c1a5aSDimitry Andric  * available.
378d88c1a5aSDimitry Andric  * If presented with a length > 4, this returns false.  The Unicode
379d88c1a5aSDimitry Andric  * definition of UTF-8 goes up to 4-byte sequences.
380d88c1a5aSDimitry Andric  */
381d88c1a5aSDimitry Andric 
isLegalUTF8(const UTF8 * source,int length)382d88c1a5aSDimitry Andric static Boolean isLegalUTF8(const UTF8 *source, int length) {
383d88c1a5aSDimitry Andric     UTF8 a;
384d88c1a5aSDimitry Andric     const UTF8 *srcptr = source+length;
385d88c1a5aSDimitry Andric     switch (length) {
386d88c1a5aSDimitry Andric     default: return false;
387d88c1a5aSDimitry Andric         /* Everything else falls through when "true"... */
388d88c1a5aSDimitry Andric     case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
389d88c1a5aSDimitry Andric     case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
390d88c1a5aSDimitry Andric     case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
391d88c1a5aSDimitry Andric 
392d88c1a5aSDimitry Andric         switch (*source) {
393d88c1a5aSDimitry Andric             /* no fall-through in this inner switch */
394d88c1a5aSDimitry Andric             case 0xE0: if (a < 0xA0) return false; break;
395d88c1a5aSDimitry Andric             case 0xED: if (a > 0x9F) return false; break;
396d88c1a5aSDimitry Andric             case 0xF0: if (a < 0x90) return false; break;
397d88c1a5aSDimitry Andric             case 0xF4: if (a > 0x8F) return false; break;
398d88c1a5aSDimitry Andric             default:   if (a < 0x80) return false;
399d88c1a5aSDimitry Andric         }
400d88c1a5aSDimitry Andric 
401d88c1a5aSDimitry Andric     case 1: if (*source >= 0x80 && *source < 0xC2) return false;
402d88c1a5aSDimitry Andric     }
403d88c1a5aSDimitry Andric     if (*source > 0xF4) return false;
404d88c1a5aSDimitry Andric     return true;
405d88c1a5aSDimitry Andric }
406d88c1a5aSDimitry Andric 
407d88c1a5aSDimitry Andric /* --------------------------------------------------------------------- */
408d88c1a5aSDimitry Andric 
409d88c1a5aSDimitry Andric /*
410d88c1a5aSDimitry Andric  * Exported function to return whether a UTF-8 sequence is legal or not.
411d88c1a5aSDimitry Andric  * This is not used here; it's just exported.
412d88c1a5aSDimitry Andric  */
isLegalUTF8Sequence(const UTF8 * source,const UTF8 * sourceEnd)413d88c1a5aSDimitry Andric Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
414d88c1a5aSDimitry Andric     int length = trailingBytesForUTF8[*source]+1;
415d88c1a5aSDimitry Andric     if (length > sourceEnd - source) {
416d88c1a5aSDimitry Andric         return false;
417d88c1a5aSDimitry Andric     }
418d88c1a5aSDimitry Andric     return isLegalUTF8(source, length);
419d88c1a5aSDimitry Andric }
420d88c1a5aSDimitry Andric 
421d88c1a5aSDimitry Andric /* --------------------------------------------------------------------- */
422d88c1a5aSDimitry Andric 
423d88c1a5aSDimitry Andric static unsigned
findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 * source,const UTF8 * sourceEnd)424d88c1a5aSDimitry Andric findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source,
425d88c1a5aSDimitry Andric                                           const UTF8 *sourceEnd) {
426d88c1a5aSDimitry Andric   UTF8 b1, b2, b3;
427d88c1a5aSDimitry Andric 
428d88c1a5aSDimitry Andric   assert(!isLegalUTF8Sequence(source, sourceEnd));
429d88c1a5aSDimitry Andric 
430d88c1a5aSDimitry Andric   /*
431d88c1a5aSDimitry Andric    * Unicode 6.3.0, D93b:
432d88c1a5aSDimitry Andric    *
433d88c1a5aSDimitry Andric    *   Maximal subpart of an ill-formed subsequence: The longest code unit
434d88c1a5aSDimitry Andric    *   subsequence starting at an unconvertible offset that is either:
435d88c1a5aSDimitry Andric    *   a. the initial subsequence of a well-formed code unit sequence, or
436d88c1a5aSDimitry Andric    *   b. a subsequence of length one.
437d88c1a5aSDimitry Andric    */
438d88c1a5aSDimitry Andric 
439d88c1a5aSDimitry Andric   if (source == sourceEnd)
440d88c1a5aSDimitry Andric     return 0;
441d88c1a5aSDimitry Andric 
442d88c1a5aSDimitry Andric   /*
443d88c1a5aSDimitry Andric    * Perform case analysis.  See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8
444d88c1a5aSDimitry Andric    * Byte Sequences.
445d88c1a5aSDimitry Andric    */
446d88c1a5aSDimitry Andric 
447d88c1a5aSDimitry Andric   b1 = *source;
448d88c1a5aSDimitry Andric   ++source;
449d88c1a5aSDimitry Andric   if (b1 >= 0xC2 && b1 <= 0xDF) {
450d88c1a5aSDimitry Andric     /*
451d88c1a5aSDimitry Andric      * First byte is valid, but we know that this code unit sequence is
452d88c1a5aSDimitry Andric      * invalid, so the maximal subpart has to end after the first byte.
453d88c1a5aSDimitry Andric      */
454d88c1a5aSDimitry Andric     return 1;
455d88c1a5aSDimitry Andric   }
456d88c1a5aSDimitry Andric 
457d88c1a5aSDimitry Andric   if (source == sourceEnd)
458d88c1a5aSDimitry Andric     return 1;
459d88c1a5aSDimitry Andric 
460d88c1a5aSDimitry Andric   b2 = *source;
461d88c1a5aSDimitry Andric   ++source;
462d88c1a5aSDimitry Andric 
463d88c1a5aSDimitry Andric   if (b1 == 0xE0) {
464d88c1a5aSDimitry Andric     return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;
465d88c1a5aSDimitry Andric   }
466d88c1a5aSDimitry Andric   if (b1 >= 0xE1 && b1 <= 0xEC) {
467d88c1a5aSDimitry Andric     return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
468d88c1a5aSDimitry Andric   }
469d88c1a5aSDimitry Andric   if (b1 == 0xED) {
470d88c1a5aSDimitry Andric     return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;
471d88c1a5aSDimitry Andric   }
472d88c1a5aSDimitry Andric   if (b1 >= 0xEE && b1 <= 0xEF) {
473d88c1a5aSDimitry Andric     return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
474d88c1a5aSDimitry Andric   }
475d88c1a5aSDimitry Andric   if (b1 == 0xF0) {
476d88c1a5aSDimitry Andric     if (b2 >= 0x90 && b2 <= 0xBF) {
477d88c1a5aSDimitry Andric       if (source == sourceEnd)
478d88c1a5aSDimitry Andric         return 2;
479d88c1a5aSDimitry Andric 
480d88c1a5aSDimitry Andric       b3 = *source;
481d88c1a5aSDimitry Andric       return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
482d88c1a5aSDimitry Andric     }
483d88c1a5aSDimitry Andric     return 1;
484d88c1a5aSDimitry Andric   }
485d88c1a5aSDimitry Andric   if (b1 >= 0xF1 && b1 <= 0xF3) {
486d88c1a5aSDimitry Andric     if (b2 >= 0x80 && b2 <= 0xBF) {
487d88c1a5aSDimitry Andric       if (source == sourceEnd)
488d88c1a5aSDimitry Andric         return 2;
489d88c1a5aSDimitry Andric 
490d88c1a5aSDimitry Andric       b3 = *source;
491d88c1a5aSDimitry Andric       return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
492d88c1a5aSDimitry Andric     }
493d88c1a5aSDimitry Andric     return 1;
494d88c1a5aSDimitry Andric   }
495d88c1a5aSDimitry Andric   if (b1 == 0xF4) {
496d88c1a5aSDimitry Andric     if (b2 >= 0x80 && b2 <= 0x8F) {
497d88c1a5aSDimitry Andric       if (source == sourceEnd)
498d88c1a5aSDimitry Andric         return 2;
499d88c1a5aSDimitry Andric 
500d88c1a5aSDimitry Andric       b3 = *source;
501d88c1a5aSDimitry Andric       return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
502d88c1a5aSDimitry Andric     }
503d88c1a5aSDimitry Andric     return 1;
504d88c1a5aSDimitry Andric   }
505d88c1a5aSDimitry Andric 
506d88c1a5aSDimitry Andric   assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);
507d88c1a5aSDimitry Andric   /*
508d88c1a5aSDimitry Andric    * There are no valid sequences that start with these bytes.  Maximal subpart
509d88c1a5aSDimitry Andric    * is defined to have length 1 in these cases.
510d88c1a5aSDimitry Andric    */
511d88c1a5aSDimitry Andric   return 1;
512d88c1a5aSDimitry Andric }
513d88c1a5aSDimitry Andric 
514d88c1a5aSDimitry Andric /* --------------------------------------------------------------------- */
515d88c1a5aSDimitry Andric 
516d88c1a5aSDimitry Andric /*
517d88c1a5aSDimitry Andric  * Exported function to return the total number of bytes in a codepoint
518d88c1a5aSDimitry Andric  * represented in UTF-8, given the value of the first byte.
519d88c1a5aSDimitry Andric  */
getNumBytesForUTF8(UTF8 first)520d88c1a5aSDimitry Andric unsigned getNumBytesForUTF8(UTF8 first) {
521d88c1a5aSDimitry Andric   return trailingBytesForUTF8[first] + 1;
522d88c1a5aSDimitry Andric }
523d88c1a5aSDimitry Andric 
524d88c1a5aSDimitry Andric /* --------------------------------------------------------------------- */
525d88c1a5aSDimitry Andric 
526d88c1a5aSDimitry Andric /*
527d88c1a5aSDimitry Andric  * Exported function to return whether a UTF-8 string is legal or not.
528d88c1a5aSDimitry Andric  * This is not used here; it's just exported.
529d88c1a5aSDimitry Andric  */
isLegalUTF8String(const UTF8 ** source,const UTF8 * sourceEnd)530d88c1a5aSDimitry Andric Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {
531d88c1a5aSDimitry Andric     while (*source != sourceEnd) {
532d88c1a5aSDimitry Andric         int length = trailingBytesForUTF8[**source] + 1;
533d88c1a5aSDimitry Andric         if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
534d88c1a5aSDimitry Andric             return false;
535d88c1a5aSDimitry Andric         *source += length;
536d88c1a5aSDimitry Andric     }
537d88c1a5aSDimitry Andric     return true;
538d88c1a5aSDimitry Andric }
539d88c1a5aSDimitry Andric 
540d88c1a5aSDimitry Andric /* --------------------------------------------------------------------- */
541d88c1a5aSDimitry Andric 
ConvertUTF8toUTF16(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF16 ** targetStart,UTF16 * targetEnd,ConversionFlags flags)542d88c1a5aSDimitry Andric ConversionResult ConvertUTF8toUTF16 (
543d88c1a5aSDimitry Andric         const UTF8** sourceStart, const UTF8* sourceEnd,
544d88c1a5aSDimitry Andric         UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
545d88c1a5aSDimitry Andric     ConversionResult result = conversionOK;
546d88c1a5aSDimitry Andric     const UTF8* source = *sourceStart;
547d88c1a5aSDimitry Andric     UTF16* target = *targetStart;
548d88c1a5aSDimitry Andric     while (source < sourceEnd) {
549d88c1a5aSDimitry Andric         UTF32 ch = 0;
550d88c1a5aSDimitry Andric         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
551d88c1a5aSDimitry Andric         if (extraBytesToRead >= sourceEnd - source) {
552d88c1a5aSDimitry Andric             result = sourceExhausted; break;
553d88c1a5aSDimitry Andric         }
554d88c1a5aSDimitry Andric         /* Do this check whether lenient or strict */
555d88c1a5aSDimitry Andric         if (!isLegalUTF8(source, extraBytesToRead+1)) {
556d88c1a5aSDimitry Andric             result = sourceIllegal;
557d88c1a5aSDimitry Andric             break;
558d88c1a5aSDimitry Andric         }
559d88c1a5aSDimitry Andric         /*
560d88c1a5aSDimitry Andric          * The cases all fall through. See "Note A" below.
561d88c1a5aSDimitry Andric          */
562d88c1a5aSDimitry Andric         switch (extraBytesToRead) {
563d88c1a5aSDimitry Andric             case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
564d88c1a5aSDimitry Andric             case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
565d88c1a5aSDimitry Andric             case 3: ch += *source++; ch <<= 6;
566d88c1a5aSDimitry Andric             case 2: ch += *source++; ch <<= 6;
567d88c1a5aSDimitry Andric             case 1: ch += *source++; ch <<= 6;
568d88c1a5aSDimitry Andric             case 0: ch += *source++;
569d88c1a5aSDimitry Andric         }
570d88c1a5aSDimitry Andric         ch -= offsetsFromUTF8[extraBytesToRead];
571d88c1a5aSDimitry Andric 
572d88c1a5aSDimitry Andric         if (target >= targetEnd) {
573d88c1a5aSDimitry Andric             source -= (extraBytesToRead+1); /* Back up source pointer! */
574d88c1a5aSDimitry Andric             result = targetExhausted; break;
575d88c1a5aSDimitry Andric         }
576d88c1a5aSDimitry Andric         if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
577d88c1a5aSDimitry Andric             /* UTF-16 surrogate values are illegal in UTF-32 */
578d88c1a5aSDimitry Andric             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
579d88c1a5aSDimitry Andric                 if (flags == strictConversion) {
580d88c1a5aSDimitry Andric                     source -= (extraBytesToRead+1); /* return to the illegal value itself */
581d88c1a5aSDimitry Andric                     result = sourceIllegal;
582d88c1a5aSDimitry Andric                     break;
583d88c1a5aSDimitry Andric                 } else {
584d88c1a5aSDimitry Andric                     *target++ = UNI_REPLACEMENT_CHAR;
585d88c1a5aSDimitry Andric                 }
586d88c1a5aSDimitry Andric             } else {
587d88c1a5aSDimitry Andric                 *target++ = (UTF16)ch; /* normal case */
588d88c1a5aSDimitry Andric             }
589d88c1a5aSDimitry Andric         } else if (ch > UNI_MAX_UTF16) {
590d88c1a5aSDimitry Andric             if (flags == strictConversion) {
591d88c1a5aSDimitry Andric                 result = sourceIllegal;
592d88c1a5aSDimitry Andric                 source -= (extraBytesToRead+1); /* return to the start */
593d88c1a5aSDimitry Andric                 break; /* Bail out; shouldn't continue */
594d88c1a5aSDimitry Andric             } else {
595d88c1a5aSDimitry Andric                 *target++ = UNI_REPLACEMENT_CHAR;
596d88c1a5aSDimitry Andric             }
597d88c1a5aSDimitry Andric         } else {
598d88c1a5aSDimitry Andric             /* target is a character in range 0xFFFF - 0x10FFFF. */
599d88c1a5aSDimitry Andric             if (target + 1 >= targetEnd) {
600d88c1a5aSDimitry Andric                 source -= (extraBytesToRead+1); /* Back up source pointer! */
601d88c1a5aSDimitry Andric                 result = targetExhausted; break;
602d88c1a5aSDimitry Andric             }
603d88c1a5aSDimitry Andric             ch -= halfBase;
604d88c1a5aSDimitry Andric             *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
605d88c1a5aSDimitry Andric             *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
606d88c1a5aSDimitry Andric         }
607d88c1a5aSDimitry Andric     }
608d88c1a5aSDimitry Andric     *sourceStart = source;
609d88c1a5aSDimitry Andric     *targetStart = target;
610d88c1a5aSDimitry Andric     return result;
611d88c1a5aSDimitry Andric }
612d88c1a5aSDimitry Andric 
613d88c1a5aSDimitry Andric /* --------------------------------------------------------------------- */
614d88c1a5aSDimitry Andric 
ConvertUTF8toUTF32Impl(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags,Boolean InputIsPartial)615d88c1a5aSDimitry Andric static ConversionResult ConvertUTF8toUTF32Impl(
616d88c1a5aSDimitry Andric         const UTF8** sourceStart, const UTF8* sourceEnd,
617d88c1a5aSDimitry Andric         UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags,
618d88c1a5aSDimitry Andric         Boolean InputIsPartial) {
619d88c1a5aSDimitry Andric     ConversionResult result = conversionOK;
620d88c1a5aSDimitry Andric     const UTF8* source = *sourceStart;
621d88c1a5aSDimitry Andric     UTF32* target = *targetStart;
622d88c1a5aSDimitry Andric     while (source < sourceEnd) {
623d88c1a5aSDimitry Andric         UTF32 ch = 0;
624d88c1a5aSDimitry Andric         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
625d88c1a5aSDimitry Andric         if (extraBytesToRead >= sourceEnd - source) {
626d88c1a5aSDimitry Andric             if (flags == strictConversion || InputIsPartial) {
627d88c1a5aSDimitry Andric                 result = sourceExhausted;
628d88c1a5aSDimitry Andric                 break;
629d88c1a5aSDimitry Andric             } else {
630d88c1a5aSDimitry Andric                 result = sourceIllegal;
631d88c1a5aSDimitry Andric 
632d88c1a5aSDimitry Andric                 /*
633d88c1a5aSDimitry Andric                  * Replace the maximal subpart of ill-formed sequence with
634d88c1a5aSDimitry Andric                  * replacement character.
635d88c1a5aSDimitry Andric                  */
636d88c1a5aSDimitry Andric                 source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
637d88c1a5aSDimitry Andric                                                                     sourceEnd);
638d88c1a5aSDimitry Andric                 *target++ = UNI_REPLACEMENT_CHAR;
639d88c1a5aSDimitry Andric                 continue;
640d88c1a5aSDimitry Andric             }
641d88c1a5aSDimitry Andric         }
642d88c1a5aSDimitry Andric         if (target >= targetEnd) {
643d88c1a5aSDimitry Andric             result = targetExhausted; break;
644d88c1a5aSDimitry Andric         }
645d88c1a5aSDimitry Andric 
646d88c1a5aSDimitry Andric         /* Do this check whether lenient or strict */
647d88c1a5aSDimitry Andric         if (!isLegalUTF8(source, extraBytesToRead+1)) {
648d88c1a5aSDimitry Andric             result = sourceIllegal;
649d88c1a5aSDimitry Andric             if (flags == strictConversion) {
650d88c1a5aSDimitry Andric                 /* Abort conversion. */
651d88c1a5aSDimitry Andric                 break;
652d88c1a5aSDimitry Andric             } else {
653d88c1a5aSDimitry Andric                 /*
654d88c1a5aSDimitry Andric                  * Replace the maximal subpart of ill-formed sequence with
655d88c1a5aSDimitry Andric                  * replacement character.
656d88c1a5aSDimitry Andric                  */
657d88c1a5aSDimitry Andric                 source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
658d88c1a5aSDimitry Andric                                                                     sourceEnd);
659d88c1a5aSDimitry Andric                 *target++ = UNI_REPLACEMENT_CHAR;
660d88c1a5aSDimitry Andric                 continue;
661d88c1a5aSDimitry Andric             }
662d88c1a5aSDimitry Andric         }
663d88c1a5aSDimitry Andric         /*
664d88c1a5aSDimitry Andric          * The cases all fall through. See "Note A" below.
665d88c1a5aSDimitry Andric          */
666d88c1a5aSDimitry Andric         switch (extraBytesToRead) {
667d88c1a5aSDimitry Andric             case 5: ch += *source++; ch <<= 6;
668d88c1a5aSDimitry Andric             case 4: ch += *source++; ch <<= 6;
669d88c1a5aSDimitry Andric             case 3: ch += *source++; ch <<= 6;
670d88c1a5aSDimitry Andric             case 2: ch += *source++; ch <<= 6;
671d88c1a5aSDimitry Andric             case 1: ch += *source++; ch <<= 6;
672d88c1a5aSDimitry Andric             case 0: ch += *source++;
673d88c1a5aSDimitry Andric         }
674d88c1a5aSDimitry Andric         ch -= offsetsFromUTF8[extraBytesToRead];
675d88c1a5aSDimitry Andric 
676d88c1a5aSDimitry Andric         if (ch <= UNI_MAX_LEGAL_UTF32) {
677d88c1a5aSDimitry Andric             /*
678d88c1a5aSDimitry Andric              * UTF-16 surrogate values are illegal in UTF-32, and anything
679d88c1a5aSDimitry Andric              * over Plane 17 (> 0x10FFFF) is illegal.
680d88c1a5aSDimitry Andric              */
681d88c1a5aSDimitry Andric             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
682d88c1a5aSDimitry Andric                 if (flags == strictConversion) {
683d88c1a5aSDimitry Andric                     source -= (extraBytesToRead+1); /* return to the illegal value itself */
684d88c1a5aSDimitry Andric                     result = sourceIllegal;
685d88c1a5aSDimitry Andric                     break;
686d88c1a5aSDimitry Andric                 } else {
687d88c1a5aSDimitry Andric                     *target++ = UNI_REPLACEMENT_CHAR;
688d88c1a5aSDimitry Andric                 }
689d88c1a5aSDimitry Andric             } else {
690d88c1a5aSDimitry Andric                 *target++ = ch;
691d88c1a5aSDimitry Andric             }
692d88c1a5aSDimitry Andric         } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
693d88c1a5aSDimitry Andric             result = sourceIllegal;
694d88c1a5aSDimitry Andric             *target++ = UNI_REPLACEMENT_CHAR;
695d88c1a5aSDimitry Andric         }
696d88c1a5aSDimitry Andric     }
697d88c1a5aSDimitry Andric     *sourceStart = source;
698d88c1a5aSDimitry Andric     *targetStart = target;
699d88c1a5aSDimitry Andric     return result;
700d88c1a5aSDimitry Andric }
701d88c1a5aSDimitry Andric 
ConvertUTF8toUTF32Partial(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags)702d88c1a5aSDimitry Andric ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart,
703d88c1a5aSDimitry Andric                                            const UTF8 *sourceEnd,
704d88c1a5aSDimitry Andric                                            UTF32 **targetStart,
705d88c1a5aSDimitry Andric                                            UTF32 *targetEnd,
706d88c1a5aSDimitry Andric                                            ConversionFlags flags) {
707d88c1a5aSDimitry Andric   return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
708d88c1a5aSDimitry Andric                                 flags, /*InputIsPartial=*/true);
709d88c1a5aSDimitry Andric }
710d88c1a5aSDimitry Andric 
ConvertUTF8toUTF32(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags)711d88c1a5aSDimitry Andric ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart,
712d88c1a5aSDimitry Andric                                     const UTF8 *sourceEnd, UTF32 **targetStart,
713d88c1a5aSDimitry Andric                                     UTF32 *targetEnd, ConversionFlags flags) {
714d88c1a5aSDimitry Andric   return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
715d88c1a5aSDimitry Andric                                 flags, /*InputIsPartial=*/false);
716d88c1a5aSDimitry Andric }
717d88c1a5aSDimitry Andric 
718d88c1a5aSDimitry Andric /* ---------------------------------------------------------------------
719d88c1a5aSDimitry Andric 
720d88c1a5aSDimitry Andric     Note A.
721d88c1a5aSDimitry Andric     The fall-through switches in UTF-8 reading code save a
722d88c1a5aSDimitry Andric     temp variable, some decrements & conditionals.  The switches
723d88c1a5aSDimitry Andric     are equivalent to the following loop:
724d88c1a5aSDimitry Andric         {
725d88c1a5aSDimitry Andric             int tmpBytesToRead = extraBytesToRead+1;
726d88c1a5aSDimitry Andric             do {
727d88c1a5aSDimitry Andric                 ch += *source++;
728d88c1a5aSDimitry Andric                 --tmpBytesToRead;
729d88c1a5aSDimitry Andric                 if (tmpBytesToRead) ch <<= 6;
730d88c1a5aSDimitry Andric             } while (tmpBytesToRead > 0);
731d88c1a5aSDimitry Andric         }
732d88c1a5aSDimitry Andric     In UTF-8 writing code, the switches on "bytesToWrite" are
733d88c1a5aSDimitry Andric     similarly unrolled loops.
734d88c1a5aSDimitry Andric 
735d88c1a5aSDimitry Andric    --------------------------------------------------------------------- */
736d88c1a5aSDimitry Andric 
737d88c1a5aSDimitry Andric } // namespace llvm
738302affcbSDimitry Andric 
739302affcbSDimitry Andric ConvertUTF_RESTORE_WARNINGS
740