1d88c1a5aSDimitry Andric /*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
2d88c1a5aSDimitry Andric *
3d88c1a5aSDimitry Andric * The LLVM Compiler Infrastructure
4d88c1a5aSDimitry Andric *
5d88c1a5aSDimitry Andric * This file is distributed under the University of Illinois Open Source
6d88c1a5aSDimitry Andric * License. See LICENSE.TXT for details.
7d88c1a5aSDimitry Andric *
8d88c1a5aSDimitry Andric *===------------------------------------------------------------------------=*/
9d88c1a5aSDimitry Andric /*
10d88c1a5aSDimitry Andric * Copyright 2001-2004 Unicode, Inc.
11d88c1a5aSDimitry Andric *
12d88c1a5aSDimitry Andric * Disclaimer
13d88c1a5aSDimitry Andric *
14d88c1a5aSDimitry Andric * This source code is provided as is by Unicode, Inc. No claims are
15d88c1a5aSDimitry Andric * made as to fitness for any particular purpose. No warranties of any
16d88c1a5aSDimitry Andric * kind are expressed or implied. The recipient agrees to determine
17d88c1a5aSDimitry Andric * applicability of information provided. If this file has been
18d88c1a5aSDimitry Andric * purchased on magnetic or optical media from Unicode, Inc., the
19d88c1a5aSDimitry Andric * sole remedy for any claim will be exchange of defective media
20d88c1a5aSDimitry Andric * within 90 days of receipt.
21d88c1a5aSDimitry Andric *
22d88c1a5aSDimitry Andric * Limitations on Rights to Redistribute This Code
23d88c1a5aSDimitry Andric *
24d88c1a5aSDimitry Andric * Unicode, Inc. hereby grants the right to freely use the information
25d88c1a5aSDimitry Andric * supplied in this file in the creation of products supporting the
26d88c1a5aSDimitry Andric * Unicode Standard, and to make copies of this file in any form
27d88c1a5aSDimitry Andric * for internal or external distribution as long as this notice
28d88c1a5aSDimitry Andric * remains attached.
29d88c1a5aSDimitry Andric */
30d88c1a5aSDimitry Andric
31d88c1a5aSDimitry Andric /* ---------------------------------------------------------------------
32d88c1a5aSDimitry Andric
33d88c1a5aSDimitry Andric Conversions between UTF32, UTF-16, and UTF-8. Source code file.
34d88c1a5aSDimitry Andric Author: Mark E. Davis, 1994.
35d88c1a5aSDimitry Andric Rev History: Rick McGowan, fixes & updates May 2001.
36d88c1a5aSDimitry Andric Sept 2001: fixed const & error conditions per
37d88c1a5aSDimitry Andric mods suggested by S. Parent & A. Lillich.
38d88c1a5aSDimitry Andric June 2002: Tim Dodd added detection and handling of incomplete
39d88c1a5aSDimitry Andric source sequences, enhanced error detection, added casts
40d88c1a5aSDimitry Andric to eliminate compiler warnings.
41d88c1a5aSDimitry Andric July 2003: slight mods to back out aggressive FFFE detection.
42d88c1a5aSDimitry Andric Jan 2004: updated switches in from-UTF8 conversions.
43d88c1a5aSDimitry Andric Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
44d88c1a5aSDimitry Andric
45d88c1a5aSDimitry Andric See the header file "ConvertUTF.h" for complete documentation.
46d88c1a5aSDimitry Andric
47d88c1a5aSDimitry Andric ------------------------------------------------------------------------ */
48d88c1a5aSDimitry Andric
49d88c1a5aSDimitry Andric #include "llvm/Support/ConvertUTF.h"
50d88c1a5aSDimitry Andric #ifdef CVTUTF_DEBUG
51d88c1a5aSDimitry Andric #include <stdio.h>
52d88c1a5aSDimitry Andric #endif
53d88c1a5aSDimitry Andric #include <assert.h>
54d88c1a5aSDimitry Andric
55302affcbSDimitry Andric /*
56302affcbSDimitry Andric * This code extensively uses fall-through switches.
57302affcbSDimitry Andric * Keep the compiler from warning about that.
58302affcbSDimitry Andric */
59302affcbSDimitry Andric #if defined(__clang__) && defined(__has_warning)
60302affcbSDimitry Andric # if __has_warning("-Wimplicit-fallthrough")
61302affcbSDimitry Andric # define ConvertUTF_DISABLE_WARNINGS \
62302affcbSDimitry Andric _Pragma("clang diagnostic push") \
63302affcbSDimitry Andric _Pragma("clang diagnostic ignored \"-Wimplicit-fallthrough\"")
64302affcbSDimitry Andric # define ConvertUTF_RESTORE_WARNINGS \
65302affcbSDimitry Andric _Pragma("clang diagnostic pop")
66302affcbSDimitry Andric # endif
67302affcbSDimitry Andric #elif defined(__GNUC__) && __GNUC__ > 6
68302affcbSDimitry Andric # define ConvertUTF_DISABLE_WARNINGS \
69302affcbSDimitry Andric _Pragma("GCC diagnostic push") \
70302affcbSDimitry Andric _Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")
71302affcbSDimitry Andric # define ConvertUTF_RESTORE_WARNINGS \
72302affcbSDimitry Andric _Pragma("GCC diagnostic pop")
73302affcbSDimitry Andric #endif
74302affcbSDimitry Andric #ifndef ConvertUTF_DISABLE_WARNINGS
75302affcbSDimitry Andric # define ConvertUTF_DISABLE_WARNINGS
76302affcbSDimitry Andric #endif
77302affcbSDimitry Andric #ifndef ConvertUTF_RESTORE_WARNINGS
78302affcbSDimitry Andric # define ConvertUTF_RESTORE_WARNINGS
79302affcbSDimitry Andric #endif
80302affcbSDimitry Andric
81302affcbSDimitry Andric ConvertUTF_DISABLE_WARNINGS
82302affcbSDimitry Andric
83d88c1a5aSDimitry Andric namespace llvm {
84d88c1a5aSDimitry Andric
85d88c1a5aSDimitry Andric static const int halfShift = 10; /* used for shifting by 10 bits */
86d88c1a5aSDimitry Andric
87d88c1a5aSDimitry Andric static const UTF32 halfBase = 0x0010000UL;
88d88c1a5aSDimitry Andric static const UTF32 halfMask = 0x3FFUL;
89d88c1a5aSDimitry Andric
90d88c1a5aSDimitry Andric #define UNI_SUR_HIGH_START (UTF32)0xD800
91d88c1a5aSDimitry Andric #define UNI_SUR_HIGH_END (UTF32)0xDBFF
92d88c1a5aSDimitry Andric #define UNI_SUR_LOW_START (UTF32)0xDC00
93d88c1a5aSDimitry Andric #define UNI_SUR_LOW_END (UTF32)0xDFFF
94d88c1a5aSDimitry Andric
95d88c1a5aSDimitry Andric /* --------------------------------------------------------------------- */
96d88c1a5aSDimitry Andric
97d88c1a5aSDimitry Andric /*
98d88c1a5aSDimitry Andric * Index into the table below with the first byte of a UTF-8 sequence to
99d88c1a5aSDimitry Andric * get the number of trailing bytes that are supposed to follow it.
100d88c1a5aSDimitry Andric * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
101d88c1a5aSDimitry Andric * left as-is for anyone who may want to do such conversion, which was
102d88c1a5aSDimitry Andric * allowed in earlier algorithms.
103d88c1a5aSDimitry Andric */
104d88c1a5aSDimitry Andric static const char trailingBytesForUTF8[256] = {
105d88c1a5aSDimitry Andric 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
106d88c1a5aSDimitry Andric 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
107d88c1a5aSDimitry Andric 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
108d88c1a5aSDimitry Andric 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
109d88c1a5aSDimitry Andric 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
110d88c1a5aSDimitry Andric 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
111d88c1a5aSDimitry Andric 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
112d88c1a5aSDimitry Andric 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
113d88c1a5aSDimitry Andric };
114d88c1a5aSDimitry Andric
115d88c1a5aSDimitry Andric /*
116d88c1a5aSDimitry Andric * Magic values subtracted from a buffer value during UTF8 conversion.
117d88c1a5aSDimitry Andric * This table contains as many values as there might be trailing bytes
118d88c1a5aSDimitry Andric * in a UTF-8 sequence.
119d88c1a5aSDimitry Andric */
120d88c1a5aSDimitry Andric static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
121d88c1a5aSDimitry Andric 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
122d88c1a5aSDimitry Andric
123d88c1a5aSDimitry Andric /*
124d88c1a5aSDimitry Andric * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
125d88c1a5aSDimitry Andric * into the first byte, depending on how many bytes follow. There are
126d88c1a5aSDimitry Andric * as many entries in this table as there are UTF-8 sequence types.
127d88c1a5aSDimitry Andric * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
128d88c1a5aSDimitry Andric * for *legal* UTF-8 will be 4 or fewer bytes total.
129d88c1a5aSDimitry Andric */
130d88c1a5aSDimitry Andric static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
131d88c1a5aSDimitry Andric
132d88c1a5aSDimitry Andric /* --------------------------------------------------------------------- */
133d88c1a5aSDimitry Andric
134d88c1a5aSDimitry Andric /* The interface converts a whole buffer to avoid function-call overhead.
135d88c1a5aSDimitry Andric * Constants have been gathered. Loops & conditionals have been removed as
136d88c1a5aSDimitry Andric * much as possible for efficiency, in favor of drop-through switches.
137d88c1a5aSDimitry Andric * (See "Note A" at the bottom of the file for equivalent code.)
138d88c1a5aSDimitry Andric * If your compiler supports it, the "isLegalUTF8" call can be turned
139d88c1a5aSDimitry Andric * into an inline function.
140d88c1a5aSDimitry Andric */
141d88c1a5aSDimitry Andric
142d88c1a5aSDimitry Andric
143d88c1a5aSDimitry Andric /* --------------------------------------------------------------------- */
144d88c1a5aSDimitry Andric
ConvertUTF32toUTF16(const UTF32 ** sourceStart,const UTF32 * sourceEnd,UTF16 ** targetStart,UTF16 * targetEnd,ConversionFlags flags)145d88c1a5aSDimitry Andric ConversionResult ConvertUTF32toUTF16 (
146d88c1a5aSDimitry Andric const UTF32** sourceStart, const UTF32* sourceEnd,
147d88c1a5aSDimitry Andric UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
148d88c1a5aSDimitry Andric ConversionResult result = conversionOK;
149d88c1a5aSDimitry Andric const UTF32* source = *sourceStart;
150d88c1a5aSDimitry Andric UTF16* target = *targetStart;
151d88c1a5aSDimitry Andric while (source < sourceEnd) {
152d88c1a5aSDimitry Andric UTF32 ch;
153d88c1a5aSDimitry Andric if (target >= targetEnd) {
154d88c1a5aSDimitry Andric result = targetExhausted; break;
155d88c1a5aSDimitry Andric }
156d88c1a5aSDimitry Andric ch = *source++;
157d88c1a5aSDimitry Andric if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
158d88c1a5aSDimitry Andric /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
159d88c1a5aSDimitry Andric if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
160d88c1a5aSDimitry Andric if (flags == strictConversion) {
161d88c1a5aSDimitry Andric --source; /* return to the illegal value itself */
162d88c1a5aSDimitry Andric result = sourceIllegal;
163d88c1a5aSDimitry Andric break;
164d88c1a5aSDimitry Andric } else {
165d88c1a5aSDimitry Andric *target++ = UNI_REPLACEMENT_CHAR;
166d88c1a5aSDimitry Andric }
167d88c1a5aSDimitry Andric } else {
168d88c1a5aSDimitry Andric *target++ = (UTF16)ch; /* normal case */
169d88c1a5aSDimitry Andric }
170d88c1a5aSDimitry Andric } else if (ch > UNI_MAX_LEGAL_UTF32) {
171d88c1a5aSDimitry Andric if (flags == strictConversion) {
172d88c1a5aSDimitry Andric result = sourceIllegal;
173d88c1a5aSDimitry Andric } else {
174d88c1a5aSDimitry Andric *target++ = UNI_REPLACEMENT_CHAR;
175d88c1a5aSDimitry Andric }
176d88c1a5aSDimitry Andric } else {
177d88c1a5aSDimitry Andric /* target is a character in range 0xFFFF - 0x10FFFF. */
178d88c1a5aSDimitry Andric if (target + 1 >= targetEnd) {
179d88c1a5aSDimitry Andric --source; /* Back up source pointer! */
180d88c1a5aSDimitry Andric result = targetExhausted; break;
181d88c1a5aSDimitry Andric }
182d88c1a5aSDimitry Andric ch -= halfBase;
183d88c1a5aSDimitry Andric *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
184d88c1a5aSDimitry Andric *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
185d88c1a5aSDimitry Andric }
186d88c1a5aSDimitry Andric }
187d88c1a5aSDimitry Andric *sourceStart = source;
188d88c1a5aSDimitry Andric *targetStart = target;
189d88c1a5aSDimitry Andric return result;
190d88c1a5aSDimitry Andric }
191d88c1a5aSDimitry Andric
192d88c1a5aSDimitry Andric /* --------------------------------------------------------------------- */
193d88c1a5aSDimitry Andric
ConvertUTF16toUTF32(const UTF16 ** sourceStart,const UTF16 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags)194d88c1a5aSDimitry Andric ConversionResult ConvertUTF16toUTF32 (
195d88c1a5aSDimitry Andric const UTF16** sourceStart, const UTF16* sourceEnd,
196d88c1a5aSDimitry Andric UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
197d88c1a5aSDimitry Andric ConversionResult result = conversionOK;
198d88c1a5aSDimitry Andric const UTF16* source = *sourceStart;
199d88c1a5aSDimitry Andric UTF32* target = *targetStart;
200d88c1a5aSDimitry Andric UTF32 ch, ch2;
201d88c1a5aSDimitry Andric while (source < sourceEnd) {
202d88c1a5aSDimitry Andric const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
203d88c1a5aSDimitry Andric ch = *source++;
204d88c1a5aSDimitry Andric /* If we have a surrogate pair, convert to UTF32 first. */
205d88c1a5aSDimitry Andric if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
206d88c1a5aSDimitry Andric /* If the 16 bits following the high surrogate are in the source buffer... */
207d88c1a5aSDimitry Andric if (source < sourceEnd) {
208d88c1a5aSDimitry Andric ch2 = *source;
209d88c1a5aSDimitry Andric /* If it's a low surrogate, convert to UTF32. */
210d88c1a5aSDimitry Andric if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
211d88c1a5aSDimitry Andric ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
212d88c1a5aSDimitry Andric + (ch2 - UNI_SUR_LOW_START) + halfBase;
213d88c1a5aSDimitry Andric ++source;
214d88c1a5aSDimitry Andric } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
215d88c1a5aSDimitry Andric --source; /* return to the illegal value itself */
216d88c1a5aSDimitry Andric result = sourceIllegal;
217d88c1a5aSDimitry Andric break;
218d88c1a5aSDimitry Andric }
219d88c1a5aSDimitry Andric } else { /* We don't have the 16 bits following the high surrogate. */
220d88c1a5aSDimitry Andric --source; /* return to the high surrogate */
221d88c1a5aSDimitry Andric result = sourceExhausted;
222d88c1a5aSDimitry Andric break;
223d88c1a5aSDimitry Andric }
224d88c1a5aSDimitry Andric } else if (flags == strictConversion) {
225d88c1a5aSDimitry Andric /* UTF-16 surrogate values are illegal in UTF-32 */
226d88c1a5aSDimitry Andric if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
227d88c1a5aSDimitry Andric --source; /* return to the illegal value itself */
228d88c1a5aSDimitry Andric result = sourceIllegal;
229d88c1a5aSDimitry Andric break;
230d88c1a5aSDimitry Andric }
231d88c1a5aSDimitry Andric }
232d88c1a5aSDimitry Andric if (target >= targetEnd) {
233d88c1a5aSDimitry Andric source = oldSource; /* Back up source pointer! */
234d88c1a5aSDimitry Andric result = targetExhausted; break;
235d88c1a5aSDimitry Andric }
236d88c1a5aSDimitry Andric *target++ = ch;
237d88c1a5aSDimitry Andric }
238d88c1a5aSDimitry Andric *sourceStart = source;
239d88c1a5aSDimitry Andric *targetStart = target;
240d88c1a5aSDimitry Andric #ifdef CVTUTF_DEBUG
241d88c1a5aSDimitry Andric if (result == sourceIllegal) {
242d88c1a5aSDimitry Andric fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
243d88c1a5aSDimitry Andric fflush(stderr);
244d88c1a5aSDimitry Andric }
245d88c1a5aSDimitry Andric #endif
246d88c1a5aSDimitry Andric return result;
247d88c1a5aSDimitry Andric }
ConvertUTF16toUTF8(const UTF16 ** sourceStart,const UTF16 * sourceEnd,UTF8 ** targetStart,UTF8 * targetEnd,ConversionFlags flags)248d88c1a5aSDimitry Andric ConversionResult ConvertUTF16toUTF8 (
249d88c1a5aSDimitry Andric const UTF16** sourceStart, const UTF16* sourceEnd,
250d88c1a5aSDimitry Andric UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
251d88c1a5aSDimitry Andric ConversionResult result = conversionOK;
252d88c1a5aSDimitry Andric const UTF16* source = *sourceStart;
253d88c1a5aSDimitry Andric UTF8* target = *targetStart;
254d88c1a5aSDimitry Andric while (source < sourceEnd) {
255d88c1a5aSDimitry Andric UTF32 ch;
256d88c1a5aSDimitry Andric unsigned short bytesToWrite = 0;
257d88c1a5aSDimitry Andric const UTF32 byteMask = 0xBF;
258d88c1a5aSDimitry Andric const UTF32 byteMark = 0x80;
259d88c1a5aSDimitry Andric const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
260d88c1a5aSDimitry Andric ch = *source++;
261d88c1a5aSDimitry Andric /* If we have a surrogate pair, convert to UTF32 first. */
262d88c1a5aSDimitry Andric if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
263d88c1a5aSDimitry Andric /* If the 16 bits following the high surrogate are in the source buffer... */
264d88c1a5aSDimitry Andric if (source < sourceEnd) {
265d88c1a5aSDimitry Andric UTF32 ch2 = *source;
266d88c1a5aSDimitry Andric /* If it's a low surrogate, convert to UTF32. */
267d88c1a5aSDimitry Andric if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
268d88c1a5aSDimitry Andric ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
269d88c1a5aSDimitry Andric + (ch2 - UNI_SUR_LOW_START) + halfBase;
270d88c1a5aSDimitry Andric ++source;
271d88c1a5aSDimitry Andric } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
272d88c1a5aSDimitry Andric --source; /* return to the illegal value itself */
273d88c1a5aSDimitry Andric result = sourceIllegal;
274d88c1a5aSDimitry Andric break;
275d88c1a5aSDimitry Andric }
276d88c1a5aSDimitry Andric } else { /* We don't have the 16 bits following the high surrogate. */
277d88c1a5aSDimitry Andric --source; /* return to the high surrogate */
278d88c1a5aSDimitry Andric result = sourceExhausted;
279d88c1a5aSDimitry Andric break;
280d88c1a5aSDimitry Andric }
281d88c1a5aSDimitry Andric } else if (flags == strictConversion) {
282d88c1a5aSDimitry Andric /* UTF-16 surrogate values are illegal in UTF-32 */
283d88c1a5aSDimitry Andric if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
284d88c1a5aSDimitry Andric --source; /* return to the illegal value itself */
285d88c1a5aSDimitry Andric result = sourceIllegal;
286d88c1a5aSDimitry Andric break;
287d88c1a5aSDimitry Andric }
288d88c1a5aSDimitry Andric }
289d88c1a5aSDimitry Andric /* Figure out how many bytes the result will require */
290d88c1a5aSDimitry Andric if (ch < (UTF32)0x80) { bytesToWrite = 1;
291d88c1a5aSDimitry Andric } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
292d88c1a5aSDimitry Andric } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
293d88c1a5aSDimitry Andric } else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
294d88c1a5aSDimitry Andric } else { bytesToWrite = 3;
295d88c1a5aSDimitry Andric ch = UNI_REPLACEMENT_CHAR;
296d88c1a5aSDimitry Andric }
297d88c1a5aSDimitry Andric
298d88c1a5aSDimitry Andric target += bytesToWrite;
299d88c1a5aSDimitry Andric if (target > targetEnd) {
300d88c1a5aSDimitry Andric source = oldSource; /* Back up source pointer! */
301d88c1a5aSDimitry Andric target -= bytesToWrite; result = targetExhausted; break;
302d88c1a5aSDimitry Andric }
303d88c1a5aSDimitry Andric switch (bytesToWrite) { /* note: everything falls through. */
304d88c1a5aSDimitry Andric case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
305d88c1a5aSDimitry Andric case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
306d88c1a5aSDimitry Andric case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
307d88c1a5aSDimitry Andric case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
308d88c1a5aSDimitry Andric }
309d88c1a5aSDimitry Andric target += bytesToWrite;
310d88c1a5aSDimitry Andric }
311d88c1a5aSDimitry Andric *sourceStart = source;
312d88c1a5aSDimitry Andric *targetStart = target;
313d88c1a5aSDimitry Andric return result;
314d88c1a5aSDimitry Andric }
315d88c1a5aSDimitry Andric
316d88c1a5aSDimitry Andric /* --------------------------------------------------------------------- */
317d88c1a5aSDimitry Andric
ConvertUTF32toUTF8(const UTF32 ** sourceStart,const UTF32 * sourceEnd,UTF8 ** targetStart,UTF8 * targetEnd,ConversionFlags flags)318d88c1a5aSDimitry Andric ConversionResult ConvertUTF32toUTF8 (
319d88c1a5aSDimitry Andric const UTF32** sourceStart, const UTF32* sourceEnd,
320d88c1a5aSDimitry Andric UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
321d88c1a5aSDimitry Andric ConversionResult result = conversionOK;
322d88c1a5aSDimitry Andric const UTF32* source = *sourceStart;
323d88c1a5aSDimitry Andric UTF8* target = *targetStart;
324d88c1a5aSDimitry Andric while (source < sourceEnd) {
325d88c1a5aSDimitry Andric UTF32 ch;
326d88c1a5aSDimitry Andric unsigned short bytesToWrite = 0;
327d88c1a5aSDimitry Andric const UTF32 byteMask = 0xBF;
328d88c1a5aSDimitry Andric const UTF32 byteMark = 0x80;
329d88c1a5aSDimitry Andric ch = *source++;
330d88c1a5aSDimitry Andric if (flags == strictConversion ) {
331d88c1a5aSDimitry Andric /* UTF-16 surrogate values are illegal in UTF-32 */
332d88c1a5aSDimitry Andric if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
333d88c1a5aSDimitry Andric --source; /* return to the illegal value itself */
334d88c1a5aSDimitry Andric result = sourceIllegal;
335d88c1a5aSDimitry Andric break;
336d88c1a5aSDimitry Andric }
337d88c1a5aSDimitry Andric }
338d88c1a5aSDimitry Andric /*
339d88c1a5aSDimitry Andric * Figure out how many bytes the result will require. Turn any
340d88c1a5aSDimitry Andric * illegally large UTF32 things (> Plane 17) into replacement chars.
341d88c1a5aSDimitry Andric */
342d88c1a5aSDimitry Andric if (ch < (UTF32)0x80) { bytesToWrite = 1;
343d88c1a5aSDimitry Andric } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
344d88c1a5aSDimitry Andric } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
345d88c1a5aSDimitry Andric } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
346d88c1a5aSDimitry Andric } else { bytesToWrite = 3;
347d88c1a5aSDimitry Andric ch = UNI_REPLACEMENT_CHAR;
348d88c1a5aSDimitry Andric result = sourceIllegal;
349d88c1a5aSDimitry Andric }
350d88c1a5aSDimitry Andric
351d88c1a5aSDimitry Andric target += bytesToWrite;
352d88c1a5aSDimitry Andric if (target > targetEnd) {
353d88c1a5aSDimitry Andric --source; /* Back up source pointer! */
354d88c1a5aSDimitry Andric target -= bytesToWrite; result = targetExhausted; break;
355d88c1a5aSDimitry Andric }
356d88c1a5aSDimitry Andric switch (bytesToWrite) { /* note: everything falls through. */
357d88c1a5aSDimitry Andric case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
358d88c1a5aSDimitry Andric case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
359d88c1a5aSDimitry Andric case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
360d88c1a5aSDimitry Andric case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
361d88c1a5aSDimitry Andric }
362d88c1a5aSDimitry Andric target += bytesToWrite;
363d88c1a5aSDimitry Andric }
364d88c1a5aSDimitry Andric *sourceStart = source;
365d88c1a5aSDimitry Andric *targetStart = target;
366d88c1a5aSDimitry Andric return result;
367d88c1a5aSDimitry Andric }
368d88c1a5aSDimitry Andric
369d88c1a5aSDimitry Andric /* --------------------------------------------------------------------- */
370d88c1a5aSDimitry Andric
371d88c1a5aSDimitry Andric /*
372d88c1a5aSDimitry Andric * Utility routine to tell whether a sequence of bytes is legal UTF-8.
373d88c1a5aSDimitry Andric * This must be called with the length pre-determined by the first byte.
374d88c1a5aSDimitry Andric * If not calling this from ConvertUTF8to*, then the length can be set by:
375d88c1a5aSDimitry Andric * length = trailingBytesForUTF8[*source]+1;
376d88c1a5aSDimitry Andric * and the sequence is illegal right away if there aren't that many bytes
377d88c1a5aSDimitry Andric * available.
378d88c1a5aSDimitry Andric * If presented with a length > 4, this returns false. The Unicode
379d88c1a5aSDimitry Andric * definition of UTF-8 goes up to 4-byte sequences.
380d88c1a5aSDimitry Andric */
381d88c1a5aSDimitry Andric
isLegalUTF8(const UTF8 * source,int length)382d88c1a5aSDimitry Andric static Boolean isLegalUTF8(const UTF8 *source, int length) {
383d88c1a5aSDimitry Andric UTF8 a;
384d88c1a5aSDimitry Andric const UTF8 *srcptr = source+length;
385d88c1a5aSDimitry Andric switch (length) {
386d88c1a5aSDimitry Andric default: return false;
387d88c1a5aSDimitry Andric /* Everything else falls through when "true"... */
388d88c1a5aSDimitry Andric case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
389d88c1a5aSDimitry Andric case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
390d88c1a5aSDimitry Andric case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
391d88c1a5aSDimitry Andric
392d88c1a5aSDimitry Andric switch (*source) {
393d88c1a5aSDimitry Andric /* no fall-through in this inner switch */
394d88c1a5aSDimitry Andric case 0xE0: if (a < 0xA0) return false; break;
395d88c1a5aSDimitry Andric case 0xED: if (a > 0x9F) return false; break;
396d88c1a5aSDimitry Andric case 0xF0: if (a < 0x90) return false; break;
397d88c1a5aSDimitry Andric case 0xF4: if (a > 0x8F) return false; break;
398d88c1a5aSDimitry Andric default: if (a < 0x80) return false;
399d88c1a5aSDimitry Andric }
400d88c1a5aSDimitry Andric
401d88c1a5aSDimitry Andric case 1: if (*source >= 0x80 && *source < 0xC2) return false;
402d88c1a5aSDimitry Andric }
403d88c1a5aSDimitry Andric if (*source > 0xF4) return false;
404d88c1a5aSDimitry Andric return true;
405d88c1a5aSDimitry Andric }
406d88c1a5aSDimitry Andric
407d88c1a5aSDimitry Andric /* --------------------------------------------------------------------- */
408d88c1a5aSDimitry Andric
409d88c1a5aSDimitry Andric /*
410d88c1a5aSDimitry Andric * Exported function to return whether a UTF-8 sequence is legal or not.
411d88c1a5aSDimitry Andric * This is not used here; it's just exported.
412d88c1a5aSDimitry Andric */
isLegalUTF8Sequence(const UTF8 * source,const UTF8 * sourceEnd)413d88c1a5aSDimitry Andric Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
414d88c1a5aSDimitry Andric int length = trailingBytesForUTF8[*source]+1;
415d88c1a5aSDimitry Andric if (length > sourceEnd - source) {
416d88c1a5aSDimitry Andric return false;
417d88c1a5aSDimitry Andric }
418d88c1a5aSDimitry Andric return isLegalUTF8(source, length);
419d88c1a5aSDimitry Andric }
420d88c1a5aSDimitry Andric
421d88c1a5aSDimitry Andric /* --------------------------------------------------------------------- */
422d88c1a5aSDimitry Andric
423d88c1a5aSDimitry Andric static unsigned
findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 * source,const UTF8 * sourceEnd)424d88c1a5aSDimitry Andric findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source,
425d88c1a5aSDimitry Andric const UTF8 *sourceEnd) {
426d88c1a5aSDimitry Andric UTF8 b1, b2, b3;
427d88c1a5aSDimitry Andric
428d88c1a5aSDimitry Andric assert(!isLegalUTF8Sequence(source, sourceEnd));
429d88c1a5aSDimitry Andric
430d88c1a5aSDimitry Andric /*
431d88c1a5aSDimitry Andric * Unicode 6.3.0, D93b:
432d88c1a5aSDimitry Andric *
433d88c1a5aSDimitry Andric * Maximal subpart of an ill-formed subsequence: The longest code unit
434d88c1a5aSDimitry Andric * subsequence starting at an unconvertible offset that is either:
435d88c1a5aSDimitry Andric * a. the initial subsequence of a well-formed code unit sequence, or
436d88c1a5aSDimitry Andric * b. a subsequence of length one.
437d88c1a5aSDimitry Andric */
438d88c1a5aSDimitry Andric
439d88c1a5aSDimitry Andric if (source == sourceEnd)
440d88c1a5aSDimitry Andric return 0;
441d88c1a5aSDimitry Andric
442d88c1a5aSDimitry Andric /*
443d88c1a5aSDimitry Andric * Perform case analysis. See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8
444d88c1a5aSDimitry Andric * Byte Sequences.
445d88c1a5aSDimitry Andric */
446d88c1a5aSDimitry Andric
447d88c1a5aSDimitry Andric b1 = *source;
448d88c1a5aSDimitry Andric ++source;
449d88c1a5aSDimitry Andric if (b1 >= 0xC2 && b1 <= 0xDF) {
450d88c1a5aSDimitry Andric /*
451d88c1a5aSDimitry Andric * First byte is valid, but we know that this code unit sequence is
452d88c1a5aSDimitry Andric * invalid, so the maximal subpart has to end after the first byte.
453d88c1a5aSDimitry Andric */
454d88c1a5aSDimitry Andric return 1;
455d88c1a5aSDimitry Andric }
456d88c1a5aSDimitry Andric
457d88c1a5aSDimitry Andric if (source == sourceEnd)
458d88c1a5aSDimitry Andric return 1;
459d88c1a5aSDimitry Andric
460d88c1a5aSDimitry Andric b2 = *source;
461d88c1a5aSDimitry Andric ++source;
462d88c1a5aSDimitry Andric
463d88c1a5aSDimitry Andric if (b1 == 0xE0) {
464d88c1a5aSDimitry Andric return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;
465d88c1a5aSDimitry Andric }
466d88c1a5aSDimitry Andric if (b1 >= 0xE1 && b1 <= 0xEC) {
467d88c1a5aSDimitry Andric return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
468d88c1a5aSDimitry Andric }
469d88c1a5aSDimitry Andric if (b1 == 0xED) {
470d88c1a5aSDimitry Andric return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;
471d88c1a5aSDimitry Andric }
472d88c1a5aSDimitry Andric if (b1 >= 0xEE && b1 <= 0xEF) {
473d88c1a5aSDimitry Andric return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
474d88c1a5aSDimitry Andric }
475d88c1a5aSDimitry Andric if (b1 == 0xF0) {
476d88c1a5aSDimitry Andric if (b2 >= 0x90 && b2 <= 0xBF) {
477d88c1a5aSDimitry Andric if (source == sourceEnd)
478d88c1a5aSDimitry Andric return 2;
479d88c1a5aSDimitry Andric
480d88c1a5aSDimitry Andric b3 = *source;
481d88c1a5aSDimitry Andric return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
482d88c1a5aSDimitry Andric }
483d88c1a5aSDimitry Andric return 1;
484d88c1a5aSDimitry Andric }
485d88c1a5aSDimitry Andric if (b1 >= 0xF1 && b1 <= 0xF3) {
486d88c1a5aSDimitry Andric if (b2 >= 0x80 && b2 <= 0xBF) {
487d88c1a5aSDimitry Andric if (source == sourceEnd)
488d88c1a5aSDimitry Andric return 2;
489d88c1a5aSDimitry Andric
490d88c1a5aSDimitry Andric b3 = *source;
491d88c1a5aSDimitry Andric return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
492d88c1a5aSDimitry Andric }
493d88c1a5aSDimitry Andric return 1;
494d88c1a5aSDimitry Andric }
495d88c1a5aSDimitry Andric if (b1 == 0xF4) {
496d88c1a5aSDimitry Andric if (b2 >= 0x80 && b2 <= 0x8F) {
497d88c1a5aSDimitry Andric if (source == sourceEnd)
498d88c1a5aSDimitry Andric return 2;
499d88c1a5aSDimitry Andric
500d88c1a5aSDimitry Andric b3 = *source;
501d88c1a5aSDimitry Andric return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
502d88c1a5aSDimitry Andric }
503d88c1a5aSDimitry Andric return 1;
504d88c1a5aSDimitry Andric }
505d88c1a5aSDimitry Andric
506d88c1a5aSDimitry Andric assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);
507d88c1a5aSDimitry Andric /*
508d88c1a5aSDimitry Andric * There are no valid sequences that start with these bytes. Maximal subpart
509d88c1a5aSDimitry Andric * is defined to have length 1 in these cases.
510d88c1a5aSDimitry Andric */
511d88c1a5aSDimitry Andric return 1;
512d88c1a5aSDimitry Andric }
513d88c1a5aSDimitry Andric
514d88c1a5aSDimitry Andric /* --------------------------------------------------------------------- */
515d88c1a5aSDimitry Andric
516d88c1a5aSDimitry Andric /*
517d88c1a5aSDimitry Andric * Exported function to return the total number of bytes in a codepoint
518d88c1a5aSDimitry Andric * represented in UTF-8, given the value of the first byte.
519d88c1a5aSDimitry Andric */
getNumBytesForUTF8(UTF8 first)520d88c1a5aSDimitry Andric unsigned getNumBytesForUTF8(UTF8 first) {
521d88c1a5aSDimitry Andric return trailingBytesForUTF8[first] + 1;
522d88c1a5aSDimitry Andric }
523d88c1a5aSDimitry Andric
524d88c1a5aSDimitry Andric /* --------------------------------------------------------------------- */
525d88c1a5aSDimitry Andric
526d88c1a5aSDimitry Andric /*
527d88c1a5aSDimitry Andric * Exported function to return whether a UTF-8 string is legal or not.
528d88c1a5aSDimitry Andric * This is not used here; it's just exported.
529d88c1a5aSDimitry Andric */
isLegalUTF8String(const UTF8 ** source,const UTF8 * sourceEnd)530d88c1a5aSDimitry Andric Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {
531d88c1a5aSDimitry Andric while (*source != sourceEnd) {
532d88c1a5aSDimitry Andric int length = trailingBytesForUTF8[**source] + 1;
533d88c1a5aSDimitry Andric if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
534d88c1a5aSDimitry Andric return false;
535d88c1a5aSDimitry Andric *source += length;
536d88c1a5aSDimitry Andric }
537d88c1a5aSDimitry Andric return true;
538d88c1a5aSDimitry Andric }
539d88c1a5aSDimitry Andric
540d88c1a5aSDimitry Andric /* --------------------------------------------------------------------- */
541d88c1a5aSDimitry Andric
ConvertUTF8toUTF16(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF16 ** targetStart,UTF16 * targetEnd,ConversionFlags flags)542d88c1a5aSDimitry Andric ConversionResult ConvertUTF8toUTF16 (
543d88c1a5aSDimitry Andric const UTF8** sourceStart, const UTF8* sourceEnd,
544d88c1a5aSDimitry Andric UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
545d88c1a5aSDimitry Andric ConversionResult result = conversionOK;
546d88c1a5aSDimitry Andric const UTF8* source = *sourceStart;
547d88c1a5aSDimitry Andric UTF16* target = *targetStart;
548d88c1a5aSDimitry Andric while (source < sourceEnd) {
549d88c1a5aSDimitry Andric UTF32 ch = 0;
550d88c1a5aSDimitry Andric unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
551d88c1a5aSDimitry Andric if (extraBytesToRead >= sourceEnd - source) {
552d88c1a5aSDimitry Andric result = sourceExhausted; break;
553d88c1a5aSDimitry Andric }
554d88c1a5aSDimitry Andric /* Do this check whether lenient or strict */
555d88c1a5aSDimitry Andric if (!isLegalUTF8(source, extraBytesToRead+1)) {
556d88c1a5aSDimitry Andric result = sourceIllegal;
557d88c1a5aSDimitry Andric break;
558d88c1a5aSDimitry Andric }
559d88c1a5aSDimitry Andric /*
560d88c1a5aSDimitry Andric * The cases all fall through. See "Note A" below.
561d88c1a5aSDimitry Andric */
562d88c1a5aSDimitry Andric switch (extraBytesToRead) {
563d88c1a5aSDimitry Andric case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
564d88c1a5aSDimitry Andric case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
565d88c1a5aSDimitry Andric case 3: ch += *source++; ch <<= 6;
566d88c1a5aSDimitry Andric case 2: ch += *source++; ch <<= 6;
567d88c1a5aSDimitry Andric case 1: ch += *source++; ch <<= 6;
568d88c1a5aSDimitry Andric case 0: ch += *source++;
569d88c1a5aSDimitry Andric }
570d88c1a5aSDimitry Andric ch -= offsetsFromUTF8[extraBytesToRead];
571d88c1a5aSDimitry Andric
572d88c1a5aSDimitry Andric if (target >= targetEnd) {
573d88c1a5aSDimitry Andric source -= (extraBytesToRead+1); /* Back up source pointer! */
574d88c1a5aSDimitry Andric result = targetExhausted; break;
575d88c1a5aSDimitry Andric }
576d88c1a5aSDimitry Andric if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
577d88c1a5aSDimitry Andric /* UTF-16 surrogate values are illegal in UTF-32 */
578d88c1a5aSDimitry Andric if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
579d88c1a5aSDimitry Andric if (flags == strictConversion) {
580d88c1a5aSDimitry Andric source -= (extraBytesToRead+1); /* return to the illegal value itself */
581d88c1a5aSDimitry Andric result = sourceIllegal;
582d88c1a5aSDimitry Andric break;
583d88c1a5aSDimitry Andric } else {
584d88c1a5aSDimitry Andric *target++ = UNI_REPLACEMENT_CHAR;
585d88c1a5aSDimitry Andric }
586d88c1a5aSDimitry Andric } else {
587d88c1a5aSDimitry Andric *target++ = (UTF16)ch; /* normal case */
588d88c1a5aSDimitry Andric }
589d88c1a5aSDimitry Andric } else if (ch > UNI_MAX_UTF16) {
590d88c1a5aSDimitry Andric if (flags == strictConversion) {
591d88c1a5aSDimitry Andric result = sourceIllegal;
592d88c1a5aSDimitry Andric source -= (extraBytesToRead+1); /* return to the start */
593d88c1a5aSDimitry Andric break; /* Bail out; shouldn't continue */
594d88c1a5aSDimitry Andric } else {
595d88c1a5aSDimitry Andric *target++ = UNI_REPLACEMENT_CHAR;
596d88c1a5aSDimitry Andric }
597d88c1a5aSDimitry Andric } else {
598d88c1a5aSDimitry Andric /* target is a character in range 0xFFFF - 0x10FFFF. */
599d88c1a5aSDimitry Andric if (target + 1 >= targetEnd) {
600d88c1a5aSDimitry Andric source -= (extraBytesToRead+1); /* Back up source pointer! */
601d88c1a5aSDimitry Andric result = targetExhausted; break;
602d88c1a5aSDimitry Andric }
603d88c1a5aSDimitry Andric ch -= halfBase;
604d88c1a5aSDimitry Andric *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
605d88c1a5aSDimitry Andric *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
606d88c1a5aSDimitry Andric }
607d88c1a5aSDimitry Andric }
608d88c1a5aSDimitry Andric *sourceStart = source;
609d88c1a5aSDimitry Andric *targetStart = target;
610d88c1a5aSDimitry Andric return result;
611d88c1a5aSDimitry Andric }
612d88c1a5aSDimitry Andric
613d88c1a5aSDimitry Andric /* --------------------------------------------------------------------- */
614d88c1a5aSDimitry Andric
ConvertUTF8toUTF32Impl(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags,Boolean InputIsPartial)615d88c1a5aSDimitry Andric static ConversionResult ConvertUTF8toUTF32Impl(
616d88c1a5aSDimitry Andric const UTF8** sourceStart, const UTF8* sourceEnd,
617d88c1a5aSDimitry Andric UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags,
618d88c1a5aSDimitry Andric Boolean InputIsPartial) {
619d88c1a5aSDimitry Andric ConversionResult result = conversionOK;
620d88c1a5aSDimitry Andric const UTF8* source = *sourceStart;
621d88c1a5aSDimitry Andric UTF32* target = *targetStart;
622d88c1a5aSDimitry Andric while (source < sourceEnd) {
623d88c1a5aSDimitry Andric UTF32 ch = 0;
624d88c1a5aSDimitry Andric unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
625d88c1a5aSDimitry Andric if (extraBytesToRead >= sourceEnd - source) {
626d88c1a5aSDimitry Andric if (flags == strictConversion || InputIsPartial) {
627d88c1a5aSDimitry Andric result = sourceExhausted;
628d88c1a5aSDimitry Andric break;
629d88c1a5aSDimitry Andric } else {
630d88c1a5aSDimitry Andric result = sourceIllegal;
631d88c1a5aSDimitry Andric
632d88c1a5aSDimitry Andric /*
633d88c1a5aSDimitry Andric * Replace the maximal subpart of ill-formed sequence with
634d88c1a5aSDimitry Andric * replacement character.
635d88c1a5aSDimitry Andric */
636d88c1a5aSDimitry Andric source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
637d88c1a5aSDimitry Andric sourceEnd);
638d88c1a5aSDimitry Andric *target++ = UNI_REPLACEMENT_CHAR;
639d88c1a5aSDimitry Andric continue;
640d88c1a5aSDimitry Andric }
641d88c1a5aSDimitry Andric }
642d88c1a5aSDimitry Andric if (target >= targetEnd) {
643d88c1a5aSDimitry Andric result = targetExhausted; break;
644d88c1a5aSDimitry Andric }
645d88c1a5aSDimitry Andric
646d88c1a5aSDimitry Andric /* Do this check whether lenient or strict */
647d88c1a5aSDimitry Andric if (!isLegalUTF8(source, extraBytesToRead+1)) {
648d88c1a5aSDimitry Andric result = sourceIllegal;
649d88c1a5aSDimitry Andric if (flags == strictConversion) {
650d88c1a5aSDimitry Andric /* Abort conversion. */
651d88c1a5aSDimitry Andric break;
652d88c1a5aSDimitry Andric } else {
653d88c1a5aSDimitry Andric /*
654d88c1a5aSDimitry Andric * Replace the maximal subpart of ill-formed sequence with
655d88c1a5aSDimitry Andric * replacement character.
656d88c1a5aSDimitry Andric */
657d88c1a5aSDimitry Andric source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
658d88c1a5aSDimitry Andric sourceEnd);
659d88c1a5aSDimitry Andric *target++ = UNI_REPLACEMENT_CHAR;
660d88c1a5aSDimitry Andric continue;
661d88c1a5aSDimitry Andric }
662d88c1a5aSDimitry Andric }
663d88c1a5aSDimitry Andric /*
664d88c1a5aSDimitry Andric * The cases all fall through. See "Note A" below.
665d88c1a5aSDimitry Andric */
666d88c1a5aSDimitry Andric switch (extraBytesToRead) {
667d88c1a5aSDimitry Andric case 5: ch += *source++; ch <<= 6;
668d88c1a5aSDimitry Andric case 4: ch += *source++; ch <<= 6;
669d88c1a5aSDimitry Andric case 3: ch += *source++; ch <<= 6;
670d88c1a5aSDimitry Andric case 2: ch += *source++; ch <<= 6;
671d88c1a5aSDimitry Andric case 1: ch += *source++; ch <<= 6;
672d88c1a5aSDimitry Andric case 0: ch += *source++;
673d88c1a5aSDimitry Andric }
674d88c1a5aSDimitry Andric ch -= offsetsFromUTF8[extraBytesToRead];
675d88c1a5aSDimitry Andric
676d88c1a5aSDimitry Andric if (ch <= UNI_MAX_LEGAL_UTF32) {
677d88c1a5aSDimitry Andric /*
678d88c1a5aSDimitry Andric * UTF-16 surrogate values are illegal in UTF-32, and anything
679d88c1a5aSDimitry Andric * over Plane 17 (> 0x10FFFF) is illegal.
680d88c1a5aSDimitry Andric */
681d88c1a5aSDimitry Andric if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
682d88c1a5aSDimitry Andric if (flags == strictConversion) {
683d88c1a5aSDimitry Andric source -= (extraBytesToRead+1); /* return to the illegal value itself */
684d88c1a5aSDimitry Andric result = sourceIllegal;
685d88c1a5aSDimitry Andric break;
686d88c1a5aSDimitry Andric } else {
687d88c1a5aSDimitry Andric *target++ = UNI_REPLACEMENT_CHAR;
688d88c1a5aSDimitry Andric }
689d88c1a5aSDimitry Andric } else {
690d88c1a5aSDimitry Andric *target++ = ch;
691d88c1a5aSDimitry Andric }
692d88c1a5aSDimitry Andric } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
693d88c1a5aSDimitry Andric result = sourceIllegal;
694d88c1a5aSDimitry Andric *target++ = UNI_REPLACEMENT_CHAR;
695d88c1a5aSDimitry Andric }
696d88c1a5aSDimitry Andric }
697d88c1a5aSDimitry Andric *sourceStart = source;
698d88c1a5aSDimitry Andric *targetStart = target;
699d88c1a5aSDimitry Andric return result;
700d88c1a5aSDimitry Andric }
701d88c1a5aSDimitry Andric
ConvertUTF8toUTF32Partial(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags)702d88c1a5aSDimitry Andric ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart,
703d88c1a5aSDimitry Andric const UTF8 *sourceEnd,
704d88c1a5aSDimitry Andric UTF32 **targetStart,
705d88c1a5aSDimitry Andric UTF32 *targetEnd,
706d88c1a5aSDimitry Andric ConversionFlags flags) {
707d88c1a5aSDimitry Andric return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
708d88c1a5aSDimitry Andric flags, /*InputIsPartial=*/true);
709d88c1a5aSDimitry Andric }
710d88c1a5aSDimitry Andric
ConvertUTF8toUTF32(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags)711d88c1a5aSDimitry Andric ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart,
712d88c1a5aSDimitry Andric const UTF8 *sourceEnd, UTF32 **targetStart,
713d88c1a5aSDimitry Andric UTF32 *targetEnd, ConversionFlags flags) {
714d88c1a5aSDimitry Andric return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
715d88c1a5aSDimitry Andric flags, /*InputIsPartial=*/false);
716d88c1a5aSDimitry Andric }
717d88c1a5aSDimitry Andric
718d88c1a5aSDimitry Andric /* ---------------------------------------------------------------------
719d88c1a5aSDimitry Andric
720d88c1a5aSDimitry Andric Note A.
721d88c1a5aSDimitry Andric The fall-through switches in UTF-8 reading code save a
722d88c1a5aSDimitry Andric temp variable, some decrements & conditionals. The switches
723d88c1a5aSDimitry Andric are equivalent to the following loop:
724d88c1a5aSDimitry Andric {
725d88c1a5aSDimitry Andric int tmpBytesToRead = extraBytesToRead+1;
726d88c1a5aSDimitry Andric do {
727d88c1a5aSDimitry Andric ch += *source++;
728d88c1a5aSDimitry Andric --tmpBytesToRead;
729d88c1a5aSDimitry Andric if (tmpBytesToRead) ch <<= 6;
730d88c1a5aSDimitry Andric } while (tmpBytesToRead > 0);
731d88c1a5aSDimitry Andric }
732d88c1a5aSDimitry Andric In UTF-8 writing code, the switches on "bytesToWrite" are
733d88c1a5aSDimitry Andric similarly unrolled loops.
734d88c1a5aSDimitry Andric
735d88c1a5aSDimitry Andric --------------------------------------------------------------------- */
736d88c1a5aSDimitry Andric
737d88c1a5aSDimitry Andric } // namespace llvm
738302affcbSDimitry Andric
739302affcbSDimitry Andric ConvertUTF_RESTORE_WARNINGS
740