lib/Support/ConvertUTF.cpp

9091055eSJustin Lebar/*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
9091055eSJustin Lebar *
2946cd70SChandler Carruth * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
2946cd70SChandler Carruth * See https://llvm.org/LICENSE.txt for license information.
2946cd70SChandler Carruth * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
9091055eSJustin Lebar *
9091055eSJustin Lebar *===------------------------------------------------------------------------=*/
9091055eSJustin Lebar/*
9091055eSJustin Lebar * Copyright 2001-2004 Unicode, Inc.
9091055eSJustin Lebar *
9091055eSJustin Lebar * Disclaimer
9091055eSJustin Lebar *
9091055eSJustin Lebar * This source code is provided as is by Unicode, Inc. No claims are
9091055eSJustin Lebar * made as to fitness for any particular purpose. No warranties of any
9091055eSJustin Lebar * kind are expressed or implied. The recipient agrees to determine
9091055eSJustin Lebar * applicability of information provided. If this file has been
9091055eSJustin Lebar * purchased on magnetic or optical media from Unicode, Inc., the
9091055eSJustin Lebar * sole remedy for any claim will be exchange of defective media
9091055eSJustin Lebar * within 90 days of receipt.
9091055eSJustin Lebar *
9091055eSJustin Lebar * Limitations on Rights to Redistribute This Code
9091055eSJustin Lebar *
9091055eSJustin Lebar * Unicode, Inc. hereby grants the right to freely use the information
9091055eSJustin Lebar * supplied in this file in the creation of products supporting the
9091055eSJustin Lebar * Unicode Standard, and to make copies of this file in any form
9091055eSJustin Lebar * for internal or external distribution as long as this notice
9091055eSJustin Lebar * remains attached.
9091055eSJustin Lebar */
9091055eSJustin Lebar
9091055eSJustin Lebar/* ---------------------------------------------------------------------
9091055eSJustin Lebar
9091055eSJustin Lebar    Conversions between UTF32, UTF-16, and UTF-8. Source code file.
9091055eSJustin Lebar    Author: Mark E. Davis, 1994.
9091055eSJustin Lebar    Rev History: Rick McGowan, fixes & updates May 2001.
9091055eSJustin Lebar    Sept 2001: fixed const & error conditions per
9091055eSJustin Lebar        mods suggested by S. Parent & A. Lillich.
9091055eSJustin Lebar    June 2002: Tim Dodd added detection and handling of incomplete
9091055eSJustin Lebar        source sequences, enhanced error detection, added casts
9091055eSJustin Lebar        to eliminate compiler warnings.
9091055eSJustin Lebar    July 2003: slight mods to back out aggressive FFFE detection.
9091055eSJustin Lebar    Jan 2004: updated switches in from-UTF8 conversions.
9091055eSJustin Lebar    Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
9091055eSJustin Lebar
9091055eSJustin Lebar    See the header file "ConvertUTF.h" for complete documentation.
9091055eSJustin Lebar
9091055eSJustin Lebar------------------------------------------------------------------------ */
9091055eSJustin Lebar
9091055eSJustin Lebar#include "llvm/Support/ConvertUTF.h"
9091055eSJustin Lebar#ifdef CVTUTF_DEBUG
9091055eSJustin Lebar#include <stdio.h>
9091055eSJustin Lebar#endif
9091055eSJustin Lebar#include <assert.h>
9091055eSJustin Lebar
229c9c11SGalina Kistanova/*
229c9c11SGalina Kistanova * This code extensively uses fall-through switches.
229c9c11SGalina Kistanova * Keep the compiler from warning about that.
229c9c11SGalina Kistanova */
229c9c11SGalina Kistanova#if defined(__clang__) && defined(__has_warning)
229c9c11SGalina Kistanova# if __has_warning("-Wimplicit-fallthrough")
229c9c11SGalina Kistanova#  define ConvertUTF_DISABLE_WARNINGS \
229c9c11SGalina Kistanova    _Pragma("clang diagnostic push")  \
229c9c11SGalina Kistanova    _Pragma("clang diagnostic ignored \"-Wimplicit-fallthrough\"")
229c9c11SGalina Kistanova#  define ConvertUTF_RESTORE_WARNINGS \
229c9c11SGalina Kistanova    _Pragma("clang diagnostic pop")
229c9c11SGalina Kistanova# endif
229c9c11SGalina Kistanova#elif defined(__GNUC__) && __GNUC__ > 6
229c9c11SGalina Kistanova# define ConvertUTF_DISABLE_WARNINGS \
229c9c11SGalina Kistanova   _Pragma("GCC diagnostic push")    \
229c9c11SGalina Kistanova   _Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")
229c9c11SGalina Kistanova# define ConvertUTF_RESTORE_WARNINGS \
229c9c11SGalina Kistanova   _Pragma("GCC diagnostic pop")
229c9c11SGalina Kistanova#endif
229c9c11SGalina Kistanova#ifndef ConvertUTF_DISABLE_WARNINGS
229c9c11SGalina Kistanova# define ConvertUTF_DISABLE_WARNINGS
229c9c11SGalina Kistanova#endif
229c9c11SGalina Kistanova#ifndef ConvertUTF_RESTORE_WARNINGS
229c9c11SGalina Kistanova# define ConvertUTF_RESTORE_WARNINGS
229c9c11SGalina Kistanova#endif
229c9c11SGalina Kistanova
229c9c11SGalina KistanovaConvertUTF_DISABLE_WARNINGS
229c9c11SGalina Kistanova
9091055eSJustin Lebarnamespace llvm {
9091055eSJustin Lebar
9091055eSJustin Lebarstatic const int halfShift  = 10; /* used for shifting by 10 bits */
9091055eSJustin Lebar
9091055eSJustin Lebarstatic const UTF32 halfBase = 0x0010000UL;
9091055eSJustin Lebarstatic const UTF32 halfMask = 0x3FFUL;
9091055eSJustin Lebar
9091055eSJustin Lebar#define UNI_SUR_HIGH_START  (UTF32)0xD800
9091055eSJustin Lebar#define UNI_SUR_HIGH_END    (UTF32)0xDBFF
9091055eSJustin Lebar#define UNI_SUR_LOW_START   (UTF32)0xDC00
9091055eSJustin Lebar#define UNI_SUR_LOW_END     (UTF32)0xDFFF
9091055eSJustin Lebar
9091055eSJustin Lebar/* --------------------------------------------------------------------- */
9091055eSJustin Lebar
9091055eSJustin Lebar/*
9091055eSJustin Lebar * Index into the table below with the first byte of a UTF-8 sequence to
9091055eSJustin Lebar * get the number of trailing bytes that are supposed to follow it.
9091055eSJustin Lebar * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
9091055eSJustin Lebar * left as-is for anyone who may want to do such conversion, which was
9091055eSJustin Lebar * allowed in earlier algorithms.
9091055eSJustin Lebar */
9091055eSJustin Lebarstatic const char trailingBytesForUTF8[256] = {
9091055eSJustin Lebar    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
9091055eSJustin Lebar    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
9091055eSJustin Lebar    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
9091055eSJustin Lebar    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
9091055eSJustin Lebar    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
9091055eSJustin Lebar    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
9091055eSJustin Lebar    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
9091055eSJustin Lebar    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
9091055eSJustin Lebar};
9091055eSJustin Lebar
9091055eSJustin Lebar/*
9091055eSJustin Lebar * Magic values subtracted from a buffer value during UTF8 conversion.
9091055eSJustin Lebar * This table contains as many values as there might be trailing bytes
9091055eSJustin Lebar * in a UTF-8 sequence.
9091055eSJustin Lebar */
9091055eSJustin Lebarstatic const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
9091055eSJustin Lebar                     0x03C82080UL, 0xFA082080UL, 0x82082080UL };
9091055eSJustin Lebar
9091055eSJustin Lebar/*
9091055eSJustin Lebar * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
9091055eSJustin Lebar * into the first byte, depending on how many bytes follow.  There are
9091055eSJustin Lebar * as many entries in this table as there are UTF-8 sequence types.
9091055eSJustin Lebar * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
9091055eSJustin Lebar * for *legal* UTF-8 will be 4 or fewer bytes total.
9091055eSJustin Lebar */
9091055eSJustin Lebarstatic const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
9091055eSJustin Lebar
9091055eSJustin Lebar/* --------------------------------------------------------------------- */
9091055eSJustin Lebar
9091055eSJustin Lebar/* The interface converts a whole buffer to avoid function-call overhead.
9091055eSJustin Lebar * Constants have been gathered. Loops & conditionals have been removed as
9091055eSJustin Lebar * much as possible for efficiency, in favor of drop-through switches.
9091055eSJustin Lebar * (See "Note A" at the bottom of the file for equivalent code.)
9091055eSJustin Lebar * If your compiler supports it, the "isLegalUTF8" call can be turned
9091055eSJustin Lebar * into an inline function.
9091055eSJustin Lebar */
9091055eSJustin Lebar
9091055eSJustin Lebar
9091055eSJustin Lebar/* --------------------------------------------------------------------- */
9091055eSJustin Lebar
9091055eSJustin LebarConversionResult ConvertUTF32toUTF16 (
9091055eSJustin Lebar        const UTF32** sourceStart, const UTF32* sourceEnd,
9091055eSJustin Lebar        UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
9091055eSJustin Lebar    ConversionResult result = conversionOK;
9091055eSJustin Lebar    const UTF32* source = *sourceStart;
9091055eSJustin Lebar    UTF16* target = *targetStart;
9091055eSJustin Lebar    while (source < sourceEnd) {
9091055eSJustin Lebar        UTF32 ch;
9091055eSJustin Lebar        if (target >= targetEnd) {
9091055eSJustin Lebar            result = targetExhausted; break;
9091055eSJustin Lebar        }
9091055eSJustin Lebar        ch = *source++;
9091055eSJustin Lebar        if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
9091055eSJustin Lebar            /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
9091055eSJustin Lebar            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
9091055eSJustin Lebar                if (flags == strictConversion) {
9091055eSJustin Lebar                    --source; /* return to the illegal value itself */
9091055eSJustin Lebar                    result = sourceIllegal;
9091055eSJustin Lebar                    break;
9091055eSJustin Lebar                } else {
9091055eSJustin Lebar                    *target++ = UNI_REPLACEMENT_CHAR;
9091055eSJustin Lebar                }
9091055eSJustin Lebar            } else {
9091055eSJustin Lebar                *target++ = (UTF16)ch; /* normal case */
9091055eSJustin Lebar            }
9091055eSJustin Lebar        } else if (ch > UNI_MAX_LEGAL_UTF32) {
9091055eSJustin Lebar            if (flags == strictConversion) {
9091055eSJustin Lebar                result = sourceIllegal;
9091055eSJustin Lebar            } else {
9091055eSJustin Lebar                *target++ = UNI_REPLACEMENT_CHAR;
9091055eSJustin Lebar            }
9091055eSJustin Lebar        } else {
9091055eSJustin Lebar            /* target is a character in range 0xFFFF - 0x10FFFF. */
9091055eSJustin Lebar            if (target + 1 >= targetEnd) {
9091055eSJustin Lebar                --source; /* Back up source pointer! */
9091055eSJustin Lebar                result = targetExhausted; break;
9091055eSJustin Lebar            }
9091055eSJustin Lebar            ch -= halfBase;
9091055eSJustin Lebar            *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
9091055eSJustin Lebar            *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
9091055eSJustin Lebar        }
9091055eSJustin Lebar    }
9091055eSJustin Lebar    *sourceStart = source;
9091055eSJustin Lebar    *targetStart = target;
9091055eSJustin Lebar    return result;
9091055eSJustin Lebar}
9091055eSJustin Lebar
9091055eSJustin Lebar/* --------------------------------------------------------------------- */
9091055eSJustin Lebar
9091055eSJustin LebarConversionResult ConvertUTF16toUTF32 (
9091055eSJustin Lebar        const UTF16** sourceStart, const UTF16* sourceEnd,
9091055eSJustin Lebar        UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
9091055eSJustin Lebar    ConversionResult result = conversionOK;
9091055eSJustin Lebar    const UTF16* source = *sourceStart;
9091055eSJustin Lebar    UTF32* target = *targetStart;
9091055eSJustin Lebar    UTF32 ch, ch2;
9091055eSJustin Lebar    while (source < sourceEnd) {
9091055eSJustin Lebar        const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
9091055eSJustin Lebar        ch = *source++;
9091055eSJustin Lebar        /* If we have a surrogate pair, convert to UTF32 first. */
9091055eSJustin Lebar        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
9091055eSJustin Lebar            /* If the 16 bits following the high surrogate are in the source buffer... */
9091055eSJustin Lebar            if (source < sourceEnd) {
9091055eSJustin Lebar                ch2 = *source;
9091055eSJustin Lebar                /* If it's a low surrogate, convert to UTF32. */
9091055eSJustin Lebar                if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
9091055eSJustin Lebar                    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
9091055eSJustin Lebar                        + (ch2 - UNI_SUR_LOW_START) + halfBase;
9091055eSJustin Lebar                    ++source;
9091055eSJustin Lebar                } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
9091055eSJustin Lebar                    --source; /* return to the illegal value itself */
9091055eSJustin Lebar                    result = sourceIllegal;
9091055eSJustin Lebar                    break;
9091055eSJustin Lebar                }
9091055eSJustin Lebar            } else { /* We don't have the 16 bits following the high surrogate. */
9091055eSJustin Lebar                --source; /* return to the high surrogate */
9091055eSJustin Lebar                result = sourceExhausted;
9091055eSJustin Lebar                break;
9091055eSJustin Lebar            }
9091055eSJustin Lebar        } else if (flags == strictConversion) {
9091055eSJustin Lebar            /* UTF-16 surrogate values are illegal in UTF-32 */
9091055eSJustin Lebar            if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
9091055eSJustin Lebar                --source; /* return to the illegal value itself */
9091055eSJustin Lebar                result = sourceIllegal;
9091055eSJustin Lebar                break;
9091055eSJustin Lebar            }
9091055eSJustin Lebar        }
9091055eSJustin Lebar        if (target >= targetEnd) {
9091055eSJustin Lebar            source = oldSource; /* Back up source pointer! */
9091055eSJustin Lebar            result = targetExhausted; break;
9091055eSJustin Lebar        }
9091055eSJustin Lebar        *target++ = ch;
9091055eSJustin Lebar    }
9091055eSJustin Lebar    *sourceStart = source;
9091055eSJustin Lebar    *targetStart = target;
9091055eSJustin Lebar#ifdef CVTUTF_DEBUG
9091055eSJustin Lebarif (result == sourceIllegal) {
9091055eSJustin Lebar    fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
9091055eSJustin Lebar    fflush(stderr);
9091055eSJustin Lebar}
9091055eSJustin Lebar#endif
9091055eSJustin Lebar    return result;
9091055eSJustin Lebar}
9091055eSJustin LebarConversionResult ConvertUTF16toUTF8 (
9091055eSJustin Lebar        const UTF16** sourceStart, const UTF16* sourceEnd,
9091055eSJustin Lebar        UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
9091055eSJustin Lebar    ConversionResult result = conversionOK;
9091055eSJustin Lebar    const UTF16* source = *sourceStart;
9091055eSJustin Lebar    UTF8* target = *targetStart;
9091055eSJustin Lebar    while (source < sourceEnd) {
9091055eSJustin Lebar        UTF32 ch;
9091055eSJustin Lebar        unsigned short bytesToWrite = 0;
9091055eSJustin Lebar        const UTF32 byteMask = 0xBF;
9091055eSJustin Lebar        const UTF32 byteMark = 0x80;
9091055eSJustin Lebar        const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
9091055eSJustin Lebar        ch = *source++;
9091055eSJustin Lebar        /* If we have a surrogate pair, convert to UTF32 first. */
9091055eSJustin Lebar        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
9091055eSJustin Lebar            /* If the 16 bits following the high surrogate are in the source buffer... */
9091055eSJustin Lebar            if (source < sourceEnd) {
9091055eSJustin Lebar                UTF32 ch2 = *source;
9091055eSJustin Lebar                /* If it's a low surrogate, convert to UTF32. */
9091055eSJustin Lebar                if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
9091055eSJustin Lebar                    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
9091055eSJustin Lebar                        + (ch2 - UNI_SUR_LOW_START) + halfBase;
9091055eSJustin Lebar                    ++source;
9091055eSJustin Lebar                } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
9091055eSJustin Lebar                    --source; /* return to the illegal value itself */
9091055eSJustin Lebar                    result = sourceIllegal;
9091055eSJustin Lebar                    break;
9091055eSJustin Lebar                }
9091055eSJustin Lebar            } else { /* We don't have the 16 bits following the high surrogate. */
9091055eSJustin Lebar                --source; /* return to the high surrogate */
9091055eSJustin Lebar                result = sourceExhausted;
9091055eSJustin Lebar                break;
9091055eSJustin Lebar            }
9091055eSJustin Lebar        } else if (flags == strictConversion) {
9091055eSJustin Lebar            /* UTF-16 surrogate values are illegal in UTF-32 */
9091055eSJustin Lebar            if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
9091055eSJustin Lebar                --source; /* return to the illegal value itself */
9091055eSJustin Lebar                result = sourceIllegal;
9091055eSJustin Lebar                break;
9091055eSJustin Lebar            }
9091055eSJustin Lebar        }
9091055eSJustin Lebar        /* Figure out how many bytes the result will require */
9091055eSJustin Lebar        if (ch < (UTF32)0x80) {      bytesToWrite = 1;
9091055eSJustin Lebar        } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
9091055eSJustin Lebar        } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
9091055eSJustin Lebar        } else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
9091055eSJustin Lebar        } else {                            bytesToWrite = 3;
9091055eSJustin Lebar                                            ch = UNI_REPLACEMENT_CHAR;
9091055eSJustin Lebar        }
9091055eSJustin Lebar
9091055eSJustin Lebar        target += bytesToWrite;
9091055eSJustin Lebar        if (target > targetEnd) {
9091055eSJustin Lebar            source = oldSource; /* Back up source pointer! */
9091055eSJustin Lebar            target -= bytesToWrite; result = targetExhausted; break;
9091055eSJustin Lebar        }
9091055eSJustin Lebar        switch (bytesToWrite) { /* note: everything falls through. */
9091055eSJustin Lebar            case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
9091055eSJustin Lebar            case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
9091055eSJustin Lebar            case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
9091055eSJustin Lebar            case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
9091055eSJustin Lebar        }
9091055eSJustin Lebar        target += bytesToWrite;
9091055eSJustin Lebar    }
9091055eSJustin Lebar    *sourceStart = source;
9091055eSJustin Lebar    *targetStart = target;
9091055eSJustin Lebar    return result;
9091055eSJustin Lebar}
9091055eSJustin Lebar
9091055eSJustin Lebar/* --------------------------------------------------------------------- */
9091055eSJustin Lebar
9091055eSJustin LebarConversionResult ConvertUTF32toUTF8 (
9091055eSJustin Lebar        const UTF32** sourceStart, const UTF32* sourceEnd,
9091055eSJustin Lebar        UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
9091055eSJustin Lebar    ConversionResult result = conversionOK;
9091055eSJustin Lebar    const UTF32* source = *sourceStart;
9091055eSJustin Lebar    UTF8* target = *targetStart;
9091055eSJustin Lebar    while (source < sourceEnd) {
9091055eSJustin Lebar        UTF32 ch;
9091055eSJustin Lebar        unsigned short bytesToWrite = 0;
9091055eSJustin Lebar        const UTF32 byteMask = 0xBF;
9091055eSJustin Lebar        const UTF32 byteMark = 0x80;
9091055eSJustin Lebar        ch = *source++;
9091055eSJustin Lebar        if (flags == strictConversion ) {
9091055eSJustin Lebar            /* UTF-16 surrogate values are illegal in UTF-32 */
9091055eSJustin Lebar            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
9091055eSJustin Lebar                --source; /* return to the illegal value itself */
9091055eSJustin Lebar                result = sourceIllegal;
9091055eSJustin Lebar                break;
9091055eSJustin Lebar            }
9091055eSJustin Lebar        }
9091055eSJustin Lebar        /*
9091055eSJustin Lebar         * Figure out how many bytes the result will require. Turn any
9091055eSJustin Lebar         * illegally large UTF32 things (> Plane 17) into replacement chars.
9091055eSJustin Lebar         */
9091055eSJustin Lebar        if (ch < (UTF32)0x80) {      bytesToWrite = 1;
9091055eSJustin Lebar        } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
9091055eSJustin Lebar        } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
9091055eSJustin Lebar        } else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
9091055eSJustin Lebar        } else {                            bytesToWrite = 3;
9091055eSJustin Lebar                                            ch = UNI_REPLACEMENT_CHAR;
9091055eSJustin Lebar                                            result = sourceIllegal;
9091055eSJustin Lebar        }
9091055eSJustin Lebar
9091055eSJustin Lebar        target += bytesToWrite;
9091055eSJustin Lebar        if (target > targetEnd) {
9091055eSJustin Lebar            --source; /* Back up source pointer! */
9091055eSJustin Lebar            target -= bytesToWrite; result = targetExhausted; break;
9091055eSJustin Lebar        }
9091055eSJustin Lebar        switch (bytesToWrite) { /* note: everything falls through. */
9091055eSJustin Lebar            case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
9091055eSJustin Lebar            case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
9091055eSJustin Lebar            case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
9091055eSJustin Lebar            case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
9091055eSJustin Lebar        }
9091055eSJustin Lebar        target += bytesToWrite;
9091055eSJustin Lebar    }
9091055eSJustin Lebar    *sourceStart = source;
9091055eSJustin Lebar    *targetStart = target;
9091055eSJustin Lebar    return result;
9091055eSJustin Lebar}
9091055eSJustin Lebar
9091055eSJustin Lebar/* --------------------------------------------------------------------- */
9091055eSJustin Lebar
9091055eSJustin Lebar/*
9091055eSJustin Lebar * Utility routine to tell whether a sequence of bytes is legal UTF-8.
9091055eSJustin Lebar * This must be called with the length pre-determined by the first byte.
9091055eSJustin Lebar * If not calling this from ConvertUTF8to*, then the length can be set by:
9091055eSJustin Lebar *  length = trailingBytesForUTF8[*source]+1;
9091055eSJustin Lebar * and the sequence is illegal right away if there aren't that many bytes
9091055eSJustin Lebar * available.
9091055eSJustin Lebar * If presented with a length > 4, this returns false.  The Unicode
9091055eSJustin Lebar * definition of UTF-8 goes up to 4-byte sequences.
9091055eSJustin Lebar */
9091055eSJustin Lebar
9091055eSJustin Lebarstatic Boolean isLegalUTF8(const UTF8 *source, int length) {
9091055eSJustin Lebar    UTF8 a;
9091055eSJustin Lebar    const UTF8 *srcptr = source+length;
9091055eSJustin Lebar    switch (length) {
9091055eSJustin Lebar    default: return false;
9091055eSJustin Lebar        /* Everything else falls through when "true"... */
9091055eSJustin Lebar    case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
9091055eSJustin Lebar    case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
9091055eSJustin Lebar    case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
9091055eSJustin Lebar
9091055eSJustin Lebar        switch (*source) {
9091055eSJustin Lebar            /* no fall-through in this inner switch */
9091055eSJustin Lebar            case 0xE0: if (a < 0xA0) return false; break;
9091055eSJustin Lebar            case 0xED: if (a > 0x9F) return false; break;
9091055eSJustin Lebar            case 0xF0: if (a < 0x90) return false; break;
9091055eSJustin Lebar            case 0xF4: if (a > 0x8F) return false; break;
9091055eSJustin Lebar            default:   if (a < 0x80) return false;
9091055eSJustin Lebar        }
9091055eSJustin Lebar
9091055eSJustin Lebar    case 1: if (*source >= 0x80 && *source < 0xC2) return false;
9091055eSJustin Lebar    }
9091055eSJustin Lebar    if (*source > 0xF4) return false;
9091055eSJustin Lebar    return true;
9091055eSJustin Lebar}
9091055eSJustin Lebar
9091055eSJustin Lebar/* --------------------------------------------------------------------- */
9091055eSJustin Lebar
9091055eSJustin Lebar/*
9091055eSJustin Lebar * Exported function to return whether a UTF-8 sequence is legal or not.
9091055eSJustin Lebar * This is not used here; it's just exported.
9091055eSJustin Lebar */
9091055eSJustin LebarBoolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
9091055eSJustin Lebar    int length = trailingBytesForUTF8[*source]+1;
9091055eSJustin Lebar    if (length > sourceEnd - source) {
9091055eSJustin Lebar        return false;
9091055eSJustin Lebar    }
9091055eSJustin Lebar    return isLegalUTF8(source, length);
9091055eSJustin Lebar}
9091055eSJustin Lebar
*d4892a16SCorentin Jabot/*
*d4892a16SCorentin Jabot * Exported function to return the size of the first utf-8 code unit sequence,
*d4892a16SCorentin Jabot * Or 0 if the sequence is not valid;
*d4892a16SCorentin Jabot */
*d4892a16SCorentin Jabotunsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd) {
*d4892a16SCorentin Jabot  int length = trailingBytesForUTF8[*source] + 1;
*d4892a16SCorentin Jabot  return (length <= sourceEnd - source && isLegalUTF8(source, length)) ? length
*d4892a16SCorentin Jabot                                                                       : 0;
*d4892a16SCorentin Jabot}
*d4892a16SCorentin Jabot
9091055eSJustin Lebar/* --------------------------------------------------------------------- */
9091055eSJustin Lebar
9091055eSJustin Lebarstatic unsigned
9091055eSJustin LebarfindMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source,
9091055eSJustin Lebar                                          const UTF8 *sourceEnd) {
9091055eSJustin Lebar  UTF8 b1, b2, b3;
9091055eSJustin Lebar
9091055eSJustin Lebar  assert(!isLegalUTF8Sequence(source, sourceEnd));
9091055eSJustin Lebar
9091055eSJustin Lebar  /*
9091055eSJustin Lebar   * Unicode 6.3.0, D93b:
9091055eSJustin Lebar   *
9091055eSJustin Lebar   *   Maximal subpart of an ill-formed subsequence: The longest code unit
9091055eSJustin Lebar   *   subsequence starting at an unconvertible offset that is either:
9091055eSJustin Lebar   *   a. the initial subsequence of a well-formed code unit sequence, or
9091055eSJustin Lebar   *   b. a subsequence of length one.
9091055eSJustin Lebar   */
9091055eSJustin Lebar
9091055eSJustin Lebar  if (source == sourceEnd)
9091055eSJustin Lebar    return 0;
9091055eSJustin Lebar
9091055eSJustin Lebar  /*
9091055eSJustin Lebar   * Perform case analysis.  See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8
9091055eSJustin Lebar   * Byte Sequences.
9091055eSJustin Lebar   */
9091055eSJustin Lebar
9091055eSJustin Lebar  b1 = *source;
9091055eSJustin Lebar  ++source;
9091055eSJustin Lebar  if (b1 >= 0xC2 && b1 <= 0xDF) {
9091055eSJustin Lebar    /*
9091055eSJustin Lebar     * First byte is valid, but we know that this code unit sequence is
9091055eSJustin Lebar     * invalid, so the maximal subpart has to end after the first byte.
9091055eSJustin Lebar     */
9091055eSJustin Lebar    return 1;
9091055eSJustin Lebar  }
9091055eSJustin Lebar
9091055eSJustin Lebar  if (source == sourceEnd)
9091055eSJustin Lebar    return 1;
9091055eSJustin Lebar
9091055eSJustin Lebar  b2 = *source;
9091055eSJustin Lebar  ++source;
9091055eSJustin Lebar
9091055eSJustin Lebar  if (b1 == 0xE0) {
9091055eSJustin Lebar    return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;
9091055eSJustin Lebar  }
9091055eSJustin Lebar  if (b1 >= 0xE1 && b1 <= 0xEC) {
9091055eSJustin Lebar    return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
9091055eSJustin Lebar  }
9091055eSJustin Lebar  if (b1 == 0xED) {
9091055eSJustin Lebar    return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;
9091055eSJustin Lebar  }
9091055eSJustin Lebar  if (b1 >= 0xEE && b1 <= 0xEF) {
9091055eSJustin Lebar    return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
9091055eSJustin Lebar  }
9091055eSJustin Lebar  if (b1 == 0xF0) {
9091055eSJustin Lebar    if (b2 >= 0x90 && b2 <= 0xBF) {
9091055eSJustin Lebar      if (source == sourceEnd)
9091055eSJustin Lebar        return 2;
9091055eSJustin Lebar
9091055eSJustin Lebar      b3 = *source;
9091055eSJustin Lebar      return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
9091055eSJustin Lebar    }
9091055eSJustin Lebar    return 1;
9091055eSJustin Lebar  }
9091055eSJustin Lebar  if (b1 >= 0xF1 && b1 <= 0xF3) {
9091055eSJustin Lebar    if (b2 >= 0x80 && b2 <= 0xBF) {
9091055eSJustin Lebar      if (source == sourceEnd)
9091055eSJustin Lebar        return 2;
9091055eSJustin Lebar
9091055eSJustin Lebar      b3 = *source;
9091055eSJustin Lebar      return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
9091055eSJustin Lebar    }
9091055eSJustin Lebar    return 1;
9091055eSJustin Lebar  }
9091055eSJustin Lebar  if (b1 == 0xF4) {
9091055eSJustin Lebar    if (b2 >= 0x80 && b2 <= 0x8F) {
9091055eSJustin Lebar      if (source == sourceEnd)
9091055eSJustin Lebar        return 2;
9091055eSJustin Lebar
9091055eSJustin Lebar      b3 = *source;
9091055eSJustin Lebar      return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
9091055eSJustin Lebar    }
9091055eSJustin Lebar    return 1;
9091055eSJustin Lebar  }
9091055eSJustin Lebar
9091055eSJustin Lebar  assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);
9091055eSJustin Lebar  /*
9091055eSJustin Lebar   * There are no valid sequences that start with these bytes.  Maximal subpart
9091055eSJustin Lebar   * is defined to have length 1 in these cases.
9091055eSJustin Lebar   */
9091055eSJustin Lebar  return 1;
9091055eSJustin Lebar}
9091055eSJustin Lebar
9091055eSJustin Lebar/* --------------------------------------------------------------------- */
9091055eSJustin Lebar
9091055eSJustin Lebar/*
9091055eSJustin Lebar * Exported function to return the total number of bytes in a codepoint
9091055eSJustin Lebar * represented in UTF-8, given the value of the first byte.
9091055eSJustin Lebar */
9091055eSJustin Lebarunsigned getNumBytesForUTF8(UTF8 first) {
9091055eSJustin Lebar  return trailingBytesForUTF8[first] + 1;
9091055eSJustin Lebar}
9091055eSJustin Lebar
9091055eSJustin Lebar/* --------------------------------------------------------------------- */
9091055eSJustin Lebar
9091055eSJustin Lebar/*
9091055eSJustin Lebar * Exported function to return whether a UTF-8 string is legal or not.
9091055eSJustin Lebar * This is not used here; it's just exported.
9091055eSJustin Lebar */
9091055eSJustin LebarBoolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {
9091055eSJustin Lebar    while (*source != sourceEnd) {
9091055eSJustin Lebar        int length = trailingBytesForUTF8[**source] + 1;
9091055eSJustin Lebar        if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
9091055eSJustin Lebar            return false;
9091055eSJustin Lebar        *source += length;
9091055eSJustin Lebar    }
9091055eSJustin Lebar    return true;
9091055eSJustin Lebar}
9091055eSJustin Lebar
9091055eSJustin Lebar/* --------------------------------------------------------------------- */
9091055eSJustin Lebar
9091055eSJustin LebarConversionResult ConvertUTF8toUTF16 (
9091055eSJustin Lebar        const UTF8** sourceStart, const UTF8* sourceEnd,
9091055eSJustin Lebar        UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
9091055eSJustin Lebar    ConversionResult result = conversionOK;
9091055eSJustin Lebar    const UTF8* source = *sourceStart;
9091055eSJustin Lebar    UTF16* target = *targetStart;
9091055eSJustin Lebar    while (source < sourceEnd) {
9091055eSJustin Lebar        UTF32 ch = 0;
9091055eSJustin Lebar        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
9091055eSJustin Lebar        if (extraBytesToRead >= sourceEnd - source) {
9091055eSJustin Lebar            result = sourceExhausted; break;
9091055eSJustin Lebar        }
9091055eSJustin Lebar        /* Do this check whether lenient or strict */
9091055eSJustin Lebar        if (!isLegalUTF8(source, extraBytesToRead+1)) {
9091055eSJustin Lebar            result = sourceIllegal;
9091055eSJustin Lebar            break;
9091055eSJustin Lebar        }
9091055eSJustin Lebar        /*
9091055eSJustin Lebar         * The cases all fall through. See "Note A" below.
9091055eSJustin Lebar         */
9091055eSJustin Lebar        switch (extraBytesToRead) {
9091055eSJustin Lebar            case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
9091055eSJustin Lebar            case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
9091055eSJustin Lebar            case 3: ch += *source++; ch <<= 6;
9091055eSJustin Lebar            case 2: ch += *source++; ch <<= 6;
9091055eSJustin Lebar            case 1: ch += *source++; ch <<= 6;
9091055eSJustin Lebar            case 0: ch += *source++;
9091055eSJustin Lebar        }
9091055eSJustin Lebar        ch -= offsetsFromUTF8[extraBytesToRead];
9091055eSJustin Lebar
9091055eSJustin Lebar        if (target >= targetEnd) {
9091055eSJustin Lebar            source -= (extraBytesToRead+1); /* Back up source pointer! */
9091055eSJustin Lebar            result = targetExhausted; break;
9091055eSJustin Lebar        }
9091055eSJustin Lebar        if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
9091055eSJustin Lebar            /* UTF-16 surrogate values are illegal in UTF-32 */
9091055eSJustin Lebar            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
9091055eSJustin Lebar                if (flags == strictConversion) {
9091055eSJustin Lebar                    source -= (extraBytesToRead+1); /* return to the illegal value itself */
9091055eSJustin Lebar                    result = sourceIllegal;
9091055eSJustin Lebar                    break;
9091055eSJustin Lebar                } else {
9091055eSJustin Lebar                    *target++ = UNI_REPLACEMENT_CHAR;
9091055eSJustin Lebar                }
9091055eSJustin Lebar            } else {
9091055eSJustin Lebar                *target++ = (UTF16)ch; /* normal case */
9091055eSJustin Lebar            }
9091055eSJustin Lebar        } else if (ch > UNI_MAX_UTF16) {
9091055eSJustin Lebar            if (flags == strictConversion) {
9091055eSJustin Lebar                result = sourceIllegal;
9091055eSJustin Lebar                source -= (extraBytesToRead+1); /* return to the start */
9091055eSJustin Lebar                break; /* Bail out; shouldn't continue */
9091055eSJustin Lebar            } else {
9091055eSJustin Lebar                *target++ = UNI_REPLACEMENT_CHAR;
9091055eSJustin Lebar            }
9091055eSJustin Lebar        } else {
9091055eSJustin Lebar            /* target is a character in range 0xFFFF - 0x10FFFF. */
9091055eSJustin Lebar            if (target + 1 >= targetEnd) {
9091055eSJustin Lebar                source -= (extraBytesToRead+1); /* Back up source pointer! */
9091055eSJustin Lebar                result = targetExhausted; break;
9091055eSJustin Lebar            }
9091055eSJustin Lebar            ch -= halfBase;
9091055eSJustin Lebar            *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
9091055eSJustin Lebar            *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
9091055eSJustin Lebar        }
9091055eSJustin Lebar    }
9091055eSJustin Lebar    *sourceStart = source;
9091055eSJustin Lebar    *targetStart = target;
9091055eSJustin Lebar    return result;
9091055eSJustin Lebar}
9091055eSJustin Lebar
9091055eSJustin Lebar/* --------------------------------------------------------------------- */
9091055eSJustin Lebar
9091055eSJustin Lebarstatic ConversionResult ConvertUTF8toUTF32Impl(
9091055eSJustin Lebar        const UTF8** sourceStart, const UTF8* sourceEnd,
9091055eSJustin Lebar        UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags,
9091055eSJustin Lebar        Boolean InputIsPartial) {
9091055eSJustin Lebar    ConversionResult result = conversionOK;
9091055eSJustin Lebar    const UTF8* source = *sourceStart;
9091055eSJustin Lebar    UTF32* target = *targetStart;
9091055eSJustin Lebar    while (source < sourceEnd) {
9091055eSJustin Lebar        UTF32 ch = 0;
9091055eSJustin Lebar        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
9091055eSJustin Lebar        if (extraBytesToRead >= sourceEnd - source) {
9091055eSJustin Lebar            if (flags == strictConversion || InputIsPartial) {
9091055eSJustin Lebar                result = sourceExhausted;
9091055eSJustin Lebar                break;
9091055eSJustin Lebar            } else {
9091055eSJustin Lebar                result = sourceIllegal;
9091055eSJustin Lebar
9091055eSJustin Lebar                /*
9091055eSJustin Lebar                 * Replace the maximal subpart of ill-formed sequence with
9091055eSJustin Lebar                 * replacement character.
9091055eSJustin Lebar                 */
9091055eSJustin Lebar                source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
9091055eSJustin Lebar                                                                    sourceEnd);
9091055eSJustin Lebar                *target++ = UNI_REPLACEMENT_CHAR;
9091055eSJustin Lebar                continue;
9091055eSJustin Lebar            }
9091055eSJustin Lebar        }
9091055eSJustin Lebar        if (target >= targetEnd) {
9091055eSJustin Lebar            result = targetExhausted; break;
9091055eSJustin Lebar        }
9091055eSJustin Lebar
9091055eSJustin Lebar        /* Do this check whether lenient or strict */
9091055eSJustin Lebar        if (!isLegalUTF8(source, extraBytesToRead+1)) {
9091055eSJustin Lebar            result = sourceIllegal;
9091055eSJustin Lebar            if (flags == strictConversion) {
9091055eSJustin Lebar                /* Abort conversion. */
9091055eSJustin Lebar                break;
9091055eSJustin Lebar            } else {
9091055eSJustin Lebar                /*
9091055eSJustin Lebar                 * Replace the maximal subpart of ill-formed sequence with
9091055eSJustin Lebar                 * replacement character.
9091055eSJustin Lebar                 */
9091055eSJustin Lebar                source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
9091055eSJustin Lebar                                                                    sourceEnd);
9091055eSJustin Lebar                *target++ = UNI_REPLACEMENT_CHAR;
9091055eSJustin Lebar                continue;
9091055eSJustin Lebar            }
9091055eSJustin Lebar        }
9091055eSJustin Lebar        /*
9091055eSJustin Lebar         * The cases all fall through. See "Note A" below.
9091055eSJustin Lebar         */
9091055eSJustin Lebar        switch (extraBytesToRead) {
9091055eSJustin Lebar            case 5: ch += *source++; ch <<= 6;
9091055eSJustin Lebar            case 4: ch += *source++; ch <<= 6;
9091055eSJustin Lebar            case 3: ch += *source++; ch <<= 6;
9091055eSJustin Lebar            case 2: ch += *source++; ch <<= 6;
9091055eSJustin Lebar            case 1: ch += *source++; ch <<= 6;
9091055eSJustin Lebar            case 0: ch += *source++;
9091055eSJustin Lebar        }
9091055eSJustin Lebar        ch -= offsetsFromUTF8[extraBytesToRead];
9091055eSJustin Lebar
9091055eSJustin Lebar        if (ch <= UNI_MAX_LEGAL_UTF32) {
9091055eSJustin Lebar            /*
9091055eSJustin Lebar             * UTF-16 surrogate values are illegal in UTF-32, and anything
9091055eSJustin Lebar             * over Plane 17 (> 0x10FFFF) is illegal.
9091055eSJustin Lebar             */
9091055eSJustin Lebar            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
9091055eSJustin Lebar                if (flags == strictConversion) {
9091055eSJustin Lebar                    source -= (extraBytesToRead+1); /* return to the illegal value itself */
9091055eSJustin Lebar                    result = sourceIllegal;
9091055eSJustin Lebar                    break;
9091055eSJustin Lebar                } else {
9091055eSJustin Lebar                    *target++ = UNI_REPLACEMENT_CHAR;
9091055eSJustin Lebar                }
9091055eSJustin Lebar            } else {
9091055eSJustin Lebar                *target++ = ch;
9091055eSJustin Lebar            }
9091055eSJustin Lebar        } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
9091055eSJustin Lebar            result = sourceIllegal;
9091055eSJustin Lebar            *target++ = UNI_REPLACEMENT_CHAR;
9091055eSJustin Lebar        }
9091055eSJustin Lebar    }
9091055eSJustin Lebar    *sourceStart = source;
9091055eSJustin Lebar    *targetStart = target;
9091055eSJustin Lebar    return result;
9091055eSJustin Lebar}
9091055eSJustin Lebar
9091055eSJustin LebarConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart,
9091055eSJustin Lebar                                           const UTF8 *sourceEnd,
9091055eSJustin Lebar                                           UTF32 **targetStart,
9091055eSJustin Lebar                                           UTF32 *targetEnd,
9091055eSJustin Lebar                                           ConversionFlags flags) {
9091055eSJustin Lebar  return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
9091055eSJustin Lebar                                flags, /*InputIsPartial=*/true);
9091055eSJustin Lebar}
9091055eSJustin Lebar
9091055eSJustin LebarConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart,
9091055eSJustin Lebar                                    const UTF8 *sourceEnd, UTF32 **targetStart,
9091055eSJustin Lebar                                    UTF32 *targetEnd, ConversionFlags flags) {
9091055eSJustin Lebar  return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
9091055eSJustin Lebar                                flags, /*InputIsPartial=*/false);
9091055eSJustin Lebar}
9091055eSJustin Lebar
9091055eSJustin Lebar/* ---------------------------------------------------------------------
9091055eSJustin Lebar
9091055eSJustin Lebar    Note A.
9091055eSJustin Lebar    The fall-through switches in UTF-8 reading code save a
9091055eSJustin Lebar    temp variable, some decrements & conditionals.  The switches
9091055eSJustin Lebar    are equivalent to the following loop:
9091055eSJustin Lebar        {
9091055eSJustin Lebar            int tmpBytesToRead = extraBytesToRead+1;
9091055eSJustin Lebar            do {
9091055eSJustin Lebar                ch += *source++;
9091055eSJustin Lebar                --tmpBytesToRead;
9091055eSJustin Lebar                if (tmpBytesToRead) ch <<= 6;
9091055eSJustin Lebar            } while (tmpBytesToRead > 0);
9091055eSJustin Lebar        }
9091055eSJustin Lebar    In UTF-8 writing code, the switches on "bytesToWrite" are
9091055eSJustin Lebar    similarly unrolled loops.
9091055eSJustin Lebar
9091055eSJustin Lebar   --------------------------------------------------------------------- */
9091055eSJustin Lebar
9091055eSJustin Lebar} // namespace llvm
229c9c11SGalina Kistanova
229c9c11SGalina KistanovaConvertUTF_RESTORE_WARNINGS