1 //===-- ConvertUTFWrapper.cpp - Wrap ConvertUTF.h with clang data types -----=== 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 10 #include "llvm/Support/ConvertUTF.h" 11 #include "llvm/Support/ErrorHandling.h" 12 #include "llvm/Support/SwapByteOrder.h" 13 #include <string> 14 #include <vector> 15 16 namespace llvm { 17 18 bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source, 19 char *&ResultPtr, const UTF8 *&ErrorPtr) { 20 assert(WideCharWidth == 1 || WideCharWidth == 2 || WideCharWidth == 4); 21 ConversionResult result = conversionOK; 22 // Copy the character span over. 23 if (WideCharWidth == 1) { 24 const UTF8 *Pos = reinterpret_cast<const UTF8*>(Source.begin()); 25 if (!isLegalUTF8String(&Pos, reinterpret_cast<const UTF8*>(Source.end()))) { 26 result = sourceIllegal; 27 ErrorPtr = Pos; 28 } else { 29 memcpy(ResultPtr, Source.data(), Source.size()); 30 ResultPtr += Source.size(); 31 } 32 } else if (WideCharWidth == 2) { 33 const UTF8 *sourceStart = (const UTF8*)Source.data(); 34 // FIXME: Make the type of the result buffer correct instead of 35 // using reinterpret_cast. 36 UTF16 *targetStart = reinterpret_cast<UTF16*>(ResultPtr); 37 ConversionFlags flags = strictConversion; 38 result = ConvertUTF8toUTF16( 39 &sourceStart, sourceStart + Source.size(), 40 &targetStart, targetStart + Source.size(), flags); 41 if (result == conversionOK) 42 ResultPtr = reinterpret_cast<char*>(targetStart); 43 else 44 ErrorPtr = sourceStart; 45 } else if (WideCharWidth == 4) { 46 const UTF8 *sourceStart = (const UTF8*)Source.data(); 47 // FIXME: Make the type of the result buffer correct instead of 48 // using reinterpret_cast. 49 UTF32 *targetStart = reinterpret_cast<UTF32*>(ResultPtr); 50 ConversionFlags flags = strictConversion; 51 result = ConvertUTF8toUTF32( 52 &sourceStart, sourceStart + Source.size(), 53 &targetStart, targetStart + Source.size(), flags); 54 if (result == conversionOK) 55 ResultPtr = reinterpret_cast<char*>(targetStart); 56 else 57 ErrorPtr = sourceStart; 58 } 59 assert((result != targetExhausted) 60 && "ConvertUTF8toUTFXX exhausted target buffer"); 61 return result == conversionOK; 62 } 63 64 bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr) { 65 const UTF32 *SourceStart = &Source; 66 const UTF32 *SourceEnd = SourceStart + 1; 67 UTF8 *TargetStart = reinterpret_cast<UTF8 *>(ResultPtr); 68 UTF8 *TargetEnd = TargetStart + 4; 69 ConversionResult CR = ConvertUTF32toUTF8(&SourceStart, SourceEnd, 70 &TargetStart, TargetEnd, 71 strictConversion); 72 if (CR != conversionOK) 73 return false; 74 75 ResultPtr = reinterpret_cast<char*>(TargetStart); 76 return true; 77 } 78 79 bool hasUTF16ByteOrderMark(ArrayRef<char> S) { 80 return (S.size() >= 2 && 81 ((S[0] == '\xff' && S[1] == '\xfe') || 82 (S[0] == '\xfe' && S[1] == '\xff'))); 83 } 84 85 bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) { 86 assert(Out.empty()); 87 88 // Error out on an uneven byte count. 89 if (SrcBytes.size() % 2) 90 return false; 91 92 // Avoid OOB by returning early on empty input. 93 if (SrcBytes.empty()) 94 return true; 95 96 const UTF16 *Src = reinterpret_cast<const UTF16 *>(SrcBytes.begin()); 97 const UTF16 *SrcEnd = reinterpret_cast<const UTF16 *>(SrcBytes.end()); 98 99 // Byteswap if necessary. 100 std::vector<UTF16> ByteSwapped; 101 if (Src[0] == UNI_UTF16_BYTE_ORDER_MARK_SWAPPED) { 102 ByteSwapped.insert(ByteSwapped.end(), Src, SrcEnd); 103 for (unsigned I = 0, E = ByteSwapped.size(); I != E; ++I) 104 ByteSwapped[I] = llvm::sys::SwapByteOrder_16(ByteSwapped[I]); 105 Src = &ByteSwapped[0]; 106 SrcEnd = &ByteSwapped[ByteSwapped.size() - 1] + 1; 107 } 108 109 // Skip the BOM for conversion. 110 if (Src[0] == UNI_UTF16_BYTE_ORDER_MARK_NATIVE) 111 Src++; 112 113 // Just allocate enough space up front. We'll shrink it later. Allocate 114 // enough that we can fit a null terminator without reallocating. 115 Out.resize(SrcBytes.size() * UNI_MAX_UTF8_BYTES_PER_CODE_POINT + 1); 116 UTF8 *Dst = reinterpret_cast<UTF8 *>(&Out[0]); 117 UTF8 *DstEnd = Dst + Out.size(); 118 119 ConversionResult CR = 120 ConvertUTF16toUTF8(&Src, SrcEnd, &Dst, DstEnd, strictConversion); 121 assert(CR != targetExhausted); 122 123 if (CR != conversionOK) { 124 Out.clear(); 125 return false; 126 } 127 128 Out.resize(reinterpret_cast<char *>(Dst) - &Out[0]); 129 Out.push_back(0); 130 Out.pop_back(); 131 return true; 132 } 133 134 bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out) 135 { 136 return convertUTF16ToUTF8String( 137 llvm::ArrayRef<char>(reinterpret_cast<const char *>(Src.data()), 138 Src.size() * sizeof(UTF16)), Out); 139 } 140 141 bool convertUTF8ToUTF16String(StringRef SrcUTF8, 142 SmallVectorImpl<UTF16> &DstUTF16) { 143 assert(DstUTF16.empty()); 144 145 // Avoid OOB by returning early on empty input. 146 if (SrcUTF8.empty()) { 147 DstUTF16.push_back(0); 148 DstUTF16.pop_back(); 149 return true; 150 } 151 152 const UTF8 *Src = reinterpret_cast<const UTF8 *>(SrcUTF8.begin()); 153 const UTF8 *SrcEnd = reinterpret_cast<const UTF8 *>(SrcUTF8.end()); 154 155 // Allocate the same number of UTF-16 code units as UTF-8 code units. Encoding 156 // as UTF-16 should always require the same amount or less code units than the 157 // UTF-8 encoding. Allocate one extra byte for the null terminator though, 158 // so that someone calling DstUTF16.data() gets a null terminated string. 159 // We resize down later so we don't have to worry that this over allocates. 160 DstUTF16.resize(SrcUTF8.size()+1); 161 UTF16 *Dst = &DstUTF16[0]; 162 UTF16 *DstEnd = Dst + DstUTF16.size(); 163 164 ConversionResult CR = 165 ConvertUTF8toUTF16(&Src, SrcEnd, &Dst, DstEnd, strictConversion); 166 assert(CR != targetExhausted); 167 168 if (CR != conversionOK) { 169 DstUTF16.clear(); 170 return false; 171 } 172 173 DstUTF16.resize(Dst - &DstUTF16[0]); 174 DstUTF16.push_back(0); 175 DstUTF16.pop_back(); 176 return true; 177 } 178 179 static_assert(sizeof(wchar_t) == 1 || sizeof(wchar_t) == 2 || 180 sizeof(wchar_t) == 4, 181 "Expected wchar_t to be 1, 2, or 4 bytes"); 182 183 template <typename TResult> 184 static inline bool ConvertUTF8toWideInternal(llvm::StringRef Source, 185 TResult &Result) { 186 // Even in the case of UTF-16, the number of bytes in a UTF-8 string is 187 // at least as large as the number of elements in the resulting wide 188 // string, because surrogate pairs take at least 4 bytes in UTF-8. 189 Result.resize(Source.size() + 1); 190 char *ResultPtr = reinterpret_cast<char *>(&Result[0]); 191 const UTF8 *ErrorPtr; 192 if (!ConvertUTF8toWide(sizeof(wchar_t), Source, ResultPtr, ErrorPtr)) { 193 Result.clear(); 194 return false; 195 } 196 Result.resize(reinterpret_cast<wchar_t *>(ResultPtr) - &Result[0]); 197 return true; 198 } 199 200 bool ConvertUTF8toWide(llvm::StringRef Source, std::wstring &Result) { 201 return ConvertUTF8toWideInternal(Source, Result); 202 } 203 204 bool ConvertUTF8toWide(const char *Source, std::wstring &Result) { 205 if (!Source) { 206 Result.clear(); 207 return true; 208 } 209 return ConvertUTF8toWide(llvm::StringRef(Source), Result); 210 } 211 212 bool convertWideToUTF8(const std::wstring &Source, std::string &Result) { 213 if (sizeof(wchar_t) == 1) { 214 const UTF8 *Start = reinterpret_cast<const UTF8 *>(Source.data()); 215 const UTF8 *End = 216 reinterpret_cast<const UTF8 *>(Source.data() + Source.size()); 217 if (!isLegalUTF8String(&Start, End)) 218 return false; 219 Result.resize(Source.size()); 220 memcpy(&Result[0], Source.data(), Source.size()); 221 return true; 222 } else if (sizeof(wchar_t) == 2) { 223 return convertUTF16ToUTF8String( 224 llvm::ArrayRef<UTF16>(reinterpret_cast<const UTF16 *>(Source.data()), 225 Source.size()), 226 Result); 227 } else if (sizeof(wchar_t) == 4) { 228 const UTF32 *Start = reinterpret_cast<const UTF32 *>(Source.data()); 229 const UTF32 *End = 230 reinterpret_cast<const UTF32 *>(Source.data() + Source.size()); 231 Result.resize(UNI_MAX_UTF8_BYTES_PER_CODE_POINT * Source.size()); 232 UTF8 *ResultPtr = reinterpret_cast<UTF8 *>(&Result[0]); 233 UTF8 *ResultEnd = reinterpret_cast<UTF8 *>(&Result[0] + Result.size()); 234 if (ConvertUTF32toUTF8(&Start, End, &ResultPtr, ResultEnd, 235 strictConversion) == conversionOK) { 236 Result.resize(reinterpret_cast<char *>(ResultPtr) - &Result[0]); 237 return true; 238 } else { 239 Result.clear(); 240 return false; 241 } 242 } else { 243 llvm_unreachable( 244 "Control should never reach this point; see static_assert further up"); 245 } 246 } 247 248 } // end namespace llvm 249 250