1 //===- llvm/unittest/Support/ConvertUTFTest.cpp - ConvertUTF tests --------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 10 #include "llvm/Support/ConvertUTF.h" 11 #include "llvm/ADT/ArrayRef.h" 12 #include "llvm/Support/Format.h" 13 #include "gtest/gtest.h" 14 #include <string> 15 #include <utility> 16 #include <vector> 17 18 using namespace llvm; 19 20 TEST(ConvertUTFTest, ConvertUTF16LittleEndianToUTF8String) { 21 // Src is the look of disapproval. 22 static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c"; 23 ArrayRef<char> Ref(Src, sizeof(Src) - 1); 24 std::string Result; 25 bool Success = convertUTF16ToUTF8String(Ref, Result); 26 EXPECT_TRUE(Success); 27 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0"); 28 EXPECT_EQ(Expected, Result); 29 } 30 31 TEST(ConvertUTFTest, ConvertUTF16BigEndianToUTF8String) { 32 // Src is the look of disapproval. 33 static const char Src[] = "\xfe\xff\x0c\xa0\x00_\x0c\xa0"; 34 ArrayRef<char> Ref(Src, sizeof(Src) - 1); 35 std::string Result; 36 bool Success = convertUTF16ToUTF8String(Ref, Result); 37 EXPECT_TRUE(Success); 38 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0"); 39 EXPECT_EQ(Expected, Result); 40 } 41 42 TEST(ConvertUTFTest, ConvertUTF8ToUTF16String) { 43 // Src is the look of disapproval. 44 static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0"; 45 StringRef Ref(Src, sizeof(Src) - 1); 46 SmallVector<UTF16, 5> Result; 47 bool Success = convertUTF8ToUTF16String(Ref, Result); 48 EXPECT_TRUE(Success); 49 static const UTF16 Expected[] = {0x0CA0, 0x005f, 0x0CA0, 0}; 50 ASSERT_EQ(3u, Result.size()); 51 for (int I = 0, E = 3; I != E; ++I) 52 EXPECT_EQ(Expected[I], Result[I]); 53 } 54 55 TEST(ConvertUTFTest, OddLengthInput) { 56 std::string Result; 57 bool Success = convertUTF16ToUTF8String(makeArrayRef("xxxxx", 5), Result); 58 EXPECT_FALSE(Success); 59 } 60 61 TEST(ConvertUTFTest, Empty) { 62 std::string Result; 63 bool Success = convertUTF16ToUTF8String(llvm::ArrayRef<char>(None), Result); 64 EXPECT_TRUE(Success); 65 EXPECT_TRUE(Result.empty()); 66 } 67 68 TEST(ConvertUTFTest, HasUTF16BOM) { 69 bool HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xff\xfe", 2)); 70 EXPECT_TRUE(HasBOM); 71 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff", 2)); 72 EXPECT_TRUE(HasBOM); 73 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff ", 3)); 74 EXPECT_TRUE(HasBOM); // Don't care about odd lengths. 75 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff\x00asdf", 6)); 76 EXPECT_TRUE(HasBOM); 77 78 HasBOM = hasUTF16ByteOrderMark(None); 79 EXPECT_FALSE(HasBOM); 80 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe", 1)); 81 EXPECT_FALSE(HasBOM); 82 } 83 84 TEST(ConvertUTFTest, UTF16WrappersForConvertUTF16ToUTF8String) { 85 // Src is the look of disapproval. 86 static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c"; 87 ArrayRef<UTF16> SrcRef = makeArrayRef((const UTF16 *)Src, 4); 88 std::string Result; 89 bool Success = convertUTF16ToUTF8String(SrcRef, Result); 90 EXPECT_TRUE(Success); 91 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0"); 92 EXPECT_EQ(Expected, Result); 93 } 94 95 TEST(ConvertUTFTest, ConvertUTF8toWide) { 96 // Src is the look of disapproval. 97 static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0"; 98 std::wstring Result; 99 bool Success = ConvertUTF8toWide((const char*)Src, Result); 100 EXPECT_TRUE(Success); 101 std::wstring Expected(L"\x0ca0_\x0ca0"); 102 EXPECT_EQ(Expected, Result); 103 Result.clear(); 104 Success = ConvertUTF8toWide(StringRef(Src, 7), Result); 105 EXPECT_TRUE(Success); 106 EXPECT_EQ(Expected, Result); 107 } 108 109 TEST(ConvertUTFTest, convertWideToUTF8) { 110 // Src is the look of disapproval. 111 static const wchar_t Src[] = L"\x0ca0_\x0ca0"; 112 std::string Result; 113 bool Success = convertWideToUTF8(Src, Result); 114 EXPECT_TRUE(Success); 115 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0"); 116 EXPECT_EQ(Expected, Result); 117 } 118 119 struct ConvertUTFResultContainer { 120 ConversionResult ErrorCode; 121 std::vector<unsigned> UnicodeScalars; 122 123 ConvertUTFResultContainer(ConversionResult ErrorCode) 124 : ErrorCode(ErrorCode) {} 125 126 ConvertUTFResultContainer 127 withScalars(unsigned US0 = 0x110000, unsigned US1 = 0x110000, 128 unsigned US2 = 0x110000, unsigned US3 = 0x110000, 129 unsigned US4 = 0x110000, unsigned US5 = 0x110000, 130 unsigned US6 = 0x110000, unsigned US7 = 0x110000) { 131 ConvertUTFResultContainer Result(*this); 132 if (US0 != 0x110000) 133 Result.UnicodeScalars.push_back(US0); 134 if (US1 != 0x110000) 135 Result.UnicodeScalars.push_back(US1); 136 if (US2 != 0x110000) 137 Result.UnicodeScalars.push_back(US2); 138 if (US3 != 0x110000) 139 Result.UnicodeScalars.push_back(US3); 140 if (US4 != 0x110000) 141 Result.UnicodeScalars.push_back(US4); 142 if (US5 != 0x110000) 143 Result.UnicodeScalars.push_back(US5); 144 if (US6 != 0x110000) 145 Result.UnicodeScalars.push_back(US6); 146 if (US7 != 0x110000) 147 Result.UnicodeScalars.push_back(US7); 148 return Result; 149 } 150 }; 151 152 std::pair<ConversionResult, std::vector<unsigned>> 153 ConvertUTF8ToUnicodeScalarsLenient(StringRef S) { 154 const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data()); 155 156 const UTF8 *SourceNext = SourceStart; 157 std::vector<UTF32> Decoded(S.size(), 0); 158 UTF32 *TargetStart = Decoded.data(); 159 160 auto ErrorCode = 161 ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart, 162 Decoded.data() + Decoded.size(), lenientConversion); 163 164 Decoded.resize(TargetStart - Decoded.data()); 165 166 return std::make_pair(ErrorCode, Decoded); 167 } 168 169 std::pair<ConversionResult, std::vector<unsigned>> 170 ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S) { 171 const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data()); 172 173 const UTF8 *SourceNext = SourceStart; 174 std::vector<UTF32> Decoded(S.size(), 0); 175 UTF32 *TargetStart = Decoded.data(); 176 177 auto ErrorCode = ConvertUTF8toUTF32Partial( 178 &SourceNext, SourceStart + S.size(), &TargetStart, 179 Decoded.data() + Decoded.size(), lenientConversion); 180 181 Decoded.resize(TargetStart - Decoded.data()); 182 183 return std::make_pair(ErrorCode, Decoded); 184 } 185 186 ::testing::AssertionResult 187 CheckConvertUTF8ToUnicodeScalars(ConvertUTFResultContainer Expected, 188 StringRef S, bool Partial = false) { 189 ConversionResult ErrorCode; 190 std::vector<unsigned> Decoded; 191 if (!Partial) 192 std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsLenient(S); 193 else 194 std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsPartialLenient(S); 195 196 if (Expected.ErrorCode != ErrorCode) 197 return ::testing::AssertionFailure() << "Expected error code " 198 << Expected.ErrorCode << ", actual " 199 << ErrorCode; 200 201 if (Expected.UnicodeScalars != Decoded) 202 return ::testing::AssertionFailure() 203 << "Expected lenient decoded result:\n" 204 << ::testing::PrintToString(Expected.UnicodeScalars) << "\n" 205 << "Actual result:\n" << ::testing::PrintToString(Decoded); 206 207 return ::testing::AssertionSuccess(); 208 } 209 210 TEST(ConvertUTFTest, UTF8ToUTF32Lenient) { 211 212 // 213 // 1-byte sequences 214 // 215 216 // U+0041 LATIN CAPITAL LETTER A 217 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 218 ConvertUTFResultContainer(conversionOK).withScalars(0x0041), "\x41")); 219 220 // 221 // 2-byte sequences 222 // 223 224 // U+0283 LATIN SMALL LETTER ESH 225 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 226 ConvertUTFResultContainer(conversionOK).withScalars(0x0283), 227 "\xca\x83")); 228 229 // U+03BA GREEK SMALL LETTER KAPPA 230 // U+1F79 GREEK SMALL LETTER OMICRON WITH OXIA 231 // U+03C3 GREEK SMALL LETTER SIGMA 232 // U+03BC GREEK SMALL LETTER MU 233 // U+03B5 GREEK SMALL LETTER EPSILON 234 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 235 ConvertUTFResultContainer(conversionOK) 236 .withScalars(0x03ba, 0x1f79, 0x03c3, 0x03bc, 0x03b5), 237 "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5")); 238 239 // 240 // 3-byte sequences 241 // 242 243 // U+4F8B CJK UNIFIED IDEOGRAPH-4F8B 244 // U+6587 CJK UNIFIED IDEOGRAPH-6587 245 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 246 ConvertUTFResultContainer(conversionOK).withScalars(0x4f8b, 0x6587), 247 "\xe4\xbe\x8b\xe6\x96\x87")); 248 249 // U+D55C HANGUL SYLLABLE HAN 250 // U+AE00 HANGUL SYLLABLE GEUL 251 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 252 ConvertUTFResultContainer(conversionOK).withScalars(0xd55c, 0xae00), 253 "\xed\x95\x9c\xea\xb8\x80")); 254 255 // U+1112 HANGUL CHOSEONG HIEUH 256 // U+1161 HANGUL JUNGSEONG A 257 // U+11AB HANGUL JONGSEONG NIEUN 258 // U+1100 HANGUL CHOSEONG KIYEOK 259 // U+1173 HANGUL JUNGSEONG EU 260 // U+11AF HANGUL JONGSEONG RIEUL 261 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 262 ConvertUTFResultContainer(conversionOK) 263 .withScalars(0x1112, 0x1161, 0x11ab, 0x1100, 0x1173, 0x11af), 264 "\xe1\x84\x92\xe1\x85\xa1\xe1\x86\xab\xe1\x84\x80\xe1\x85\xb3" 265 "\xe1\x86\xaf")); 266 267 // 268 // 4-byte sequences 269 // 270 271 // U+E0100 VARIATION SELECTOR-17 272 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 273 ConvertUTFResultContainer(conversionOK).withScalars(0x000E0100), 274 "\xf3\xa0\x84\x80")); 275 276 // 277 // First possible sequence of a certain length 278 // 279 280 // U+0000 NULL 281 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 282 ConvertUTFResultContainer(conversionOK).withScalars(0x0000), 283 StringRef("\x00", 1))); 284 285 // U+0080 PADDING CHARACTER 286 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 287 ConvertUTFResultContainer(conversionOK).withScalars(0x0080), 288 "\xc2\x80")); 289 290 // U+0800 SAMARITAN LETTER ALAF 291 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 292 ConvertUTFResultContainer(conversionOK).withScalars(0x0800), 293 "\xe0\xa0\x80")); 294 295 // U+10000 LINEAR B SYLLABLE B008 A 296 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 297 ConvertUTFResultContainer(conversionOK).withScalars(0x10000), 298 "\xf0\x90\x80\x80")); 299 300 // U+200000 (invalid) 301 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 302 ConvertUTFResultContainer(sourceIllegal) 303 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 304 "\xf8\x88\x80\x80\x80")); 305 306 // U+4000000 (invalid) 307 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 308 ConvertUTFResultContainer(sourceIllegal) 309 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 310 "\xfc\x84\x80\x80\x80\x80")); 311 312 // 313 // Last possible sequence of a certain length 314 // 315 316 // U+007F DELETE 317 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 318 ConvertUTFResultContainer(conversionOK).withScalars(0x007f), "\x7f")); 319 320 // U+07FF (unassigned) 321 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 322 ConvertUTFResultContainer(conversionOK).withScalars(0x07ff), 323 "\xdf\xbf")); 324 325 // U+FFFF (noncharacter) 326 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 327 ConvertUTFResultContainer(conversionOK).withScalars(0xffff), 328 "\xef\xbf\xbf")); 329 330 // U+1FFFFF (invalid) 331 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 332 ConvertUTFResultContainer(sourceIllegal) 333 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 334 "\xf7\xbf\xbf\xbf")); 335 336 // U+3FFFFFF (invalid) 337 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 338 ConvertUTFResultContainer(sourceIllegal) 339 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 340 "\xfb\xbf\xbf\xbf\xbf")); 341 342 // U+7FFFFFFF (invalid) 343 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 344 ConvertUTFResultContainer(sourceIllegal) 345 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 346 "\xfd\xbf\xbf\xbf\xbf\xbf")); 347 348 // 349 // Other boundary conditions 350 // 351 352 // U+D7FF (unassigned) 353 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 354 ConvertUTFResultContainer(conversionOK).withScalars(0xd7ff), 355 "\xed\x9f\xbf")); 356 357 // U+E000 (private use) 358 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 359 ConvertUTFResultContainer(conversionOK).withScalars(0xe000), 360 "\xee\x80\x80")); 361 362 // U+FFFD REPLACEMENT CHARACTER 363 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 364 ConvertUTFResultContainer(conversionOK).withScalars(0xfffd), 365 "\xef\xbf\xbd")); 366 367 // U+10FFFF (noncharacter) 368 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 369 ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff), 370 "\xf4\x8f\xbf\xbf")); 371 372 // U+110000 (invalid) 373 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 374 ConvertUTFResultContainer(sourceIllegal) 375 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 376 "\xf4\x90\x80\x80")); 377 378 // 379 // Unexpected continuation bytes 380 // 381 382 // A sequence of unexpected continuation bytes that don't follow a first 383 // byte, every byte is a maximal subpart. 384 385 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 386 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\x80")); 387 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 388 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xbf")); 389 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 390 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 391 "\x80\x80")); 392 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 393 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 394 "\x80\xbf")); 395 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 396 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 397 "\xbf\x80")); 398 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 399 ConvertUTFResultContainer(sourceIllegal) 400 .withScalars(0xfffd, 0xfffd, 0xfffd), 401 "\x80\xbf\x80")); 402 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 403 ConvertUTFResultContainer(sourceIllegal) 404 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 405 "\x80\xbf\x80\xbf")); 406 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 407 ConvertUTFResultContainer(sourceIllegal) 408 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 409 "\x80\xbf\x82\xbf\xaa")); 410 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 411 ConvertUTFResultContainer(sourceIllegal) 412 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 413 "\xaa\xb0\xbb\xbf\xaa\xa0")); 414 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 415 ConvertUTFResultContainer(sourceIllegal) 416 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 417 "\xaa\xb0\xbb\xbf\xaa\xa0\x8f")); 418 419 // All continuation bytes (0x80--0xbf). 420 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 421 ConvertUTFResultContainer(sourceIllegal) 422 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 423 0xfffd, 0xfffd, 0xfffd, 0xfffd) 424 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 425 0xfffd, 0xfffd, 0xfffd, 0xfffd) 426 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 427 0xfffd, 0xfffd, 0xfffd, 0xfffd) 428 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 429 0xfffd, 0xfffd, 0xfffd, 0xfffd) 430 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 431 0xfffd, 0xfffd, 0xfffd, 0xfffd) 432 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 433 0xfffd, 0xfffd, 0xfffd, 0xfffd) 434 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 435 0xfffd, 0xfffd, 0xfffd, 0xfffd) 436 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 437 0xfffd, 0xfffd, 0xfffd, 0xfffd), 438 "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" 439 "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" 440 "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf" 441 "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf")); 442 443 // 444 // Lonely start bytes 445 // 446 447 // Start bytes of 2-byte sequences (0xc0--0xdf). 448 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 449 ConvertUTFResultContainer(sourceIllegal) 450 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 451 0xfffd, 0xfffd, 0xfffd, 0xfffd) 452 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 453 0xfffd, 0xfffd, 0xfffd, 0xfffd) 454 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 455 0xfffd, 0xfffd, 0xfffd, 0xfffd) 456 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 457 0xfffd, 0xfffd, 0xfffd, 0xfffd), 458 "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf" 459 "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf")); 460 461 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 462 ConvertUTFResultContainer(sourceIllegal) 463 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 464 0xfffd, 0x0020, 0xfffd, 0x0020) 465 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 466 0xfffd, 0x0020, 0xfffd, 0x0020) 467 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 468 0xfffd, 0x0020, 0xfffd, 0x0020) 469 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 470 0xfffd, 0x0020, 0xfffd, 0x0020) 471 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 472 0xfffd, 0x0020, 0xfffd, 0x0020) 473 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 474 0xfffd, 0x0020, 0xfffd, 0x0020) 475 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 476 0xfffd, 0x0020, 0xfffd, 0x0020) 477 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 478 0xfffd, 0x0020, 0xfffd, 0x0020), 479 "\xc0\x20\xc1\x20\xc2\x20\xc3\x20\xc4\x20\xc5\x20\xc6\x20\xc7\x20" 480 "\xc8\x20\xc9\x20\xca\x20\xcb\x20\xcc\x20\xcd\x20\xce\x20\xcf\x20" 481 "\xd0\x20\xd1\x20\xd2\x20\xd3\x20\xd4\x20\xd5\x20\xd6\x20\xd7\x20" 482 "\xd8\x20\xd9\x20\xda\x20\xdb\x20\xdc\x20\xdd\x20\xde\x20\xdf\x20")); 483 484 // Start bytes of 3-byte sequences (0xe0--0xef). 485 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 486 ConvertUTFResultContainer(sourceIllegal) 487 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 488 0xfffd, 0xfffd, 0xfffd, 0xfffd) 489 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 490 0xfffd, 0xfffd, 0xfffd, 0xfffd), 491 "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef")); 492 493 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 494 ConvertUTFResultContainer(sourceIllegal) 495 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 496 0xfffd, 0x0020, 0xfffd, 0x0020) 497 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 498 0xfffd, 0x0020, 0xfffd, 0x0020) 499 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 500 0xfffd, 0x0020, 0xfffd, 0x0020) 501 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 502 0xfffd, 0x0020, 0xfffd, 0x0020), 503 "\xe0\x20\xe1\x20\xe2\x20\xe3\x20\xe4\x20\xe5\x20\xe6\x20\xe7\x20" 504 "\xe8\x20\xe9\x20\xea\x20\xeb\x20\xec\x20\xed\x20\xee\x20\xef\x20")); 505 506 // Start bytes of 4-byte sequences (0xf0--0xf7). 507 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 508 ConvertUTFResultContainer(sourceIllegal) 509 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 510 0xfffd, 0xfffd, 0xfffd, 0xfffd), 511 "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7")); 512 513 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 514 ConvertUTFResultContainer(sourceIllegal) 515 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 516 0xfffd, 0x0020, 0xfffd, 0x0020) 517 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 518 0xfffd, 0x0020, 0xfffd, 0x0020), 519 "\xf0\x20\xf1\x20\xf2\x20\xf3\x20\xf4\x20\xf5\x20\xf6\x20\xf7\x20")); 520 521 // Start bytes of 5-byte sequences (0xf8--0xfb). 522 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 523 ConvertUTFResultContainer(sourceIllegal) 524 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 525 "\xf8\xf9\xfa\xfb")); 526 527 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 528 ConvertUTFResultContainer(sourceIllegal) 529 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 530 0xfffd, 0x0020, 0xfffd, 0x0020), 531 "\xf8\x20\xf9\x20\xfa\x20\xfb\x20")); 532 533 // Start bytes of 6-byte sequences (0xfc--0xfd). 534 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 535 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 536 "\xfc\xfd")); 537 538 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 539 ConvertUTFResultContainer(sourceIllegal) 540 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020), 541 "\xfc\x20\xfd\x20")); 542 543 // 544 // Other bytes (0xc0--0xc1, 0xfe--0xff). 545 // 546 547 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 548 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc0")); 549 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 550 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc1")); 551 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 552 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfe")); 553 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 554 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xff")); 555 556 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 557 ConvertUTFResultContainer(sourceIllegal) 558 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 559 "\xc0\xc1\xfe\xff")); 560 561 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 562 ConvertUTFResultContainer(sourceIllegal) 563 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 564 "\xfe\xfe\xff\xff")); 565 566 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 567 ConvertUTFResultContainer(sourceIllegal) 568 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 569 "\xfe\x80\x80\x80\x80\x80")); 570 571 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 572 ConvertUTFResultContainer(sourceIllegal) 573 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 574 "\xff\x80\x80\x80\x80\x80")); 575 576 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 577 ConvertUTFResultContainer(sourceIllegal) 578 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 579 0xfffd, 0x0020, 0xfffd, 0x0020), 580 "\xc0\x20\xc1\x20\xfe\x20\xff\x20")); 581 582 // 583 // Sequences with one continuation byte missing 584 // 585 586 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 587 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc2")); 588 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 589 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xdf")); 590 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 591 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 592 "\xe0\xa0")); 593 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 594 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 595 "\xe0\xbf")); 596 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 597 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 598 "\xe1\x80")); 599 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 600 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 601 "\xec\xbf")); 602 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 603 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 604 "\xed\x80")); 605 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 606 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 607 "\xed\x9f")); 608 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 609 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 610 "\xee\x80")); 611 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 612 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 613 "\xef\xbf")); 614 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 615 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 616 "\xf0\x90\x80")); 617 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 618 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 619 "\xf0\xbf\xbf")); 620 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 621 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 622 "\xf1\x80\x80")); 623 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 624 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 625 "\xf3\xbf\xbf")); 626 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 627 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 628 "\xf4\x80\x80")); 629 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 630 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 631 "\xf4\x8f\xbf")); 632 633 // Overlong sequences with one trailing byte missing. 634 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 635 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 636 "\xc0")); 637 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 638 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 639 "\xc1")); 640 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 641 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 642 "\xe0\x80")); 643 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 644 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 645 "\xe0\x9f")); 646 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 647 ConvertUTFResultContainer(sourceIllegal) 648 .withScalars(0xfffd, 0xfffd, 0xfffd), 649 "\xf0\x80\x80")); 650 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 651 ConvertUTFResultContainer(sourceIllegal) 652 .withScalars(0xfffd, 0xfffd, 0xfffd), 653 "\xf0\x8f\x80")); 654 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 655 ConvertUTFResultContainer(sourceIllegal) 656 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 657 "\xf8\x80\x80\x80")); 658 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 659 ConvertUTFResultContainer(sourceIllegal) 660 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 661 "\xfc\x80\x80\x80\x80")); 662 663 // Sequences that represent surrogates with one trailing byte missing. 664 // High surrogates 665 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 666 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 667 "\xed\xa0")); 668 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 669 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 670 "\xed\xac")); 671 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 672 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 673 "\xed\xaf")); 674 // Low surrogates 675 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 676 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 677 "\xed\xb0")); 678 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 679 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 680 "\xed\xb4")); 681 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 682 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 683 "\xed\xbf")); 684 685 // Ill-formed 4-byte sequences. 686 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx 687 // U+1100xx (invalid) 688 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 689 ConvertUTFResultContainer(sourceIllegal) 690 .withScalars(0xfffd, 0xfffd, 0xfffd), 691 "\xf4\x90\x80")); 692 // U+13FBxx (invalid) 693 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 694 ConvertUTFResultContainer(sourceIllegal) 695 .withScalars(0xfffd, 0xfffd, 0xfffd), 696 "\xf4\xbf\xbf")); 697 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 698 ConvertUTFResultContainer(sourceIllegal) 699 .withScalars(0xfffd, 0xfffd, 0xfffd), 700 "\xf5\x80\x80")); 701 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 702 ConvertUTFResultContainer(sourceIllegal) 703 .withScalars(0xfffd, 0xfffd, 0xfffd), 704 "\xf6\x80\x80")); 705 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 706 ConvertUTFResultContainer(sourceIllegal) 707 .withScalars(0xfffd, 0xfffd, 0xfffd), 708 "\xf7\x80\x80")); 709 // U+1FFBxx (invalid) 710 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 711 ConvertUTFResultContainer(sourceIllegal) 712 .withScalars(0xfffd, 0xfffd, 0xfffd), 713 "\xf7\xbf\xbf")); 714 715 // Ill-formed 5-byte sequences. 716 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 717 // U+2000xx (invalid) 718 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 719 ConvertUTFResultContainer(sourceIllegal) 720 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 721 "\xf8\x88\x80\x80")); 722 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 723 ConvertUTFResultContainer(sourceIllegal) 724 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 725 "\xf8\xbf\xbf\xbf")); 726 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 727 ConvertUTFResultContainer(sourceIllegal) 728 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 729 "\xf9\x80\x80\x80")); 730 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 731 ConvertUTFResultContainer(sourceIllegal) 732 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 733 "\xfa\x80\x80\x80")); 734 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 735 ConvertUTFResultContainer(sourceIllegal) 736 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 737 "\xfb\x80\x80\x80")); 738 // U+3FFFFxx (invalid) 739 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 740 ConvertUTFResultContainer(sourceIllegal) 741 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 742 "\xfb\xbf\xbf\xbf")); 743 744 // Ill-formed 6-byte sequences. 745 // 1111110u 10uuuuuu 10uzzzzz 10zzzyyyy 10yyyyxx 10xxxxxx 746 // U+40000xx (invalid) 747 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 748 ConvertUTFResultContainer(sourceIllegal) 749 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 750 "\xfc\x84\x80\x80\x80")); 751 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 752 ConvertUTFResultContainer(sourceIllegal) 753 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 754 "\xfc\xbf\xbf\xbf\xbf")); 755 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 756 ConvertUTFResultContainer(sourceIllegal) 757 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 758 "\xfd\x80\x80\x80\x80")); 759 // U+7FFFFFxx (invalid) 760 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 761 ConvertUTFResultContainer(sourceIllegal) 762 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 763 "\xfd\xbf\xbf\xbf\xbf")); 764 765 // 766 // Sequences with two continuation bytes missing 767 // 768 769 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 770 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 771 "\xf0\x90")); 772 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 773 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 774 "\xf0\xbf")); 775 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 776 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 777 "\xf1\x80")); 778 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 779 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 780 "\xf3\xbf")); 781 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 782 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 783 "\xf4\x80")); 784 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 785 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 786 "\xf4\x8f")); 787 788 // Overlong sequences with two trailing byte missing. 789 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 790 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xe0")); 791 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 792 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 793 "\xf0\x80")); 794 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 795 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 796 "\xf0\x8f")); 797 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 798 ConvertUTFResultContainer(sourceIllegal) 799 .withScalars(0xfffd, 0xfffd, 0xfffd), 800 "\xf8\x80\x80")); 801 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 802 ConvertUTFResultContainer(sourceIllegal) 803 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 804 "\xfc\x80\x80\x80")); 805 806 // Sequences that represent surrogates with two trailing bytes missing. 807 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 808 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xed")); 809 810 // Ill-formed 4-byte sequences. 811 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx 812 // U+110yxx (invalid) 813 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 814 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 815 "\xf4\x90")); 816 // U+13Fyxx (invalid) 817 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 818 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 819 "\xf4\xbf")); 820 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 821 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 822 "\xf5\x80")); 823 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 824 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 825 "\xf6\x80")); 826 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 827 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 828 "\xf7\x80")); 829 // U+1FFyxx (invalid) 830 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 831 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 832 "\xf7\xbf")); 833 834 // Ill-formed 5-byte sequences. 835 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 836 // U+200yxx (invalid) 837 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 838 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 839 "\xf8\x88\x80")); 840 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 841 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 842 "\xf8\xbf\xbf")); 843 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 844 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 845 "\xf9\x80\x80")); 846 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 847 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 848 "\xfa\x80\x80")); 849 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 850 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 851 "\xfb\x80\x80")); 852 // U+3FFFyxx (invalid) 853 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 854 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 855 "\xfb\xbf\xbf")); 856 857 // Ill-formed 6-byte sequences. 858 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 859 // U+4000yxx (invalid) 860 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 861 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 862 "\xfc\x84\x80\x80")); 863 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 864 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 865 "\xfc\xbf\xbf\xbf")); 866 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 867 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 868 "\xfd\x80\x80\x80")); 869 // U+7FFFFyxx (invalid) 870 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 871 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 872 "\xfd\xbf\xbf\xbf")); 873 874 // 875 // Sequences with three continuation bytes missing 876 // 877 878 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 879 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0")); 880 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 881 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf1")); 882 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 883 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf2")); 884 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 885 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf3")); 886 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 887 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf4")); 888 889 // Broken overlong sequences. 890 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 891 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0")); 892 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 893 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 894 "\xf8\x80")); 895 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 896 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 897 "\xfc\x80\x80")); 898 899 // Ill-formed 4-byte sequences. 900 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx 901 // U+14yyxx (invalid) 902 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 903 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf5")); 904 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 905 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf6")); 906 // U+1Cyyxx (invalid) 907 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 908 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf7")); 909 910 // Ill-formed 5-byte sequences. 911 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 912 // U+20yyxx (invalid) 913 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 914 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 915 "\xf8\x88")); 916 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 917 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 918 "\xf8\xbf")); 919 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 920 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 921 "\xf9\x80")); 922 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 923 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 924 "\xfa\x80")); 925 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 926 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 927 "\xfb\x80")); 928 // U+3FCyyxx (invalid) 929 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 930 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 931 "\xfb\xbf")); 932 933 // Ill-formed 6-byte sequences. 934 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 935 // U+400yyxx (invalid) 936 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 937 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 938 "\xfc\x84\x80")); 939 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 940 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 941 "\xfc\xbf\xbf")); 942 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 943 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 944 "\xfd\x80\x80")); 945 // U+7FFCyyxx (invalid) 946 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 947 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 948 "\xfd\xbf\xbf")); 949 950 // 951 // Sequences with four continuation bytes missing 952 // 953 954 // Ill-formed 5-byte sequences. 955 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 956 // U+uzyyxx (invalid) 957 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 958 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8")); 959 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 960 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf9")); 961 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 962 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfa")); 963 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 964 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb")); 965 // U+3zyyxx (invalid) 966 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 967 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb")); 968 969 // Broken overlong sequences. 970 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 971 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8")); 972 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 973 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 974 "\xfc\x80")); 975 976 // Ill-formed 6-byte sequences. 977 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 978 // U+uzzyyxx (invalid) 979 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 980 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 981 "\xfc\x84")); 982 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 983 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 984 "\xfc\xbf")); 985 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 986 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 987 "\xfd\x80")); 988 // U+7Fzzyyxx (invalid) 989 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 990 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 991 "\xfd\xbf")); 992 993 // 994 // Sequences with five continuation bytes missing 995 // 996 997 // Ill-formed 6-byte sequences. 998 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 999 // U+uzzyyxx (invalid) 1000 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1001 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfc")); 1002 // U+uuzzyyxx (invalid) 1003 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1004 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfd")); 1005 1006 // 1007 // Consecutive sequences with trailing bytes missing 1008 // 1009 1010 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1011 ConvertUTFResultContainer(sourceIllegal) 1012 .withScalars(0xfffd, /**/ 0xfffd, 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd) 1013 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd) 1014 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd) 1015 .withScalars(0xfffd, /**/ 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd) 1016 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd) 1017 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1018 "\xc0" "\xe0\x80" "\xf0\x80\x80" 1019 "\xf8\x80\x80\x80" 1020 "\xfc\x80\x80\x80\x80" 1021 "\xdf" "\xef\xbf" "\xf7\xbf\xbf" 1022 "\xfb\xbf\xbf\xbf" 1023 "\xfd\xbf\xbf\xbf\xbf")); 1024 1025 // 1026 // Overlong UTF-8 sequences 1027 // 1028 1029 // U+002F SOLIDUS 1030 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1031 ConvertUTFResultContainer(conversionOK).withScalars(0x002f), "\x2f")); 1032 1033 // Overlong sequences of the above. 1034 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1035 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 1036 "\xc0\xaf")); 1037 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1038 ConvertUTFResultContainer(sourceIllegal) 1039 .withScalars(0xfffd, 0xfffd, 0xfffd), 1040 "\xe0\x80\xaf")); 1041 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1042 ConvertUTFResultContainer(sourceIllegal) 1043 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 1044 "\xf0\x80\x80\xaf")); 1045 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1046 ConvertUTFResultContainer(sourceIllegal) 1047 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1048 "\xf8\x80\x80\x80\xaf")); 1049 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1050 ConvertUTFResultContainer(sourceIllegal) 1051 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1052 "\xfc\x80\x80\x80\x80\xaf")); 1053 1054 // U+0000 NULL 1055 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1056 ConvertUTFResultContainer(conversionOK).withScalars(0x0000), 1057 StringRef("\x00", 1))); 1058 1059 // Overlong sequences of the above. 1060 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1061 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 1062 "\xc0\x80")); 1063 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1064 ConvertUTFResultContainer(sourceIllegal) 1065 .withScalars(0xfffd, 0xfffd, 0xfffd), 1066 "\xe0\x80\x80")); 1067 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1068 ConvertUTFResultContainer(sourceIllegal) 1069 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 1070 "\xf0\x80\x80\x80")); 1071 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1072 ConvertUTFResultContainer(sourceIllegal) 1073 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1074 "\xf8\x80\x80\x80\x80")); 1075 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1076 ConvertUTFResultContainer(sourceIllegal) 1077 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1078 "\xfc\x80\x80\x80\x80\x80")); 1079 1080 // Other overlong sequences. 1081 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1082 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 1083 "\xc0\xbf")); 1084 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1085 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 1086 "\xc1\x80")); 1087 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1088 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 1089 "\xc1\xbf")); 1090 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1091 ConvertUTFResultContainer(sourceIllegal) 1092 .withScalars(0xfffd, 0xfffd, 0xfffd), 1093 "\xe0\x9f\xbf")); 1094 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1095 ConvertUTFResultContainer(sourceIllegal) 1096 .withScalars(0xfffd, 0xfffd, 0xfffd), 1097 "\xed\xa0\x80")); 1098 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1099 ConvertUTFResultContainer(sourceIllegal) 1100 .withScalars(0xfffd, 0xfffd, 0xfffd), 1101 "\xed\xbf\xbf")); 1102 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1103 ConvertUTFResultContainer(sourceIllegal) 1104 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 1105 "\xf0\x8f\x80\x80")); 1106 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1107 ConvertUTFResultContainer(sourceIllegal) 1108 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 1109 "\xf0\x8f\xbf\xbf")); 1110 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1111 ConvertUTFResultContainer(sourceIllegal) 1112 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1113 "\xf8\x87\xbf\xbf\xbf")); 1114 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1115 ConvertUTFResultContainer(sourceIllegal) 1116 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1117 "\xfc\x83\xbf\xbf\xbf\xbf")); 1118 1119 // 1120 // Isolated surrogates 1121 // 1122 1123 // Unicode 6.3.0: 1124 // 1125 // D71. High-surrogate code point: A Unicode code point in the range 1126 // U+D800 to U+DBFF. 1127 // 1128 // D73. Low-surrogate code point: A Unicode code point in the range 1129 // U+DC00 to U+DFFF. 1130 1131 // Note: U+E0100 is <DB40 DD00> in UTF16. 1132 1133 // High surrogates 1134 1135 // U+D800 1136 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1137 ConvertUTFResultContainer(sourceIllegal) 1138 .withScalars(0xfffd, 0xfffd, 0xfffd), 1139 "\xed\xa0\x80")); 1140 1141 // U+DB40 1142 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1143 ConvertUTFResultContainer(sourceIllegal) 1144 .withScalars(0xfffd, 0xfffd, 0xfffd), 1145 "\xed\xac\xa0")); 1146 1147 // U+DBFF 1148 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1149 ConvertUTFResultContainer(sourceIllegal) 1150 .withScalars(0xfffd, 0xfffd, 0xfffd), 1151 "\xed\xaf\xbf")); 1152 1153 // Low surrogates 1154 1155 // U+DC00 1156 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1157 ConvertUTFResultContainer(sourceIllegal) 1158 .withScalars(0xfffd, 0xfffd, 0xfffd), 1159 "\xed\xb0\x80")); 1160 1161 // U+DD00 1162 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1163 ConvertUTFResultContainer(sourceIllegal) 1164 .withScalars(0xfffd, 0xfffd, 0xfffd), 1165 "\xed\xb4\x80")); 1166 1167 // U+DFFF 1168 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1169 ConvertUTFResultContainer(sourceIllegal) 1170 .withScalars(0xfffd, 0xfffd, 0xfffd), 1171 "\xed\xbf\xbf")); 1172 1173 // Surrogate pairs 1174 1175 // U+D800 U+DC00 1176 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1177 ConvertUTFResultContainer(sourceIllegal) 1178 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1179 "\xed\xa0\x80\xed\xb0\x80")); 1180 1181 // U+D800 U+DD00 1182 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1183 ConvertUTFResultContainer(sourceIllegal) 1184 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1185 "\xed\xa0\x80\xed\xb4\x80")); 1186 1187 // U+D800 U+DFFF 1188 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1189 ConvertUTFResultContainer(sourceIllegal) 1190 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1191 "\xed\xa0\x80\xed\xbf\xbf")); 1192 1193 // U+DB40 U+DC00 1194 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1195 ConvertUTFResultContainer(sourceIllegal) 1196 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1197 "\xed\xac\xa0\xed\xb0\x80")); 1198 1199 // U+DB40 U+DD00 1200 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1201 ConvertUTFResultContainer(sourceIllegal) 1202 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1203 "\xed\xac\xa0\xed\xb4\x80")); 1204 1205 // U+DB40 U+DFFF 1206 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1207 ConvertUTFResultContainer(sourceIllegal) 1208 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1209 "\xed\xac\xa0\xed\xbf\xbf")); 1210 1211 // U+DBFF U+DC00 1212 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1213 ConvertUTFResultContainer(sourceIllegal) 1214 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1215 "\xed\xaf\xbf\xed\xb0\x80")); 1216 1217 // U+DBFF U+DD00 1218 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1219 ConvertUTFResultContainer(sourceIllegal) 1220 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1221 "\xed\xaf\xbf\xed\xb4\x80")); 1222 1223 // U+DBFF U+DFFF 1224 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1225 ConvertUTFResultContainer(sourceIllegal) 1226 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1227 "\xed\xaf\xbf\xed\xbf\xbf")); 1228 1229 // 1230 // Noncharacters 1231 // 1232 1233 // Unicode 6.3.0: 1234 // 1235 // D14. Noncharacter: A code point that is permanently reserved for 1236 // internal use and that should never be interchanged. Noncharacters 1237 // consist of the values U+nFFFE and U+nFFFF (where n is from 0 to 1016) 1238 // and the values U+FDD0..U+FDEF. 1239 1240 // U+FFFE 1241 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1242 ConvertUTFResultContainer(conversionOK).withScalars(0xfffe), 1243 "\xef\xbf\xbe")); 1244 1245 // U+FFFF 1246 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1247 ConvertUTFResultContainer(conversionOK).withScalars(0xffff), 1248 "\xef\xbf\xbf")); 1249 1250 // U+1FFFE 1251 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1252 ConvertUTFResultContainer(conversionOK).withScalars(0x1fffe), 1253 "\xf0\x9f\xbf\xbe")); 1254 1255 // U+1FFFF 1256 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1257 ConvertUTFResultContainer(conversionOK).withScalars(0x1ffff), 1258 "\xf0\x9f\xbf\xbf")); 1259 1260 // U+2FFFE 1261 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1262 ConvertUTFResultContainer(conversionOK).withScalars(0x2fffe), 1263 "\xf0\xaf\xbf\xbe")); 1264 1265 // U+2FFFF 1266 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1267 ConvertUTFResultContainer(conversionOK).withScalars(0x2ffff), 1268 "\xf0\xaf\xbf\xbf")); 1269 1270 // U+3FFFE 1271 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1272 ConvertUTFResultContainer(conversionOK).withScalars(0x3fffe), 1273 "\xf0\xbf\xbf\xbe")); 1274 1275 // U+3FFFF 1276 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1277 ConvertUTFResultContainer(conversionOK).withScalars(0x3ffff), 1278 "\xf0\xbf\xbf\xbf")); 1279 1280 // U+4FFFE 1281 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1282 ConvertUTFResultContainer(conversionOK).withScalars(0x4fffe), 1283 "\xf1\x8f\xbf\xbe")); 1284 1285 // U+4FFFF 1286 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1287 ConvertUTFResultContainer(conversionOK).withScalars(0x4ffff), 1288 "\xf1\x8f\xbf\xbf")); 1289 1290 // U+5FFFE 1291 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1292 ConvertUTFResultContainer(conversionOK).withScalars(0x5fffe), 1293 "\xf1\x9f\xbf\xbe")); 1294 1295 // U+5FFFF 1296 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1297 ConvertUTFResultContainer(conversionOK).withScalars(0x5ffff), 1298 "\xf1\x9f\xbf\xbf")); 1299 1300 // U+6FFFE 1301 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1302 ConvertUTFResultContainer(conversionOK).withScalars(0x6fffe), 1303 "\xf1\xaf\xbf\xbe")); 1304 1305 // U+6FFFF 1306 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1307 ConvertUTFResultContainer(conversionOK).withScalars(0x6ffff), 1308 "\xf1\xaf\xbf\xbf")); 1309 1310 // U+7FFFE 1311 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1312 ConvertUTFResultContainer(conversionOK).withScalars(0x7fffe), 1313 "\xf1\xbf\xbf\xbe")); 1314 1315 // U+7FFFF 1316 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1317 ConvertUTFResultContainer(conversionOK).withScalars(0x7ffff), 1318 "\xf1\xbf\xbf\xbf")); 1319 1320 // U+8FFFE 1321 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1322 ConvertUTFResultContainer(conversionOK).withScalars(0x8fffe), 1323 "\xf2\x8f\xbf\xbe")); 1324 1325 // U+8FFFF 1326 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1327 ConvertUTFResultContainer(conversionOK).withScalars(0x8ffff), 1328 "\xf2\x8f\xbf\xbf")); 1329 1330 // U+9FFFE 1331 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1332 ConvertUTFResultContainer(conversionOK).withScalars(0x9fffe), 1333 "\xf2\x9f\xbf\xbe")); 1334 1335 // U+9FFFF 1336 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1337 ConvertUTFResultContainer(conversionOK).withScalars(0x9ffff), 1338 "\xf2\x9f\xbf\xbf")); 1339 1340 // U+AFFFE 1341 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1342 ConvertUTFResultContainer(conversionOK).withScalars(0xafffe), 1343 "\xf2\xaf\xbf\xbe")); 1344 1345 // U+AFFFF 1346 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1347 ConvertUTFResultContainer(conversionOK).withScalars(0xaffff), 1348 "\xf2\xaf\xbf\xbf")); 1349 1350 // U+BFFFE 1351 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1352 ConvertUTFResultContainer(conversionOK).withScalars(0xbfffe), 1353 "\xf2\xbf\xbf\xbe")); 1354 1355 // U+BFFFF 1356 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1357 ConvertUTFResultContainer(conversionOK).withScalars(0xbffff), 1358 "\xf2\xbf\xbf\xbf")); 1359 1360 // U+CFFFE 1361 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1362 ConvertUTFResultContainer(conversionOK).withScalars(0xcfffe), 1363 "\xf3\x8f\xbf\xbe")); 1364 1365 // U+CFFFF 1366 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1367 ConvertUTFResultContainer(conversionOK).withScalars(0xcfffF), 1368 "\xf3\x8f\xbf\xbf")); 1369 1370 // U+DFFFE 1371 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1372 ConvertUTFResultContainer(conversionOK).withScalars(0xdfffe), 1373 "\xf3\x9f\xbf\xbe")); 1374 1375 // U+DFFFF 1376 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1377 ConvertUTFResultContainer(conversionOK).withScalars(0xdffff), 1378 "\xf3\x9f\xbf\xbf")); 1379 1380 // U+EFFFE 1381 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1382 ConvertUTFResultContainer(conversionOK).withScalars(0xefffe), 1383 "\xf3\xaf\xbf\xbe")); 1384 1385 // U+EFFFF 1386 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1387 ConvertUTFResultContainer(conversionOK).withScalars(0xeffff), 1388 "\xf3\xaf\xbf\xbf")); 1389 1390 // U+FFFFE 1391 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1392 ConvertUTFResultContainer(conversionOK).withScalars(0xffffe), 1393 "\xf3\xbf\xbf\xbe")); 1394 1395 // U+FFFFF 1396 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1397 ConvertUTFResultContainer(conversionOK).withScalars(0xfffff), 1398 "\xf3\xbf\xbf\xbf")); 1399 1400 // U+10FFFE 1401 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1402 ConvertUTFResultContainer(conversionOK).withScalars(0x10fffe), 1403 "\xf4\x8f\xbf\xbe")); 1404 1405 // U+10FFFF 1406 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1407 ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff), 1408 "\xf4\x8f\xbf\xbf")); 1409 1410 // U+FDD0 1411 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1412 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd0), 1413 "\xef\xb7\x90")); 1414 1415 // U+FDD1 1416 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1417 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd1), 1418 "\xef\xb7\x91")); 1419 1420 // U+FDD2 1421 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1422 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd2), 1423 "\xef\xb7\x92")); 1424 1425 // U+FDD3 1426 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1427 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd3), 1428 "\xef\xb7\x93")); 1429 1430 // U+FDD4 1431 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1432 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd4), 1433 "\xef\xb7\x94")); 1434 1435 // U+FDD5 1436 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1437 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd5), 1438 "\xef\xb7\x95")); 1439 1440 // U+FDD6 1441 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1442 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd6), 1443 "\xef\xb7\x96")); 1444 1445 // U+FDD7 1446 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1447 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd7), 1448 "\xef\xb7\x97")); 1449 1450 // U+FDD8 1451 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1452 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd8), 1453 "\xef\xb7\x98")); 1454 1455 // U+FDD9 1456 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1457 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd9), 1458 "\xef\xb7\x99")); 1459 1460 // U+FDDA 1461 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1462 ConvertUTFResultContainer(conversionOK).withScalars(0xfdda), 1463 "\xef\xb7\x9a")); 1464 1465 // U+FDDB 1466 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1467 ConvertUTFResultContainer(conversionOK).withScalars(0xfddb), 1468 "\xef\xb7\x9b")); 1469 1470 // U+FDDC 1471 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1472 ConvertUTFResultContainer(conversionOK).withScalars(0xfddc), 1473 "\xef\xb7\x9c")); 1474 1475 // U+FDDD 1476 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1477 ConvertUTFResultContainer(conversionOK).withScalars(0xfddd), 1478 "\xef\xb7\x9d")); 1479 1480 // U+FDDE 1481 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1482 ConvertUTFResultContainer(conversionOK).withScalars(0xfdde), 1483 "\xef\xb7\x9e")); 1484 1485 // U+FDDF 1486 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1487 ConvertUTFResultContainer(conversionOK).withScalars(0xfddf), 1488 "\xef\xb7\x9f")); 1489 1490 // U+FDE0 1491 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1492 ConvertUTFResultContainer(conversionOK).withScalars(0xfde0), 1493 "\xef\xb7\xa0")); 1494 1495 // U+FDE1 1496 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1497 ConvertUTFResultContainer(conversionOK).withScalars(0xfde1), 1498 "\xef\xb7\xa1")); 1499 1500 // U+FDE2 1501 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1502 ConvertUTFResultContainer(conversionOK).withScalars(0xfde2), 1503 "\xef\xb7\xa2")); 1504 1505 // U+FDE3 1506 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1507 ConvertUTFResultContainer(conversionOK).withScalars(0xfde3), 1508 "\xef\xb7\xa3")); 1509 1510 // U+FDE4 1511 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1512 ConvertUTFResultContainer(conversionOK).withScalars(0xfde4), 1513 "\xef\xb7\xa4")); 1514 1515 // U+FDE5 1516 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1517 ConvertUTFResultContainer(conversionOK).withScalars(0xfde5), 1518 "\xef\xb7\xa5")); 1519 1520 // U+FDE6 1521 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1522 ConvertUTFResultContainer(conversionOK).withScalars(0xfde6), 1523 "\xef\xb7\xa6")); 1524 1525 // U+FDE7 1526 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1527 ConvertUTFResultContainer(conversionOK).withScalars(0xfde7), 1528 "\xef\xb7\xa7")); 1529 1530 // U+FDE8 1531 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1532 ConvertUTFResultContainer(conversionOK).withScalars(0xfde8), 1533 "\xef\xb7\xa8")); 1534 1535 // U+FDE9 1536 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1537 ConvertUTFResultContainer(conversionOK).withScalars(0xfde9), 1538 "\xef\xb7\xa9")); 1539 1540 // U+FDEA 1541 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1542 ConvertUTFResultContainer(conversionOK).withScalars(0xfdea), 1543 "\xef\xb7\xaa")); 1544 1545 // U+FDEB 1546 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1547 ConvertUTFResultContainer(conversionOK).withScalars(0xfdeb), 1548 "\xef\xb7\xab")); 1549 1550 // U+FDEC 1551 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1552 ConvertUTFResultContainer(conversionOK).withScalars(0xfdec), 1553 "\xef\xb7\xac")); 1554 1555 // U+FDED 1556 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1557 ConvertUTFResultContainer(conversionOK).withScalars(0xfded), 1558 "\xef\xb7\xad")); 1559 1560 // U+FDEE 1561 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1562 ConvertUTFResultContainer(conversionOK).withScalars(0xfdee), 1563 "\xef\xb7\xae")); 1564 1565 // U+FDEF 1566 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1567 ConvertUTFResultContainer(conversionOK).withScalars(0xfdef), 1568 "\xef\xb7\xaf")); 1569 1570 // U+FDF0 1571 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1572 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf0), 1573 "\xef\xb7\xb0")); 1574 1575 // U+FDF1 1576 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1577 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf1), 1578 "\xef\xb7\xb1")); 1579 1580 // U+FDF2 1581 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1582 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf2), 1583 "\xef\xb7\xb2")); 1584 1585 // U+FDF3 1586 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1587 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf3), 1588 "\xef\xb7\xb3")); 1589 1590 // U+FDF4 1591 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1592 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf4), 1593 "\xef\xb7\xb4")); 1594 1595 // U+FDF5 1596 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1597 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf5), 1598 "\xef\xb7\xb5")); 1599 1600 // U+FDF6 1601 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1602 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf6), 1603 "\xef\xb7\xb6")); 1604 1605 // U+FDF7 1606 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1607 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf7), 1608 "\xef\xb7\xb7")); 1609 1610 // U+FDF8 1611 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1612 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf8), 1613 "\xef\xb7\xb8")); 1614 1615 // U+FDF9 1616 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1617 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf9), 1618 "\xef\xb7\xb9")); 1619 1620 // U+FDFA 1621 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1622 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfa), 1623 "\xef\xb7\xba")); 1624 1625 // U+FDFB 1626 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1627 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfb), 1628 "\xef\xb7\xbb")); 1629 1630 // U+FDFC 1631 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1632 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfc), 1633 "\xef\xb7\xbc")); 1634 1635 // U+FDFD 1636 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1637 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfd), 1638 "\xef\xb7\xbd")); 1639 1640 // U+FDFE 1641 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1642 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfe), 1643 "\xef\xb7\xbe")); 1644 1645 // U+FDFF 1646 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1647 ConvertUTFResultContainer(conversionOK).withScalars(0xfdff), 1648 "\xef\xb7\xbf")); 1649 } 1650 1651 TEST(ConvertUTFTest, UTF8ToUTF32PartialLenient) { 1652 // U+0041 LATIN CAPITAL LETTER A 1653 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1654 ConvertUTFResultContainer(conversionOK).withScalars(0x0041), 1655 "\x41", true)); 1656 1657 // 1658 // Sequences with one continuation byte missing 1659 // 1660 1661 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1662 ConvertUTFResultContainer(sourceExhausted), 1663 "\xc2", true)); 1664 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1665 ConvertUTFResultContainer(sourceExhausted), 1666 "\xdf", true)); 1667 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1668 ConvertUTFResultContainer(sourceExhausted), 1669 "\xe0\xa0", true)); 1670 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1671 ConvertUTFResultContainer(sourceExhausted), 1672 "\xe0\xbf", true)); 1673 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1674 ConvertUTFResultContainer(sourceExhausted), 1675 "\xe1\x80", true)); 1676 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1677 ConvertUTFResultContainer(sourceExhausted), 1678 "\xec\xbf", true)); 1679 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1680 ConvertUTFResultContainer(sourceExhausted), 1681 "\xed\x80", true)); 1682 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1683 ConvertUTFResultContainer(sourceExhausted), 1684 "\xed\x9f", true)); 1685 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1686 ConvertUTFResultContainer(sourceExhausted), 1687 "\xee\x80", true)); 1688 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1689 ConvertUTFResultContainer(sourceExhausted), 1690 "\xef\xbf", true)); 1691 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1692 ConvertUTFResultContainer(sourceExhausted), 1693 "\xf0\x90\x80", true)); 1694 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1695 ConvertUTFResultContainer(sourceExhausted), 1696 "\xf0\xbf\xbf", true)); 1697 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1698 ConvertUTFResultContainer(sourceExhausted), 1699 "\xf1\x80\x80", true)); 1700 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1701 ConvertUTFResultContainer(sourceExhausted), 1702 "\xf3\xbf\xbf", true)); 1703 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1704 ConvertUTFResultContainer(sourceExhausted), 1705 "\xf4\x80\x80", true)); 1706 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1707 ConvertUTFResultContainer(sourceExhausted), 1708 "\xf4\x8f\xbf", true)); 1709 1710 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1711 ConvertUTFResultContainer(sourceExhausted).withScalars(0x0041), 1712 "\x41\xc2", true)); 1713 } 1714 1715