1 //===- llvm/unittest/Support/ConvertUTFTest.cpp - ConvertUTF tests --------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 10 #include "llvm/Support/ConvertUTF.h" 11 #include "llvm/ADT/ArrayRef.h" 12 #include "gtest/gtest.h" 13 #include <string> 14 #include <vector> 15 16 using namespace llvm; 17 18 TEST(ConvertUTFTest, ConvertUTF16LittleEndianToUTF8String) { 19 // Src is the look of disapproval. 20 static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c"; 21 ArrayRef<char> Ref(Src, sizeof(Src) - 1); 22 std::string Result; 23 bool Success = convertUTF16ToUTF8String(Ref, Result); 24 EXPECT_TRUE(Success); 25 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0"); 26 EXPECT_EQ(Expected, Result); 27 } 28 29 TEST(ConvertUTFTest, ConvertUTF16BigEndianToUTF8String) { 30 // Src is the look of disapproval. 31 static const char Src[] = "\xfe\xff\x0c\xa0\x00_\x0c\xa0"; 32 ArrayRef<char> Ref(Src, sizeof(Src) - 1); 33 std::string Result; 34 bool Success = convertUTF16ToUTF8String(Ref, Result); 35 EXPECT_TRUE(Success); 36 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0"); 37 EXPECT_EQ(Expected, Result); 38 } 39 40 TEST(ConvertUTFTest, ConvertUTF8ToUTF16String) { 41 // Src is the look of disapproval. 42 static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0"; 43 StringRef Ref(Src, sizeof(Src) - 1); 44 SmallVector<UTF16, 5> Result; 45 bool Success = convertUTF8ToUTF16String(Ref, Result); 46 EXPECT_TRUE(Success); 47 static const UTF16 Expected[] = {0x0CA0, 0x005f, 0x0CA0, 0}; 48 ASSERT_EQ(3u, Result.size()); 49 for (int I = 0, E = 3; I != E; ++I) 50 EXPECT_EQ(Expected[I], Result[I]); 51 } 52 53 TEST(ConvertUTFTest, OddLengthInput) { 54 std::string Result; 55 bool Success = convertUTF16ToUTF8String(makeArrayRef("xxxxx", 5), Result); 56 EXPECT_FALSE(Success); 57 } 58 59 TEST(ConvertUTFTest, Empty) { 60 std::string Result; 61 bool Success = convertUTF16ToUTF8String(llvm::ArrayRef<char>(None), Result); 62 EXPECT_TRUE(Success); 63 EXPECT_TRUE(Result.empty()); 64 } 65 66 TEST(ConvertUTFTest, HasUTF16BOM) { 67 bool HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xff\xfe", 2)); 68 EXPECT_TRUE(HasBOM); 69 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff", 2)); 70 EXPECT_TRUE(HasBOM); 71 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff ", 3)); 72 EXPECT_TRUE(HasBOM); // Don't care about odd lengths. 73 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff\x00asdf", 6)); 74 EXPECT_TRUE(HasBOM); 75 76 HasBOM = hasUTF16ByteOrderMark(None); 77 EXPECT_FALSE(HasBOM); 78 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe", 1)); 79 EXPECT_FALSE(HasBOM); 80 } 81 82 TEST(ConvertUTFTest, UTF16WrappersForConvertUTF16ToUTF8String) { 83 // Src is the look of disapproval. 84 static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c"; 85 ArrayRef<UTF16> SrcRef = makeArrayRef((const UTF16 *)Src, 4); 86 std::string Result; 87 bool Success = convertUTF16ToUTF8String(SrcRef, Result); 88 EXPECT_TRUE(Success); 89 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0"); 90 EXPECT_EQ(Expected, Result); 91 } 92 93 TEST(ConvertUTFTest, ConvertUTF8toWide) { 94 // Src is the look of disapproval. 95 static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0"; 96 std::wstring Result; 97 bool Success = ConvertUTF8toWide((const char*)Src, Result); 98 EXPECT_TRUE(Success); 99 std::wstring Expected(L"\x0ca0_\x0ca0"); 100 EXPECT_EQ(Expected, Result); 101 Result.clear(); 102 Success = ConvertUTF8toWide(StringRef(Src, 7), Result); 103 EXPECT_TRUE(Success); 104 EXPECT_EQ(Expected, Result); 105 } 106 107 TEST(ConvertUTFTest, convertWideToUTF8) { 108 // Src is the look of disapproval. 109 static const wchar_t Src[] = L"\x0ca0_\x0ca0"; 110 std::string Result; 111 bool Success = convertWideToUTF8(Src, Result); 112 EXPECT_TRUE(Success); 113 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0"); 114 EXPECT_EQ(Expected, Result); 115 } 116 117 struct ConvertUTFResultContainer { 118 ConversionResult ErrorCode; 119 std::vector<unsigned> UnicodeScalars; 120 121 ConvertUTFResultContainer(ConversionResult ErrorCode) 122 : ErrorCode(ErrorCode) {} 123 124 ConvertUTFResultContainer 125 withScalars(unsigned US0 = 0x110000, unsigned US1 = 0x110000, 126 unsigned US2 = 0x110000, unsigned US3 = 0x110000, 127 unsigned US4 = 0x110000, unsigned US5 = 0x110000, 128 unsigned US6 = 0x110000, unsigned US7 = 0x110000) { 129 ConvertUTFResultContainer Result(*this); 130 if (US0 != 0x110000) 131 Result.UnicodeScalars.push_back(US0); 132 if (US1 != 0x110000) 133 Result.UnicodeScalars.push_back(US1); 134 if (US2 != 0x110000) 135 Result.UnicodeScalars.push_back(US2); 136 if (US3 != 0x110000) 137 Result.UnicodeScalars.push_back(US3); 138 if (US4 != 0x110000) 139 Result.UnicodeScalars.push_back(US4); 140 if (US5 != 0x110000) 141 Result.UnicodeScalars.push_back(US5); 142 if (US6 != 0x110000) 143 Result.UnicodeScalars.push_back(US6); 144 if (US7 != 0x110000) 145 Result.UnicodeScalars.push_back(US7); 146 return Result; 147 } 148 }; 149 150 std::pair<ConversionResult, std::vector<unsigned>> 151 ConvertUTF8ToUnicodeScalarsLenient(StringRef S) { 152 const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data()); 153 154 const UTF8 *SourceNext = SourceStart; 155 std::vector<UTF32> Decoded(S.size(), 0); 156 UTF32 *TargetStart = Decoded.data(); 157 158 auto ErrorCode = 159 ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart, 160 Decoded.data() + Decoded.size(), lenientConversion); 161 162 Decoded.resize(TargetStart - Decoded.data()); 163 164 return std::make_pair(ErrorCode, Decoded); 165 } 166 167 std::pair<ConversionResult, std::vector<unsigned>> 168 ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S) { 169 const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data()); 170 171 const UTF8 *SourceNext = SourceStart; 172 std::vector<UTF32> Decoded(S.size(), 0); 173 UTF32 *TargetStart = Decoded.data(); 174 175 auto ErrorCode = ConvertUTF8toUTF32Partial( 176 &SourceNext, SourceStart + S.size(), &TargetStart, 177 Decoded.data() + Decoded.size(), lenientConversion); 178 179 Decoded.resize(TargetStart - Decoded.data()); 180 181 return std::make_pair(ErrorCode, Decoded); 182 } 183 184 ::testing::AssertionResult 185 CheckConvertUTF8ToUnicodeScalars(ConvertUTFResultContainer Expected, 186 StringRef S, bool Partial = false) { 187 ConversionResult ErrorCode; 188 std::vector<unsigned> Decoded; 189 if (!Partial) 190 std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsLenient(S); 191 else 192 std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsPartialLenient(S); 193 194 if (Expected.ErrorCode != ErrorCode) 195 return ::testing::AssertionFailure() << "Expected error code " 196 << Expected.ErrorCode << ", actual " 197 << ErrorCode; 198 199 if (Expected.UnicodeScalars != Decoded) 200 return ::testing::AssertionFailure() 201 << "Expected lenient decoded result:\n" 202 << ::testing::PrintToString(Expected.UnicodeScalars) << "\n" 203 << "Actual result:\n" << ::testing::PrintToString(Decoded); 204 205 return ::testing::AssertionSuccess(); 206 } 207 208 TEST(ConvertUTFTest, UTF8ToUTF32Lenient) { 209 210 // 211 // 1-byte sequences 212 // 213 214 // U+0041 LATIN CAPITAL LETTER A 215 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 216 ConvertUTFResultContainer(conversionOK).withScalars(0x0041), "\x41")); 217 218 // 219 // 2-byte sequences 220 // 221 222 // U+0283 LATIN SMALL LETTER ESH 223 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 224 ConvertUTFResultContainer(conversionOK).withScalars(0x0283), 225 "\xca\x83")); 226 227 // U+03BA GREEK SMALL LETTER KAPPA 228 // U+1F79 GREEK SMALL LETTER OMICRON WITH OXIA 229 // U+03C3 GREEK SMALL LETTER SIGMA 230 // U+03BC GREEK SMALL LETTER MU 231 // U+03B5 GREEK SMALL LETTER EPSILON 232 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 233 ConvertUTFResultContainer(conversionOK) 234 .withScalars(0x03ba, 0x1f79, 0x03c3, 0x03bc, 0x03b5), 235 "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5")); 236 237 // 238 // 3-byte sequences 239 // 240 241 // U+4F8B CJK UNIFIED IDEOGRAPH-4F8B 242 // U+6587 CJK UNIFIED IDEOGRAPH-6587 243 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 244 ConvertUTFResultContainer(conversionOK).withScalars(0x4f8b, 0x6587), 245 "\xe4\xbe\x8b\xe6\x96\x87")); 246 247 // U+D55C HANGUL SYLLABLE HAN 248 // U+AE00 HANGUL SYLLABLE GEUL 249 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 250 ConvertUTFResultContainer(conversionOK).withScalars(0xd55c, 0xae00), 251 "\xed\x95\x9c\xea\xb8\x80")); 252 253 // U+1112 HANGUL CHOSEONG HIEUH 254 // U+1161 HANGUL JUNGSEONG A 255 // U+11AB HANGUL JONGSEONG NIEUN 256 // U+1100 HANGUL CHOSEONG KIYEOK 257 // U+1173 HANGUL JUNGSEONG EU 258 // U+11AF HANGUL JONGSEONG RIEUL 259 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 260 ConvertUTFResultContainer(conversionOK) 261 .withScalars(0x1112, 0x1161, 0x11ab, 0x1100, 0x1173, 0x11af), 262 "\xe1\x84\x92\xe1\x85\xa1\xe1\x86\xab\xe1\x84\x80\xe1\x85\xb3" 263 "\xe1\x86\xaf")); 264 265 // 266 // 4-byte sequences 267 // 268 269 // U+E0100 VARIATION SELECTOR-17 270 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 271 ConvertUTFResultContainer(conversionOK).withScalars(0x000E0100), 272 "\xf3\xa0\x84\x80")); 273 274 // 275 // First possible sequence of a certain length 276 // 277 278 // U+0000 NULL 279 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 280 ConvertUTFResultContainer(conversionOK).withScalars(0x0000), 281 StringRef("\x00", 1))); 282 283 // U+0080 PADDING CHARACTER 284 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 285 ConvertUTFResultContainer(conversionOK).withScalars(0x0080), 286 "\xc2\x80")); 287 288 // U+0800 SAMARITAN LETTER ALAF 289 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 290 ConvertUTFResultContainer(conversionOK).withScalars(0x0800), 291 "\xe0\xa0\x80")); 292 293 // U+10000 LINEAR B SYLLABLE B008 A 294 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 295 ConvertUTFResultContainer(conversionOK).withScalars(0x10000), 296 "\xf0\x90\x80\x80")); 297 298 // U+200000 (invalid) 299 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 300 ConvertUTFResultContainer(sourceIllegal) 301 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 302 "\xf8\x88\x80\x80\x80")); 303 304 // U+4000000 (invalid) 305 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 306 ConvertUTFResultContainer(sourceIllegal) 307 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 308 "\xfc\x84\x80\x80\x80\x80")); 309 310 // 311 // Last possible sequence of a certain length 312 // 313 314 // U+007F DELETE 315 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 316 ConvertUTFResultContainer(conversionOK).withScalars(0x007f), "\x7f")); 317 318 // U+07FF (unassigned) 319 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 320 ConvertUTFResultContainer(conversionOK).withScalars(0x07ff), 321 "\xdf\xbf")); 322 323 // U+FFFF (noncharacter) 324 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 325 ConvertUTFResultContainer(conversionOK).withScalars(0xffff), 326 "\xef\xbf\xbf")); 327 328 // U+1FFFFF (invalid) 329 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 330 ConvertUTFResultContainer(sourceIllegal) 331 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 332 "\xf7\xbf\xbf\xbf")); 333 334 // U+3FFFFFF (invalid) 335 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 336 ConvertUTFResultContainer(sourceIllegal) 337 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 338 "\xfb\xbf\xbf\xbf\xbf")); 339 340 // U+7FFFFFFF (invalid) 341 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 342 ConvertUTFResultContainer(sourceIllegal) 343 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 344 "\xfd\xbf\xbf\xbf\xbf\xbf")); 345 346 // 347 // Other boundary conditions 348 // 349 350 // U+D7FF (unassigned) 351 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 352 ConvertUTFResultContainer(conversionOK).withScalars(0xd7ff), 353 "\xed\x9f\xbf")); 354 355 // U+E000 (private use) 356 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 357 ConvertUTFResultContainer(conversionOK).withScalars(0xe000), 358 "\xee\x80\x80")); 359 360 // U+FFFD REPLACEMENT CHARACTER 361 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 362 ConvertUTFResultContainer(conversionOK).withScalars(0xfffd), 363 "\xef\xbf\xbd")); 364 365 // U+10FFFF (noncharacter) 366 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 367 ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff), 368 "\xf4\x8f\xbf\xbf")); 369 370 // U+110000 (invalid) 371 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 372 ConvertUTFResultContainer(sourceIllegal) 373 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 374 "\xf4\x90\x80\x80")); 375 376 // 377 // Unexpected continuation bytes 378 // 379 380 // A sequence of unexpected continuation bytes that don't follow a first 381 // byte, every byte is a maximal subpart. 382 383 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 384 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\x80")); 385 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 386 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xbf")); 387 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 388 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 389 "\x80\x80")); 390 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 391 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 392 "\x80\xbf")); 393 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 394 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 395 "\xbf\x80")); 396 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 397 ConvertUTFResultContainer(sourceIllegal) 398 .withScalars(0xfffd, 0xfffd, 0xfffd), 399 "\x80\xbf\x80")); 400 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 401 ConvertUTFResultContainer(sourceIllegal) 402 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 403 "\x80\xbf\x80\xbf")); 404 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 405 ConvertUTFResultContainer(sourceIllegal) 406 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 407 "\x80\xbf\x82\xbf\xaa")); 408 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 409 ConvertUTFResultContainer(sourceIllegal) 410 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 411 "\xaa\xb0\xbb\xbf\xaa\xa0")); 412 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 413 ConvertUTFResultContainer(sourceIllegal) 414 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 415 "\xaa\xb0\xbb\xbf\xaa\xa0\x8f")); 416 417 // All continuation bytes (0x80--0xbf). 418 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 419 ConvertUTFResultContainer(sourceIllegal) 420 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 421 0xfffd, 0xfffd, 0xfffd, 0xfffd) 422 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 423 0xfffd, 0xfffd, 0xfffd, 0xfffd) 424 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 425 0xfffd, 0xfffd, 0xfffd, 0xfffd) 426 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 427 0xfffd, 0xfffd, 0xfffd, 0xfffd) 428 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 429 0xfffd, 0xfffd, 0xfffd, 0xfffd) 430 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 431 0xfffd, 0xfffd, 0xfffd, 0xfffd) 432 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 433 0xfffd, 0xfffd, 0xfffd, 0xfffd) 434 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 435 0xfffd, 0xfffd, 0xfffd, 0xfffd), 436 "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" 437 "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" 438 "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf" 439 "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf")); 440 441 // 442 // Lonely start bytes 443 // 444 445 // Start bytes of 2-byte sequences (0xc0--0xdf). 446 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 447 ConvertUTFResultContainer(sourceIllegal) 448 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 449 0xfffd, 0xfffd, 0xfffd, 0xfffd) 450 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 451 0xfffd, 0xfffd, 0xfffd, 0xfffd) 452 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 453 0xfffd, 0xfffd, 0xfffd, 0xfffd) 454 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 455 0xfffd, 0xfffd, 0xfffd, 0xfffd), 456 "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf" 457 "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf")); 458 459 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 460 ConvertUTFResultContainer(sourceIllegal) 461 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 462 0xfffd, 0x0020, 0xfffd, 0x0020) 463 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 464 0xfffd, 0x0020, 0xfffd, 0x0020) 465 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 466 0xfffd, 0x0020, 0xfffd, 0x0020) 467 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 468 0xfffd, 0x0020, 0xfffd, 0x0020) 469 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 470 0xfffd, 0x0020, 0xfffd, 0x0020) 471 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 472 0xfffd, 0x0020, 0xfffd, 0x0020) 473 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 474 0xfffd, 0x0020, 0xfffd, 0x0020) 475 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 476 0xfffd, 0x0020, 0xfffd, 0x0020), 477 "\xc0\x20\xc1\x20\xc2\x20\xc3\x20\xc4\x20\xc5\x20\xc6\x20\xc7\x20" 478 "\xc8\x20\xc9\x20\xca\x20\xcb\x20\xcc\x20\xcd\x20\xce\x20\xcf\x20" 479 "\xd0\x20\xd1\x20\xd2\x20\xd3\x20\xd4\x20\xd5\x20\xd6\x20\xd7\x20" 480 "\xd8\x20\xd9\x20\xda\x20\xdb\x20\xdc\x20\xdd\x20\xde\x20\xdf\x20")); 481 482 // Start bytes of 3-byte sequences (0xe0--0xef). 483 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 484 ConvertUTFResultContainer(sourceIllegal) 485 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 486 0xfffd, 0xfffd, 0xfffd, 0xfffd) 487 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 488 0xfffd, 0xfffd, 0xfffd, 0xfffd), 489 "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef")); 490 491 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 492 ConvertUTFResultContainer(sourceIllegal) 493 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 494 0xfffd, 0x0020, 0xfffd, 0x0020) 495 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 496 0xfffd, 0x0020, 0xfffd, 0x0020) 497 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 498 0xfffd, 0x0020, 0xfffd, 0x0020) 499 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 500 0xfffd, 0x0020, 0xfffd, 0x0020), 501 "\xe0\x20\xe1\x20\xe2\x20\xe3\x20\xe4\x20\xe5\x20\xe6\x20\xe7\x20" 502 "\xe8\x20\xe9\x20\xea\x20\xeb\x20\xec\x20\xed\x20\xee\x20\xef\x20")); 503 504 // Start bytes of 4-byte sequences (0xf0--0xf7). 505 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 506 ConvertUTFResultContainer(sourceIllegal) 507 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 508 0xfffd, 0xfffd, 0xfffd, 0xfffd), 509 "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7")); 510 511 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 512 ConvertUTFResultContainer(sourceIllegal) 513 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 514 0xfffd, 0x0020, 0xfffd, 0x0020) 515 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 516 0xfffd, 0x0020, 0xfffd, 0x0020), 517 "\xf0\x20\xf1\x20\xf2\x20\xf3\x20\xf4\x20\xf5\x20\xf6\x20\xf7\x20")); 518 519 // Start bytes of 5-byte sequences (0xf8--0xfb). 520 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 521 ConvertUTFResultContainer(sourceIllegal) 522 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 523 "\xf8\xf9\xfa\xfb")); 524 525 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 526 ConvertUTFResultContainer(sourceIllegal) 527 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 528 0xfffd, 0x0020, 0xfffd, 0x0020), 529 "\xf8\x20\xf9\x20\xfa\x20\xfb\x20")); 530 531 // Start bytes of 6-byte sequences (0xfc--0xfd). 532 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 533 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 534 "\xfc\xfd")); 535 536 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 537 ConvertUTFResultContainer(sourceIllegal) 538 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020), 539 "\xfc\x20\xfd\x20")); 540 541 // 542 // Other bytes (0xc0--0xc1, 0xfe--0xff). 543 // 544 545 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 546 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc0")); 547 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 548 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc1")); 549 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 550 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfe")); 551 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 552 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xff")); 553 554 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 555 ConvertUTFResultContainer(sourceIllegal) 556 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 557 "\xc0\xc1\xfe\xff")); 558 559 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 560 ConvertUTFResultContainer(sourceIllegal) 561 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 562 "\xfe\xfe\xff\xff")); 563 564 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 565 ConvertUTFResultContainer(sourceIllegal) 566 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 567 "\xfe\x80\x80\x80\x80\x80")); 568 569 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 570 ConvertUTFResultContainer(sourceIllegal) 571 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 572 "\xff\x80\x80\x80\x80\x80")); 573 574 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 575 ConvertUTFResultContainer(sourceIllegal) 576 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 577 0xfffd, 0x0020, 0xfffd, 0x0020), 578 "\xc0\x20\xc1\x20\xfe\x20\xff\x20")); 579 580 // 581 // Sequences with one continuation byte missing 582 // 583 584 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 585 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc2")); 586 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 587 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xdf")); 588 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 589 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 590 "\xe0\xa0")); 591 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 592 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 593 "\xe0\xbf")); 594 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 595 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 596 "\xe1\x80")); 597 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 598 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 599 "\xec\xbf")); 600 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 601 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 602 "\xed\x80")); 603 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 604 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 605 "\xed\x9f")); 606 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 607 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 608 "\xee\x80")); 609 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 610 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 611 "\xef\xbf")); 612 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 613 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 614 "\xf0\x90\x80")); 615 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 616 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 617 "\xf0\xbf\xbf")); 618 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 619 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 620 "\xf1\x80\x80")); 621 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 622 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 623 "\xf3\xbf\xbf")); 624 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 625 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 626 "\xf4\x80\x80")); 627 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 628 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 629 "\xf4\x8f\xbf")); 630 631 // Overlong sequences with one trailing byte missing. 632 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 633 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 634 "\xc0")); 635 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 636 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 637 "\xc1")); 638 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 639 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 640 "\xe0\x80")); 641 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 642 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 643 "\xe0\x9f")); 644 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 645 ConvertUTFResultContainer(sourceIllegal) 646 .withScalars(0xfffd, 0xfffd, 0xfffd), 647 "\xf0\x80\x80")); 648 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 649 ConvertUTFResultContainer(sourceIllegal) 650 .withScalars(0xfffd, 0xfffd, 0xfffd), 651 "\xf0\x8f\x80")); 652 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 653 ConvertUTFResultContainer(sourceIllegal) 654 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 655 "\xf8\x80\x80\x80")); 656 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 657 ConvertUTFResultContainer(sourceIllegal) 658 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 659 "\xfc\x80\x80\x80\x80")); 660 661 // Sequences that represent surrogates with one trailing byte missing. 662 // High surrogates 663 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 664 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 665 "\xed\xa0")); 666 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 667 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 668 "\xed\xac")); 669 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 670 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 671 "\xed\xaf")); 672 // Low surrogates 673 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 674 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 675 "\xed\xb0")); 676 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 677 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 678 "\xed\xb4")); 679 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 680 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 681 "\xed\xbf")); 682 683 // Ill-formed 4-byte sequences. 684 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx 685 // U+1100xx (invalid) 686 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 687 ConvertUTFResultContainer(sourceIllegal) 688 .withScalars(0xfffd, 0xfffd, 0xfffd), 689 "\xf4\x90\x80")); 690 // U+13FBxx (invalid) 691 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 692 ConvertUTFResultContainer(sourceIllegal) 693 .withScalars(0xfffd, 0xfffd, 0xfffd), 694 "\xf4\xbf\xbf")); 695 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 696 ConvertUTFResultContainer(sourceIllegal) 697 .withScalars(0xfffd, 0xfffd, 0xfffd), 698 "\xf5\x80\x80")); 699 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 700 ConvertUTFResultContainer(sourceIllegal) 701 .withScalars(0xfffd, 0xfffd, 0xfffd), 702 "\xf6\x80\x80")); 703 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 704 ConvertUTFResultContainer(sourceIllegal) 705 .withScalars(0xfffd, 0xfffd, 0xfffd), 706 "\xf7\x80\x80")); 707 // U+1FFBxx (invalid) 708 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 709 ConvertUTFResultContainer(sourceIllegal) 710 .withScalars(0xfffd, 0xfffd, 0xfffd), 711 "\xf7\xbf\xbf")); 712 713 // Ill-formed 5-byte sequences. 714 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 715 // U+2000xx (invalid) 716 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 717 ConvertUTFResultContainer(sourceIllegal) 718 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 719 "\xf8\x88\x80\x80")); 720 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 721 ConvertUTFResultContainer(sourceIllegal) 722 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 723 "\xf8\xbf\xbf\xbf")); 724 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 725 ConvertUTFResultContainer(sourceIllegal) 726 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 727 "\xf9\x80\x80\x80")); 728 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 729 ConvertUTFResultContainer(sourceIllegal) 730 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 731 "\xfa\x80\x80\x80")); 732 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 733 ConvertUTFResultContainer(sourceIllegal) 734 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 735 "\xfb\x80\x80\x80")); 736 // U+3FFFFxx (invalid) 737 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 738 ConvertUTFResultContainer(sourceIllegal) 739 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 740 "\xfb\xbf\xbf\xbf")); 741 742 // Ill-formed 6-byte sequences. 743 // 1111110u 10uuuuuu 10uzzzzz 10zzzyyyy 10yyyyxx 10xxxxxx 744 // U+40000xx (invalid) 745 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 746 ConvertUTFResultContainer(sourceIllegal) 747 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 748 "\xfc\x84\x80\x80\x80")); 749 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 750 ConvertUTFResultContainer(sourceIllegal) 751 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 752 "\xfc\xbf\xbf\xbf\xbf")); 753 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 754 ConvertUTFResultContainer(sourceIllegal) 755 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 756 "\xfd\x80\x80\x80\x80")); 757 // U+7FFFFFxx (invalid) 758 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 759 ConvertUTFResultContainer(sourceIllegal) 760 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 761 "\xfd\xbf\xbf\xbf\xbf")); 762 763 // 764 // Sequences with two continuation bytes missing 765 // 766 767 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 768 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 769 "\xf0\x90")); 770 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 771 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 772 "\xf0\xbf")); 773 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 774 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 775 "\xf1\x80")); 776 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 777 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 778 "\xf3\xbf")); 779 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 780 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 781 "\xf4\x80")); 782 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 783 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 784 "\xf4\x8f")); 785 786 // Overlong sequences with two trailing byte missing. 787 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 788 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xe0")); 789 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 790 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 791 "\xf0\x80")); 792 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 793 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 794 "\xf0\x8f")); 795 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 796 ConvertUTFResultContainer(sourceIllegal) 797 .withScalars(0xfffd, 0xfffd, 0xfffd), 798 "\xf8\x80\x80")); 799 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 800 ConvertUTFResultContainer(sourceIllegal) 801 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 802 "\xfc\x80\x80\x80")); 803 804 // Sequences that represent surrogates with two trailing bytes missing. 805 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 806 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xed")); 807 808 // Ill-formed 4-byte sequences. 809 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx 810 // U+110yxx (invalid) 811 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 812 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 813 "\xf4\x90")); 814 // U+13Fyxx (invalid) 815 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 816 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 817 "\xf4\xbf")); 818 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 819 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 820 "\xf5\x80")); 821 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 822 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 823 "\xf6\x80")); 824 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 825 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 826 "\xf7\x80")); 827 // U+1FFyxx (invalid) 828 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 829 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 830 "\xf7\xbf")); 831 832 // Ill-formed 5-byte sequences. 833 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 834 // U+200yxx (invalid) 835 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 836 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 837 "\xf8\x88\x80")); 838 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 839 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 840 "\xf8\xbf\xbf")); 841 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 842 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 843 "\xf9\x80\x80")); 844 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 845 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 846 "\xfa\x80\x80")); 847 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 848 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 849 "\xfb\x80\x80")); 850 // U+3FFFyxx (invalid) 851 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 852 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 853 "\xfb\xbf\xbf")); 854 855 // Ill-formed 6-byte sequences. 856 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 857 // U+4000yxx (invalid) 858 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 859 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 860 "\xfc\x84\x80\x80")); 861 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 862 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 863 "\xfc\xbf\xbf\xbf")); 864 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 865 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 866 "\xfd\x80\x80\x80")); 867 // U+7FFFFyxx (invalid) 868 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 869 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 870 "\xfd\xbf\xbf\xbf")); 871 872 // 873 // Sequences with three continuation bytes missing 874 // 875 876 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 877 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0")); 878 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 879 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf1")); 880 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 881 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf2")); 882 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 883 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf3")); 884 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 885 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf4")); 886 887 // Broken overlong sequences. 888 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 889 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0")); 890 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 891 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 892 "\xf8\x80")); 893 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 894 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 895 "\xfc\x80\x80")); 896 897 // Ill-formed 4-byte sequences. 898 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx 899 // U+14yyxx (invalid) 900 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 901 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf5")); 902 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 903 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf6")); 904 // U+1Cyyxx (invalid) 905 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 906 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf7")); 907 908 // Ill-formed 5-byte sequences. 909 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 910 // U+20yyxx (invalid) 911 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 912 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 913 "\xf8\x88")); 914 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 915 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 916 "\xf8\xbf")); 917 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 918 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 919 "\xf9\x80")); 920 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 921 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 922 "\xfa\x80")); 923 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 924 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 925 "\xfb\x80")); 926 // U+3FCyyxx (invalid) 927 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 928 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 929 "\xfb\xbf")); 930 931 // Ill-formed 6-byte sequences. 932 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 933 // U+400yyxx (invalid) 934 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 935 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 936 "\xfc\x84\x80")); 937 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 938 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 939 "\xfc\xbf\xbf")); 940 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 941 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 942 "\xfd\x80\x80")); 943 // U+7FFCyyxx (invalid) 944 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 945 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 946 "\xfd\xbf\xbf")); 947 948 // 949 // Sequences with four continuation bytes missing 950 // 951 952 // Ill-formed 5-byte sequences. 953 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 954 // U+uzyyxx (invalid) 955 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 956 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8")); 957 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 958 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf9")); 959 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 960 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfa")); 961 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 962 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb")); 963 // U+3zyyxx (invalid) 964 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 965 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb")); 966 967 // Broken overlong sequences. 968 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 969 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8")); 970 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 971 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 972 "\xfc\x80")); 973 974 // Ill-formed 6-byte sequences. 975 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 976 // U+uzzyyxx (invalid) 977 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 978 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 979 "\xfc\x84")); 980 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 981 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 982 "\xfc\xbf")); 983 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 984 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 985 "\xfd\x80")); 986 // U+7Fzzyyxx (invalid) 987 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 988 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 989 "\xfd\xbf")); 990 991 // 992 // Sequences with five continuation bytes missing 993 // 994 995 // Ill-formed 6-byte sequences. 996 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 997 // U+uzzyyxx (invalid) 998 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 999 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfc")); 1000 // U+uuzzyyxx (invalid) 1001 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1002 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfd")); 1003 1004 // 1005 // Consecutive sequences with trailing bytes missing 1006 // 1007 1008 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1009 ConvertUTFResultContainer(sourceIllegal) 1010 .withScalars(0xfffd, /**/ 0xfffd, 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd) 1011 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd) 1012 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd) 1013 .withScalars(0xfffd, /**/ 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd) 1014 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd) 1015 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1016 "\xc0" "\xe0\x80" "\xf0\x80\x80" 1017 "\xf8\x80\x80\x80" 1018 "\xfc\x80\x80\x80\x80" 1019 "\xdf" "\xef\xbf" "\xf7\xbf\xbf" 1020 "\xfb\xbf\xbf\xbf" 1021 "\xfd\xbf\xbf\xbf\xbf")); 1022 1023 // 1024 // Overlong UTF-8 sequences 1025 // 1026 1027 // U+002F SOLIDUS 1028 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1029 ConvertUTFResultContainer(conversionOK).withScalars(0x002f), "\x2f")); 1030 1031 // Overlong sequences of the above. 1032 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1033 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 1034 "\xc0\xaf")); 1035 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1036 ConvertUTFResultContainer(sourceIllegal) 1037 .withScalars(0xfffd, 0xfffd, 0xfffd), 1038 "\xe0\x80\xaf")); 1039 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1040 ConvertUTFResultContainer(sourceIllegal) 1041 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 1042 "\xf0\x80\x80\xaf")); 1043 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1044 ConvertUTFResultContainer(sourceIllegal) 1045 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1046 "\xf8\x80\x80\x80\xaf")); 1047 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1048 ConvertUTFResultContainer(sourceIllegal) 1049 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1050 "\xfc\x80\x80\x80\x80\xaf")); 1051 1052 // U+0000 NULL 1053 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1054 ConvertUTFResultContainer(conversionOK).withScalars(0x0000), 1055 StringRef("\x00", 1))); 1056 1057 // Overlong sequences of the above. 1058 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1059 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 1060 "\xc0\x80")); 1061 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1062 ConvertUTFResultContainer(sourceIllegal) 1063 .withScalars(0xfffd, 0xfffd, 0xfffd), 1064 "\xe0\x80\x80")); 1065 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1066 ConvertUTFResultContainer(sourceIllegal) 1067 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 1068 "\xf0\x80\x80\x80")); 1069 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1070 ConvertUTFResultContainer(sourceIllegal) 1071 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1072 "\xf8\x80\x80\x80\x80")); 1073 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1074 ConvertUTFResultContainer(sourceIllegal) 1075 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1076 "\xfc\x80\x80\x80\x80\x80")); 1077 1078 // Other overlong sequences. 1079 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1080 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 1081 "\xc0\xbf")); 1082 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1083 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 1084 "\xc1\x80")); 1085 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1086 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 1087 "\xc1\xbf")); 1088 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1089 ConvertUTFResultContainer(sourceIllegal) 1090 .withScalars(0xfffd, 0xfffd, 0xfffd), 1091 "\xe0\x9f\xbf")); 1092 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1093 ConvertUTFResultContainer(sourceIllegal) 1094 .withScalars(0xfffd, 0xfffd, 0xfffd), 1095 "\xed\xa0\x80")); 1096 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1097 ConvertUTFResultContainer(sourceIllegal) 1098 .withScalars(0xfffd, 0xfffd, 0xfffd), 1099 "\xed\xbf\xbf")); 1100 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1101 ConvertUTFResultContainer(sourceIllegal) 1102 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 1103 "\xf0\x8f\x80\x80")); 1104 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1105 ConvertUTFResultContainer(sourceIllegal) 1106 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 1107 "\xf0\x8f\xbf\xbf")); 1108 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1109 ConvertUTFResultContainer(sourceIllegal) 1110 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1111 "\xf8\x87\xbf\xbf\xbf")); 1112 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1113 ConvertUTFResultContainer(sourceIllegal) 1114 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1115 "\xfc\x83\xbf\xbf\xbf\xbf")); 1116 1117 // 1118 // Isolated surrogates 1119 // 1120 1121 // Unicode 6.3.0: 1122 // 1123 // D71. High-surrogate code point: A Unicode code point in the range 1124 // U+D800 to U+DBFF. 1125 // 1126 // D73. Low-surrogate code point: A Unicode code point in the range 1127 // U+DC00 to U+DFFF. 1128 1129 // Note: U+E0100 is <DB40 DD00> in UTF16. 1130 1131 // High surrogates 1132 1133 // U+D800 1134 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1135 ConvertUTFResultContainer(sourceIllegal) 1136 .withScalars(0xfffd, 0xfffd, 0xfffd), 1137 "\xed\xa0\x80")); 1138 1139 // U+DB40 1140 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1141 ConvertUTFResultContainer(sourceIllegal) 1142 .withScalars(0xfffd, 0xfffd, 0xfffd), 1143 "\xed\xac\xa0")); 1144 1145 // U+DBFF 1146 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1147 ConvertUTFResultContainer(sourceIllegal) 1148 .withScalars(0xfffd, 0xfffd, 0xfffd), 1149 "\xed\xaf\xbf")); 1150 1151 // Low surrogates 1152 1153 // U+DC00 1154 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1155 ConvertUTFResultContainer(sourceIllegal) 1156 .withScalars(0xfffd, 0xfffd, 0xfffd), 1157 "\xed\xb0\x80")); 1158 1159 // U+DD00 1160 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1161 ConvertUTFResultContainer(sourceIllegal) 1162 .withScalars(0xfffd, 0xfffd, 0xfffd), 1163 "\xed\xb4\x80")); 1164 1165 // U+DFFF 1166 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1167 ConvertUTFResultContainer(sourceIllegal) 1168 .withScalars(0xfffd, 0xfffd, 0xfffd), 1169 "\xed\xbf\xbf")); 1170 1171 // Surrogate pairs 1172 1173 // U+D800 U+DC00 1174 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1175 ConvertUTFResultContainer(sourceIllegal) 1176 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1177 "\xed\xa0\x80\xed\xb0\x80")); 1178 1179 // U+D800 U+DD00 1180 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1181 ConvertUTFResultContainer(sourceIllegal) 1182 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1183 "\xed\xa0\x80\xed\xb4\x80")); 1184 1185 // U+D800 U+DFFF 1186 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1187 ConvertUTFResultContainer(sourceIllegal) 1188 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1189 "\xed\xa0\x80\xed\xbf\xbf")); 1190 1191 // U+DB40 U+DC00 1192 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1193 ConvertUTFResultContainer(sourceIllegal) 1194 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1195 "\xed\xac\xa0\xed\xb0\x80")); 1196 1197 // U+DB40 U+DD00 1198 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1199 ConvertUTFResultContainer(sourceIllegal) 1200 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1201 "\xed\xac\xa0\xed\xb4\x80")); 1202 1203 // U+DB40 U+DFFF 1204 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1205 ConvertUTFResultContainer(sourceIllegal) 1206 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1207 "\xed\xac\xa0\xed\xbf\xbf")); 1208 1209 // U+DBFF U+DC00 1210 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1211 ConvertUTFResultContainer(sourceIllegal) 1212 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1213 "\xed\xaf\xbf\xed\xb0\x80")); 1214 1215 // U+DBFF U+DD00 1216 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1217 ConvertUTFResultContainer(sourceIllegal) 1218 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1219 "\xed\xaf\xbf\xed\xb4\x80")); 1220 1221 // U+DBFF U+DFFF 1222 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1223 ConvertUTFResultContainer(sourceIllegal) 1224 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1225 "\xed\xaf\xbf\xed\xbf\xbf")); 1226 1227 // 1228 // Noncharacters 1229 // 1230 1231 // Unicode 6.3.0: 1232 // 1233 // D14. Noncharacter: A code point that is permanently reserved for 1234 // internal use and that should never be interchanged. Noncharacters 1235 // consist of the values U+nFFFE and U+nFFFF (where n is from 0 to 1016) 1236 // and the values U+FDD0..U+FDEF. 1237 1238 // U+FFFE 1239 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1240 ConvertUTFResultContainer(conversionOK).withScalars(0xfffe), 1241 "\xef\xbf\xbe")); 1242 1243 // U+FFFF 1244 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1245 ConvertUTFResultContainer(conversionOK).withScalars(0xffff), 1246 "\xef\xbf\xbf")); 1247 1248 // U+1FFFE 1249 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1250 ConvertUTFResultContainer(conversionOK).withScalars(0x1fffe), 1251 "\xf0\x9f\xbf\xbe")); 1252 1253 // U+1FFFF 1254 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1255 ConvertUTFResultContainer(conversionOK).withScalars(0x1ffff), 1256 "\xf0\x9f\xbf\xbf")); 1257 1258 // U+2FFFE 1259 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1260 ConvertUTFResultContainer(conversionOK).withScalars(0x2fffe), 1261 "\xf0\xaf\xbf\xbe")); 1262 1263 // U+2FFFF 1264 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1265 ConvertUTFResultContainer(conversionOK).withScalars(0x2ffff), 1266 "\xf0\xaf\xbf\xbf")); 1267 1268 // U+3FFFE 1269 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1270 ConvertUTFResultContainer(conversionOK).withScalars(0x3fffe), 1271 "\xf0\xbf\xbf\xbe")); 1272 1273 // U+3FFFF 1274 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1275 ConvertUTFResultContainer(conversionOK).withScalars(0x3ffff), 1276 "\xf0\xbf\xbf\xbf")); 1277 1278 // U+4FFFE 1279 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1280 ConvertUTFResultContainer(conversionOK).withScalars(0x4fffe), 1281 "\xf1\x8f\xbf\xbe")); 1282 1283 // U+4FFFF 1284 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1285 ConvertUTFResultContainer(conversionOK).withScalars(0x4ffff), 1286 "\xf1\x8f\xbf\xbf")); 1287 1288 // U+5FFFE 1289 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1290 ConvertUTFResultContainer(conversionOK).withScalars(0x5fffe), 1291 "\xf1\x9f\xbf\xbe")); 1292 1293 // U+5FFFF 1294 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1295 ConvertUTFResultContainer(conversionOK).withScalars(0x5ffff), 1296 "\xf1\x9f\xbf\xbf")); 1297 1298 // U+6FFFE 1299 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1300 ConvertUTFResultContainer(conversionOK).withScalars(0x6fffe), 1301 "\xf1\xaf\xbf\xbe")); 1302 1303 // U+6FFFF 1304 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1305 ConvertUTFResultContainer(conversionOK).withScalars(0x6ffff), 1306 "\xf1\xaf\xbf\xbf")); 1307 1308 // U+7FFFE 1309 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1310 ConvertUTFResultContainer(conversionOK).withScalars(0x7fffe), 1311 "\xf1\xbf\xbf\xbe")); 1312 1313 // U+7FFFF 1314 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1315 ConvertUTFResultContainer(conversionOK).withScalars(0x7ffff), 1316 "\xf1\xbf\xbf\xbf")); 1317 1318 // U+8FFFE 1319 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1320 ConvertUTFResultContainer(conversionOK).withScalars(0x8fffe), 1321 "\xf2\x8f\xbf\xbe")); 1322 1323 // U+8FFFF 1324 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1325 ConvertUTFResultContainer(conversionOK).withScalars(0x8ffff), 1326 "\xf2\x8f\xbf\xbf")); 1327 1328 // U+9FFFE 1329 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1330 ConvertUTFResultContainer(conversionOK).withScalars(0x9fffe), 1331 "\xf2\x9f\xbf\xbe")); 1332 1333 // U+9FFFF 1334 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1335 ConvertUTFResultContainer(conversionOK).withScalars(0x9ffff), 1336 "\xf2\x9f\xbf\xbf")); 1337 1338 // U+AFFFE 1339 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1340 ConvertUTFResultContainer(conversionOK).withScalars(0xafffe), 1341 "\xf2\xaf\xbf\xbe")); 1342 1343 // U+AFFFF 1344 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1345 ConvertUTFResultContainer(conversionOK).withScalars(0xaffff), 1346 "\xf2\xaf\xbf\xbf")); 1347 1348 // U+BFFFE 1349 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1350 ConvertUTFResultContainer(conversionOK).withScalars(0xbfffe), 1351 "\xf2\xbf\xbf\xbe")); 1352 1353 // U+BFFFF 1354 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1355 ConvertUTFResultContainer(conversionOK).withScalars(0xbffff), 1356 "\xf2\xbf\xbf\xbf")); 1357 1358 // U+CFFFE 1359 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1360 ConvertUTFResultContainer(conversionOK).withScalars(0xcfffe), 1361 "\xf3\x8f\xbf\xbe")); 1362 1363 // U+CFFFF 1364 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1365 ConvertUTFResultContainer(conversionOK).withScalars(0xcfffF), 1366 "\xf3\x8f\xbf\xbf")); 1367 1368 // U+DFFFE 1369 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1370 ConvertUTFResultContainer(conversionOK).withScalars(0xdfffe), 1371 "\xf3\x9f\xbf\xbe")); 1372 1373 // U+DFFFF 1374 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1375 ConvertUTFResultContainer(conversionOK).withScalars(0xdffff), 1376 "\xf3\x9f\xbf\xbf")); 1377 1378 // U+EFFFE 1379 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1380 ConvertUTFResultContainer(conversionOK).withScalars(0xefffe), 1381 "\xf3\xaf\xbf\xbe")); 1382 1383 // U+EFFFF 1384 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1385 ConvertUTFResultContainer(conversionOK).withScalars(0xeffff), 1386 "\xf3\xaf\xbf\xbf")); 1387 1388 // U+FFFFE 1389 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1390 ConvertUTFResultContainer(conversionOK).withScalars(0xffffe), 1391 "\xf3\xbf\xbf\xbe")); 1392 1393 // U+FFFFF 1394 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1395 ConvertUTFResultContainer(conversionOK).withScalars(0xfffff), 1396 "\xf3\xbf\xbf\xbf")); 1397 1398 // U+10FFFE 1399 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1400 ConvertUTFResultContainer(conversionOK).withScalars(0x10fffe), 1401 "\xf4\x8f\xbf\xbe")); 1402 1403 // U+10FFFF 1404 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1405 ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff), 1406 "\xf4\x8f\xbf\xbf")); 1407 1408 // U+FDD0 1409 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1410 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd0), 1411 "\xef\xb7\x90")); 1412 1413 // U+FDD1 1414 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1415 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd1), 1416 "\xef\xb7\x91")); 1417 1418 // U+FDD2 1419 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1420 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd2), 1421 "\xef\xb7\x92")); 1422 1423 // U+FDD3 1424 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1425 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd3), 1426 "\xef\xb7\x93")); 1427 1428 // U+FDD4 1429 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1430 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd4), 1431 "\xef\xb7\x94")); 1432 1433 // U+FDD5 1434 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1435 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd5), 1436 "\xef\xb7\x95")); 1437 1438 // U+FDD6 1439 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1440 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd6), 1441 "\xef\xb7\x96")); 1442 1443 // U+FDD7 1444 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1445 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd7), 1446 "\xef\xb7\x97")); 1447 1448 // U+FDD8 1449 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1450 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd8), 1451 "\xef\xb7\x98")); 1452 1453 // U+FDD9 1454 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1455 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd9), 1456 "\xef\xb7\x99")); 1457 1458 // U+FDDA 1459 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1460 ConvertUTFResultContainer(conversionOK).withScalars(0xfdda), 1461 "\xef\xb7\x9a")); 1462 1463 // U+FDDB 1464 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1465 ConvertUTFResultContainer(conversionOK).withScalars(0xfddb), 1466 "\xef\xb7\x9b")); 1467 1468 // U+FDDC 1469 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1470 ConvertUTFResultContainer(conversionOK).withScalars(0xfddc), 1471 "\xef\xb7\x9c")); 1472 1473 // U+FDDD 1474 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1475 ConvertUTFResultContainer(conversionOK).withScalars(0xfddd), 1476 "\xef\xb7\x9d")); 1477 1478 // U+FDDE 1479 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1480 ConvertUTFResultContainer(conversionOK).withScalars(0xfdde), 1481 "\xef\xb7\x9e")); 1482 1483 // U+FDDF 1484 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1485 ConvertUTFResultContainer(conversionOK).withScalars(0xfddf), 1486 "\xef\xb7\x9f")); 1487 1488 // U+FDE0 1489 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1490 ConvertUTFResultContainer(conversionOK).withScalars(0xfde0), 1491 "\xef\xb7\xa0")); 1492 1493 // U+FDE1 1494 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1495 ConvertUTFResultContainer(conversionOK).withScalars(0xfde1), 1496 "\xef\xb7\xa1")); 1497 1498 // U+FDE2 1499 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1500 ConvertUTFResultContainer(conversionOK).withScalars(0xfde2), 1501 "\xef\xb7\xa2")); 1502 1503 // U+FDE3 1504 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1505 ConvertUTFResultContainer(conversionOK).withScalars(0xfde3), 1506 "\xef\xb7\xa3")); 1507 1508 // U+FDE4 1509 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1510 ConvertUTFResultContainer(conversionOK).withScalars(0xfde4), 1511 "\xef\xb7\xa4")); 1512 1513 // U+FDE5 1514 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1515 ConvertUTFResultContainer(conversionOK).withScalars(0xfde5), 1516 "\xef\xb7\xa5")); 1517 1518 // U+FDE6 1519 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1520 ConvertUTFResultContainer(conversionOK).withScalars(0xfde6), 1521 "\xef\xb7\xa6")); 1522 1523 // U+FDE7 1524 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1525 ConvertUTFResultContainer(conversionOK).withScalars(0xfde7), 1526 "\xef\xb7\xa7")); 1527 1528 // U+FDE8 1529 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1530 ConvertUTFResultContainer(conversionOK).withScalars(0xfde8), 1531 "\xef\xb7\xa8")); 1532 1533 // U+FDE9 1534 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1535 ConvertUTFResultContainer(conversionOK).withScalars(0xfde9), 1536 "\xef\xb7\xa9")); 1537 1538 // U+FDEA 1539 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1540 ConvertUTFResultContainer(conversionOK).withScalars(0xfdea), 1541 "\xef\xb7\xaa")); 1542 1543 // U+FDEB 1544 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1545 ConvertUTFResultContainer(conversionOK).withScalars(0xfdeb), 1546 "\xef\xb7\xab")); 1547 1548 // U+FDEC 1549 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1550 ConvertUTFResultContainer(conversionOK).withScalars(0xfdec), 1551 "\xef\xb7\xac")); 1552 1553 // U+FDED 1554 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1555 ConvertUTFResultContainer(conversionOK).withScalars(0xfded), 1556 "\xef\xb7\xad")); 1557 1558 // U+FDEE 1559 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1560 ConvertUTFResultContainer(conversionOK).withScalars(0xfdee), 1561 "\xef\xb7\xae")); 1562 1563 // U+FDEF 1564 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1565 ConvertUTFResultContainer(conversionOK).withScalars(0xfdef), 1566 "\xef\xb7\xaf")); 1567 1568 // U+FDF0 1569 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1570 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf0), 1571 "\xef\xb7\xb0")); 1572 1573 // U+FDF1 1574 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1575 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf1), 1576 "\xef\xb7\xb1")); 1577 1578 // U+FDF2 1579 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1580 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf2), 1581 "\xef\xb7\xb2")); 1582 1583 // U+FDF3 1584 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1585 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf3), 1586 "\xef\xb7\xb3")); 1587 1588 // U+FDF4 1589 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1590 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf4), 1591 "\xef\xb7\xb4")); 1592 1593 // U+FDF5 1594 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1595 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf5), 1596 "\xef\xb7\xb5")); 1597 1598 // U+FDF6 1599 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1600 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf6), 1601 "\xef\xb7\xb6")); 1602 1603 // U+FDF7 1604 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1605 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf7), 1606 "\xef\xb7\xb7")); 1607 1608 // U+FDF8 1609 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1610 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf8), 1611 "\xef\xb7\xb8")); 1612 1613 // U+FDF9 1614 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1615 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf9), 1616 "\xef\xb7\xb9")); 1617 1618 // U+FDFA 1619 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1620 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfa), 1621 "\xef\xb7\xba")); 1622 1623 // U+FDFB 1624 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1625 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfb), 1626 "\xef\xb7\xbb")); 1627 1628 // U+FDFC 1629 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1630 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfc), 1631 "\xef\xb7\xbc")); 1632 1633 // U+FDFD 1634 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1635 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfd), 1636 "\xef\xb7\xbd")); 1637 1638 // U+FDFE 1639 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1640 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfe), 1641 "\xef\xb7\xbe")); 1642 1643 // U+FDFF 1644 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1645 ConvertUTFResultContainer(conversionOK).withScalars(0xfdff), 1646 "\xef\xb7\xbf")); 1647 } 1648 1649 TEST(ConvertUTFTest, UTF8ToUTF32PartialLenient) { 1650 // U+0041 LATIN CAPITAL LETTER A 1651 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1652 ConvertUTFResultContainer(conversionOK).withScalars(0x0041), 1653 "\x41", true)); 1654 1655 // 1656 // Sequences with one continuation byte missing 1657 // 1658 1659 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1660 ConvertUTFResultContainer(sourceExhausted), 1661 "\xc2", true)); 1662 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1663 ConvertUTFResultContainer(sourceExhausted), 1664 "\xdf", true)); 1665 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1666 ConvertUTFResultContainer(sourceExhausted), 1667 "\xe0\xa0", true)); 1668 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1669 ConvertUTFResultContainer(sourceExhausted), 1670 "\xe0\xbf", true)); 1671 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1672 ConvertUTFResultContainer(sourceExhausted), 1673 "\xe1\x80", true)); 1674 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1675 ConvertUTFResultContainer(sourceExhausted), 1676 "\xec\xbf", true)); 1677 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1678 ConvertUTFResultContainer(sourceExhausted), 1679 "\xed\x80", true)); 1680 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1681 ConvertUTFResultContainer(sourceExhausted), 1682 "\xed\x9f", true)); 1683 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1684 ConvertUTFResultContainer(sourceExhausted), 1685 "\xee\x80", true)); 1686 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1687 ConvertUTFResultContainer(sourceExhausted), 1688 "\xef\xbf", true)); 1689 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1690 ConvertUTFResultContainer(sourceExhausted), 1691 "\xf0\x90\x80", true)); 1692 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1693 ConvertUTFResultContainer(sourceExhausted), 1694 "\xf0\xbf\xbf", true)); 1695 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1696 ConvertUTFResultContainer(sourceExhausted), 1697 "\xf1\x80\x80", true)); 1698 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1699 ConvertUTFResultContainer(sourceExhausted), 1700 "\xf3\xbf\xbf", true)); 1701 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1702 ConvertUTFResultContainer(sourceExhausted), 1703 "\xf4\x80\x80", true)); 1704 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1705 ConvertUTFResultContainer(sourceExhausted), 1706 "\xf4\x8f\xbf", true)); 1707 1708 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1709 ConvertUTFResultContainer(sourceExhausted).withScalars(0x0041), 1710 "\x41\xc2", true)); 1711 } 1712 1713