1 //===- llvm/unittest/Support/ConvertUTFTest.cpp - ConvertUTF tests --------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "llvm/Support/ConvertUTF.h" 10 #include "llvm/ADT/ArrayRef.h" 11 #include "gtest/gtest.h" 12 #include <string> 13 #include <vector> 14 15 using namespace llvm; 16 17 TEST(ConvertUTFTest, ConvertUTF16LittleEndianToUTF8String) { 18 // Src is the look of disapproval. 19 static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c"; 20 ArrayRef<char> Ref(Src, sizeof(Src) - 1); 21 std::string Result; 22 bool Success = convertUTF16ToUTF8String(Ref, Result); 23 EXPECT_TRUE(Success); 24 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0"); 25 EXPECT_EQ(Expected, Result); 26 } 27 28 TEST(ConvertUTFTest, ConvertUTF16BigEndianToUTF8String) { 29 // Src is the look of disapproval. 30 static const char Src[] = "\xfe\xff\x0c\xa0\x00_\x0c\xa0"; 31 ArrayRef<char> Ref(Src, sizeof(Src) - 1); 32 std::string Result; 33 bool Success = convertUTF16ToUTF8String(Ref, Result); 34 EXPECT_TRUE(Success); 35 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0"); 36 EXPECT_EQ(Expected, Result); 37 } 38 39 TEST(ConvertUTFTest, ConvertUTF8ToUTF16String) { 40 // Src is the look of disapproval. 41 static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0"; 42 StringRef Ref(Src, sizeof(Src) - 1); 43 SmallVector<UTF16, 5> Result; 44 bool Success = convertUTF8ToUTF16String(Ref, Result); 45 EXPECT_TRUE(Success); 46 static const UTF16 Expected[] = {0x0CA0, 0x005f, 0x0CA0, 0}; 47 ASSERT_EQ(3u, Result.size()); 48 for (int I = 0, E = 3; I != E; ++I) 49 EXPECT_EQ(Expected[I], Result[I]); 50 } 51 52 TEST(ConvertUTFTest, OddLengthInput) { 53 std::string Result; 54 bool Success = convertUTF16ToUTF8String(makeArrayRef("xxxxx", 5), Result); 55 EXPECT_FALSE(Success); 56 } 57 58 TEST(ConvertUTFTest, Empty) { 59 std::string Result; 60 bool Success = convertUTF16ToUTF8String(llvm::ArrayRef<char>(None), Result); 61 EXPECT_TRUE(Success); 62 EXPECT_TRUE(Result.empty()); 63 } 64 65 TEST(ConvertUTFTest, HasUTF16BOM) { 66 bool HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xff\xfe", 2)); 67 EXPECT_TRUE(HasBOM); 68 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff", 2)); 69 EXPECT_TRUE(HasBOM); 70 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff ", 3)); 71 EXPECT_TRUE(HasBOM); // Don't care about odd lengths. 72 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff\x00asdf", 6)); 73 EXPECT_TRUE(HasBOM); 74 75 HasBOM = hasUTF16ByteOrderMark(None); 76 EXPECT_FALSE(HasBOM); 77 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe", 1)); 78 EXPECT_FALSE(HasBOM); 79 } 80 81 TEST(ConvertUTFTest, UTF16WrappersForConvertUTF16ToUTF8String) { 82 // Src is the look of disapproval. 83 static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c"; 84 ArrayRef<UTF16> SrcRef = makeArrayRef((const UTF16 *)Src, 4); 85 std::string Result; 86 bool Success = convertUTF16ToUTF8String(SrcRef, Result); 87 EXPECT_TRUE(Success); 88 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0"); 89 EXPECT_EQ(Expected, Result); 90 } 91 92 TEST(ConvertUTFTest, ConvertUTF8toWide) { 93 // Src is the look of disapproval. 94 static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0"; 95 std::wstring Result; 96 bool Success = ConvertUTF8toWide((const char*)Src, Result); 97 EXPECT_TRUE(Success); 98 std::wstring Expected(L"\x0ca0_\x0ca0"); 99 EXPECT_EQ(Expected, Result); 100 Result.clear(); 101 Success = ConvertUTF8toWide(StringRef(Src, 7), Result); 102 EXPECT_TRUE(Success); 103 EXPECT_EQ(Expected, Result); 104 } 105 106 TEST(ConvertUTFTest, convertWideToUTF8) { 107 // Src is the look of disapproval. 108 static const wchar_t Src[] = L"\x0ca0_\x0ca0"; 109 std::string Result; 110 bool Success = convertWideToUTF8(Src, Result); 111 EXPECT_TRUE(Success); 112 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0"); 113 EXPECT_EQ(Expected, Result); 114 } 115 116 struct ConvertUTFResultContainer { 117 ConversionResult ErrorCode; 118 std::vector<unsigned> UnicodeScalars; 119 120 ConvertUTFResultContainer(ConversionResult ErrorCode) 121 : ErrorCode(ErrorCode) {} 122 123 ConvertUTFResultContainer 124 withScalars(unsigned US0 = 0x110000, unsigned US1 = 0x110000, 125 unsigned US2 = 0x110000, unsigned US3 = 0x110000, 126 unsigned US4 = 0x110000, unsigned US5 = 0x110000, 127 unsigned US6 = 0x110000, unsigned US7 = 0x110000) { 128 ConvertUTFResultContainer Result(*this); 129 if (US0 != 0x110000) 130 Result.UnicodeScalars.push_back(US0); 131 if (US1 != 0x110000) 132 Result.UnicodeScalars.push_back(US1); 133 if (US2 != 0x110000) 134 Result.UnicodeScalars.push_back(US2); 135 if (US3 != 0x110000) 136 Result.UnicodeScalars.push_back(US3); 137 if (US4 != 0x110000) 138 Result.UnicodeScalars.push_back(US4); 139 if (US5 != 0x110000) 140 Result.UnicodeScalars.push_back(US5); 141 if (US6 != 0x110000) 142 Result.UnicodeScalars.push_back(US6); 143 if (US7 != 0x110000) 144 Result.UnicodeScalars.push_back(US7); 145 return Result; 146 } 147 }; 148 149 std::pair<ConversionResult, std::vector<unsigned>> 150 ConvertUTF8ToUnicodeScalarsLenient(StringRef S) { 151 const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data()); 152 153 const UTF8 *SourceNext = SourceStart; 154 std::vector<UTF32> Decoded(S.size(), 0); 155 UTF32 *TargetStart = Decoded.data(); 156 157 auto ErrorCode = 158 ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart, 159 Decoded.data() + Decoded.size(), lenientConversion); 160 161 Decoded.resize(TargetStart - Decoded.data()); 162 163 return std::make_pair(ErrorCode, Decoded); 164 } 165 166 std::pair<ConversionResult, std::vector<unsigned>> 167 ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S) { 168 const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data()); 169 170 const UTF8 *SourceNext = SourceStart; 171 std::vector<UTF32> Decoded(S.size(), 0); 172 UTF32 *TargetStart = Decoded.data(); 173 174 auto ErrorCode = ConvertUTF8toUTF32Partial( 175 &SourceNext, SourceStart + S.size(), &TargetStart, 176 Decoded.data() + Decoded.size(), lenientConversion); 177 178 Decoded.resize(TargetStart - Decoded.data()); 179 180 return std::make_pair(ErrorCode, Decoded); 181 } 182 183 ::testing::AssertionResult 184 CheckConvertUTF8ToUnicodeScalars(ConvertUTFResultContainer Expected, 185 StringRef S, bool Partial = false) { 186 ConversionResult ErrorCode; 187 std::vector<unsigned> Decoded; 188 if (!Partial) 189 std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsLenient(S); 190 else 191 std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsPartialLenient(S); 192 193 if (Expected.ErrorCode != ErrorCode) 194 return ::testing::AssertionFailure() << "Expected error code " 195 << Expected.ErrorCode << ", actual " 196 << ErrorCode; 197 198 if (Expected.UnicodeScalars != Decoded) 199 return ::testing::AssertionFailure() 200 << "Expected lenient decoded result:\n" 201 << ::testing::PrintToString(Expected.UnicodeScalars) << "\n" 202 << "Actual result:\n" << ::testing::PrintToString(Decoded); 203 204 return ::testing::AssertionSuccess(); 205 } 206 207 TEST(ConvertUTFTest, UTF8ToUTF32Lenient) { 208 209 // 210 // 1-byte sequences 211 // 212 213 // U+0041 LATIN CAPITAL LETTER A 214 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 215 ConvertUTFResultContainer(conversionOK).withScalars(0x0041), "\x41")); 216 217 // 218 // 2-byte sequences 219 // 220 221 // U+0283 LATIN SMALL LETTER ESH 222 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 223 ConvertUTFResultContainer(conversionOK).withScalars(0x0283), 224 "\xca\x83")); 225 226 // U+03BA GREEK SMALL LETTER KAPPA 227 // U+1F79 GREEK SMALL LETTER OMICRON WITH OXIA 228 // U+03C3 GREEK SMALL LETTER SIGMA 229 // U+03BC GREEK SMALL LETTER MU 230 // U+03B5 GREEK SMALL LETTER EPSILON 231 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 232 ConvertUTFResultContainer(conversionOK) 233 .withScalars(0x03ba, 0x1f79, 0x03c3, 0x03bc, 0x03b5), 234 "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5")); 235 236 // 237 // 3-byte sequences 238 // 239 240 // U+4F8B CJK UNIFIED IDEOGRAPH-4F8B 241 // U+6587 CJK UNIFIED IDEOGRAPH-6587 242 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 243 ConvertUTFResultContainer(conversionOK).withScalars(0x4f8b, 0x6587), 244 "\xe4\xbe\x8b\xe6\x96\x87")); 245 246 // U+D55C HANGUL SYLLABLE HAN 247 // U+AE00 HANGUL SYLLABLE GEUL 248 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 249 ConvertUTFResultContainer(conversionOK).withScalars(0xd55c, 0xae00), 250 "\xed\x95\x9c\xea\xb8\x80")); 251 252 // U+1112 HANGUL CHOSEONG HIEUH 253 // U+1161 HANGUL JUNGSEONG A 254 // U+11AB HANGUL JONGSEONG NIEUN 255 // U+1100 HANGUL CHOSEONG KIYEOK 256 // U+1173 HANGUL JUNGSEONG EU 257 // U+11AF HANGUL JONGSEONG RIEUL 258 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 259 ConvertUTFResultContainer(conversionOK) 260 .withScalars(0x1112, 0x1161, 0x11ab, 0x1100, 0x1173, 0x11af), 261 "\xe1\x84\x92\xe1\x85\xa1\xe1\x86\xab\xe1\x84\x80\xe1\x85\xb3" 262 "\xe1\x86\xaf")); 263 264 // 265 // 4-byte sequences 266 // 267 268 // U+E0100 VARIATION SELECTOR-17 269 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 270 ConvertUTFResultContainer(conversionOK).withScalars(0x000E0100), 271 "\xf3\xa0\x84\x80")); 272 273 // 274 // First possible sequence of a certain length 275 // 276 277 // U+0000 NULL 278 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 279 ConvertUTFResultContainer(conversionOK).withScalars(0x0000), 280 StringRef("\x00", 1))); 281 282 // U+0080 PADDING CHARACTER 283 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 284 ConvertUTFResultContainer(conversionOK).withScalars(0x0080), 285 "\xc2\x80")); 286 287 // U+0800 SAMARITAN LETTER ALAF 288 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 289 ConvertUTFResultContainer(conversionOK).withScalars(0x0800), 290 "\xe0\xa0\x80")); 291 292 // U+10000 LINEAR B SYLLABLE B008 A 293 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 294 ConvertUTFResultContainer(conversionOK).withScalars(0x10000), 295 "\xf0\x90\x80\x80")); 296 297 // U+200000 (invalid) 298 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 299 ConvertUTFResultContainer(sourceIllegal) 300 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 301 "\xf8\x88\x80\x80\x80")); 302 303 // U+4000000 (invalid) 304 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 305 ConvertUTFResultContainer(sourceIllegal) 306 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 307 "\xfc\x84\x80\x80\x80\x80")); 308 309 // 310 // Last possible sequence of a certain length 311 // 312 313 // U+007F DELETE 314 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 315 ConvertUTFResultContainer(conversionOK).withScalars(0x007f), "\x7f")); 316 317 // U+07FF (unassigned) 318 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 319 ConvertUTFResultContainer(conversionOK).withScalars(0x07ff), 320 "\xdf\xbf")); 321 322 // U+FFFF (noncharacter) 323 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 324 ConvertUTFResultContainer(conversionOK).withScalars(0xffff), 325 "\xef\xbf\xbf")); 326 327 // U+1FFFFF (invalid) 328 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 329 ConvertUTFResultContainer(sourceIllegal) 330 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 331 "\xf7\xbf\xbf\xbf")); 332 333 // U+3FFFFFF (invalid) 334 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 335 ConvertUTFResultContainer(sourceIllegal) 336 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 337 "\xfb\xbf\xbf\xbf\xbf")); 338 339 // U+7FFFFFFF (invalid) 340 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 341 ConvertUTFResultContainer(sourceIllegal) 342 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 343 "\xfd\xbf\xbf\xbf\xbf\xbf")); 344 345 // 346 // Other boundary conditions 347 // 348 349 // U+D7FF (unassigned) 350 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 351 ConvertUTFResultContainer(conversionOK).withScalars(0xd7ff), 352 "\xed\x9f\xbf")); 353 354 // U+E000 (private use) 355 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 356 ConvertUTFResultContainer(conversionOK).withScalars(0xe000), 357 "\xee\x80\x80")); 358 359 // U+FFFD REPLACEMENT CHARACTER 360 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 361 ConvertUTFResultContainer(conversionOK).withScalars(0xfffd), 362 "\xef\xbf\xbd")); 363 364 // U+10FFFF (noncharacter) 365 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 366 ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff), 367 "\xf4\x8f\xbf\xbf")); 368 369 // U+110000 (invalid) 370 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 371 ConvertUTFResultContainer(sourceIllegal) 372 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 373 "\xf4\x90\x80\x80")); 374 375 // 376 // Unexpected continuation bytes 377 // 378 379 // A sequence of unexpected continuation bytes that don't follow a first 380 // byte, every byte is a maximal subpart. 381 382 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 383 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\x80")); 384 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 385 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xbf")); 386 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 387 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 388 "\x80\x80")); 389 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 390 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 391 "\x80\xbf")); 392 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 393 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 394 "\xbf\x80")); 395 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 396 ConvertUTFResultContainer(sourceIllegal) 397 .withScalars(0xfffd, 0xfffd, 0xfffd), 398 "\x80\xbf\x80")); 399 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 400 ConvertUTFResultContainer(sourceIllegal) 401 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 402 "\x80\xbf\x80\xbf")); 403 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 404 ConvertUTFResultContainer(sourceIllegal) 405 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 406 "\x80\xbf\x82\xbf\xaa")); 407 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 408 ConvertUTFResultContainer(sourceIllegal) 409 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 410 "\xaa\xb0\xbb\xbf\xaa\xa0")); 411 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 412 ConvertUTFResultContainer(sourceIllegal) 413 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 414 "\xaa\xb0\xbb\xbf\xaa\xa0\x8f")); 415 416 // All continuation bytes (0x80--0xbf). 417 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 418 ConvertUTFResultContainer(sourceIllegal) 419 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 420 0xfffd, 0xfffd, 0xfffd, 0xfffd) 421 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 422 0xfffd, 0xfffd, 0xfffd, 0xfffd) 423 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 424 0xfffd, 0xfffd, 0xfffd, 0xfffd) 425 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 426 0xfffd, 0xfffd, 0xfffd, 0xfffd) 427 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 428 0xfffd, 0xfffd, 0xfffd, 0xfffd) 429 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 430 0xfffd, 0xfffd, 0xfffd, 0xfffd) 431 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 432 0xfffd, 0xfffd, 0xfffd, 0xfffd) 433 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 434 0xfffd, 0xfffd, 0xfffd, 0xfffd), 435 "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" 436 "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" 437 "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf" 438 "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf")); 439 440 // 441 // Lonely start bytes 442 // 443 444 // Start bytes of 2-byte sequences (0xc0--0xdf). 445 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 446 ConvertUTFResultContainer(sourceIllegal) 447 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 448 0xfffd, 0xfffd, 0xfffd, 0xfffd) 449 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 450 0xfffd, 0xfffd, 0xfffd, 0xfffd) 451 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 452 0xfffd, 0xfffd, 0xfffd, 0xfffd) 453 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 454 0xfffd, 0xfffd, 0xfffd, 0xfffd), 455 "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf" 456 "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf")); 457 458 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 459 ConvertUTFResultContainer(sourceIllegal) 460 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 461 0xfffd, 0x0020, 0xfffd, 0x0020) 462 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 463 0xfffd, 0x0020, 0xfffd, 0x0020) 464 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 465 0xfffd, 0x0020, 0xfffd, 0x0020) 466 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 467 0xfffd, 0x0020, 0xfffd, 0x0020) 468 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 469 0xfffd, 0x0020, 0xfffd, 0x0020) 470 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 471 0xfffd, 0x0020, 0xfffd, 0x0020) 472 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 473 0xfffd, 0x0020, 0xfffd, 0x0020) 474 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 475 0xfffd, 0x0020, 0xfffd, 0x0020), 476 "\xc0\x20\xc1\x20\xc2\x20\xc3\x20\xc4\x20\xc5\x20\xc6\x20\xc7\x20" 477 "\xc8\x20\xc9\x20\xca\x20\xcb\x20\xcc\x20\xcd\x20\xce\x20\xcf\x20" 478 "\xd0\x20\xd1\x20\xd2\x20\xd3\x20\xd4\x20\xd5\x20\xd6\x20\xd7\x20" 479 "\xd8\x20\xd9\x20\xda\x20\xdb\x20\xdc\x20\xdd\x20\xde\x20\xdf\x20")); 480 481 // Start bytes of 3-byte sequences (0xe0--0xef). 482 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 483 ConvertUTFResultContainer(sourceIllegal) 484 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 485 0xfffd, 0xfffd, 0xfffd, 0xfffd) 486 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 487 0xfffd, 0xfffd, 0xfffd, 0xfffd), 488 "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef")); 489 490 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 491 ConvertUTFResultContainer(sourceIllegal) 492 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 493 0xfffd, 0x0020, 0xfffd, 0x0020) 494 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 495 0xfffd, 0x0020, 0xfffd, 0x0020) 496 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 497 0xfffd, 0x0020, 0xfffd, 0x0020) 498 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 499 0xfffd, 0x0020, 0xfffd, 0x0020), 500 "\xe0\x20\xe1\x20\xe2\x20\xe3\x20\xe4\x20\xe5\x20\xe6\x20\xe7\x20" 501 "\xe8\x20\xe9\x20\xea\x20\xeb\x20\xec\x20\xed\x20\xee\x20\xef\x20")); 502 503 // Start bytes of 4-byte sequences (0xf0--0xf7). 504 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 505 ConvertUTFResultContainer(sourceIllegal) 506 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 507 0xfffd, 0xfffd, 0xfffd, 0xfffd), 508 "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7")); 509 510 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 511 ConvertUTFResultContainer(sourceIllegal) 512 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 513 0xfffd, 0x0020, 0xfffd, 0x0020) 514 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 515 0xfffd, 0x0020, 0xfffd, 0x0020), 516 "\xf0\x20\xf1\x20\xf2\x20\xf3\x20\xf4\x20\xf5\x20\xf6\x20\xf7\x20")); 517 518 // Start bytes of 5-byte sequences (0xf8--0xfb). 519 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 520 ConvertUTFResultContainer(sourceIllegal) 521 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 522 "\xf8\xf9\xfa\xfb")); 523 524 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 525 ConvertUTFResultContainer(sourceIllegal) 526 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 527 0xfffd, 0x0020, 0xfffd, 0x0020), 528 "\xf8\x20\xf9\x20\xfa\x20\xfb\x20")); 529 530 // Start bytes of 6-byte sequences (0xfc--0xfd). 531 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 532 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 533 "\xfc\xfd")); 534 535 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 536 ConvertUTFResultContainer(sourceIllegal) 537 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020), 538 "\xfc\x20\xfd\x20")); 539 540 // 541 // Other bytes (0xc0--0xc1, 0xfe--0xff). 542 // 543 544 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 545 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc0")); 546 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 547 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc1")); 548 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 549 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfe")); 550 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 551 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xff")); 552 553 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 554 ConvertUTFResultContainer(sourceIllegal) 555 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 556 "\xc0\xc1\xfe\xff")); 557 558 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 559 ConvertUTFResultContainer(sourceIllegal) 560 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 561 "\xfe\xfe\xff\xff")); 562 563 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 564 ConvertUTFResultContainer(sourceIllegal) 565 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 566 "\xfe\x80\x80\x80\x80\x80")); 567 568 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 569 ConvertUTFResultContainer(sourceIllegal) 570 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 571 "\xff\x80\x80\x80\x80\x80")); 572 573 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 574 ConvertUTFResultContainer(sourceIllegal) 575 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 576 0xfffd, 0x0020, 0xfffd, 0x0020), 577 "\xc0\x20\xc1\x20\xfe\x20\xff\x20")); 578 579 // 580 // Sequences with one continuation byte missing 581 // 582 583 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 584 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc2")); 585 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 586 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xdf")); 587 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 588 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 589 "\xe0\xa0")); 590 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 591 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 592 "\xe0\xbf")); 593 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 594 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 595 "\xe1\x80")); 596 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 597 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 598 "\xec\xbf")); 599 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 600 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 601 "\xed\x80")); 602 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 603 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 604 "\xed\x9f")); 605 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 606 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 607 "\xee\x80")); 608 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 609 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 610 "\xef\xbf")); 611 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 612 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 613 "\xf0\x90\x80")); 614 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 615 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 616 "\xf0\xbf\xbf")); 617 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 618 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 619 "\xf1\x80\x80")); 620 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 621 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 622 "\xf3\xbf\xbf")); 623 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 624 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 625 "\xf4\x80\x80")); 626 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 627 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 628 "\xf4\x8f\xbf")); 629 630 // Overlong sequences with one trailing byte missing. 631 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 632 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 633 "\xc0")); 634 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 635 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 636 "\xc1")); 637 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 638 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 639 "\xe0\x80")); 640 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 641 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 642 "\xe0\x9f")); 643 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 644 ConvertUTFResultContainer(sourceIllegal) 645 .withScalars(0xfffd, 0xfffd, 0xfffd), 646 "\xf0\x80\x80")); 647 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 648 ConvertUTFResultContainer(sourceIllegal) 649 .withScalars(0xfffd, 0xfffd, 0xfffd), 650 "\xf0\x8f\x80")); 651 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 652 ConvertUTFResultContainer(sourceIllegal) 653 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 654 "\xf8\x80\x80\x80")); 655 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 656 ConvertUTFResultContainer(sourceIllegal) 657 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 658 "\xfc\x80\x80\x80\x80")); 659 660 // Sequences that represent surrogates with one trailing byte missing. 661 // High surrogates 662 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 663 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 664 "\xed\xa0")); 665 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 666 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 667 "\xed\xac")); 668 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 669 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 670 "\xed\xaf")); 671 // Low surrogates 672 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 673 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 674 "\xed\xb0")); 675 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 676 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 677 "\xed\xb4")); 678 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 679 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 680 "\xed\xbf")); 681 682 // Ill-formed 4-byte sequences. 683 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx 684 // U+1100xx (invalid) 685 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 686 ConvertUTFResultContainer(sourceIllegal) 687 .withScalars(0xfffd, 0xfffd, 0xfffd), 688 "\xf4\x90\x80")); 689 // U+13FBxx (invalid) 690 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 691 ConvertUTFResultContainer(sourceIllegal) 692 .withScalars(0xfffd, 0xfffd, 0xfffd), 693 "\xf4\xbf\xbf")); 694 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 695 ConvertUTFResultContainer(sourceIllegal) 696 .withScalars(0xfffd, 0xfffd, 0xfffd), 697 "\xf5\x80\x80")); 698 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 699 ConvertUTFResultContainer(sourceIllegal) 700 .withScalars(0xfffd, 0xfffd, 0xfffd), 701 "\xf6\x80\x80")); 702 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 703 ConvertUTFResultContainer(sourceIllegal) 704 .withScalars(0xfffd, 0xfffd, 0xfffd), 705 "\xf7\x80\x80")); 706 // U+1FFBxx (invalid) 707 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 708 ConvertUTFResultContainer(sourceIllegal) 709 .withScalars(0xfffd, 0xfffd, 0xfffd), 710 "\xf7\xbf\xbf")); 711 712 // Ill-formed 5-byte sequences. 713 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 714 // U+2000xx (invalid) 715 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 716 ConvertUTFResultContainer(sourceIllegal) 717 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 718 "\xf8\x88\x80\x80")); 719 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 720 ConvertUTFResultContainer(sourceIllegal) 721 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 722 "\xf8\xbf\xbf\xbf")); 723 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 724 ConvertUTFResultContainer(sourceIllegal) 725 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 726 "\xf9\x80\x80\x80")); 727 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 728 ConvertUTFResultContainer(sourceIllegal) 729 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 730 "\xfa\x80\x80\x80")); 731 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 732 ConvertUTFResultContainer(sourceIllegal) 733 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 734 "\xfb\x80\x80\x80")); 735 // U+3FFFFxx (invalid) 736 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 737 ConvertUTFResultContainer(sourceIllegal) 738 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 739 "\xfb\xbf\xbf\xbf")); 740 741 // Ill-formed 6-byte sequences. 742 // 1111110u 10uuuuuu 10uzzzzz 10zzzyyyy 10yyyyxx 10xxxxxx 743 // U+40000xx (invalid) 744 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 745 ConvertUTFResultContainer(sourceIllegal) 746 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 747 "\xfc\x84\x80\x80\x80")); 748 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 749 ConvertUTFResultContainer(sourceIllegal) 750 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 751 "\xfc\xbf\xbf\xbf\xbf")); 752 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 753 ConvertUTFResultContainer(sourceIllegal) 754 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 755 "\xfd\x80\x80\x80\x80")); 756 // U+7FFFFFxx (invalid) 757 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 758 ConvertUTFResultContainer(sourceIllegal) 759 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 760 "\xfd\xbf\xbf\xbf\xbf")); 761 762 // 763 // Sequences with two continuation bytes missing 764 // 765 766 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 767 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 768 "\xf0\x90")); 769 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 770 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 771 "\xf0\xbf")); 772 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 773 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 774 "\xf1\x80")); 775 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 776 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 777 "\xf3\xbf")); 778 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 779 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 780 "\xf4\x80")); 781 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 782 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 783 "\xf4\x8f")); 784 785 // Overlong sequences with two trailing byte missing. 786 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 787 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xe0")); 788 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 789 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 790 "\xf0\x80")); 791 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 792 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 793 "\xf0\x8f")); 794 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 795 ConvertUTFResultContainer(sourceIllegal) 796 .withScalars(0xfffd, 0xfffd, 0xfffd), 797 "\xf8\x80\x80")); 798 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 799 ConvertUTFResultContainer(sourceIllegal) 800 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 801 "\xfc\x80\x80\x80")); 802 803 // Sequences that represent surrogates with two trailing bytes missing. 804 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 805 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xed")); 806 807 // Ill-formed 4-byte sequences. 808 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx 809 // U+110yxx (invalid) 810 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 811 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 812 "\xf4\x90")); 813 // U+13Fyxx (invalid) 814 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 815 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 816 "\xf4\xbf")); 817 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 818 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 819 "\xf5\x80")); 820 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 821 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 822 "\xf6\x80")); 823 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 824 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 825 "\xf7\x80")); 826 // U+1FFyxx (invalid) 827 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 828 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 829 "\xf7\xbf")); 830 831 // Ill-formed 5-byte sequences. 832 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 833 // U+200yxx (invalid) 834 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 835 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 836 "\xf8\x88\x80")); 837 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 838 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 839 "\xf8\xbf\xbf")); 840 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 841 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 842 "\xf9\x80\x80")); 843 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 844 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 845 "\xfa\x80\x80")); 846 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 847 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 848 "\xfb\x80\x80")); 849 // U+3FFFyxx (invalid) 850 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 851 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 852 "\xfb\xbf\xbf")); 853 854 // Ill-formed 6-byte sequences. 855 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 856 // U+4000yxx (invalid) 857 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 858 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 859 "\xfc\x84\x80\x80")); 860 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 861 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 862 "\xfc\xbf\xbf\xbf")); 863 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 864 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 865 "\xfd\x80\x80\x80")); 866 // U+7FFFFyxx (invalid) 867 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 868 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 869 "\xfd\xbf\xbf\xbf")); 870 871 // 872 // Sequences with three continuation bytes missing 873 // 874 875 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 876 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0")); 877 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 878 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf1")); 879 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 880 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf2")); 881 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 882 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf3")); 883 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 884 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf4")); 885 886 // Broken overlong sequences. 887 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 888 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0")); 889 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 890 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 891 "\xf8\x80")); 892 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 893 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 894 "\xfc\x80\x80")); 895 896 // Ill-formed 4-byte sequences. 897 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx 898 // U+14yyxx (invalid) 899 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 900 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf5")); 901 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 902 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf6")); 903 // U+1Cyyxx (invalid) 904 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 905 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf7")); 906 907 // Ill-formed 5-byte sequences. 908 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 909 // U+20yyxx (invalid) 910 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 911 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 912 "\xf8\x88")); 913 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 914 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 915 "\xf8\xbf")); 916 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 917 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 918 "\xf9\x80")); 919 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 920 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 921 "\xfa\x80")); 922 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 923 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 924 "\xfb\x80")); 925 // U+3FCyyxx (invalid) 926 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 927 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 928 "\xfb\xbf")); 929 930 // Ill-formed 6-byte sequences. 931 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 932 // U+400yyxx (invalid) 933 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 934 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 935 "\xfc\x84\x80")); 936 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 937 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 938 "\xfc\xbf\xbf")); 939 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 940 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 941 "\xfd\x80\x80")); 942 // U+7FFCyyxx (invalid) 943 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 944 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 945 "\xfd\xbf\xbf")); 946 947 // 948 // Sequences with four continuation bytes missing 949 // 950 951 // Ill-formed 5-byte sequences. 952 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 953 // U+uzyyxx (invalid) 954 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 955 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8")); 956 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 957 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf9")); 958 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 959 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfa")); 960 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 961 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb")); 962 // U+3zyyxx (invalid) 963 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 964 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb")); 965 966 // Broken overlong sequences. 967 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 968 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8")); 969 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 970 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 971 "\xfc\x80")); 972 973 // Ill-formed 6-byte sequences. 974 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 975 // U+uzzyyxx (invalid) 976 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 977 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 978 "\xfc\x84")); 979 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 980 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 981 "\xfc\xbf")); 982 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 983 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 984 "\xfd\x80")); 985 // U+7Fzzyyxx (invalid) 986 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 987 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 988 "\xfd\xbf")); 989 990 // 991 // Sequences with five continuation bytes missing 992 // 993 994 // Ill-formed 6-byte sequences. 995 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 996 // U+uzzyyxx (invalid) 997 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 998 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfc")); 999 // U+uuzzyyxx (invalid) 1000 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1001 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfd")); 1002 1003 // 1004 // Consecutive sequences with trailing bytes missing 1005 // 1006 1007 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1008 ConvertUTFResultContainer(sourceIllegal) 1009 .withScalars(0xfffd, /**/ 0xfffd, 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd) 1010 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd) 1011 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd) 1012 .withScalars(0xfffd, /**/ 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd) 1013 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd) 1014 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1015 "\xc0" "\xe0\x80" "\xf0\x80\x80" 1016 "\xf8\x80\x80\x80" 1017 "\xfc\x80\x80\x80\x80" 1018 "\xdf" "\xef\xbf" "\xf7\xbf\xbf" 1019 "\xfb\xbf\xbf\xbf" 1020 "\xfd\xbf\xbf\xbf\xbf")); 1021 1022 // 1023 // Overlong UTF-8 sequences 1024 // 1025 1026 // U+002F SOLIDUS 1027 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1028 ConvertUTFResultContainer(conversionOK).withScalars(0x002f), "\x2f")); 1029 1030 // Overlong sequences of the above. 1031 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1032 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 1033 "\xc0\xaf")); 1034 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1035 ConvertUTFResultContainer(sourceIllegal) 1036 .withScalars(0xfffd, 0xfffd, 0xfffd), 1037 "\xe0\x80\xaf")); 1038 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1039 ConvertUTFResultContainer(sourceIllegal) 1040 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 1041 "\xf0\x80\x80\xaf")); 1042 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1043 ConvertUTFResultContainer(sourceIllegal) 1044 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1045 "\xf8\x80\x80\x80\xaf")); 1046 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1047 ConvertUTFResultContainer(sourceIllegal) 1048 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1049 "\xfc\x80\x80\x80\x80\xaf")); 1050 1051 // U+0000 NULL 1052 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1053 ConvertUTFResultContainer(conversionOK).withScalars(0x0000), 1054 StringRef("\x00", 1))); 1055 1056 // Overlong sequences of the above. 1057 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1058 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 1059 "\xc0\x80")); 1060 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1061 ConvertUTFResultContainer(sourceIllegal) 1062 .withScalars(0xfffd, 0xfffd, 0xfffd), 1063 "\xe0\x80\x80")); 1064 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1065 ConvertUTFResultContainer(sourceIllegal) 1066 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 1067 "\xf0\x80\x80\x80")); 1068 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1069 ConvertUTFResultContainer(sourceIllegal) 1070 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1071 "\xf8\x80\x80\x80\x80")); 1072 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1073 ConvertUTFResultContainer(sourceIllegal) 1074 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1075 "\xfc\x80\x80\x80\x80\x80")); 1076 1077 // Other overlong sequences. 1078 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1079 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 1080 "\xc0\xbf")); 1081 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1082 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 1083 "\xc1\x80")); 1084 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1085 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 1086 "\xc1\xbf")); 1087 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1088 ConvertUTFResultContainer(sourceIllegal) 1089 .withScalars(0xfffd, 0xfffd, 0xfffd), 1090 "\xe0\x9f\xbf")); 1091 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1092 ConvertUTFResultContainer(sourceIllegal) 1093 .withScalars(0xfffd, 0xfffd, 0xfffd), 1094 "\xed\xa0\x80")); 1095 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1096 ConvertUTFResultContainer(sourceIllegal) 1097 .withScalars(0xfffd, 0xfffd, 0xfffd), 1098 "\xed\xbf\xbf")); 1099 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1100 ConvertUTFResultContainer(sourceIllegal) 1101 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 1102 "\xf0\x8f\x80\x80")); 1103 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1104 ConvertUTFResultContainer(sourceIllegal) 1105 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 1106 "\xf0\x8f\xbf\xbf")); 1107 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1108 ConvertUTFResultContainer(sourceIllegal) 1109 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1110 "\xf8\x87\xbf\xbf\xbf")); 1111 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1112 ConvertUTFResultContainer(sourceIllegal) 1113 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1114 "\xfc\x83\xbf\xbf\xbf\xbf")); 1115 1116 // 1117 // Isolated surrogates 1118 // 1119 1120 // Unicode 6.3.0: 1121 // 1122 // D71. High-surrogate code point: A Unicode code point in the range 1123 // U+D800 to U+DBFF. 1124 // 1125 // D73. Low-surrogate code point: A Unicode code point in the range 1126 // U+DC00 to U+DFFF. 1127 1128 // Note: U+E0100 is <DB40 DD00> in UTF16. 1129 1130 // High surrogates 1131 1132 // U+D800 1133 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1134 ConvertUTFResultContainer(sourceIllegal) 1135 .withScalars(0xfffd, 0xfffd, 0xfffd), 1136 "\xed\xa0\x80")); 1137 1138 // U+DB40 1139 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1140 ConvertUTFResultContainer(sourceIllegal) 1141 .withScalars(0xfffd, 0xfffd, 0xfffd), 1142 "\xed\xac\xa0")); 1143 1144 // U+DBFF 1145 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1146 ConvertUTFResultContainer(sourceIllegal) 1147 .withScalars(0xfffd, 0xfffd, 0xfffd), 1148 "\xed\xaf\xbf")); 1149 1150 // Low surrogates 1151 1152 // U+DC00 1153 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1154 ConvertUTFResultContainer(sourceIllegal) 1155 .withScalars(0xfffd, 0xfffd, 0xfffd), 1156 "\xed\xb0\x80")); 1157 1158 // U+DD00 1159 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1160 ConvertUTFResultContainer(sourceIllegal) 1161 .withScalars(0xfffd, 0xfffd, 0xfffd), 1162 "\xed\xb4\x80")); 1163 1164 // U+DFFF 1165 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1166 ConvertUTFResultContainer(sourceIllegal) 1167 .withScalars(0xfffd, 0xfffd, 0xfffd), 1168 "\xed\xbf\xbf")); 1169 1170 // Surrogate pairs 1171 1172 // U+D800 U+DC00 1173 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1174 ConvertUTFResultContainer(sourceIllegal) 1175 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1176 "\xed\xa0\x80\xed\xb0\x80")); 1177 1178 // U+D800 U+DD00 1179 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1180 ConvertUTFResultContainer(sourceIllegal) 1181 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1182 "\xed\xa0\x80\xed\xb4\x80")); 1183 1184 // U+D800 U+DFFF 1185 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1186 ConvertUTFResultContainer(sourceIllegal) 1187 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1188 "\xed\xa0\x80\xed\xbf\xbf")); 1189 1190 // U+DB40 U+DC00 1191 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1192 ConvertUTFResultContainer(sourceIllegal) 1193 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1194 "\xed\xac\xa0\xed\xb0\x80")); 1195 1196 // U+DB40 U+DD00 1197 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1198 ConvertUTFResultContainer(sourceIllegal) 1199 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1200 "\xed\xac\xa0\xed\xb4\x80")); 1201 1202 // U+DB40 U+DFFF 1203 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1204 ConvertUTFResultContainer(sourceIllegal) 1205 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1206 "\xed\xac\xa0\xed\xbf\xbf")); 1207 1208 // U+DBFF U+DC00 1209 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1210 ConvertUTFResultContainer(sourceIllegal) 1211 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1212 "\xed\xaf\xbf\xed\xb0\x80")); 1213 1214 // U+DBFF U+DD00 1215 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1216 ConvertUTFResultContainer(sourceIllegal) 1217 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1218 "\xed\xaf\xbf\xed\xb4\x80")); 1219 1220 // U+DBFF U+DFFF 1221 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1222 ConvertUTFResultContainer(sourceIllegal) 1223 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1224 "\xed\xaf\xbf\xed\xbf\xbf")); 1225 1226 // 1227 // Noncharacters 1228 // 1229 1230 // Unicode 6.3.0: 1231 // 1232 // D14. Noncharacter: A code point that is permanently reserved for 1233 // internal use and that should never be interchanged. Noncharacters 1234 // consist of the values U+nFFFE and U+nFFFF (where n is from 0 to 1016) 1235 // and the values U+FDD0..U+FDEF. 1236 1237 // U+FFFE 1238 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1239 ConvertUTFResultContainer(conversionOK).withScalars(0xfffe), 1240 "\xef\xbf\xbe")); 1241 1242 // U+FFFF 1243 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1244 ConvertUTFResultContainer(conversionOK).withScalars(0xffff), 1245 "\xef\xbf\xbf")); 1246 1247 // U+1FFFE 1248 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1249 ConvertUTFResultContainer(conversionOK).withScalars(0x1fffe), 1250 "\xf0\x9f\xbf\xbe")); 1251 1252 // U+1FFFF 1253 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1254 ConvertUTFResultContainer(conversionOK).withScalars(0x1ffff), 1255 "\xf0\x9f\xbf\xbf")); 1256 1257 // U+2FFFE 1258 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1259 ConvertUTFResultContainer(conversionOK).withScalars(0x2fffe), 1260 "\xf0\xaf\xbf\xbe")); 1261 1262 // U+2FFFF 1263 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1264 ConvertUTFResultContainer(conversionOK).withScalars(0x2ffff), 1265 "\xf0\xaf\xbf\xbf")); 1266 1267 // U+3FFFE 1268 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1269 ConvertUTFResultContainer(conversionOK).withScalars(0x3fffe), 1270 "\xf0\xbf\xbf\xbe")); 1271 1272 // U+3FFFF 1273 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1274 ConvertUTFResultContainer(conversionOK).withScalars(0x3ffff), 1275 "\xf0\xbf\xbf\xbf")); 1276 1277 // U+4FFFE 1278 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1279 ConvertUTFResultContainer(conversionOK).withScalars(0x4fffe), 1280 "\xf1\x8f\xbf\xbe")); 1281 1282 // U+4FFFF 1283 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1284 ConvertUTFResultContainer(conversionOK).withScalars(0x4ffff), 1285 "\xf1\x8f\xbf\xbf")); 1286 1287 // U+5FFFE 1288 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1289 ConvertUTFResultContainer(conversionOK).withScalars(0x5fffe), 1290 "\xf1\x9f\xbf\xbe")); 1291 1292 // U+5FFFF 1293 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1294 ConvertUTFResultContainer(conversionOK).withScalars(0x5ffff), 1295 "\xf1\x9f\xbf\xbf")); 1296 1297 // U+6FFFE 1298 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1299 ConvertUTFResultContainer(conversionOK).withScalars(0x6fffe), 1300 "\xf1\xaf\xbf\xbe")); 1301 1302 // U+6FFFF 1303 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1304 ConvertUTFResultContainer(conversionOK).withScalars(0x6ffff), 1305 "\xf1\xaf\xbf\xbf")); 1306 1307 // U+7FFFE 1308 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1309 ConvertUTFResultContainer(conversionOK).withScalars(0x7fffe), 1310 "\xf1\xbf\xbf\xbe")); 1311 1312 // U+7FFFF 1313 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1314 ConvertUTFResultContainer(conversionOK).withScalars(0x7ffff), 1315 "\xf1\xbf\xbf\xbf")); 1316 1317 // U+8FFFE 1318 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1319 ConvertUTFResultContainer(conversionOK).withScalars(0x8fffe), 1320 "\xf2\x8f\xbf\xbe")); 1321 1322 // U+8FFFF 1323 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1324 ConvertUTFResultContainer(conversionOK).withScalars(0x8ffff), 1325 "\xf2\x8f\xbf\xbf")); 1326 1327 // U+9FFFE 1328 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1329 ConvertUTFResultContainer(conversionOK).withScalars(0x9fffe), 1330 "\xf2\x9f\xbf\xbe")); 1331 1332 // U+9FFFF 1333 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1334 ConvertUTFResultContainer(conversionOK).withScalars(0x9ffff), 1335 "\xf2\x9f\xbf\xbf")); 1336 1337 // U+AFFFE 1338 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1339 ConvertUTFResultContainer(conversionOK).withScalars(0xafffe), 1340 "\xf2\xaf\xbf\xbe")); 1341 1342 // U+AFFFF 1343 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1344 ConvertUTFResultContainer(conversionOK).withScalars(0xaffff), 1345 "\xf2\xaf\xbf\xbf")); 1346 1347 // U+BFFFE 1348 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1349 ConvertUTFResultContainer(conversionOK).withScalars(0xbfffe), 1350 "\xf2\xbf\xbf\xbe")); 1351 1352 // U+BFFFF 1353 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1354 ConvertUTFResultContainer(conversionOK).withScalars(0xbffff), 1355 "\xf2\xbf\xbf\xbf")); 1356 1357 // U+CFFFE 1358 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1359 ConvertUTFResultContainer(conversionOK).withScalars(0xcfffe), 1360 "\xf3\x8f\xbf\xbe")); 1361 1362 // U+CFFFF 1363 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1364 ConvertUTFResultContainer(conversionOK).withScalars(0xcfffF), 1365 "\xf3\x8f\xbf\xbf")); 1366 1367 // U+DFFFE 1368 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1369 ConvertUTFResultContainer(conversionOK).withScalars(0xdfffe), 1370 "\xf3\x9f\xbf\xbe")); 1371 1372 // U+DFFFF 1373 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1374 ConvertUTFResultContainer(conversionOK).withScalars(0xdffff), 1375 "\xf3\x9f\xbf\xbf")); 1376 1377 // U+EFFFE 1378 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1379 ConvertUTFResultContainer(conversionOK).withScalars(0xefffe), 1380 "\xf3\xaf\xbf\xbe")); 1381 1382 // U+EFFFF 1383 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1384 ConvertUTFResultContainer(conversionOK).withScalars(0xeffff), 1385 "\xf3\xaf\xbf\xbf")); 1386 1387 // U+FFFFE 1388 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1389 ConvertUTFResultContainer(conversionOK).withScalars(0xffffe), 1390 "\xf3\xbf\xbf\xbe")); 1391 1392 // U+FFFFF 1393 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1394 ConvertUTFResultContainer(conversionOK).withScalars(0xfffff), 1395 "\xf3\xbf\xbf\xbf")); 1396 1397 // U+10FFFE 1398 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1399 ConvertUTFResultContainer(conversionOK).withScalars(0x10fffe), 1400 "\xf4\x8f\xbf\xbe")); 1401 1402 // U+10FFFF 1403 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1404 ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff), 1405 "\xf4\x8f\xbf\xbf")); 1406 1407 // U+FDD0 1408 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1409 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd0), 1410 "\xef\xb7\x90")); 1411 1412 // U+FDD1 1413 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1414 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd1), 1415 "\xef\xb7\x91")); 1416 1417 // U+FDD2 1418 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1419 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd2), 1420 "\xef\xb7\x92")); 1421 1422 // U+FDD3 1423 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1424 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd3), 1425 "\xef\xb7\x93")); 1426 1427 // U+FDD4 1428 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1429 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd4), 1430 "\xef\xb7\x94")); 1431 1432 // U+FDD5 1433 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1434 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd5), 1435 "\xef\xb7\x95")); 1436 1437 // U+FDD6 1438 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1439 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd6), 1440 "\xef\xb7\x96")); 1441 1442 // U+FDD7 1443 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1444 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd7), 1445 "\xef\xb7\x97")); 1446 1447 // U+FDD8 1448 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1449 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd8), 1450 "\xef\xb7\x98")); 1451 1452 // U+FDD9 1453 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1454 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd9), 1455 "\xef\xb7\x99")); 1456 1457 // U+FDDA 1458 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1459 ConvertUTFResultContainer(conversionOK).withScalars(0xfdda), 1460 "\xef\xb7\x9a")); 1461 1462 // U+FDDB 1463 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1464 ConvertUTFResultContainer(conversionOK).withScalars(0xfddb), 1465 "\xef\xb7\x9b")); 1466 1467 // U+FDDC 1468 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1469 ConvertUTFResultContainer(conversionOK).withScalars(0xfddc), 1470 "\xef\xb7\x9c")); 1471 1472 // U+FDDD 1473 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1474 ConvertUTFResultContainer(conversionOK).withScalars(0xfddd), 1475 "\xef\xb7\x9d")); 1476 1477 // U+FDDE 1478 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1479 ConvertUTFResultContainer(conversionOK).withScalars(0xfdde), 1480 "\xef\xb7\x9e")); 1481 1482 // U+FDDF 1483 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1484 ConvertUTFResultContainer(conversionOK).withScalars(0xfddf), 1485 "\xef\xb7\x9f")); 1486 1487 // U+FDE0 1488 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1489 ConvertUTFResultContainer(conversionOK).withScalars(0xfde0), 1490 "\xef\xb7\xa0")); 1491 1492 // U+FDE1 1493 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1494 ConvertUTFResultContainer(conversionOK).withScalars(0xfde1), 1495 "\xef\xb7\xa1")); 1496 1497 // U+FDE2 1498 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1499 ConvertUTFResultContainer(conversionOK).withScalars(0xfde2), 1500 "\xef\xb7\xa2")); 1501 1502 // U+FDE3 1503 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1504 ConvertUTFResultContainer(conversionOK).withScalars(0xfde3), 1505 "\xef\xb7\xa3")); 1506 1507 // U+FDE4 1508 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1509 ConvertUTFResultContainer(conversionOK).withScalars(0xfde4), 1510 "\xef\xb7\xa4")); 1511 1512 // U+FDE5 1513 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1514 ConvertUTFResultContainer(conversionOK).withScalars(0xfde5), 1515 "\xef\xb7\xa5")); 1516 1517 // U+FDE6 1518 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1519 ConvertUTFResultContainer(conversionOK).withScalars(0xfde6), 1520 "\xef\xb7\xa6")); 1521 1522 // U+FDE7 1523 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1524 ConvertUTFResultContainer(conversionOK).withScalars(0xfde7), 1525 "\xef\xb7\xa7")); 1526 1527 // U+FDE8 1528 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1529 ConvertUTFResultContainer(conversionOK).withScalars(0xfde8), 1530 "\xef\xb7\xa8")); 1531 1532 // U+FDE9 1533 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1534 ConvertUTFResultContainer(conversionOK).withScalars(0xfde9), 1535 "\xef\xb7\xa9")); 1536 1537 // U+FDEA 1538 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1539 ConvertUTFResultContainer(conversionOK).withScalars(0xfdea), 1540 "\xef\xb7\xaa")); 1541 1542 // U+FDEB 1543 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1544 ConvertUTFResultContainer(conversionOK).withScalars(0xfdeb), 1545 "\xef\xb7\xab")); 1546 1547 // U+FDEC 1548 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1549 ConvertUTFResultContainer(conversionOK).withScalars(0xfdec), 1550 "\xef\xb7\xac")); 1551 1552 // U+FDED 1553 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1554 ConvertUTFResultContainer(conversionOK).withScalars(0xfded), 1555 "\xef\xb7\xad")); 1556 1557 // U+FDEE 1558 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1559 ConvertUTFResultContainer(conversionOK).withScalars(0xfdee), 1560 "\xef\xb7\xae")); 1561 1562 // U+FDEF 1563 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1564 ConvertUTFResultContainer(conversionOK).withScalars(0xfdef), 1565 "\xef\xb7\xaf")); 1566 1567 // U+FDF0 1568 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1569 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf0), 1570 "\xef\xb7\xb0")); 1571 1572 // U+FDF1 1573 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1574 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf1), 1575 "\xef\xb7\xb1")); 1576 1577 // U+FDF2 1578 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1579 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf2), 1580 "\xef\xb7\xb2")); 1581 1582 // U+FDF3 1583 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1584 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf3), 1585 "\xef\xb7\xb3")); 1586 1587 // U+FDF4 1588 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1589 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf4), 1590 "\xef\xb7\xb4")); 1591 1592 // U+FDF5 1593 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1594 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf5), 1595 "\xef\xb7\xb5")); 1596 1597 // U+FDF6 1598 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1599 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf6), 1600 "\xef\xb7\xb6")); 1601 1602 // U+FDF7 1603 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1604 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf7), 1605 "\xef\xb7\xb7")); 1606 1607 // U+FDF8 1608 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1609 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf8), 1610 "\xef\xb7\xb8")); 1611 1612 // U+FDF9 1613 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1614 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf9), 1615 "\xef\xb7\xb9")); 1616 1617 // U+FDFA 1618 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1619 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfa), 1620 "\xef\xb7\xba")); 1621 1622 // U+FDFB 1623 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1624 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfb), 1625 "\xef\xb7\xbb")); 1626 1627 // U+FDFC 1628 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1629 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfc), 1630 "\xef\xb7\xbc")); 1631 1632 // U+FDFD 1633 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1634 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfd), 1635 "\xef\xb7\xbd")); 1636 1637 // U+FDFE 1638 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1639 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfe), 1640 "\xef\xb7\xbe")); 1641 1642 // U+FDFF 1643 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1644 ConvertUTFResultContainer(conversionOK).withScalars(0xfdff), 1645 "\xef\xb7\xbf")); 1646 } 1647 1648 TEST(ConvertUTFTest, UTF8ToUTF32PartialLenient) { 1649 // U+0041 LATIN CAPITAL LETTER A 1650 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1651 ConvertUTFResultContainer(conversionOK).withScalars(0x0041), 1652 "\x41", true)); 1653 1654 // 1655 // Sequences with one continuation byte missing 1656 // 1657 1658 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1659 ConvertUTFResultContainer(sourceExhausted), 1660 "\xc2", true)); 1661 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1662 ConvertUTFResultContainer(sourceExhausted), 1663 "\xdf", true)); 1664 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1665 ConvertUTFResultContainer(sourceExhausted), 1666 "\xe0\xa0", true)); 1667 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1668 ConvertUTFResultContainer(sourceExhausted), 1669 "\xe0\xbf", true)); 1670 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1671 ConvertUTFResultContainer(sourceExhausted), 1672 "\xe1\x80", true)); 1673 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1674 ConvertUTFResultContainer(sourceExhausted), 1675 "\xec\xbf", true)); 1676 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1677 ConvertUTFResultContainer(sourceExhausted), 1678 "\xed\x80", true)); 1679 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1680 ConvertUTFResultContainer(sourceExhausted), 1681 "\xed\x9f", true)); 1682 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1683 ConvertUTFResultContainer(sourceExhausted), 1684 "\xee\x80", true)); 1685 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1686 ConvertUTFResultContainer(sourceExhausted), 1687 "\xef\xbf", true)); 1688 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1689 ConvertUTFResultContainer(sourceExhausted), 1690 "\xf0\x90\x80", true)); 1691 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1692 ConvertUTFResultContainer(sourceExhausted), 1693 "\xf0\xbf\xbf", true)); 1694 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1695 ConvertUTFResultContainer(sourceExhausted), 1696 "\xf1\x80\x80", true)); 1697 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1698 ConvertUTFResultContainer(sourceExhausted), 1699 "\xf3\xbf\xbf", true)); 1700 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1701 ConvertUTFResultContainer(sourceExhausted), 1702 "\xf4\x80\x80", true)); 1703 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1704 ConvertUTFResultContainer(sourceExhausted), 1705 "\xf4\x8f\xbf", true)); 1706 1707 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1708 ConvertUTFResultContainer(sourceExhausted).withScalars(0x0041), 1709 "\x41\xc2", true)); 1710 } 1711 1712