1 //===- llvm/unittest/Support/ConvertUTFTest.cpp - ConvertUTF tests --------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "llvm/Support/ConvertUTF.h"
10 #include "llvm/ADT/ArrayRef.h"
11 #include "gtest/gtest.h"
12 #include <string>
13 #include <vector>
14
15 using namespace llvm;
16
TEST(ConvertUTFTest,ConvertUTF16LittleEndianToUTF8String)17 TEST(ConvertUTFTest, ConvertUTF16LittleEndianToUTF8String) {
18 // Src is the look of disapproval.
19 alignas(UTF16) static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c";
20 ArrayRef<char> Ref(Src, sizeof(Src) - 1);
21 std::string Result;
22 bool Success = convertUTF16ToUTF8String(Ref, Result);
23 EXPECT_TRUE(Success);
24 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
25 EXPECT_EQ(Expected, Result);
26 }
27
TEST(ConvertUTFTest,ConvertUTF32LittleEndianToUTF8String)28 TEST(ConvertUTFTest, ConvertUTF32LittleEndianToUTF8String) {
29 // Src is the look of disapproval.
30 alignas(UTF32) static const char Src[] =
31 "\xFF\xFE\x00\x00\xA0\x0C\x00\x00\x5F\x00\x00\x00\xA0\x0C\x00\x00";
32 ArrayRef<char> Ref(Src, sizeof(Src) - 1);
33 std::string Result;
34 bool Success = convertUTF32ToUTF8String(Ref, Result);
35 EXPECT_TRUE(Success);
36 std::string Expected("\xE0\xB2\xA0_\xE0\xB2\xA0");
37 EXPECT_EQ(Expected, Result);
38 }
39
TEST(ConvertUTFTest,ConvertUTF16BigEndianToUTF8String)40 TEST(ConvertUTFTest, ConvertUTF16BigEndianToUTF8String) {
41 // Src is the look of disapproval.
42 alignas(UTF16) static const char Src[] = "\xfe\xff\x0c\xa0\x00_\x0c\xa0";
43 ArrayRef<char> Ref(Src, sizeof(Src) - 1);
44 std::string Result;
45 bool Success = convertUTF16ToUTF8String(Ref, Result);
46 EXPECT_TRUE(Success);
47 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
48 EXPECT_EQ(Expected, Result);
49 }
50
TEST(ConvertUTFTest,ConvertUTF32BigEndianToUTF8String)51 TEST(ConvertUTFTest, ConvertUTF32BigEndianToUTF8String) {
52 // Src is the look of disapproval.
53 alignas(UTF32) static const char Src[] =
54 "\x00\x00\xFE\xFF\x00\x00\x0C\xA0\x00\x00\x00\x5F\x00\x00\x0C\xA0";
55 ArrayRef<char> Ref(Src, sizeof(Src) - 1);
56 std::string Result;
57 bool Success = convertUTF32ToUTF8String(Ref, Result);
58 EXPECT_TRUE(Success);
59 std::string Expected("\xE0\xB2\xA0_\xE0\xB2\xA0");
60 EXPECT_EQ(Expected, Result);
61 }
62
TEST(ConvertUTFTest,ConvertUTF8ToUTF16String)63 TEST(ConvertUTFTest, ConvertUTF8ToUTF16String) {
64 // Src is the look of disapproval.
65 static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0";
66 StringRef Ref(Src, sizeof(Src) - 1);
67 SmallVector<UTF16, 5> Result;
68 bool Success = convertUTF8ToUTF16String(Ref, Result);
69 EXPECT_TRUE(Success);
70 static const UTF16 Expected[] = {0x0CA0, 0x005f, 0x0CA0, 0};
71 ASSERT_EQ(3u, Result.size());
72 for (int I = 0, E = 3; I != E; ++I)
73 EXPECT_EQ(Expected[I], Result[I]);
74 }
75
TEST(ConvertUTFTest,OddLengthInput)76 TEST(ConvertUTFTest, OddLengthInput) {
77 std::string Result;
78 bool Success = convertUTF16ToUTF8String(makeArrayRef("xxxxx", 5), Result);
79 EXPECT_FALSE(Success);
80 }
81
TEST(ConvertUTFTest,Empty)82 TEST(ConvertUTFTest, Empty) {
83 std::string Result;
84 bool Success = convertUTF16ToUTF8String(llvm::ArrayRef<char>(None), Result);
85 EXPECT_TRUE(Success);
86 EXPECT_TRUE(Result.empty());
87 }
88
TEST(ConvertUTFTest,HasUTF16BOM)89 TEST(ConvertUTFTest, HasUTF16BOM) {
90 bool HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xff\xfe", 2));
91 EXPECT_TRUE(HasBOM);
92 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff", 2));
93 EXPECT_TRUE(HasBOM);
94 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff ", 3));
95 EXPECT_TRUE(HasBOM); // Don't care about odd lengths.
96 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff\x00asdf", 6));
97 EXPECT_TRUE(HasBOM);
98
99 HasBOM = hasUTF16ByteOrderMark(None);
100 EXPECT_FALSE(HasBOM);
101 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe", 1));
102 EXPECT_FALSE(HasBOM);
103 }
104
TEST(ConvertUTFTest,UTF16WrappersForConvertUTF16ToUTF8String)105 TEST(ConvertUTFTest, UTF16WrappersForConvertUTF16ToUTF8String) {
106 // Src is the look of disapproval.
107 alignas(UTF16) static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c";
108 ArrayRef<UTF16> SrcRef = makeArrayRef((const UTF16 *)Src, 4);
109 std::string Result;
110 bool Success = convertUTF16ToUTF8String(SrcRef, Result);
111 EXPECT_TRUE(Success);
112 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
113 EXPECT_EQ(Expected, Result);
114 }
115
TEST(ConvertUTFTest,ConvertUTF8toWide)116 TEST(ConvertUTFTest, ConvertUTF8toWide) {
117 // Src is the look of disapproval.
118 static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0";
119 std::wstring Result;
120 bool Success = ConvertUTF8toWide((const char*)Src, Result);
121 EXPECT_TRUE(Success);
122 std::wstring Expected(L"\x0ca0_\x0ca0");
123 EXPECT_EQ(Expected, Result);
124 Result.clear();
125 Success = ConvertUTF8toWide(StringRef(Src, 7), Result);
126 EXPECT_TRUE(Success);
127 EXPECT_EQ(Expected, Result);
128 }
129
TEST(ConvertUTFTest,convertWideToUTF8)130 TEST(ConvertUTFTest, convertWideToUTF8) {
131 // Src is the look of disapproval.
132 static const wchar_t Src[] = L"\x0ca0_\x0ca0";
133 std::string Result;
134 bool Success = convertWideToUTF8(Src, Result);
135 EXPECT_TRUE(Success);
136 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
137 EXPECT_EQ(Expected, Result);
138 }
139
140 struct ConvertUTFResultContainer {
141 ConversionResult ErrorCode;
142 std::vector<unsigned> UnicodeScalars;
143
ConvertUTFResultContainerConvertUTFResultContainer144 ConvertUTFResultContainer(ConversionResult ErrorCode)
145 : ErrorCode(ErrorCode) {}
146
147 ConvertUTFResultContainer
withScalarsConvertUTFResultContainer148 withScalars(unsigned US0 = 0x110000, unsigned US1 = 0x110000,
149 unsigned US2 = 0x110000, unsigned US3 = 0x110000,
150 unsigned US4 = 0x110000, unsigned US5 = 0x110000,
151 unsigned US6 = 0x110000, unsigned US7 = 0x110000) {
152 ConvertUTFResultContainer Result(*this);
153 if (US0 != 0x110000)
154 Result.UnicodeScalars.push_back(US0);
155 if (US1 != 0x110000)
156 Result.UnicodeScalars.push_back(US1);
157 if (US2 != 0x110000)
158 Result.UnicodeScalars.push_back(US2);
159 if (US3 != 0x110000)
160 Result.UnicodeScalars.push_back(US3);
161 if (US4 != 0x110000)
162 Result.UnicodeScalars.push_back(US4);
163 if (US5 != 0x110000)
164 Result.UnicodeScalars.push_back(US5);
165 if (US6 != 0x110000)
166 Result.UnicodeScalars.push_back(US6);
167 if (US7 != 0x110000)
168 Result.UnicodeScalars.push_back(US7);
169 return Result;
170 }
171 };
172
173 std::pair<ConversionResult, std::vector<unsigned>>
ConvertUTF8ToUnicodeScalarsLenient(StringRef S)174 ConvertUTF8ToUnicodeScalarsLenient(StringRef S) {
175 const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
176
177 const UTF8 *SourceNext = SourceStart;
178 std::vector<UTF32> Decoded(S.size(), 0);
179 UTF32 *TargetStart = Decoded.data();
180
181 auto ErrorCode =
182 ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart,
183 Decoded.data() + Decoded.size(), lenientConversion);
184
185 Decoded.resize(TargetStart - Decoded.data());
186
187 return std::make_pair(ErrorCode, Decoded);
188 }
189
190 std::pair<ConversionResult, std::vector<unsigned>>
ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S)191 ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S) {
192 const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
193
194 const UTF8 *SourceNext = SourceStart;
195 std::vector<UTF32> Decoded(S.size(), 0);
196 UTF32 *TargetStart = Decoded.data();
197
198 auto ErrorCode = ConvertUTF8toUTF32Partial(
199 &SourceNext, SourceStart + S.size(), &TargetStart,
200 Decoded.data() + Decoded.size(), lenientConversion);
201
202 Decoded.resize(TargetStart - Decoded.data());
203
204 return std::make_pair(ErrorCode, Decoded);
205 }
206
207 ::testing::AssertionResult
CheckConvertUTF8ToUnicodeScalars(ConvertUTFResultContainer Expected,StringRef S,bool Partial=false)208 CheckConvertUTF8ToUnicodeScalars(ConvertUTFResultContainer Expected,
209 StringRef S, bool Partial = false) {
210 ConversionResult ErrorCode;
211 std::vector<unsigned> Decoded;
212 if (!Partial)
213 std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsLenient(S);
214 else
215 std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsPartialLenient(S);
216
217 if (Expected.ErrorCode != ErrorCode)
218 return ::testing::AssertionFailure() << "Expected error code "
219 << Expected.ErrorCode << ", actual "
220 << ErrorCode;
221
222 if (Expected.UnicodeScalars != Decoded)
223 return ::testing::AssertionFailure()
224 << "Expected lenient decoded result:\n"
225 << ::testing::PrintToString(Expected.UnicodeScalars) << "\n"
226 << "Actual result:\n" << ::testing::PrintToString(Decoded);
227
228 return ::testing::AssertionSuccess();
229 }
230
TEST(ConvertUTFTest,UTF8ToUTF32Lenient)231 TEST(ConvertUTFTest, UTF8ToUTF32Lenient) {
232
233 //
234 // 1-byte sequences
235 //
236
237 // U+0041 LATIN CAPITAL LETTER A
238 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
239 ConvertUTFResultContainer(conversionOK).withScalars(0x0041), "\x41"));
240
241 //
242 // 2-byte sequences
243 //
244
245 // U+0283 LATIN SMALL LETTER ESH
246 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
247 ConvertUTFResultContainer(conversionOK).withScalars(0x0283),
248 "\xca\x83"));
249
250 // U+03BA GREEK SMALL LETTER KAPPA
251 // U+1F79 GREEK SMALL LETTER OMICRON WITH OXIA
252 // U+03C3 GREEK SMALL LETTER SIGMA
253 // U+03BC GREEK SMALL LETTER MU
254 // U+03B5 GREEK SMALL LETTER EPSILON
255 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
256 ConvertUTFResultContainer(conversionOK)
257 .withScalars(0x03ba, 0x1f79, 0x03c3, 0x03bc, 0x03b5),
258 "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5"));
259
260 //
261 // 3-byte sequences
262 //
263
264 // U+4F8B CJK UNIFIED IDEOGRAPH-4F8B
265 // U+6587 CJK UNIFIED IDEOGRAPH-6587
266 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
267 ConvertUTFResultContainer(conversionOK).withScalars(0x4f8b, 0x6587),
268 "\xe4\xbe\x8b\xe6\x96\x87"));
269
270 // U+D55C HANGUL SYLLABLE HAN
271 // U+AE00 HANGUL SYLLABLE GEUL
272 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
273 ConvertUTFResultContainer(conversionOK).withScalars(0xd55c, 0xae00),
274 "\xed\x95\x9c\xea\xb8\x80"));
275
276 // U+1112 HANGUL CHOSEONG HIEUH
277 // U+1161 HANGUL JUNGSEONG A
278 // U+11AB HANGUL JONGSEONG NIEUN
279 // U+1100 HANGUL CHOSEONG KIYEOK
280 // U+1173 HANGUL JUNGSEONG EU
281 // U+11AF HANGUL JONGSEONG RIEUL
282 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
283 ConvertUTFResultContainer(conversionOK)
284 .withScalars(0x1112, 0x1161, 0x11ab, 0x1100, 0x1173, 0x11af),
285 "\xe1\x84\x92\xe1\x85\xa1\xe1\x86\xab\xe1\x84\x80\xe1\x85\xb3"
286 "\xe1\x86\xaf"));
287
288 //
289 // 4-byte sequences
290 //
291
292 // U+E0100 VARIATION SELECTOR-17
293 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
294 ConvertUTFResultContainer(conversionOK).withScalars(0x000E0100),
295 "\xf3\xa0\x84\x80"));
296
297 //
298 // First possible sequence of a certain length
299 //
300
301 // U+0000 NULL
302 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
303 ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
304 StringRef("\x00", 1)));
305
306 // U+0080 PADDING CHARACTER
307 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
308 ConvertUTFResultContainer(conversionOK).withScalars(0x0080),
309 "\xc2\x80"));
310
311 // U+0800 SAMARITAN LETTER ALAF
312 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
313 ConvertUTFResultContainer(conversionOK).withScalars(0x0800),
314 "\xe0\xa0\x80"));
315
316 // U+10000 LINEAR B SYLLABLE B008 A
317 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
318 ConvertUTFResultContainer(conversionOK).withScalars(0x10000),
319 "\xf0\x90\x80\x80"));
320
321 // U+200000 (invalid)
322 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
323 ConvertUTFResultContainer(sourceIllegal)
324 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
325 "\xf8\x88\x80\x80\x80"));
326
327 // U+4000000 (invalid)
328 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
329 ConvertUTFResultContainer(sourceIllegal)
330 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
331 "\xfc\x84\x80\x80\x80\x80"));
332
333 //
334 // Last possible sequence of a certain length
335 //
336
337 // U+007F DELETE
338 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
339 ConvertUTFResultContainer(conversionOK).withScalars(0x007f), "\x7f"));
340
341 // U+07FF (unassigned)
342 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
343 ConvertUTFResultContainer(conversionOK).withScalars(0x07ff),
344 "\xdf\xbf"));
345
346 // U+FFFF (noncharacter)
347 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
348 ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
349 "\xef\xbf\xbf"));
350
351 // U+1FFFFF (invalid)
352 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
353 ConvertUTFResultContainer(sourceIllegal)
354 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
355 "\xf7\xbf\xbf\xbf"));
356
357 // U+3FFFFFF (invalid)
358 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
359 ConvertUTFResultContainer(sourceIllegal)
360 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
361 "\xfb\xbf\xbf\xbf\xbf"));
362
363 // U+7FFFFFFF (invalid)
364 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
365 ConvertUTFResultContainer(sourceIllegal)
366 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
367 "\xfd\xbf\xbf\xbf\xbf\xbf"));
368
369 //
370 // Other boundary conditions
371 //
372
373 // U+D7FF (unassigned)
374 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
375 ConvertUTFResultContainer(conversionOK).withScalars(0xd7ff),
376 "\xed\x9f\xbf"));
377
378 // U+E000 (private use)
379 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
380 ConvertUTFResultContainer(conversionOK).withScalars(0xe000),
381 "\xee\x80\x80"));
382
383 // U+FFFD REPLACEMENT CHARACTER
384 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
385 ConvertUTFResultContainer(conversionOK).withScalars(0xfffd),
386 "\xef\xbf\xbd"));
387
388 // U+10FFFF (noncharacter)
389 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
390 ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
391 "\xf4\x8f\xbf\xbf"));
392
393 // U+110000 (invalid)
394 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
395 ConvertUTFResultContainer(sourceIllegal)
396 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
397 "\xf4\x90\x80\x80"));
398
399 //
400 // Unexpected continuation bytes
401 //
402
403 // A sequence of unexpected continuation bytes that don't follow a first
404 // byte, every byte is a maximal subpart.
405
406 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
407 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\x80"));
408 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
409 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xbf"));
410 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
411 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
412 "\x80\x80"));
413 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
414 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
415 "\x80\xbf"));
416 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
417 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
418 "\xbf\x80"));
419 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
420 ConvertUTFResultContainer(sourceIllegal)
421 .withScalars(0xfffd, 0xfffd, 0xfffd),
422 "\x80\xbf\x80"));
423 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
424 ConvertUTFResultContainer(sourceIllegal)
425 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
426 "\x80\xbf\x80\xbf"));
427 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
428 ConvertUTFResultContainer(sourceIllegal)
429 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
430 "\x80\xbf\x82\xbf\xaa"));
431 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
432 ConvertUTFResultContainer(sourceIllegal)
433 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
434 "\xaa\xb0\xbb\xbf\xaa\xa0"));
435 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
436 ConvertUTFResultContainer(sourceIllegal)
437 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
438 "\xaa\xb0\xbb\xbf\xaa\xa0\x8f"));
439
440 // All continuation bytes (0x80--0xbf).
441 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
442 ConvertUTFResultContainer(sourceIllegal)
443 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
444 0xfffd, 0xfffd, 0xfffd, 0xfffd)
445 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
446 0xfffd, 0xfffd, 0xfffd, 0xfffd)
447 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
448 0xfffd, 0xfffd, 0xfffd, 0xfffd)
449 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
450 0xfffd, 0xfffd, 0xfffd, 0xfffd)
451 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
452 0xfffd, 0xfffd, 0xfffd, 0xfffd)
453 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
454 0xfffd, 0xfffd, 0xfffd, 0xfffd)
455 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
456 0xfffd, 0xfffd, 0xfffd, 0xfffd)
457 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
458 0xfffd, 0xfffd, 0xfffd, 0xfffd),
459 "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
460 "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
461 "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
462 "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"));
463
464 //
465 // Lonely start bytes
466 //
467
468 // Start bytes of 2-byte sequences (0xc0--0xdf).
469 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
470 ConvertUTFResultContainer(sourceIllegal)
471 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
472 0xfffd, 0xfffd, 0xfffd, 0xfffd)
473 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
474 0xfffd, 0xfffd, 0xfffd, 0xfffd)
475 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
476 0xfffd, 0xfffd, 0xfffd, 0xfffd)
477 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
478 0xfffd, 0xfffd, 0xfffd, 0xfffd),
479 "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
480 "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"));
481
482 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
483 ConvertUTFResultContainer(sourceIllegal)
484 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
485 0xfffd, 0x0020, 0xfffd, 0x0020)
486 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
487 0xfffd, 0x0020, 0xfffd, 0x0020)
488 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
489 0xfffd, 0x0020, 0xfffd, 0x0020)
490 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
491 0xfffd, 0x0020, 0xfffd, 0x0020)
492 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
493 0xfffd, 0x0020, 0xfffd, 0x0020)
494 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
495 0xfffd, 0x0020, 0xfffd, 0x0020)
496 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
497 0xfffd, 0x0020, 0xfffd, 0x0020)
498 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
499 0xfffd, 0x0020, 0xfffd, 0x0020),
500 "\xc0\x20\xc1\x20\xc2\x20\xc3\x20\xc4\x20\xc5\x20\xc6\x20\xc7\x20"
501 "\xc8\x20\xc9\x20\xca\x20\xcb\x20\xcc\x20\xcd\x20\xce\x20\xcf\x20"
502 "\xd0\x20\xd1\x20\xd2\x20\xd3\x20\xd4\x20\xd5\x20\xd6\x20\xd7\x20"
503 "\xd8\x20\xd9\x20\xda\x20\xdb\x20\xdc\x20\xdd\x20\xde\x20\xdf\x20"));
504
505 // Start bytes of 3-byte sequences (0xe0--0xef).
506 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
507 ConvertUTFResultContainer(sourceIllegal)
508 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
509 0xfffd, 0xfffd, 0xfffd, 0xfffd)
510 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
511 0xfffd, 0xfffd, 0xfffd, 0xfffd),
512 "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"));
513
514 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
515 ConvertUTFResultContainer(sourceIllegal)
516 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
517 0xfffd, 0x0020, 0xfffd, 0x0020)
518 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
519 0xfffd, 0x0020, 0xfffd, 0x0020)
520 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
521 0xfffd, 0x0020, 0xfffd, 0x0020)
522 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
523 0xfffd, 0x0020, 0xfffd, 0x0020),
524 "\xe0\x20\xe1\x20\xe2\x20\xe3\x20\xe4\x20\xe5\x20\xe6\x20\xe7\x20"
525 "\xe8\x20\xe9\x20\xea\x20\xeb\x20\xec\x20\xed\x20\xee\x20\xef\x20"));
526
527 // Start bytes of 4-byte sequences (0xf0--0xf7).
528 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
529 ConvertUTFResultContainer(sourceIllegal)
530 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
531 0xfffd, 0xfffd, 0xfffd, 0xfffd),
532 "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"));
533
534 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
535 ConvertUTFResultContainer(sourceIllegal)
536 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
537 0xfffd, 0x0020, 0xfffd, 0x0020)
538 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
539 0xfffd, 0x0020, 0xfffd, 0x0020),
540 "\xf0\x20\xf1\x20\xf2\x20\xf3\x20\xf4\x20\xf5\x20\xf6\x20\xf7\x20"));
541
542 // Start bytes of 5-byte sequences (0xf8--0xfb).
543 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
544 ConvertUTFResultContainer(sourceIllegal)
545 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
546 "\xf8\xf9\xfa\xfb"));
547
548 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
549 ConvertUTFResultContainer(sourceIllegal)
550 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
551 0xfffd, 0x0020, 0xfffd, 0x0020),
552 "\xf8\x20\xf9\x20\xfa\x20\xfb\x20"));
553
554 // Start bytes of 6-byte sequences (0xfc--0xfd).
555 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
556 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
557 "\xfc\xfd"));
558
559 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
560 ConvertUTFResultContainer(sourceIllegal)
561 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020),
562 "\xfc\x20\xfd\x20"));
563
564 //
565 // Other bytes (0xc0--0xc1, 0xfe--0xff).
566 //
567
568 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
569 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc0"));
570 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
571 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc1"));
572 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
573 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfe"));
574 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
575 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xff"));
576
577 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
578 ConvertUTFResultContainer(sourceIllegal)
579 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
580 "\xc0\xc1\xfe\xff"));
581
582 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
583 ConvertUTFResultContainer(sourceIllegal)
584 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
585 "\xfe\xfe\xff\xff"));
586
587 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
588 ConvertUTFResultContainer(sourceIllegal)
589 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
590 "\xfe\x80\x80\x80\x80\x80"));
591
592 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
593 ConvertUTFResultContainer(sourceIllegal)
594 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
595 "\xff\x80\x80\x80\x80\x80"));
596
597 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
598 ConvertUTFResultContainer(sourceIllegal)
599 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
600 0xfffd, 0x0020, 0xfffd, 0x0020),
601 "\xc0\x20\xc1\x20\xfe\x20\xff\x20"));
602
603 //
604 // Sequences with one continuation byte missing
605 //
606
607 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
608 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc2"));
609 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
610 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xdf"));
611 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
612 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
613 "\xe0\xa0"));
614 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
615 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
616 "\xe0\xbf"));
617 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
618 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
619 "\xe1\x80"));
620 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
621 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
622 "\xec\xbf"));
623 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
624 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
625 "\xed\x80"));
626 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
627 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
628 "\xed\x9f"));
629 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
630 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
631 "\xee\x80"));
632 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
633 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
634 "\xef\xbf"));
635 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
636 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
637 "\xf0\x90\x80"));
638 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
639 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
640 "\xf0\xbf\xbf"));
641 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
642 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
643 "\xf1\x80\x80"));
644 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
645 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
646 "\xf3\xbf\xbf"));
647 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
648 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
649 "\xf4\x80\x80"));
650 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
651 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
652 "\xf4\x8f\xbf"));
653
654 // Overlong sequences with one trailing byte missing.
655 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
656 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
657 "\xc0"));
658 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
659 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
660 "\xc1"));
661 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
662 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
663 "\xe0\x80"));
664 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
665 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
666 "\xe0\x9f"));
667 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
668 ConvertUTFResultContainer(sourceIllegal)
669 .withScalars(0xfffd, 0xfffd, 0xfffd),
670 "\xf0\x80\x80"));
671 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
672 ConvertUTFResultContainer(sourceIllegal)
673 .withScalars(0xfffd, 0xfffd, 0xfffd),
674 "\xf0\x8f\x80"));
675 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
676 ConvertUTFResultContainer(sourceIllegal)
677 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
678 "\xf8\x80\x80\x80"));
679 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
680 ConvertUTFResultContainer(sourceIllegal)
681 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
682 "\xfc\x80\x80\x80\x80"));
683
684 // Sequences that represent surrogates with one trailing byte missing.
685 // High surrogates
686 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
687 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
688 "\xed\xa0"));
689 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
690 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
691 "\xed\xac"));
692 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
693 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
694 "\xed\xaf"));
695 // Low surrogates
696 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
697 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
698 "\xed\xb0"));
699 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
700 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
701 "\xed\xb4"));
702 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
703 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
704 "\xed\xbf"));
705
706 // Ill-formed 4-byte sequences.
707 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
708 // U+1100xx (invalid)
709 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
710 ConvertUTFResultContainer(sourceIllegal)
711 .withScalars(0xfffd, 0xfffd, 0xfffd),
712 "\xf4\x90\x80"));
713 // U+13FBxx (invalid)
714 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
715 ConvertUTFResultContainer(sourceIllegal)
716 .withScalars(0xfffd, 0xfffd, 0xfffd),
717 "\xf4\xbf\xbf"));
718 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
719 ConvertUTFResultContainer(sourceIllegal)
720 .withScalars(0xfffd, 0xfffd, 0xfffd),
721 "\xf5\x80\x80"));
722 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
723 ConvertUTFResultContainer(sourceIllegal)
724 .withScalars(0xfffd, 0xfffd, 0xfffd),
725 "\xf6\x80\x80"));
726 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
727 ConvertUTFResultContainer(sourceIllegal)
728 .withScalars(0xfffd, 0xfffd, 0xfffd),
729 "\xf7\x80\x80"));
730 // U+1FFBxx (invalid)
731 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
732 ConvertUTFResultContainer(sourceIllegal)
733 .withScalars(0xfffd, 0xfffd, 0xfffd),
734 "\xf7\xbf\xbf"));
735
736 // Ill-formed 5-byte sequences.
737 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
738 // U+2000xx (invalid)
739 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
740 ConvertUTFResultContainer(sourceIllegal)
741 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
742 "\xf8\x88\x80\x80"));
743 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
744 ConvertUTFResultContainer(sourceIllegal)
745 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
746 "\xf8\xbf\xbf\xbf"));
747 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
748 ConvertUTFResultContainer(sourceIllegal)
749 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
750 "\xf9\x80\x80\x80"));
751 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
752 ConvertUTFResultContainer(sourceIllegal)
753 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
754 "\xfa\x80\x80\x80"));
755 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
756 ConvertUTFResultContainer(sourceIllegal)
757 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
758 "\xfb\x80\x80\x80"));
759 // U+3FFFFxx (invalid)
760 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
761 ConvertUTFResultContainer(sourceIllegal)
762 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
763 "\xfb\xbf\xbf\xbf"));
764
765 // Ill-formed 6-byte sequences.
766 // 1111110u 10uuuuuu 10uzzzzz 10zzzyyyy 10yyyyxx 10xxxxxx
767 // U+40000xx (invalid)
768 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
769 ConvertUTFResultContainer(sourceIllegal)
770 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
771 "\xfc\x84\x80\x80\x80"));
772 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
773 ConvertUTFResultContainer(sourceIllegal)
774 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
775 "\xfc\xbf\xbf\xbf\xbf"));
776 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
777 ConvertUTFResultContainer(sourceIllegal)
778 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
779 "\xfd\x80\x80\x80\x80"));
780 // U+7FFFFFxx (invalid)
781 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
782 ConvertUTFResultContainer(sourceIllegal)
783 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
784 "\xfd\xbf\xbf\xbf\xbf"));
785
786 //
787 // Sequences with two continuation bytes missing
788 //
789
790 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
791 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
792 "\xf0\x90"));
793 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
794 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
795 "\xf0\xbf"));
796 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
797 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
798 "\xf1\x80"));
799 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
800 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
801 "\xf3\xbf"));
802 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
803 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
804 "\xf4\x80"));
805 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
806 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
807 "\xf4\x8f"));
808
809 // Overlong sequences with two trailing byte missing.
810 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
811 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xe0"));
812 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
813 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
814 "\xf0\x80"));
815 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
816 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
817 "\xf0\x8f"));
818 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
819 ConvertUTFResultContainer(sourceIllegal)
820 .withScalars(0xfffd, 0xfffd, 0xfffd),
821 "\xf8\x80\x80"));
822 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
823 ConvertUTFResultContainer(sourceIllegal)
824 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
825 "\xfc\x80\x80\x80"));
826
827 // Sequences that represent surrogates with two trailing bytes missing.
828 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
829 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xed"));
830
831 // Ill-formed 4-byte sequences.
832 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
833 // U+110yxx (invalid)
834 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
835 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
836 "\xf4\x90"));
837 // U+13Fyxx (invalid)
838 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
839 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
840 "\xf4\xbf"));
841 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
842 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
843 "\xf5\x80"));
844 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
845 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
846 "\xf6\x80"));
847 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
848 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
849 "\xf7\x80"));
850 // U+1FFyxx (invalid)
851 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
852 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
853 "\xf7\xbf"));
854
855 // Ill-formed 5-byte sequences.
856 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
857 // U+200yxx (invalid)
858 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
859 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
860 "\xf8\x88\x80"));
861 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
862 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
863 "\xf8\xbf\xbf"));
864 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
865 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
866 "\xf9\x80\x80"));
867 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
868 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
869 "\xfa\x80\x80"));
870 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
871 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
872 "\xfb\x80\x80"));
873 // U+3FFFyxx (invalid)
874 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
875 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
876 "\xfb\xbf\xbf"));
877
878 // Ill-formed 6-byte sequences.
879 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
880 // U+4000yxx (invalid)
881 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
882 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
883 "\xfc\x84\x80\x80"));
884 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
885 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
886 "\xfc\xbf\xbf\xbf"));
887 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
888 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
889 "\xfd\x80\x80\x80"));
890 // U+7FFFFyxx (invalid)
891 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
892 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
893 "\xfd\xbf\xbf\xbf"));
894
895 //
896 // Sequences with three continuation bytes missing
897 //
898
899 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
900 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
901 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
902 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf1"));
903 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
904 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf2"));
905 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
906 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf3"));
907 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
908 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf4"));
909
910 // Broken overlong sequences.
911 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
912 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
913 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
914 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
915 "\xf8\x80"));
916 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
917 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
918 "\xfc\x80\x80"));
919
920 // Ill-formed 4-byte sequences.
921 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
922 // U+14yyxx (invalid)
923 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
924 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf5"));
925 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
926 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf6"));
927 // U+1Cyyxx (invalid)
928 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
929 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf7"));
930
931 // Ill-formed 5-byte sequences.
932 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
933 // U+20yyxx (invalid)
934 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
935 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
936 "\xf8\x88"));
937 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
938 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
939 "\xf8\xbf"));
940 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
941 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
942 "\xf9\x80"));
943 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
944 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
945 "\xfa\x80"));
946 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
947 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
948 "\xfb\x80"));
949 // U+3FCyyxx (invalid)
950 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
951 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
952 "\xfb\xbf"));
953
954 // Ill-formed 6-byte sequences.
955 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
956 // U+400yyxx (invalid)
957 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
958 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
959 "\xfc\x84\x80"));
960 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
961 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
962 "\xfc\xbf\xbf"));
963 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
964 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
965 "\xfd\x80\x80"));
966 // U+7FFCyyxx (invalid)
967 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
968 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
969 "\xfd\xbf\xbf"));
970
971 //
972 // Sequences with four continuation bytes missing
973 //
974
975 // Ill-formed 5-byte sequences.
976 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
977 // U+uzyyxx (invalid)
978 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
979 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
980 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
981 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf9"));
982 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
983 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfa"));
984 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
985 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
986 // U+3zyyxx (invalid)
987 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
988 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
989
990 // Broken overlong sequences.
991 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
992 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
993 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
994 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
995 "\xfc\x80"));
996
997 // Ill-formed 6-byte sequences.
998 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
999 // U+uzzyyxx (invalid)
1000 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1001 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1002 "\xfc\x84"));
1003 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1004 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1005 "\xfc\xbf"));
1006 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1007 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1008 "\xfd\x80"));
1009 // U+7Fzzyyxx (invalid)
1010 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1011 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1012 "\xfd\xbf"));
1013
1014 //
1015 // Sequences with five continuation bytes missing
1016 //
1017
1018 // Ill-formed 6-byte sequences.
1019 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
1020 // U+uzzyyxx (invalid)
1021 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1022 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfc"));
1023 // U+uuzzyyxx (invalid)
1024 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1025 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfd"));
1026
1027 //
1028 // Consecutive sequences with trailing bytes missing
1029 //
1030
1031 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1032 ConvertUTFResultContainer(sourceIllegal)
1033 .withScalars(0xfffd, /**/ 0xfffd, 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
1034 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
1035 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd)
1036 .withScalars(0xfffd, /**/ 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
1037 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
1038 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1039 "\xc0" "\xe0\x80" "\xf0\x80\x80"
1040 "\xf8\x80\x80\x80"
1041 "\xfc\x80\x80\x80\x80"
1042 "\xdf" "\xef\xbf" "\xf7\xbf\xbf"
1043 "\xfb\xbf\xbf\xbf"
1044 "\xfd\xbf\xbf\xbf\xbf"));
1045
1046 //
1047 // Overlong UTF-8 sequences
1048 //
1049
1050 // U+002F SOLIDUS
1051 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1052 ConvertUTFResultContainer(conversionOK).withScalars(0x002f), "\x2f"));
1053
1054 // Overlong sequences of the above.
1055 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1056 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1057 "\xc0\xaf"));
1058 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1059 ConvertUTFResultContainer(sourceIllegal)
1060 .withScalars(0xfffd, 0xfffd, 0xfffd),
1061 "\xe0\x80\xaf"));
1062 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1063 ConvertUTFResultContainer(sourceIllegal)
1064 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1065 "\xf0\x80\x80\xaf"));
1066 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1067 ConvertUTFResultContainer(sourceIllegal)
1068 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1069 "\xf8\x80\x80\x80\xaf"));
1070 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1071 ConvertUTFResultContainer(sourceIllegal)
1072 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1073 "\xfc\x80\x80\x80\x80\xaf"));
1074
1075 // U+0000 NULL
1076 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1077 ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
1078 StringRef("\x00", 1)));
1079
1080 // Overlong sequences of the above.
1081 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1082 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1083 "\xc0\x80"));
1084 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1085 ConvertUTFResultContainer(sourceIllegal)
1086 .withScalars(0xfffd, 0xfffd, 0xfffd),
1087 "\xe0\x80\x80"));
1088 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1089 ConvertUTFResultContainer(sourceIllegal)
1090 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1091 "\xf0\x80\x80\x80"));
1092 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1093 ConvertUTFResultContainer(sourceIllegal)
1094 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1095 "\xf8\x80\x80\x80\x80"));
1096 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1097 ConvertUTFResultContainer(sourceIllegal)
1098 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1099 "\xfc\x80\x80\x80\x80\x80"));
1100
1101 // Other overlong sequences.
1102 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1103 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1104 "\xc0\xbf"));
1105 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1106 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1107 "\xc1\x80"));
1108 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1109 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1110 "\xc1\xbf"));
1111 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1112 ConvertUTFResultContainer(sourceIllegal)
1113 .withScalars(0xfffd, 0xfffd, 0xfffd),
1114 "\xe0\x9f\xbf"));
1115 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1116 ConvertUTFResultContainer(sourceIllegal)
1117 .withScalars(0xfffd, 0xfffd, 0xfffd),
1118 "\xed\xa0\x80"));
1119 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1120 ConvertUTFResultContainer(sourceIllegal)
1121 .withScalars(0xfffd, 0xfffd, 0xfffd),
1122 "\xed\xbf\xbf"));
1123 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1124 ConvertUTFResultContainer(sourceIllegal)
1125 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1126 "\xf0\x8f\x80\x80"));
1127 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1128 ConvertUTFResultContainer(sourceIllegal)
1129 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1130 "\xf0\x8f\xbf\xbf"));
1131 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1132 ConvertUTFResultContainer(sourceIllegal)
1133 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1134 "\xf8\x87\xbf\xbf\xbf"));
1135 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1136 ConvertUTFResultContainer(sourceIllegal)
1137 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1138 "\xfc\x83\xbf\xbf\xbf\xbf"));
1139
1140 //
1141 // Isolated surrogates
1142 //
1143
1144 // Unicode 6.3.0:
1145 //
1146 // D71. High-surrogate code point: A Unicode code point in the range
1147 // U+D800 to U+DBFF.
1148 //
1149 // D73. Low-surrogate code point: A Unicode code point in the range
1150 // U+DC00 to U+DFFF.
1151
1152 // Note: U+E0100 is <DB40 DD00> in UTF16.
1153
1154 // High surrogates
1155
1156 // U+D800
1157 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1158 ConvertUTFResultContainer(sourceIllegal)
1159 .withScalars(0xfffd, 0xfffd, 0xfffd),
1160 "\xed\xa0\x80"));
1161
1162 // U+DB40
1163 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1164 ConvertUTFResultContainer(sourceIllegal)
1165 .withScalars(0xfffd, 0xfffd, 0xfffd),
1166 "\xed\xac\xa0"));
1167
1168 // U+DBFF
1169 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1170 ConvertUTFResultContainer(sourceIllegal)
1171 .withScalars(0xfffd, 0xfffd, 0xfffd),
1172 "\xed\xaf\xbf"));
1173
1174 // Low surrogates
1175
1176 // U+DC00
1177 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1178 ConvertUTFResultContainer(sourceIllegal)
1179 .withScalars(0xfffd, 0xfffd, 0xfffd),
1180 "\xed\xb0\x80"));
1181
1182 // U+DD00
1183 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1184 ConvertUTFResultContainer(sourceIllegal)
1185 .withScalars(0xfffd, 0xfffd, 0xfffd),
1186 "\xed\xb4\x80"));
1187
1188 // U+DFFF
1189 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1190 ConvertUTFResultContainer(sourceIllegal)
1191 .withScalars(0xfffd, 0xfffd, 0xfffd),
1192 "\xed\xbf\xbf"));
1193
1194 // Surrogate pairs
1195
1196 // U+D800 U+DC00
1197 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1198 ConvertUTFResultContainer(sourceIllegal)
1199 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1200 "\xed\xa0\x80\xed\xb0\x80"));
1201
1202 // U+D800 U+DD00
1203 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1204 ConvertUTFResultContainer(sourceIllegal)
1205 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1206 "\xed\xa0\x80\xed\xb4\x80"));
1207
1208 // U+D800 U+DFFF
1209 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1210 ConvertUTFResultContainer(sourceIllegal)
1211 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1212 "\xed\xa0\x80\xed\xbf\xbf"));
1213
1214 // U+DB40 U+DC00
1215 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1216 ConvertUTFResultContainer(sourceIllegal)
1217 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1218 "\xed\xac\xa0\xed\xb0\x80"));
1219
1220 // U+DB40 U+DD00
1221 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1222 ConvertUTFResultContainer(sourceIllegal)
1223 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1224 "\xed\xac\xa0\xed\xb4\x80"));
1225
1226 // U+DB40 U+DFFF
1227 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1228 ConvertUTFResultContainer(sourceIllegal)
1229 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1230 "\xed\xac\xa0\xed\xbf\xbf"));
1231
1232 // U+DBFF U+DC00
1233 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1234 ConvertUTFResultContainer(sourceIllegal)
1235 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1236 "\xed\xaf\xbf\xed\xb0\x80"));
1237
1238 // U+DBFF U+DD00
1239 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1240 ConvertUTFResultContainer(sourceIllegal)
1241 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1242 "\xed\xaf\xbf\xed\xb4\x80"));
1243
1244 // U+DBFF U+DFFF
1245 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1246 ConvertUTFResultContainer(sourceIllegal)
1247 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1248 "\xed\xaf\xbf\xed\xbf\xbf"));
1249
1250 //
1251 // Noncharacters
1252 //
1253
1254 // Unicode 6.3.0:
1255 //
1256 // D14. Noncharacter: A code point that is permanently reserved for
1257 // internal use and that should never be interchanged. Noncharacters
1258 // consist of the values U+nFFFE and U+nFFFF (where n is from 0 to 1016)
1259 // and the values U+FDD0..U+FDEF.
1260
1261 // U+FFFE
1262 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1263 ConvertUTFResultContainer(conversionOK).withScalars(0xfffe),
1264 "\xef\xbf\xbe"));
1265
1266 // U+FFFF
1267 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1268 ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
1269 "\xef\xbf\xbf"));
1270
1271 // U+1FFFE
1272 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1273 ConvertUTFResultContainer(conversionOK).withScalars(0x1fffe),
1274 "\xf0\x9f\xbf\xbe"));
1275
1276 // U+1FFFF
1277 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1278 ConvertUTFResultContainer(conversionOK).withScalars(0x1ffff),
1279 "\xf0\x9f\xbf\xbf"));
1280
1281 // U+2FFFE
1282 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1283 ConvertUTFResultContainer(conversionOK).withScalars(0x2fffe),
1284 "\xf0\xaf\xbf\xbe"));
1285
1286 // U+2FFFF
1287 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1288 ConvertUTFResultContainer(conversionOK).withScalars(0x2ffff),
1289 "\xf0\xaf\xbf\xbf"));
1290
1291 // U+3FFFE
1292 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1293 ConvertUTFResultContainer(conversionOK).withScalars(0x3fffe),
1294 "\xf0\xbf\xbf\xbe"));
1295
1296 // U+3FFFF
1297 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1298 ConvertUTFResultContainer(conversionOK).withScalars(0x3ffff),
1299 "\xf0\xbf\xbf\xbf"));
1300
1301 // U+4FFFE
1302 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1303 ConvertUTFResultContainer(conversionOK).withScalars(0x4fffe),
1304 "\xf1\x8f\xbf\xbe"));
1305
1306 // U+4FFFF
1307 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1308 ConvertUTFResultContainer(conversionOK).withScalars(0x4ffff),
1309 "\xf1\x8f\xbf\xbf"));
1310
1311 // U+5FFFE
1312 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1313 ConvertUTFResultContainer(conversionOK).withScalars(0x5fffe),
1314 "\xf1\x9f\xbf\xbe"));
1315
1316 // U+5FFFF
1317 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1318 ConvertUTFResultContainer(conversionOK).withScalars(0x5ffff),
1319 "\xf1\x9f\xbf\xbf"));
1320
1321 // U+6FFFE
1322 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1323 ConvertUTFResultContainer(conversionOK).withScalars(0x6fffe),
1324 "\xf1\xaf\xbf\xbe"));
1325
1326 // U+6FFFF
1327 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1328 ConvertUTFResultContainer(conversionOK).withScalars(0x6ffff),
1329 "\xf1\xaf\xbf\xbf"));
1330
1331 // U+7FFFE
1332 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1333 ConvertUTFResultContainer(conversionOK).withScalars(0x7fffe),
1334 "\xf1\xbf\xbf\xbe"));
1335
1336 // U+7FFFF
1337 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1338 ConvertUTFResultContainer(conversionOK).withScalars(0x7ffff),
1339 "\xf1\xbf\xbf\xbf"));
1340
1341 // U+8FFFE
1342 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1343 ConvertUTFResultContainer(conversionOK).withScalars(0x8fffe),
1344 "\xf2\x8f\xbf\xbe"));
1345
1346 // U+8FFFF
1347 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1348 ConvertUTFResultContainer(conversionOK).withScalars(0x8ffff),
1349 "\xf2\x8f\xbf\xbf"));
1350
1351 // U+9FFFE
1352 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1353 ConvertUTFResultContainer(conversionOK).withScalars(0x9fffe),
1354 "\xf2\x9f\xbf\xbe"));
1355
1356 // U+9FFFF
1357 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1358 ConvertUTFResultContainer(conversionOK).withScalars(0x9ffff),
1359 "\xf2\x9f\xbf\xbf"));
1360
1361 // U+AFFFE
1362 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1363 ConvertUTFResultContainer(conversionOK).withScalars(0xafffe),
1364 "\xf2\xaf\xbf\xbe"));
1365
1366 // U+AFFFF
1367 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1368 ConvertUTFResultContainer(conversionOK).withScalars(0xaffff),
1369 "\xf2\xaf\xbf\xbf"));
1370
1371 // U+BFFFE
1372 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1373 ConvertUTFResultContainer(conversionOK).withScalars(0xbfffe),
1374 "\xf2\xbf\xbf\xbe"));
1375
1376 // U+BFFFF
1377 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1378 ConvertUTFResultContainer(conversionOK).withScalars(0xbffff),
1379 "\xf2\xbf\xbf\xbf"));
1380
1381 // U+CFFFE
1382 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1383 ConvertUTFResultContainer(conversionOK).withScalars(0xcfffe),
1384 "\xf3\x8f\xbf\xbe"));
1385
1386 // U+CFFFF
1387 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1388 ConvertUTFResultContainer(conversionOK).withScalars(0xcfffF),
1389 "\xf3\x8f\xbf\xbf"));
1390
1391 // U+DFFFE
1392 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1393 ConvertUTFResultContainer(conversionOK).withScalars(0xdfffe),
1394 "\xf3\x9f\xbf\xbe"));
1395
1396 // U+DFFFF
1397 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1398 ConvertUTFResultContainer(conversionOK).withScalars(0xdffff),
1399 "\xf3\x9f\xbf\xbf"));
1400
1401 // U+EFFFE
1402 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1403 ConvertUTFResultContainer(conversionOK).withScalars(0xefffe),
1404 "\xf3\xaf\xbf\xbe"));
1405
1406 // U+EFFFF
1407 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1408 ConvertUTFResultContainer(conversionOK).withScalars(0xeffff),
1409 "\xf3\xaf\xbf\xbf"));
1410
1411 // U+FFFFE
1412 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1413 ConvertUTFResultContainer(conversionOK).withScalars(0xffffe),
1414 "\xf3\xbf\xbf\xbe"));
1415
1416 // U+FFFFF
1417 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1418 ConvertUTFResultContainer(conversionOK).withScalars(0xfffff),
1419 "\xf3\xbf\xbf\xbf"));
1420
1421 // U+10FFFE
1422 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1423 ConvertUTFResultContainer(conversionOK).withScalars(0x10fffe),
1424 "\xf4\x8f\xbf\xbe"));
1425
1426 // U+10FFFF
1427 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1428 ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
1429 "\xf4\x8f\xbf\xbf"));
1430
1431 // U+FDD0
1432 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1433 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd0),
1434 "\xef\xb7\x90"));
1435
1436 // U+FDD1
1437 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1438 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd1),
1439 "\xef\xb7\x91"));
1440
1441 // U+FDD2
1442 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1443 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd2),
1444 "\xef\xb7\x92"));
1445
1446 // U+FDD3
1447 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1448 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd3),
1449 "\xef\xb7\x93"));
1450
1451 // U+FDD4
1452 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1453 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd4),
1454 "\xef\xb7\x94"));
1455
1456 // U+FDD5
1457 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1458 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd5),
1459 "\xef\xb7\x95"));
1460
1461 // U+FDD6
1462 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1463 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd6),
1464 "\xef\xb7\x96"));
1465
1466 // U+FDD7
1467 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1468 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd7),
1469 "\xef\xb7\x97"));
1470
1471 // U+FDD8
1472 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1473 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd8),
1474 "\xef\xb7\x98"));
1475
1476 // U+FDD9
1477 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1478 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd9),
1479 "\xef\xb7\x99"));
1480
1481 // U+FDDA
1482 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1483 ConvertUTFResultContainer(conversionOK).withScalars(0xfdda),
1484 "\xef\xb7\x9a"));
1485
1486 // U+FDDB
1487 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1488 ConvertUTFResultContainer(conversionOK).withScalars(0xfddb),
1489 "\xef\xb7\x9b"));
1490
1491 // U+FDDC
1492 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1493 ConvertUTFResultContainer(conversionOK).withScalars(0xfddc),
1494 "\xef\xb7\x9c"));
1495
1496 // U+FDDD
1497 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1498 ConvertUTFResultContainer(conversionOK).withScalars(0xfddd),
1499 "\xef\xb7\x9d"));
1500
1501 // U+FDDE
1502 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1503 ConvertUTFResultContainer(conversionOK).withScalars(0xfdde),
1504 "\xef\xb7\x9e"));
1505
1506 // U+FDDF
1507 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1508 ConvertUTFResultContainer(conversionOK).withScalars(0xfddf),
1509 "\xef\xb7\x9f"));
1510
1511 // U+FDE0
1512 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1513 ConvertUTFResultContainer(conversionOK).withScalars(0xfde0),
1514 "\xef\xb7\xa0"));
1515
1516 // U+FDE1
1517 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1518 ConvertUTFResultContainer(conversionOK).withScalars(0xfde1),
1519 "\xef\xb7\xa1"));
1520
1521 // U+FDE2
1522 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1523 ConvertUTFResultContainer(conversionOK).withScalars(0xfde2),
1524 "\xef\xb7\xa2"));
1525
1526 // U+FDE3
1527 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1528 ConvertUTFResultContainer(conversionOK).withScalars(0xfde3),
1529 "\xef\xb7\xa3"));
1530
1531 // U+FDE4
1532 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1533 ConvertUTFResultContainer(conversionOK).withScalars(0xfde4),
1534 "\xef\xb7\xa4"));
1535
1536 // U+FDE5
1537 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1538 ConvertUTFResultContainer(conversionOK).withScalars(0xfde5),
1539 "\xef\xb7\xa5"));
1540
1541 // U+FDE6
1542 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1543 ConvertUTFResultContainer(conversionOK).withScalars(0xfde6),
1544 "\xef\xb7\xa6"));
1545
1546 // U+FDE7
1547 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1548 ConvertUTFResultContainer(conversionOK).withScalars(0xfde7),
1549 "\xef\xb7\xa7"));
1550
1551 // U+FDE8
1552 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1553 ConvertUTFResultContainer(conversionOK).withScalars(0xfde8),
1554 "\xef\xb7\xa8"));
1555
1556 // U+FDE9
1557 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1558 ConvertUTFResultContainer(conversionOK).withScalars(0xfde9),
1559 "\xef\xb7\xa9"));
1560
1561 // U+FDEA
1562 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1563 ConvertUTFResultContainer(conversionOK).withScalars(0xfdea),
1564 "\xef\xb7\xaa"));
1565
1566 // U+FDEB
1567 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1568 ConvertUTFResultContainer(conversionOK).withScalars(0xfdeb),
1569 "\xef\xb7\xab"));
1570
1571 // U+FDEC
1572 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1573 ConvertUTFResultContainer(conversionOK).withScalars(0xfdec),
1574 "\xef\xb7\xac"));
1575
1576 // U+FDED
1577 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1578 ConvertUTFResultContainer(conversionOK).withScalars(0xfded),
1579 "\xef\xb7\xad"));
1580
1581 // U+FDEE
1582 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1583 ConvertUTFResultContainer(conversionOK).withScalars(0xfdee),
1584 "\xef\xb7\xae"));
1585
1586 // U+FDEF
1587 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1588 ConvertUTFResultContainer(conversionOK).withScalars(0xfdef),
1589 "\xef\xb7\xaf"));
1590
1591 // U+FDF0
1592 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1593 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf0),
1594 "\xef\xb7\xb0"));
1595
1596 // U+FDF1
1597 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1598 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf1),
1599 "\xef\xb7\xb1"));
1600
1601 // U+FDF2
1602 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1603 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf2),
1604 "\xef\xb7\xb2"));
1605
1606 // U+FDF3
1607 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1608 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf3),
1609 "\xef\xb7\xb3"));
1610
1611 // U+FDF4
1612 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1613 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf4),
1614 "\xef\xb7\xb4"));
1615
1616 // U+FDF5
1617 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1618 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf5),
1619 "\xef\xb7\xb5"));
1620
1621 // U+FDF6
1622 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1623 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf6),
1624 "\xef\xb7\xb6"));
1625
1626 // U+FDF7
1627 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1628 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf7),
1629 "\xef\xb7\xb7"));
1630
1631 // U+FDF8
1632 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1633 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf8),
1634 "\xef\xb7\xb8"));
1635
1636 // U+FDF9
1637 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1638 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf9),
1639 "\xef\xb7\xb9"));
1640
1641 // U+FDFA
1642 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1643 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfa),
1644 "\xef\xb7\xba"));
1645
1646 // U+FDFB
1647 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1648 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfb),
1649 "\xef\xb7\xbb"));
1650
1651 // U+FDFC
1652 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1653 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfc),
1654 "\xef\xb7\xbc"));
1655
1656 // U+FDFD
1657 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1658 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfd),
1659 "\xef\xb7\xbd"));
1660
1661 // U+FDFE
1662 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1663 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfe),
1664 "\xef\xb7\xbe"));
1665
1666 // U+FDFF
1667 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1668 ConvertUTFResultContainer(conversionOK).withScalars(0xfdff),
1669 "\xef\xb7\xbf"));
1670 }
1671
TEST(ConvertUTFTest,UTF8ToUTF32PartialLenient)1672 TEST(ConvertUTFTest, UTF8ToUTF32PartialLenient) {
1673 // U+0041 LATIN CAPITAL LETTER A
1674 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1675 ConvertUTFResultContainer(conversionOK).withScalars(0x0041),
1676 "\x41", true));
1677
1678 //
1679 // Sequences with one continuation byte missing
1680 //
1681
1682 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1683 ConvertUTFResultContainer(sourceExhausted),
1684 "\xc2", true));
1685 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1686 ConvertUTFResultContainer(sourceExhausted),
1687 "\xdf", true));
1688 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1689 ConvertUTFResultContainer(sourceExhausted),
1690 "\xe0\xa0", true));
1691 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1692 ConvertUTFResultContainer(sourceExhausted),
1693 "\xe0\xbf", true));
1694 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1695 ConvertUTFResultContainer(sourceExhausted),
1696 "\xe1\x80", true));
1697 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1698 ConvertUTFResultContainer(sourceExhausted),
1699 "\xec\xbf", true));
1700 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1701 ConvertUTFResultContainer(sourceExhausted),
1702 "\xed\x80", true));
1703 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1704 ConvertUTFResultContainer(sourceExhausted),
1705 "\xed\x9f", true));
1706 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1707 ConvertUTFResultContainer(sourceExhausted),
1708 "\xee\x80", true));
1709 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1710 ConvertUTFResultContainer(sourceExhausted),
1711 "\xef\xbf", true));
1712 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1713 ConvertUTFResultContainer(sourceExhausted),
1714 "\xf0\x90\x80", true));
1715 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1716 ConvertUTFResultContainer(sourceExhausted),
1717 "\xf0\xbf\xbf", true));
1718 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1719 ConvertUTFResultContainer(sourceExhausted),
1720 "\xf1\x80\x80", true));
1721 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1722 ConvertUTFResultContainer(sourceExhausted),
1723 "\xf3\xbf\xbf", true));
1724 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1725 ConvertUTFResultContainer(sourceExhausted),
1726 "\xf4\x80\x80", true));
1727 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1728 ConvertUTFResultContainer(sourceExhausted),
1729 "\xf4\x8f\xbf", true));
1730
1731 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1732 ConvertUTFResultContainer(sourceExhausted).withScalars(0x0041),
1733 "\x41\xc2", true));
1734 }
1735
1736