efb2f106250ac23877990fd4c08390161bf5e66a
[oota-llvm.git] / unittests / Support / ConvertUTFTest.cpp
1 //===- llvm/unittest/Support/ConvertUTFTest.cpp - ConvertUTF tests --------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9
10 #include "llvm/Support/ConvertUTF.h"
11 #include "gtest/gtest.h"
12 #include <string>
13 #include <vector>
14 #include <utility>
15
16 using namespace llvm;
17
18 TEST(ConvertUTFTest, ConvertUTF16LittleEndianToUTF8String) {
19   // Src is the look of disapproval.
20   static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c";
21   ArrayRef<char> Ref(Src, sizeof(Src) - 1);
22   std::string Result;
23   bool Success = convertUTF16ToUTF8String(Ref, Result);
24   EXPECT_TRUE(Success);
25   std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
26   EXPECT_EQ(Expected, Result);
27 }
28
29 TEST(ConvertUTFTest, ConvertUTF16BigEndianToUTF8String) {
30   // Src is the look of disapproval.
31   static const char Src[] = "\xfe\xff\x0c\xa0\x00_\x0c\xa0";
32   ArrayRef<char> Ref(Src, sizeof(Src) - 1);
33   std::string Result;
34   bool Success = convertUTF16ToUTF8String(Ref, Result);
35   EXPECT_TRUE(Success);
36   std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
37   EXPECT_EQ(Expected, Result);
38 }
39
40 TEST(ConvertUTFTest, OddLengthInput) {
41   std::string Result;
42   bool Success = convertUTF16ToUTF8String(makeArrayRef("xxxxx", 5), Result);
43   EXPECT_FALSE(Success);
44 }
45
46 TEST(ConvertUTFTest, Empty) {
47   std::string Result;
48   bool Success = convertUTF16ToUTF8String(None, Result);
49   EXPECT_TRUE(Success);
50   EXPECT_TRUE(Result.empty());
51 }
52
53 TEST(ConvertUTFTest, HasUTF16BOM) {
54   bool HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xff\xfe", 2));
55   EXPECT_TRUE(HasBOM);
56   HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff", 2));
57   EXPECT_TRUE(HasBOM);
58   HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff ", 3));
59   EXPECT_TRUE(HasBOM); // Don't care about odd lengths.
60   HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff\x00asdf", 6));
61   EXPECT_TRUE(HasBOM);
62
63   HasBOM = hasUTF16ByteOrderMark(None);
64   EXPECT_FALSE(HasBOM);
65   HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe", 1));
66   EXPECT_FALSE(HasBOM);
67 }
68
69 struct ConvertUTFResultContainer {
70   ConversionResult ErrorCode;
71   std::vector<unsigned> UnicodeScalars;
72
73   ConvertUTFResultContainer(ConversionResult ErrorCode)
74       : ErrorCode(ErrorCode) {}
75
76   ConvertUTFResultContainer
77   withScalars(unsigned US0 = 0x110000, unsigned US1 = 0x110000,
78               unsigned US2 = 0x110000, unsigned US3 = 0x110000,
79               unsigned US4 = 0x110000, unsigned US5 = 0x110000,
80               unsigned US6 = 0x110000, unsigned US7 = 0x110000) {
81     ConvertUTFResultContainer Result(*this);
82     if (US0 != 0x110000)
83       Result.UnicodeScalars.push_back(US0);
84     if (US1 != 0x110000)
85       Result.UnicodeScalars.push_back(US1);
86     if (US2 != 0x110000)
87       Result.UnicodeScalars.push_back(US2);
88     if (US3 != 0x110000)
89       Result.UnicodeScalars.push_back(US3);
90     if (US4 != 0x110000)
91       Result.UnicodeScalars.push_back(US4);
92     if (US5 != 0x110000)
93       Result.UnicodeScalars.push_back(US5);
94     if (US6 != 0x110000)
95       Result.UnicodeScalars.push_back(US6);
96     if (US7 != 0x110000)
97       Result.UnicodeScalars.push_back(US7);
98     return Result;
99   }
100 };
101
102 std::pair<ConversionResult, std::vector<unsigned>>
103 ConvertUTF8ToUnicodeScalarsLenient(StringRef S) {
104   const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
105
106   const UTF8 *SourceNext = SourceStart;
107   std::vector<UTF32> Decoded(S.size(), 0);
108   UTF32 *TargetStart = Decoded.data();
109
110   auto ErrorCode =
111       ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart,
112                          Decoded.data() + Decoded.size(), lenientConversion);
113
114   Decoded.resize(TargetStart - Decoded.data());
115
116   return std::make_pair(ErrorCode, Decoded);
117 }
118
119 std::pair<ConversionResult, std::vector<unsigned>>
120 ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S) {
121   const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
122
123   const UTF8 *SourceNext = SourceStart;
124   std::vector<UTF32> Decoded(S.size(), 0);
125   UTF32 *TargetStart = Decoded.data();
126
127   auto ErrorCode = ConvertUTF8toUTF32Partial(
128       &SourceNext, SourceStart + S.size(), &TargetStart,
129       Decoded.data() + Decoded.size(), lenientConversion);
130
131   Decoded.resize(TargetStart - Decoded.data());
132
133   return std::make_pair(ErrorCode, Decoded);
134 }
135
136 ::testing::AssertionResult
137 CheckConvertUTF8ToUnicodeScalars(ConvertUTFResultContainer Expected,
138                                  StringRef S, bool Partial = false) {
139   ConversionResult ErrorCode;
140   std::vector<unsigned> Decoded;
141   if (!Partial)
142     std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsLenient(S);
143   else
144     std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsPartialLenient(S);
145
146   if (Expected.ErrorCode != ErrorCode)
147     return ::testing::AssertionFailure() << "Expected error code "
148                                          << Expected.ErrorCode << ", actual "
149                                          << ErrorCode;
150
151   if (Expected.UnicodeScalars != Decoded)
152     return ::testing::AssertionFailure()
153            << "Expected lenient decoded result:\n"
154            << ::testing::PrintToString(Expected.UnicodeScalars) << "\n"
155            << "Actual result:\n" << ::testing::PrintToString(Decoded);
156
157   return ::testing::AssertionSuccess();
158 }
159
160 TEST(ConvertUTFTest, UTF8ToUTF32Lenient) {
161
162   //
163   // 1-byte sequences
164   //
165
166   // U+0041 LATIN CAPITAL LETTER A
167   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
168       ConvertUTFResultContainer(conversionOK).withScalars(0x0041), "\x41"));
169
170   //
171   // 2-byte sequences
172   //
173
174   // U+0283 LATIN SMALL LETTER ESH
175   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
176       ConvertUTFResultContainer(conversionOK).withScalars(0x0283),
177       "\xca\x83"));
178
179   // U+03BA GREEK SMALL LETTER KAPPA
180   // U+1F79 GREEK SMALL LETTER OMICRON WITH OXIA
181   // U+03C3 GREEK SMALL LETTER SIGMA
182   // U+03BC GREEK SMALL LETTER MU
183   // U+03B5 GREEK SMALL LETTER EPSILON
184   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
185       ConvertUTFResultContainer(conversionOK)
186           .withScalars(0x03ba, 0x1f79, 0x03c3, 0x03bc, 0x03b5),
187       "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5"));
188
189   //
190   // 3-byte sequences
191   //
192
193   // U+4F8B CJK UNIFIED IDEOGRAPH-4F8B
194   // U+6587 CJK UNIFIED IDEOGRAPH-6587
195   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
196       ConvertUTFResultContainer(conversionOK).withScalars(0x4f8b, 0x6587),
197       "\xe4\xbe\x8b\xe6\x96\x87"));
198
199   // U+D55C HANGUL SYLLABLE HAN
200   // U+AE00 HANGUL SYLLABLE GEUL
201   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
202       ConvertUTFResultContainer(conversionOK).withScalars(0xd55c, 0xae00),
203       "\xed\x95\x9c\xea\xb8\x80"));
204
205   // U+1112 HANGUL CHOSEONG HIEUH
206   // U+1161 HANGUL JUNGSEONG A
207   // U+11AB HANGUL JONGSEONG NIEUN
208   // U+1100 HANGUL CHOSEONG KIYEOK
209   // U+1173 HANGUL JUNGSEONG EU
210   // U+11AF HANGUL JONGSEONG RIEUL
211   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
212       ConvertUTFResultContainer(conversionOK)
213           .withScalars(0x1112, 0x1161, 0x11ab, 0x1100, 0x1173, 0x11af),
214       "\xe1\x84\x92\xe1\x85\xa1\xe1\x86\xab\xe1\x84\x80\xe1\x85\xb3"
215       "\xe1\x86\xaf"));
216
217   //
218   // 4-byte sequences
219   //
220
221   // U+E0100 VARIATION SELECTOR-17
222   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
223       ConvertUTFResultContainer(conversionOK).withScalars(0x000E0100),
224       "\xf3\xa0\x84\x80"));
225
226   //
227   // First possible sequence of a certain length
228   //
229
230   // U+0000 NULL
231   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
232       ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
233       StringRef("\x00", 1)));
234
235   // U+0080 PADDING CHARACTER
236   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
237       ConvertUTFResultContainer(conversionOK).withScalars(0x0080),
238       "\xc2\x80"));
239
240   // U+0800 SAMARITAN LETTER ALAF
241   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
242       ConvertUTFResultContainer(conversionOK).withScalars(0x0800),
243       "\xe0\xa0\x80"));
244
245   // U+10000 LINEAR B SYLLABLE B008 A
246   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
247       ConvertUTFResultContainer(conversionOK).withScalars(0x10000),
248       "\xf0\x90\x80\x80"));
249
250   // U+200000 (invalid)
251   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
252       ConvertUTFResultContainer(sourceIllegal)
253           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
254       "\xf8\x88\x80\x80\x80"));
255
256   // U+4000000 (invalid)
257   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
258       ConvertUTFResultContainer(sourceIllegal)
259           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
260       "\xfc\x84\x80\x80\x80\x80"));
261
262   //
263   // Last possible sequence of a certain length
264   //
265
266   // U+007F DELETE
267   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
268       ConvertUTFResultContainer(conversionOK).withScalars(0x007f), "\x7f"));
269
270   // U+07FF (unassigned)
271   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
272       ConvertUTFResultContainer(conversionOK).withScalars(0x07ff),
273       "\xdf\xbf"));
274
275   // U+FFFF (noncharacter)
276   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
277       ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
278       "\xef\xbf\xbf"));
279
280   // U+1FFFFF (invalid)
281   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
282       ConvertUTFResultContainer(sourceIllegal)
283           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
284       "\xf7\xbf\xbf\xbf"));
285
286   // U+3FFFFFF (invalid)
287   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
288       ConvertUTFResultContainer(sourceIllegal)
289           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
290       "\xfb\xbf\xbf\xbf\xbf"));
291
292   // U+7FFFFFFF (invalid)
293   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
294       ConvertUTFResultContainer(sourceIllegal)
295           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
296       "\xfd\xbf\xbf\xbf\xbf\xbf"));
297
298   //
299   // Other boundary conditions
300   //
301
302   // U+D7FF (unassigned)
303   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
304       ConvertUTFResultContainer(conversionOK).withScalars(0xd7ff),
305       "\xed\x9f\xbf"));
306
307   // U+E000 (private use)
308   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
309       ConvertUTFResultContainer(conversionOK).withScalars(0xe000),
310       "\xee\x80\x80"));
311
312   // U+FFFD REPLACEMENT CHARACTER
313   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
314       ConvertUTFResultContainer(conversionOK).withScalars(0xfffd),
315       "\xef\xbf\xbd"));
316
317   // U+10FFFF (noncharacter)
318   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
319       ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
320       "\xf4\x8f\xbf\xbf"));
321
322   // U+110000 (invalid)
323   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
324       ConvertUTFResultContainer(sourceIllegal)
325           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
326       "\xf4\x90\x80\x80"));
327
328   //
329   // Unexpected continuation bytes
330   //
331
332   // A sequence of unexpected continuation bytes that don't follow a first
333   // byte, every byte is a maximal subpart.
334
335   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
336       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\x80"));
337   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
338       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xbf"));
339   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
340       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
341       "\x80\x80"));
342   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
343       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
344       "\x80\xbf"));
345   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
346       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
347       "\xbf\x80"));
348   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
349       ConvertUTFResultContainer(sourceIllegal)
350           .withScalars(0xfffd, 0xfffd, 0xfffd),
351       "\x80\xbf\x80"));
352   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
353       ConvertUTFResultContainer(sourceIllegal)
354           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
355       "\x80\xbf\x80\xbf"));
356   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
357       ConvertUTFResultContainer(sourceIllegal)
358           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
359       "\x80\xbf\x82\xbf\xaa"));
360   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
361       ConvertUTFResultContainer(sourceIllegal)
362           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
363       "\xaa\xb0\xbb\xbf\xaa\xa0"));
364   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
365       ConvertUTFResultContainer(sourceIllegal)
366           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
367       "\xaa\xb0\xbb\xbf\xaa\xa0\x8f"));
368
369   // All continuation bytes (0x80--0xbf).
370   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
371       ConvertUTFResultContainer(sourceIllegal)
372           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
373                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
374           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
375                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
376           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
377                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
378           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
379                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
380           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
381                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
382           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
383                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
384           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
385                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
386           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
387                        0xfffd, 0xfffd, 0xfffd, 0xfffd),
388       "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
389       "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
390       "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
391       "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"));
392
393   //
394   // Lonely start bytes
395   //
396
397   // Start bytes of 2-byte sequences (0xc0--0xdf).
398   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
399       ConvertUTFResultContainer(sourceIllegal)
400           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
401                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
402           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
403                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
404           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
405                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
406           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
407                        0xfffd, 0xfffd, 0xfffd, 0xfffd),
408       "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
409       "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"));
410
411   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
412       ConvertUTFResultContainer(sourceIllegal)
413           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
414                        0xfffd, 0x0020, 0xfffd, 0x0020)
415           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
416                        0xfffd, 0x0020, 0xfffd, 0x0020)
417           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
418                        0xfffd, 0x0020, 0xfffd, 0x0020)
419           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
420                        0xfffd, 0x0020, 0xfffd, 0x0020)
421           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
422                        0xfffd, 0x0020, 0xfffd, 0x0020)
423           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
424                        0xfffd, 0x0020, 0xfffd, 0x0020)
425           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
426                        0xfffd, 0x0020, 0xfffd, 0x0020)
427           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
428                        0xfffd, 0x0020, 0xfffd, 0x0020),
429       "\xc0\x20\xc1\x20\xc2\x20\xc3\x20\xc4\x20\xc5\x20\xc6\x20\xc7\x20"
430       "\xc8\x20\xc9\x20\xca\x20\xcb\x20\xcc\x20\xcd\x20\xce\x20\xcf\x20"
431       "\xd0\x20\xd1\x20\xd2\x20\xd3\x20\xd4\x20\xd5\x20\xd6\x20\xd7\x20"
432       "\xd8\x20\xd9\x20\xda\x20\xdb\x20\xdc\x20\xdd\x20\xde\x20\xdf\x20"));
433
434   // Start bytes of 3-byte sequences (0xe0--0xef).
435   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
436       ConvertUTFResultContainer(sourceIllegal)
437           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
438                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
439           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
440                        0xfffd, 0xfffd, 0xfffd, 0xfffd),
441       "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"));
442
443   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
444       ConvertUTFResultContainer(sourceIllegal)
445           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
446                        0xfffd, 0x0020, 0xfffd, 0x0020)
447           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
448                        0xfffd, 0x0020, 0xfffd, 0x0020)
449           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
450                        0xfffd, 0x0020, 0xfffd, 0x0020)
451           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
452                        0xfffd, 0x0020, 0xfffd, 0x0020),
453       "\xe0\x20\xe1\x20\xe2\x20\xe3\x20\xe4\x20\xe5\x20\xe6\x20\xe7\x20"
454       "\xe8\x20\xe9\x20\xea\x20\xeb\x20\xec\x20\xed\x20\xee\x20\xef\x20"));
455
456   // Start bytes of 4-byte sequences (0xf0--0xf7).
457   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
458       ConvertUTFResultContainer(sourceIllegal)
459           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
460                        0xfffd, 0xfffd, 0xfffd, 0xfffd),
461       "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"));
462
463   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
464       ConvertUTFResultContainer(sourceIllegal)
465           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
466                        0xfffd, 0x0020, 0xfffd, 0x0020)
467           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
468                        0xfffd, 0x0020, 0xfffd, 0x0020),
469       "\xf0\x20\xf1\x20\xf2\x20\xf3\x20\xf4\x20\xf5\x20\xf6\x20\xf7\x20"));
470
471   // Start bytes of 5-byte sequences (0xf8--0xfb).
472   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
473       ConvertUTFResultContainer(sourceIllegal)
474           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
475       "\xf8\xf9\xfa\xfb"));
476
477   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
478       ConvertUTFResultContainer(sourceIllegal)
479           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
480                        0xfffd, 0x0020, 0xfffd, 0x0020),
481       "\xf8\x20\xf9\x20\xfa\x20\xfb\x20"));
482
483   // Start bytes of 6-byte sequences (0xfc--0xfd).
484   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
485       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
486       "\xfc\xfd"));
487
488   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
489       ConvertUTFResultContainer(sourceIllegal)
490           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020),
491       "\xfc\x20\xfd\x20"));
492
493   //
494   // Other bytes (0xc0--0xc1, 0xfe--0xff).
495   //
496
497   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
498       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc0"));
499   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
500       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc1"));
501   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
502       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfe"));
503   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
504       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xff"));
505
506   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
507       ConvertUTFResultContainer(sourceIllegal)
508           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
509       "\xc0\xc1\xfe\xff"));
510
511   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
512       ConvertUTFResultContainer(sourceIllegal)
513           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
514       "\xfe\xfe\xff\xff"));
515
516   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
517       ConvertUTFResultContainer(sourceIllegal)
518           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
519       "\xfe\x80\x80\x80\x80\x80"));
520
521   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
522       ConvertUTFResultContainer(sourceIllegal)
523           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
524       "\xff\x80\x80\x80\x80\x80"));
525
526   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
527       ConvertUTFResultContainer(sourceIllegal)
528           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
529                        0xfffd, 0x0020, 0xfffd, 0x0020),
530       "\xc0\x20\xc1\x20\xfe\x20\xff\x20"));
531
532   //
533   // Sequences with one continuation byte missing
534   //
535
536   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
537       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc2"));
538   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
539       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xdf"));
540   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
541       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
542       "\xe0\xa0"));
543   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
544       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
545       "\xe0\xbf"));
546   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
547       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
548       "\xe1\x80"));
549   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
550       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
551       "\xec\xbf"));
552   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
553       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
554       "\xed\x80"));
555   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
556       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
557       "\xed\x9f"));
558   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
559       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
560       "\xee\x80"));
561   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
562       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
563       "\xef\xbf"));
564   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
565       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
566       "\xf0\x90\x80"));
567   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
568       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
569       "\xf0\xbf\xbf"));
570   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
571       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
572       "\xf1\x80\x80"));
573   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
574       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
575       "\xf3\xbf\xbf"));
576   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
577       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
578       "\xf4\x80\x80"));
579   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
580       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
581       "\xf4\x8f\xbf"));
582
583   // Overlong sequences with one trailing byte missing.
584   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
585       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
586       "\xc0"));
587   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
588       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
589       "\xc1"));
590   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
591       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
592       "\xe0\x80"));
593   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
594       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
595       "\xe0\x9f"));
596   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
597       ConvertUTFResultContainer(sourceIllegal)
598           .withScalars(0xfffd, 0xfffd, 0xfffd),
599       "\xf0\x80\x80"));
600   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
601       ConvertUTFResultContainer(sourceIllegal)
602           .withScalars(0xfffd, 0xfffd, 0xfffd),
603       "\xf0\x8f\x80"));
604   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
605       ConvertUTFResultContainer(sourceIllegal)
606           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
607       "\xf8\x80\x80\x80"));
608   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
609       ConvertUTFResultContainer(sourceIllegal)
610           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
611       "\xfc\x80\x80\x80\x80"));
612
613   // Sequences that represent surrogates with one trailing byte missing.
614   // High surrogates
615   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
616       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
617       "\xed\xa0"));
618   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
619       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
620       "\xed\xac"));
621   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
622       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
623       "\xed\xaf"));
624   // Low surrogates
625   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
626       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
627       "\xed\xb0"));
628   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
629       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
630       "\xed\xb4"));
631   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
632       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
633       "\xed\xbf"));
634
635   // Ill-formed 4-byte sequences.
636   // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
637   // U+1100xx (invalid)
638   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
639       ConvertUTFResultContainer(sourceIllegal)
640           .withScalars(0xfffd, 0xfffd, 0xfffd),
641       "\xf4\x90\x80"));
642   // U+13FBxx (invalid)
643   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
644       ConvertUTFResultContainer(sourceIllegal)
645           .withScalars(0xfffd, 0xfffd, 0xfffd),
646       "\xf4\xbf\xbf"));
647   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
648       ConvertUTFResultContainer(sourceIllegal)
649           .withScalars(0xfffd, 0xfffd, 0xfffd),
650       "\xf5\x80\x80"));
651   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
652       ConvertUTFResultContainer(sourceIllegal)
653           .withScalars(0xfffd, 0xfffd, 0xfffd),
654       "\xf6\x80\x80"));
655   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
656       ConvertUTFResultContainer(sourceIllegal)
657           .withScalars(0xfffd, 0xfffd, 0xfffd),
658       "\xf7\x80\x80"));
659   // U+1FFBxx (invalid)
660   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
661       ConvertUTFResultContainer(sourceIllegal)
662           .withScalars(0xfffd, 0xfffd, 0xfffd),
663       "\xf7\xbf\xbf"));
664
665   // Ill-formed 5-byte sequences.
666   // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
667   // U+2000xx (invalid)
668   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
669       ConvertUTFResultContainer(sourceIllegal)
670           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
671       "\xf8\x88\x80\x80"));
672   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
673       ConvertUTFResultContainer(sourceIllegal)
674           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
675       "\xf8\xbf\xbf\xbf"));
676   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
677       ConvertUTFResultContainer(sourceIllegal)
678           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
679       "\xf9\x80\x80\x80"));
680   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
681       ConvertUTFResultContainer(sourceIllegal)
682           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
683       "\xfa\x80\x80\x80"));
684   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
685       ConvertUTFResultContainer(sourceIllegal)
686           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
687       "\xfb\x80\x80\x80"));
688   // U+3FFFFxx (invalid)
689   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
690       ConvertUTFResultContainer(sourceIllegal)
691           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
692       "\xfb\xbf\xbf\xbf"));
693
694   // Ill-formed 6-byte sequences.
695   // 1111110u 10uuuuuu 10uzzzzz 10zzzyyyy 10yyyyxx 10xxxxxx
696   // U+40000xx (invalid)
697   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
698       ConvertUTFResultContainer(sourceIllegal)
699           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
700       "\xfc\x84\x80\x80\x80"));
701   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
702       ConvertUTFResultContainer(sourceIllegal)
703           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
704       "\xfc\xbf\xbf\xbf\xbf"));
705   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
706       ConvertUTFResultContainer(sourceIllegal)
707           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
708       "\xfd\x80\x80\x80\x80"));
709   // U+7FFFFFxx (invalid)
710   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
711       ConvertUTFResultContainer(sourceIllegal)
712           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
713       "\xfd\xbf\xbf\xbf\xbf"));
714
715   //
716   // Sequences with two continuation bytes missing
717   //
718
719   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
720       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
721       "\xf0\x90"));
722   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
723       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
724       "\xf0\xbf"));
725   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
726       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
727       "\xf1\x80"));
728   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
729       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
730       "\xf3\xbf"));
731   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
732       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
733       "\xf4\x80"));
734   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
735       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
736       "\xf4\x8f"));
737
738   // Overlong sequences with two trailing byte missing.
739   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
740       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xe0"));
741   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
742       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
743       "\xf0\x80"));
744   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
745       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
746       "\xf0\x8f"));
747   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
748       ConvertUTFResultContainer(sourceIllegal)
749           .withScalars(0xfffd, 0xfffd, 0xfffd),
750       "\xf8\x80\x80"));
751   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
752       ConvertUTFResultContainer(sourceIllegal)
753           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
754       "\xfc\x80\x80\x80"));
755
756   // Sequences that represent surrogates with two trailing bytes missing.
757   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
758       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xed"));
759
760   // Ill-formed 4-byte sequences.
761   // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
762   // U+110yxx (invalid)
763   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
764       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
765       "\xf4\x90"));
766   // U+13Fyxx (invalid)
767   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
768       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
769       "\xf4\xbf"));
770   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
771       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
772       "\xf5\x80"));
773   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
774       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
775       "\xf6\x80"));
776   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
777       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
778       "\xf7\x80"));
779   // U+1FFyxx (invalid)
780   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
781       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
782       "\xf7\xbf"));
783
784   // Ill-formed 5-byte sequences.
785   // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
786   // U+200yxx (invalid)
787   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
788       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
789       "\xf8\x88\x80"));
790   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
791       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
792       "\xf8\xbf\xbf"));
793   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
794       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
795       "\xf9\x80\x80"));
796   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
797       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
798       "\xfa\x80\x80"));
799   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
800       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
801       "\xfb\x80\x80"));
802   // U+3FFFyxx (invalid)
803   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
804       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
805       "\xfb\xbf\xbf"));
806
807   // Ill-formed 6-byte sequences.
808   // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
809   // U+4000yxx (invalid)
810   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
811       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
812       "\xfc\x84\x80\x80"));
813   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
814       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
815       "\xfc\xbf\xbf\xbf"));
816   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
817       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
818       "\xfd\x80\x80\x80"));
819   // U+7FFFFyxx (invalid)
820   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
821       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
822       "\xfd\xbf\xbf\xbf"));
823
824   //
825   // Sequences with three continuation bytes missing
826   //
827
828   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
829       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
830   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
831       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf1"));
832   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
833       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf2"));
834   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
835       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf3"));
836   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
837       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf4"));
838
839   // Broken overlong sequences.
840   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
841       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
842   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
843       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
844       "\xf8\x80"));
845   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
846       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
847       "\xfc\x80\x80"));
848
849   // Ill-formed 4-byte sequences.
850   // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
851   // U+14yyxx (invalid)
852   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
853       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf5"));
854   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
855       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf6"));
856   // U+1Cyyxx (invalid)
857   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
858       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf7"));
859
860   // Ill-formed 5-byte sequences.
861   // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
862   // U+20yyxx (invalid)
863   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
864       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
865       "\xf8\x88"));
866   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
867       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
868       "\xf8\xbf"));
869   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
870       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
871       "\xf9\x80"));
872   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
873       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
874       "\xfa\x80"));
875   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
876       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
877       "\xfb\x80"));
878   // U+3FCyyxx (invalid)
879   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
880       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
881       "\xfb\xbf"));
882
883   // Ill-formed 6-byte sequences.
884   // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
885   // U+400yyxx (invalid)
886   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
887       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
888       "\xfc\x84\x80"));
889   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
890       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
891       "\xfc\xbf\xbf"));
892   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
893       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
894       "\xfd\x80\x80"));
895   // U+7FFCyyxx (invalid)
896   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
897       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
898       "\xfd\xbf\xbf"));
899
900   //
901   // Sequences with four continuation bytes missing
902   //
903
904   // Ill-formed 5-byte sequences.
905   // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
906   // U+uzyyxx (invalid)
907   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
908       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
909   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
910       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf9"));
911   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
912       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfa"));
913   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
914       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
915   // U+3zyyxx (invalid)
916   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
917       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
918
919   // Broken overlong sequences.
920   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
921       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
922   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
923       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
924       "\xfc\x80"));
925
926   // Ill-formed 6-byte sequences.
927   // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
928   // U+uzzyyxx (invalid)
929   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
930       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
931       "\xfc\x84"));
932   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
933       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
934       "\xfc\xbf"));
935   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
936       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
937       "\xfd\x80"));
938   // U+7Fzzyyxx (invalid)
939   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
940       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
941       "\xfd\xbf"));
942
943   //
944   // Sequences with five continuation bytes missing
945   //
946
947   // Ill-formed 6-byte sequences.
948   // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
949   // U+uzzyyxx (invalid)
950   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
951       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfc"));
952   // U+uuzzyyxx (invalid)
953   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
954       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfd"));
955
956   //
957   // Consecutive sequences with trailing bytes missing
958   //
959
960   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
961       ConvertUTFResultContainer(sourceIllegal)
962           .withScalars(0xfffd, /**/ 0xfffd, 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
963           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
964           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd)
965           .withScalars(0xfffd, /**/ 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
966           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
967           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
968       "\xc0" "\xe0\x80" "\xf0\x80\x80"
969       "\xf8\x80\x80\x80"
970       "\xfc\x80\x80\x80\x80"
971       "\xdf" "\xef\xbf" "\xf7\xbf\xbf"
972       "\xfb\xbf\xbf\xbf"
973       "\xfd\xbf\xbf\xbf\xbf"));
974
975   //
976   // Overlong UTF-8 sequences
977   //
978
979   // U+002F SOLIDUS
980   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
981       ConvertUTFResultContainer(conversionOK).withScalars(0x002f), "\x2f"));
982
983   // Overlong sequences of the above.
984   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
985       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
986       "\xc0\xaf"));
987   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
988       ConvertUTFResultContainer(sourceIllegal)
989           .withScalars(0xfffd, 0xfffd, 0xfffd),
990       "\xe0\x80\xaf"));
991   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
992       ConvertUTFResultContainer(sourceIllegal)
993           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
994       "\xf0\x80\x80\xaf"));
995   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
996       ConvertUTFResultContainer(sourceIllegal)
997           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
998       "\xf8\x80\x80\x80\xaf"));
999   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1000       ConvertUTFResultContainer(sourceIllegal)
1001           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1002       "\xfc\x80\x80\x80\x80\xaf"));
1003
1004   // U+0000 NULL
1005   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1006       ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
1007       StringRef("\x00", 1)));
1008
1009   // Overlong sequences of the above.
1010   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1011       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1012       "\xc0\x80"));
1013   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1014       ConvertUTFResultContainer(sourceIllegal)
1015           .withScalars(0xfffd, 0xfffd, 0xfffd),
1016       "\xe0\x80\x80"));
1017   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1018       ConvertUTFResultContainer(sourceIllegal)
1019           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1020       "\xf0\x80\x80\x80"));
1021   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1022       ConvertUTFResultContainer(sourceIllegal)
1023           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1024       "\xf8\x80\x80\x80\x80"));
1025   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1026       ConvertUTFResultContainer(sourceIllegal)
1027           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1028       "\xfc\x80\x80\x80\x80\x80"));
1029
1030   // Other overlong sequences.
1031   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1032       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1033       "\xc0\xbf"));
1034   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1035       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1036       "\xc1\x80"));
1037   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1038       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1039       "\xc1\xbf"));
1040   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1041       ConvertUTFResultContainer(sourceIllegal)
1042           .withScalars(0xfffd, 0xfffd, 0xfffd),
1043       "\xe0\x9f\xbf"));
1044   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1045       ConvertUTFResultContainer(sourceIllegal)
1046           .withScalars(0xfffd, 0xfffd, 0xfffd),
1047       "\xed\xa0\x80"));
1048   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1049       ConvertUTFResultContainer(sourceIllegal)
1050           .withScalars(0xfffd, 0xfffd, 0xfffd),
1051       "\xed\xbf\xbf"));
1052   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1053       ConvertUTFResultContainer(sourceIllegal)
1054           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1055       "\xf0\x8f\x80\x80"));
1056   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1057       ConvertUTFResultContainer(sourceIllegal)
1058           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1059       "\xf0\x8f\xbf\xbf"));
1060   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1061       ConvertUTFResultContainer(sourceIllegal)
1062           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1063       "\xf8\x87\xbf\xbf\xbf"));
1064   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1065       ConvertUTFResultContainer(sourceIllegal)
1066           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1067       "\xfc\x83\xbf\xbf\xbf\xbf"));
1068
1069   //
1070   // Isolated surrogates
1071   //
1072
1073   // Unicode 6.3.0:
1074   //
1075   //    D71.  High-surrogate code point: A Unicode code point in the range
1076   //    U+D800 to U+DBFF.
1077   //
1078   //    D73.  Low-surrogate code point: A Unicode code point in the range
1079   //    U+DC00 to U+DFFF.
1080
1081   // Note: U+E0100 is <DB40 DD00> in UTF16.
1082
1083   // High surrogates
1084
1085   // U+D800
1086   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1087       ConvertUTFResultContainer(sourceIllegal)
1088           .withScalars(0xfffd, 0xfffd, 0xfffd),
1089       "\xed\xa0\x80"));
1090
1091   // U+DB40
1092   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1093       ConvertUTFResultContainer(sourceIllegal)
1094           .withScalars(0xfffd, 0xfffd, 0xfffd),
1095       "\xed\xac\xa0"));
1096
1097   // U+DBFF
1098   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1099       ConvertUTFResultContainer(sourceIllegal)
1100           .withScalars(0xfffd, 0xfffd, 0xfffd),
1101       "\xed\xaf\xbf"));
1102
1103   // Low surrogates
1104
1105   // U+DC00
1106   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1107       ConvertUTFResultContainer(sourceIllegal)
1108           .withScalars(0xfffd, 0xfffd, 0xfffd),
1109       "\xed\xb0\x80"));
1110
1111   // U+DD00
1112   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1113       ConvertUTFResultContainer(sourceIllegal)
1114           .withScalars(0xfffd, 0xfffd, 0xfffd),
1115       "\xed\xb4\x80"));
1116
1117   // U+DFFF
1118   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1119       ConvertUTFResultContainer(sourceIllegal)
1120           .withScalars(0xfffd, 0xfffd, 0xfffd),
1121       "\xed\xbf\xbf"));
1122
1123   // Surrogate pairs
1124
1125   // U+D800 U+DC00
1126   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1127       ConvertUTFResultContainer(sourceIllegal)
1128           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1129       "\xed\xa0\x80\xed\xb0\x80"));
1130
1131   // U+D800 U+DD00
1132   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1133       ConvertUTFResultContainer(sourceIllegal)
1134           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1135       "\xed\xa0\x80\xed\xb4\x80"));
1136
1137   // U+D800 U+DFFF
1138   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1139       ConvertUTFResultContainer(sourceIllegal)
1140           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1141       "\xed\xa0\x80\xed\xbf\xbf"));
1142
1143   // U+DB40 U+DC00
1144   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1145       ConvertUTFResultContainer(sourceIllegal)
1146           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1147       "\xed\xac\xa0\xed\xb0\x80"));
1148
1149   // U+DB40 U+DD00
1150   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1151       ConvertUTFResultContainer(sourceIllegal)
1152           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1153       "\xed\xac\xa0\xed\xb4\x80"));
1154
1155   // U+DB40 U+DFFF
1156   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1157       ConvertUTFResultContainer(sourceIllegal)
1158           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1159       "\xed\xac\xa0\xed\xbf\xbf"));
1160
1161   // U+DBFF U+DC00
1162   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1163       ConvertUTFResultContainer(sourceIllegal)
1164           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1165       "\xed\xaf\xbf\xed\xb0\x80"));
1166
1167   // U+DBFF U+DD00
1168   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1169       ConvertUTFResultContainer(sourceIllegal)
1170           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1171       "\xed\xaf\xbf\xed\xb4\x80"));
1172
1173   // U+DBFF U+DFFF
1174   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1175       ConvertUTFResultContainer(sourceIllegal)
1176           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1177       "\xed\xaf\xbf\xed\xbf\xbf"));
1178
1179   //
1180   // Noncharacters
1181   //
1182
1183   // Unicode 6.3.0:
1184   //
1185   //    D14.  Noncharacter: A code point that is permanently reserved for
1186   //    internal use and that should never be interchanged. Noncharacters
1187   //    consist of the values U+nFFFE and U+nFFFF (where n is from 0 to 1016)
1188   //    and the values U+FDD0..U+FDEF.
1189
1190   // U+FFFE
1191   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1192       ConvertUTFResultContainer(conversionOK).withScalars(0xfffe),
1193       "\xef\xbf\xbe"));
1194
1195   // U+FFFF
1196   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1197       ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
1198       "\xef\xbf\xbf"));
1199
1200   // U+1FFFE
1201   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1202       ConvertUTFResultContainer(conversionOK).withScalars(0x1fffe),
1203       "\xf0\x9f\xbf\xbe"));
1204
1205   // U+1FFFF
1206   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1207       ConvertUTFResultContainer(conversionOK).withScalars(0x1ffff),
1208       "\xf0\x9f\xbf\xbf"));
1209
1210   // U+2FFFE
1211   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1212       ConvertUTFResultContainer(conversionOK).withScalars(0x2fffe),
1213       "\xf0\xaf\xbf\xbe"));
1214
1215   // U+2FFFF
1216   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1217       ConvertUTFResultContainer(conversionOK).withScalars(0x2ffff),
1218       "\xf0\xaf\xbf\xbf"));
1219
1220   // U+3FFFE
1221   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1222       ConvertUTFResultContainer(conversionOK).withScalars(0x3fffe),
1223       "\xf0\xbf\xbf\xbe"));
1224
1225   // U+3FFFF
1226   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1227       ConvertUTFResultContainer(conversionOK).withScalars(0x3ffff),
1228       "\xf0\xbf\xbf\xbf"));
1229
1230   // U+4FFFE
1231   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1232       ConvertUTFResultContainer(conversionOK).withScalars(0x4fffe),
1233       "\xf1\x8f\xbf\xbe"));
1234
1235   // U+4FFFF
1236   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1237       ConvertUTFResultContainer(conversionOK).withScalars(0x4ffff),
1238       "\xf1\x8f\xbf\xbf"));
1239
1240   // U+5FFFE
1241   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1242       ConvertUTFResultContainer(conversionOK).withScalars(0x5fffe),
1243       "\xf1\x9f\xbf\xbe"));
1244
1245   // U+5FFFF
1246   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1247       ConvertUTFResultContainer(conversionOK).withScalars(0x5ffff),
1248       "\xf1\x9f\xbf\xbf"));
1249
1250   // U+6FFFE
1251   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1252       ConvertUTFResultContainer(conversionOK).withScalars(0x6fffe),
1253       "\xf1\xaf\xbf\xbe"));
1254
1255   // U+6FFFF
1256   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1257       ConvertUTFResultContainer(conversionOK).withScalars(0x6ffff),
1258       "\xf1\xaf\xbf\xbf"));
1259
1260   // U+7FFFE
1261   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1262       ConvertUTFResultContainer(conversionOK).withScalars(0x7fffe),
1263       "\xf1\xbf\xbf\xbe"));
1264
1265   // U+7FFFF
1266   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1267       ConvertUTFResultContainer(conversionOK).withScalars(0x7ffff),
1268       "\xf1\xbf\xbf\xbf"));
1269
1270   // U+8FFFE
1271   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1272       ConvertUTFResultContainer(conversionOK).withScalars(0x8fffe),
1273       "\xf2\x8f\xbf\xbe"));
1274
1275   // U+8FFFF
1276   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1277       ConvertUTFResultContainer(conversionOK).withScalars(0x8ffff),
1278       "\xf2\x8f\xbf\xbf"));
1279
1280   // U+9FFFE
1281   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1282       ConvertUTFResultContainer(conversionOK).withScalars(0x9fffe),
1283       "\xf2\x9f\xbf\xbe"));
1284
1285   // U+9FFFF
1286   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1287       ConvertUTFResultContainer(conversionOK).withScalars(0x9ffff),
1288       "\xf2\x9f\xbf\xbf"));
1289
1290   // U+AFFFE
1291   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1292       ConvertUTFResultContainer(conversionOK).withScalars(0xafffe),
1293       "\xf2\xaf\xbf\xbe"));
1294
1295   // U+AFFFF
1296   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1297       ConvertUTFResultContainer(conversionOK).withScalars(0xaffff),
1298       "\xf2\xaf\xbf\xbf"));
1299
1300   // U+BFFFE
1301   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1302       ConvertUTFResultContainer(conversionOK).withScalars(0xbfffe),
1303       "\xf2\xbf\xbf\xbe"));
1304
1305   // U+BFFFF
1306   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1307       ConvertUTFResultContainer(conversionOK).withScalars(0xbffff),
1308       "\xf2\xbf\xbf\xbf"));
1309
1310   // U+CFFFE
1311   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1312       ConvertUTFResultContainer(conversionOK).withScalars(0xcfffe),
1313       "\xf3\x8f\xbf\xbe"));
1314
1315   // U+CFFFF
1316   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1317       ConvertUTFResultContainer(conversionOK).withScalars(0xcfffF),
1318       "\xf3\x8f\xbf\xbf"));
1319
1320   // U+DFFFE
1321   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1322       ConvertUTFResultContainer(conversionOK).withScalars(0xdfffe),
1323       "\xf3\x9f\xbf\xbe"));
1324
1325   // U+DFFFF
1326   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1327       ConvertUTFResultContainer(conversionOK).withScalars(0xdffff),
1328       "\xf3\x9f\xbf\xbf"));
1329
1330   // U+EFFFE
1331   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1332       ConvertUTFResultContainer(conversionOK).withScalars(0xefffe),
1333       "\xf3\xaf\xbf\xbe"));
1334
1335   // U+EFFFF
1336   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1337       ConvertUTFResultContainer(conversionOK).withScalars(0xeffff),
1338       "\xf3\xaf\xbf\xbf"));
1339
1340   // U+FFFFE
1341   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1342       ConvertUTFResultContainer(conversionOK).withScalars(0xffffe),
1343       "\xf3\xbf\xbf\xbe"));
1344
1345   // U+FFFFF
1346   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1347       ConvertUTFResultContainer(conversionOK).withScalars(0xfffff),
1348       "\xf3\xbf\xbf\xbf"));
1349
1350   // U+10FFFE
1351   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1352       ConvertUTFResultContainer(conversionOK).withScalars(0x10fffe),
1353       "\xf4\x8f\xbf\xbe"));
1354
1355   // U+10FFFF
1356   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1357       ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
1358       "\xf4\x8f\xbf\xbf"));
1359
1360   // U+FDD0
1361   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1362       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd0),
1363       "\xef\xb7\x90"));
1364
1365   // U+FDD1
1366   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1367       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd1),
1368       "\xef\xb7\x91"));
1369
1370   // U+FDD2
1371   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1372       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd2),
1373       "\xef\xb7\x92"));
1374
1375   // U+FDD3
1376   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1377       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd3),
1378       "\xef\xb7\x93"));
1379
1380   // U+FDD4
1381   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1382       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd4),
1383       "\xef\xb7\x94"));
1384
1385   // U+FDD5
1386   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1387       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd5),
1388       "\xef\xb7\x95"));
1389
1390   // U+FDD6
1391   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1392       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd6),
1393       "\xef\xb7\x96"));
1394
1395   // U+FDD7
1396   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1397       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd7),
1398       "\xef\xb7\x97"));
1399
1400   // U+FDD8
1401   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1402       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd8),
1403       "\xef\xb7\x98"));
1404
1405   // U+FDD9
1406   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1407       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd9),
1408       "\xef\xb7\x99"));
1409
1410   // U+FDDA
1411   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1412       ConvertUTFResultContainer(conversionOK).withScalars(0xfdda),
1413       "\xef\xb7\x9a"));
1414
1415   // U+FDDB
1416   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1417       ConvertUTFResultContainer(conversionOK).withScalars(0xfddb),
1418       "\xef\xb7\x9b"));
1419
1420   // U+FDDC
1421   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1422       ConvertUTFResultContainer(conversionOK).withScalars(0xfddc),
1423       "\xef\xb7\x9c"));
1424
1425   // U+FDDD
1426   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1427       ConvertUTFResultContainer(conversionOK).withScalars(0xfddd),
1428       "\xef\xb7\x9d"));
1429
1430   // U+FDDE
1431   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1432       ConvertUTFResultContainer(conversionOK).withScalars(0xfdde),
1433       "\xef\xb7\x9e"));
1434
1435   // U+FDDF
1436   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1437       ConvertUTFResultContainer(conversionOK).withScalars(0xfddf),
1438       "\xef\xb7\x9f"));
1439
1440   // U+FDE0
1441   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1442       ConvertUTFResultContainer(conversionOK).withScalars(0xfde0),
1443       "\xef\xb7\xa0"));
1444
1445   // U+FDE1
1446   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1447       ConvertUTFResultContainer(conversionOK).withScalars(0xfde1),
1448       "\xef\xb7\xa1"));
1449
1450   // U+FDE2
1451   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1452       ConvertUTFResultContainer(conversionOK).withScalars(0xfde2),
1453       "\xef\xb7\xa2"));
1454
1455   // U+FDE3
1456   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1457       ConvertUTFResultContainer(conversionOK).withScalars(0xfde3),
1458       "\xef\xb7\xa3"));
1459
1460   // U+FDE4
1461   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1462       ConvertUTFResultContainer(conversionOK).withScalars(0xfde4),
1463       "\xef\xb7\xa4"));
1464
1465   // U+FDE5
1466   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1467       ConvertUTFResultContainer(conversionOK).withScalars(0xfde5),
1468       "\xef\xb7\xa5"));
1469
1470   // U+FDE6
1471   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1472       ConvertUTFResultContainer(conversionOK).withScalars(0xfde6),
1473       "\xef\xb7\xa6"));
1474
1475   // U+FDE7
1476   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1477       ConvertUTFResultContainer(conversionOK).withScalars(0xfde7),
1478       "\xef\xb7\xa7"));
1479
1480   // U+FDE8
1481   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1482       ConvertUTFResultContainer(conversionOK).withScalars(0xfde8),
1483       "\xef\xb7\xa8"));
1484
1485   // U+FDE9
1486   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1487       ConvertUTFResultContainer(conversionOK).withScalars(0xfde9),
1488       "\xef\xb7\xa9"));
1489
1490   // U+FDEA
1491   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1492       ConvertUTFResultContainer(conversionOK).withScalars(0xfdea),
1493       "\xef\xb7\xaa"));
1494
1495   // U+FDEB
1496   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1497       ConvertUTFResultContainer(conversionOK).withScalars(0xfdeb),
1498       "\xef\xb7\xab"));
1499
1500   // U+FDEC
1501   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1502       ConvertUTFResultContainer(conversionOK).withScalars(0xfdec),
1503       "\xef\xb7\xac"));
1504
1505   // U+FDED
1506   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1507       ConvertUTFResultContainer(conversionOK).withScalars(0xfded),
1508       "\xef\xb7\xad"));
1509
1510   // U+FDEE
1511   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1512       ConvertUTFResultContainer(conversionOK).withScalars(0xfdee),
1513       "\xef\xb7\xae"));
1514
1515   // U+FDEF
1516   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1517       ConvertUTFResultContainer(conversionOK).withScalars(0xfdef),
1518       "\xef\xb7\xaf"));
1519
1520   // U+FDF0
1521   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1522       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf0),
1523       "\xef\xb7\xb0"));
1524
1525   // U+FDF1
1526   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1527       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf1),
1528       "\xef\xb7\xb1"));
1529
1530   // U+FDF2
1531   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1532       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf2),
1533       "\xef\xb7\xb2"));
1534
1535   // U+FDF3
1536   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1537       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf3),
1538       "\xef\xb7\xb3"));
1539
1540   // U+FDF4
1541   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1542       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf4),
1543       "\xef\xb7\xb4"));
1544
1545   // U+FDF5
1546   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1547       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf5),
1548       "\xef\xb7\xb5"));
1549
1550   // U+FDF6
1551   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1552       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf6),
1553       "\xef\xb7\xb6"));
1554
1555   // U+FDF7
1556   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1557       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf7),
1558       "\xef\xb7\xb7"));
1559
1560   // U+FDF8
1561   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1562       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf8),
1563       "\xef\xb7\xb8"));
1564
1565   // U+FDF9
1566   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1567       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf9),
1568       "\xef\xb7\xb9"));
1569
1570   // U+FDFA
1571   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1572       ConvertUTFResultContainer(conversionOK).withScalars(0xfdfa),
1573       "\xef\xb7\xba"));
1574
1575   // U+FDFB
1576   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1577       ConvertUTFResultContainer(conversionOK).withScalars(0xfdfb),
1578       "\xef\xb7\xbb"));
1579
1580   // U+FDFC
1581   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1582       ConvertUTFResultContainer(conversionOK).withScalars(0xfdfc),
1583       "\xef\xb7\xbc"));
1584
1585   // U+FDFD
1586   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1587       ConvertUTFResultContainer(conversionOK).withScalars(0xfdfd),
1588       "\xef\xb7\xbd"));
1589
1590   // U+FDFE
1591   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1592       ConvertUTFResultContainer(conversionOK).withScalars(0xfdfe),
1593       "\xef\xb7\xbe"));
1594
1595   // U+FDFF
1596   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1597       ConvertUTFResultContainer(conversionOK).withScalars(0xfdff),
1598       "\xef\xb7\xbf"));
1599 }
1600
1601 TEST(ConvertUTFTest, UTF8ToUTF32PartialLenient) {
1602   // U+0041 LATIN CAPITAL LETTER A
1603   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1604       ConvertUTFResultContainer(conversionOK).withScalars(0x0041),
1605       "\x41", true));
1606
1607   //
1608   // Sequences with one continuation byte missing
1609   //
1610
1611   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1612       ConvertUTFResultContainer(sourceExhausted),
1613       "\xc2", true));
1614   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1615       ConvertUTFResultContainer(sourceExhausted),
1616       "\xdf", true));
1617   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1618       ConvertUTFResultContainer(sourceExhausted),
1619       "\xe0\xa0", true));
1620   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1621       ConvertUTFResultContainer(sourceExhausted),
1622       "\xe0\xbf", true));
1623   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1624       ConvertUTFResultContainer(sourceExhausted),
1625       "\xe1\x80", true));
1626   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1627       ConvertUTFResultContainer(sourceExhausted),
1628       "\xec\xbf", true));
1629   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1630       ConvertUTFResultContainer(sourceExhausted),
1631       "\xed\x80", true));
1632   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1633       ConvertUTFResultContainer(sourceExhausted),
1634       "\xed\x9f", true));
1635   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1636       ConvertUTFResultContainer(sourceExhausted),
1637       "\xee\x80", true));
1638   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1639       ConvertUTFResultContainer(sourceExhausted),
1640       "\xef\xbf", true));
1641   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1642       ConvertUTFResultContainer(sourceExhausted),
1643       "\xf0\x90\x80", true));
1644   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1645       ConvertUTFResultContainer(sourceExhausted),
1646       "\xf0\xbf\xbf", true));
1647   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1648       ConvertUTFResultContainer(sourceExhausted),
1649       "\xf1\x80\x80", true));
1650   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1651       ConvertUTFResultContainer(sourceExhausted),
1652       "\xf3\xbf\xbf", true));
1653   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1654       ConvertUTFResultContainer(sourceExhausted),
1655       "\xf4\x80\x80", true));
1656   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1657       ConvertUTFResultContainer(sourceExhausted),
1658       "\xf4\x8f\xbf", true));
1659
1660   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1661       ConvertUTFResultContainer(sourceExhausted).withScalars(0x0041),
1662       "\x41\xc2", true));
1663 }
1664