Remove possibility of failures to due race in ThreadPool unittest
[oota-llvm.git] / unittests / Support / ConvertUTFTest.cpp
1 //===- llvm/unittest/Support/ConvertUTFTest.cpp - ConvertUTF tests --------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9
10 #include "llvm/Support/ConvertUTF.h"
11 #include "llvm/Support/Format.h"
12 #include "gtest/gtest.h"
13 #include <string>
14 #include <utility>
15 #include <vector>
16
17 using namespace llvm;
18
19 TEST(ConvertUTFTest, ConvertUTF16LittleEndianToUTF8String) {
20   // Src is the look of disapproval.
21   static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c";
22   ArrayRef<char> Ref(Src, sizeof(Src) - 1);
23   std::string Result;
24   bool Success = convertUTF16ToUTF8String(Ref, Result);
25   EXPECT_TRUE(Success);
26   std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
27   EXPECT_EQ(Expected, Result);
28 }
29
30 TEST(ConvertUTFTest, ConvertUTF16BigEndianToUTF8String) {
31   // Src is the look of disapproval.
32   static const char Src[] = "\xfe\xff\x0c\xa0\x00_\x0c\xa0";
33   ArrayRef<char> Ref(Src, sizeof(Src) - 1);
34   std::string Result;
35   bool Success = convertUTF16ToUTF8String(Ref, Result);
36   EXPECT_TRUE(Success);
37   std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
38   EXPECT_EQ(Expected, Result);
39 }
40
41 TEST(ConvertUTFTest, ConvertUTF8ToUTF16String) {
42   // Src is the look of disapproval.
43   static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0";
44   StringRef Ref(Src, sizeof(Src) - 1);
45   SmallVector<UTF16, 5> Result;
46   bool Success = convertUTF8ToUTF16String(Ref, Result);
47   EXPECT_TRUE(Success);
48   static const UTF16 Expected[] = {0x0CA0, 0x005f, 0x0CA0, 0};
49   ASSERT_EQ(3u, Result.size());
50   for (int I = 0, E = 3; I != E; ++I)
51     EXPECT_EQ(Expected[I], Result[I]);
52 }
53
54 TEST(ConvertUTFTest, OddLengthInput) {
55   std::string Result;
56   bool Success = convertUTF16ToUTF8String(makeArrayRef("xxxxx", 5), Result);
57   EXPECT_FALSE(Success);
58 }
59
60 TEST(ConvertUTFTest, Empty) {
61   std::string Result;
62   bool Success = convertUTF16ToUTF8String(None, Result);
63   EXPECT_TRUE(Success);
64   EXPECT_TRUE(Result.empty());
65 }
66
67 TEST(ConvertUTFTest, HasUTF16BOM) {
68   bool HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xff\xfe", 2));
69   EXPECT_TRUE(HasBOM);
70   HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff", 2));
71   EXPECT_TRUE(HasBOM);
72   HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff ", 3));
73   EXPECT_TRUE(HasBOM); // Don't care about odd lengths.
74   HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff\x00asdf", 6));
75   EXPECT_TRUE(HasBOM);
76
77   HasBOM = hasUTF16ByteOrderMark(None);
78   EXPECT_FALSE(HasBOM);
79   HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe", 1));
80   EXPECT_FALSE(HasBOM);
81 }
82
83 struct ConvertUTFResultContainer {
84   ConversionResult ErrorCode;
85   std::vector<unsigned> UnicodeScalars;
86
87   ConvertUTFResultContainer(ConversionResult ErrorCode)
88       : ErrorCode(ErrorCode) {}
89
90   ConvertUTFResultContainer
91   withScalars(unsigned US0 = 0x110000, unsigned US1 = 0x110000,
92               unsigned US2 = 0x110000, unsigned US3 = 0x110000,
93               unsigned US4 = 0x110000, unsigned US5 = 0x110000,
94               unsigned US6 = 0x110000, unsigned US7 = 0x110000) {
95     ConvertUTFResultContainer Result(*this);
96     if (US0 != 0x110000)
97       Result.UnicodeScalars.push_back(US0);
98     if (US1 != 0x110000)
99       Result.UnicodeScalars.push_back(US1);
100     if (US2 != 0x110000)
101       Result.UnicodeScalars.push_back(US2);
102     if (US3 != 0x110000)
103       Result.UnicodeScalars.push_back(US3);
104     if (US4 != 0x110000)
105       Result.UnicodeScalars.push_back(US4);
106     if (US5 != 0x110000)
107       Result.UnicodeScalars.push_back(US5);
108     if (US6 != 0x110000)
109       Result.UnicodeScalars.push_back(US6);
110     if (US7 != 0x110000)
111       Result.UnicodeScalars.push_back(US7);
112     return Result;
113   }
114 };
115
116 std::pair<ConversionResult, std::vector<unsigned>>
117 ConvertUTF8ToUnicodeScalarsLenient(StringRef S) {
118   const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
119
120   const UTF8 *SourceNext = SourceStart;
121   std::vector<UTF32> Decoded(S.size(), 0);
122   UTF32 *TargetStart = Decoded.data();
123
124   auto ErrorCode =
125       ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart,
126                          Decoded.data() + Decoded.size(), lenientConversion);
127
128   Decoded.resize(TargetStart - Decoded.data());
129
130   return std::make_pair(ErrorCode, Decoded);
131 }
132
133 std::pair<ConversionResult, std::vector<unsigned>>
134 ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S) {
135   const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
136
137   const UTF8 *SourceNext = SourceStart;
138   std::vector<UTF32> Decoded(S.size(), 0);
139   UTF32 *TargetStart = Decoded.data();
140
141   auto ErrorCode = ConvertUTF8toUTF32Partial(
142       &SourceNext, SourceStart + S.size(), &TargetStart,
143       Decoded.data() + Decoded.size(), lenientConversion);
144
145   Decoded.resize(TargetStart - Decoded.data());
146
147   return std::make_pair(ErrorCode, Decoded);
148 }
149
150 ::testing::AssertionResult
151 CheckConvertUTF8ToUnicodeScalars(ConvertUTFResultContainer Expected,
152                                  StringRef S, bool Partial = false) {
153   ConversionResult ErrorCode;
154   std::vector<unsigned> Decoded;
155   if (!Partial)
156     std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsLenient(S);
157   else
158     std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsPartialLenient(S);
159
160   if (Expected.ErrorCode != ErrorCode)
161     return ::testing::AssertionFailure() << "Expected error code "
162                                          << Expected.ErrorCode << ", actual "
163                                          << ErrorCode;
164
165   if (Expected.UnicodeScalars != Decoded)
166     return ::testing::AssertionFailure()
167            << "Expected lenient decoded result:\n"
168            << ::testing::PrintToString(Expected.UnicodeScalars) << "\n"
169            << "Actual result:\n" << ::testing::PrintToString(Decoded);
170
171   return ::testing::AssertionSuccess();
172 }
173
174 TEST(ConvertUTFTest, UTF8ToUTF32Lenient) {
175
176   //
177   // 1-byte sequences
178   //
179
180   // U+0041 LATIN CAPITAL LETTER A
181   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
182       ConvertUTFResultContainer(conversionOK).withScalars(0x0041), "\x41"));
183
184   //
185   // 2-byte sequences
186   //
187
188   // U+0283 LATIN SMALL LETTER ESH
189   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
190       ConvertUTFResultContainer(conversionOK).withScalars(0x0283),
191       "\xca\x83"));
192
193   // U+03BA GREEK SMALL LETTER KAPPA
194   // U+1F79 GREEK SMALL LETTER OMICRON WITH OXIA
195   // U+03C3 GREEK SMALL LETTER SIGMA
196   // U+03BC GREEK SMALL LETTER MU
197   // U+03B5 GREEK SMALL LETTER EPSILON
198   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
199       ConvertUTFResultContainer(conversionOK)
200           .withScalars(0x03ba, 0x1f79, 0x03c3, 0x03bc, 0x03b5),
201       "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5"));
202
203   //
204   // 3-byte sequences
205   //
206
207   // U+4F8B CJK UNIFIED IDEOGRAPH-4F8B
208   // U+6587 CJK UNIFIED IDEOGRAPH-6587
209   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
210       ConvertUTFResultContainer(conversionOK).withScalars(0x4f8b, 0x6587),
211       "\xe4\xbe\x8b\xe6\x96\x87"));
212
213   // U+D55C HANGUL SYLLABLE HAN
214   // U+AE00 HANGUL SYLLABLE GEUL
215   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
216       ConvertUTFResultContainer(conversionOK).withScalars(0xd55c, 0xae00),
217       "\xed\x95\x9c\xea\xb8\x80"));
218
219   // U+1112 HANGUL CHOSEONG HIEUH
220   // U+1161 HANGUL JUNGSEONG A
221   // U+11AB HANGUL JONGSEONG NIEUN
222   // U+1100 HANGUL CHOSEONG KIYEOK
223   // U+1173 HANGUL JUNGSEONG EU
224   // U+11AF HANGUL JONGSEONG RIEUL
225   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
226       ConvertUTFResultContainer(conversionOK)
227           .withScalars(0x1112, 0x1161, 0x11ab, 0x1100, 0x1173, 0x11af),
228       "\xe1\x84\x92\xe1\x85\xa1\xe1\x86\xab\xe1\x84\x80\xe1\x85\xb3"
229       "\xe1\x86\xaf"));
230
231   //
232   // 4-byte sequences
233   //
234
235   // U+E0100 VARIATION SELECTOR-17
236   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
237       ConvertUTFResultContainer(conversionOK).withScalars(0x000E0100),
238       "\xf3\xa0\x84\x80"));
239
240   //
241   // First possible sequence of a certain length
242   //
243
244   // U+0000 NULL
245   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
246       ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
247       StringRef("\x00", 1)));
248
249   // U+0080 PADDING CHARACTER
250   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
251       ConvertUTFResultContainer(conversionOK).withScalars(0x0080),
252       "\xc2\x80"));
253
254   // U+0800 SAMARITAN LETTER ALAF
255   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
256       ConvertUTFResultContainer(conversionOK).withScalars(0x0800),
257       "\xe0\xa0\x80"));
258
259   // U+10000 LINEAR B SYLLABLE B008 A
260   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
261       ConvertUTFResultContainer(conversionOK).withScalars(0x10000),
262       "\xf0\x90\x80\x80"));
263
264   // U+200000 (invalid)
265   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
266       ConvertUTFResultContainer(sourceIllegal)
267           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
268       "\xf8\x88\x80\x80\x80"));
269
270   // U+4000000 (invalid)
271   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
272       ConvertUTFResultContainer(sourceIllegal)
273           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
274       "\xfc\x84\x80\x80\x80\x80"));
275
276   //
277   // Last possible sequence of a certain length
278   //
279
280   // U+007F DELETE
281   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
282       ConvertUTFResultContainer(conversionOK).withScalars(0x007f), "\x7f"));
283
284   // U+07FF (unassigned)
285   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
286       ConvertUTFResultContainer(conversionOK).withScalars(0x07ff),
287       "\xdf\xbf"));
288
289   // U+FFFF (noncharacter)
290   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
291       ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
292       "\xef\xbf\xbf"));
293
294   // U+1FFFFF (invalid)
295   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
296       ConvertUTFResultContainer(sourceIllegal)
297           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
298       "\xf7\xbf\xbf\xbf"));
299
300   // U+3FFFFFF (invalid)
301   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
302       ConvertUTFResultContainer(sourceIllegal)
303           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
304       "\xfb\xbf\xbf\xbf\xbf"));
305
306   // U+7FFFFFFF (invalid)
307   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
308       ConvertUTFResultContainer(sourceIllegal)
309           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
310       "\xfd\xbf\xbf\xbf\xbf\xbf"));
311
312   //
313   // Other boundary conditions
314   //
315
316   // U+D7FF (unassigned)
317   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
318       ConvertUTFResultContainer(conversionOK).withScalars(0xd7ff),
319       "\xed\x9f\xbf"));
320
321   // U+E000 (private use)
322   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
323       ConvertUTFResultContainer(conversionOK).withScalars(0xe000),
324       "\xee\x80\x80"));
325
326   // U+FFFD REPLACEMENT CHARACTER
327   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
328       ConvertUTFResultContainer(conversionOK).withScalars(0xfffd),
329       "\xef\xbf\xbd"));
330
331   // U+10FFFF (noncharacter)
332   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
333       ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
334       "\xf4\x8f\xbf\xbf"));
335
336   // U+110000 (invalid)
337   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
338       ConvertUTFResultContainer(sourceIllegal)
339           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
340       "\xf4\x90\x80\x80"));
341
342   //
343   // Unexpected continuation bytes
344   //
345
346   // A sequence of unexpected continuation bytes that don't follow a first
347   // byte, every byte is a maximal subpart.
348
349   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
350       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\x80"));
351   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
352       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xbf"));
353   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
354       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
355       "\x80\x80"));
356   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
357       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
358       "\x80\xbf"));
359   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
360       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
361       "\xbf\x80"));
362   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
363       ConvertUTFResultContainer(sourceIllegal)
364           .withScalars(0xfffd, 0xfffd, 0xfffd),
365       "\x80\xbf\x80"));
366   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
367       ConvertUTFResultContainer(sourceIllegal)
368           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
369       "\x80\xbf\x80\xbf"));
370   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
371       ConvertUTFResultContainer(sourceIllegal)
372           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
373       "\x80\xbf\x82\xbf\xaa"));
374   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
375       ConvertUTFResultContainer(sourceIllegal)
376           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
377       "\xaa\xb0\xbb\xbf\xaa\xa0"));
378   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
379       ConvertUTFResultContainer(sourceIllegal)
380           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
381       "\xaa\xb0\xbb\xbf\xaa\xa0\x8f"));
382
383   // All continuation bytes (0x80--0xbf).
384   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
385       ConvertUTFResultContainer(sourceIllegal)
386           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
387                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
388           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
389                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
390           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
391                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
392           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
393                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
394           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
395                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
396           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
397                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
398           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
399                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
400           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
401                        0xfffd, 0xfffd, 0xfffd, 0xfffd),
402       "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
403       "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
404       "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
405       "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"));
406
407   //
408   // Lonely start bytes
409   //
410
411   // Start bytes of 2-byte sequences (0xc0--0xdf).
412   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
413       ConvertUTFResultContainer(sourceIllegal)
414           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
415                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
416           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
417                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
418           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
419                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
420           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
421                        0xfffd, 0xfffd, 0xfffd, 0xfffd),
422       "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
423       "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"));
424
425   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
426       ConvertUTFResultContainer(sourceIllegal)
427           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
428                        0xfffd, 0x0020, 0xfffd, 0x0020)
429           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
430                        0xfffd, 0x0020, 0xfffd, 0x0020)
431           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
432                        0xfffd, 0x0020, 0xfffd, 0x0020)
433           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
434                        0xfffd, 0x0020, 0xfffd, 0x0020)
435           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
436                        0xfffd, 0x0020, 0xfffd, 0x0020)
437           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
438                        0xfffd, 0x0020, 0xfffd, 0x0020)
439           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
440                        0xfffd, 0x0020, 0xfffd, 0x0020)
441           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
442                        0xfffd, 0x0020, 0xfffd, 0x0020),
443       "\xc0\x20\xc1\x20\xc2\x20\xc3\x20\xc4\x20\xc5\x20\xc6\x20\xc7\x20"
444       "\xc8\x20\xc9\x20\xca\x20\xcb\x20\xcc\x20\xcd\x20\xce\x20\xcf\x20"
445       "\xd0\x20\xd1\x20\xd2\x20\xd3\x20\xd4\x20\xd5\x20\xd6\x20\xd7\x20"
446       "\xd8\x20\xd9\x20\xda\x20\xdb\x20\xdc\x20\xdd\x20\xde\x20\xdf\x20"));
447
448   // Start bytes of 3-byte sequences (0xe0--0xef).
449   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
450       ConvertUTFResultContainer(sourceIllegal)
451           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
452                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
453           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
454                        0xfffd, 0xfffd, 0xfffd, 0xfffd),
455       "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"));
456
457   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
458       ConvertUTFResultContainer(sourceIllegal)
459           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
460                        0xfffd, 0x0020, 0xfffd, 0x0020)
461           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
462                        0xfffd, 0x0020, 0xfffd, 0x0020)
463           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
464                        0xfffd, 0x0020, 0xfffd, 0x0020)
465           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
466                        0xfffd, 0x0020, 0xfffd, 0x0020),
467       "\xe0\x20\xe1\x20\xe2\x20\xe3\x20\xe4\x20\xe5\x20\xe6\x20\xe7\x20"
468       "\xe8\x20\xe9\x20\xea\x20\xeb\x20\xec\x20\xed\x20\xee\x20\xef\x20"));
469
470   // Start bytes of 4-byte sequences (0xf0--0xf7).
471   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
472       ConvertUTFResultContainer(sourceIllegal)
473           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
474                        0xfffd, 0xfffd, 0xfffd, 0xfffd),
475       "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"));
476
477   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
478       ConvertUTFResultContainer(sourceIllegal)
479           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
480                        0xfffd, 0x0020, 0xfffd, 0x0020)
481           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
482                        0xfffd, 0x0020, 0xfffd, 0x0020),
483       "\xf0\x20\xf1\x20\xf2\x20\xf3\x20\xf4\x20\xf5\x20\xf6\x20\xf7\x20"));
484
485   // Start bytes of 5-byte sequences (0xf8--0xfb).
486   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
487       ConvertUTFResultContainer(sourceIllegal)
488           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
489       "\xf8\xf9\xfa\xfb"));
490
491   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
492       ConvertUTFResultContainer(sourceIllegal)
493           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
494                        0xfffd, 0x0020, 0xfffd, 0x0020),
495       "\xf8\x20\xf9\x20\xfa\x20\xfb\x20"));
496
497   // Start bytes of 6-byte sequences (0xfc--0xfd).
498   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
499       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
500       "\xfc\xfd"));
501
502   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
503       ConvertUTFResultContainer(sourceIllegal)
504           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020),
505       "\xfc\x20\xfd\x20"));
506
507   //
508   // Other bytes (0xc0--0xc1, 0xfe--0xff).
509   //
510
511   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
512       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc0"));
513   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
514       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc1"));
515   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
516       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfe"));
517   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
518       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xff"));
519
520   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
521       ConvertUTFResultContainer(sourceIllegal)
522           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
523       "\xc0\xc1\xfe\xff"));
524
525   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
526       ConvertUTFResultContainer(sourceIllegal)
527           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
528       "\xfe\xfe\xff\xff"));
529
530   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
531       ConvertUTFResultContainer(sourceIllegal)
532           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
533       "\xfe\x80\x80\x80\x80\x80"));
534
535   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
536       ConvertUTFResultContainer(sourceIllegal)
537           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
538       "\xff\x80\x80\x80\x80\x80"));
539
540   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
541       ConvertUTFResultContainer(sourceIllegal)
542           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
543                        0xfffd, 0x0020, 0xfffd, 0x0020),
544       "\xc0\x20\xc1\x20\xfe\x20\xff\x20"));
545
546   //
547   // Sequences with one continuation byte missing
548   //
549
550   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
551       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc2"));
552   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
553       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xdf"));
554   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
555       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
556       "\xe0\xa0"));
557   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
558       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
559       "\xe0\xbf"));
560   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
561       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
562       "\xe1\x80"));
563   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
564       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
565       "\xec\xbf"));
566   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
567       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
568       "\xed\x80"));
569   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
570       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
571       "\xed\x9f"));
572   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
573       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
574       "\xee\x80"));
575   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
576       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
577       "\xef\xbf"));
578   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
579       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
580       "\xf0\x90\x80"));
581   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
582       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
583       "\xf0\xbf\xbf"));
584   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
585       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
586       "\xf1\x80\x80"));
587   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
588       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
589       "\xf3\xbf\xbf"));
590   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
591       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
592       "\xf4\x80\x80"));
593   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
594       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
595       "\xf4\x8f\xbf"));
596
597   // Overlong sequences with one trailing byte missing.
598   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
599       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
600       "\xc0"));
601   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
602       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
603       "\xc1"));
604   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
605       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
606       "\xe0\x80"));
607   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
608       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
609       "\xe0\x9f"));
610   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
611       ConvertUTFResultContainer(sourceIllegal)
612           .withScalars(0xfffd, 0xfffd, 0xfffd),
613       "\xf0\x80\x80"));
614   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
615       ConvertUTFResultContainer(sourceIllegal)
616           .withScalars(0xfffd, 0xfffd, 0xfffd),
617       "\xf0\x8f\x80"));
618   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
619       ConvertUTFResultContainer(sourceIllegal)
620           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
621       "\xf8\x80\x80\x80"));
622   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
623       ConvertUTFResultContainer(sourceIllegal)
624           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
625       "\xfc\x80\x80\x80\x80"));
626
627   // Sequences that represent surrogates with one trailing byte missing.
628   // High surrogates
629   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
630       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
631       "\xed\xa0"));
632   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
633       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
634       "\xed\xac"));
635   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
636       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
637       "\xed\xaf"));
638   // Low surrogates
639   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
640       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
641       "\xed\xb0"));
642   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
643       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
644       "\xed\xb4"));
645   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
646       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
647       "\xed\xbf"));
648
649   // Ill-formed 4-byte sequences.
650   // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
651   // U+1100xx (invalid)
652   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
653       ConvertUTFResultContainer(sourceIllegal)
654           .withScalars(0xfffd, 0xfffd, 0xfffd),
655       "\xf4\x90\x80"));
656   // U+13FBxx (invalid)
657   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
658       ConvertUTFResultContainer(sourceIllegal)
659           .withScalars(0xfffd, 0xfffd, 0xfffd),
660       "\xf4\xbf\xbf"));
661   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
662       ConvertUTFResultContainer(sourceIllegal)
663           .withScalars(0xfffd, 0xfffd, 0xfffd),
664       "\xf5\x80\x80"));
665   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
666       ConvertUTFResultContainer(sourceIllegal)
667           .withScalars(0xfffd, 0xfffd, 0xfffd),
668       "\xf6\x80\x80"));
669   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
670       ConvertUTFResultContainer(sourceIllegal)
671           .withScalars(0xfffd, 0xfffd, 0xfffd),
672       "\xf7\x80\x80"));
673   // U+1FFBxx (invalid)
674   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
675       ConvertUTFResultContainer(sourceIllegal)
676           .withScalars(0xfffd, 0xfffd, 0xfffd),
677       "\xf7\xbf\xbf"));
678
679   // Ill-formed 5-byte sequences.
680   // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
681   // U+2000xx (invalid)
682   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
683       ConvertUTFResultContainer(sourceIllegal)
684           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
685       "\xf8\x88\x80\x80"));
686   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
687       ConvertUTFResultContainer(sourceIllegal)
688           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
689       "\xf8\xbf\xbf\xbf"));
690   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
691       ConvertUTFResultContainer(sourceIllegal)
692           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
693       "\xf9\x80\x80\x80"));
694   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
695       ConvertUTFResultContainer(sourceIllegal)
696           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
697       "\xfa\x80\x80\x80"));
698   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
699       ConvertUTFResultContainer(sourceIllegal)
700           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
701       "\xfb\x80\x80\x80"));
702   // U+3FFFFxx (invalid)
703   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
704       ConvertUTFResultContainer(sourceIllegal)
705           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
706       "\xfb\xbf\xbf\xbf"));
707
708   // Ill-formed 6-byte sequences.
709   // 1111110u 10uuuuuu 10uzzzzz 10zzzyyyy 10yyyyxx 10xxxxxx
710   // U+40000xx (invalid)
711   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
712       ConvertUTFResultContainer(sourceIllegal)
713           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
714       "\xfc\x84\x80\x80\x80"));
715   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
716       ConvertUTFResultContainer(sourceIllegal)
717           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
718       "\xfc\xbf\xbf\xbf\xbf"));
719   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
720       ConvertUTFResultContainer(sourceIllegal)
721           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
722       "\xfd\x80\x80\x80\x80"));
723   // U+7FFFFFxx (invalid)
724   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
725       ConvertUTFResultContainer(sourceIllegal)
726           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
727       "\xfd\xbf\xbf\xbf\xbf"));
728
729   //
730   // Sequences with two continuation bytes missing
731   //
732
733   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
734       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
735       "\xf0\x90"));
736   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
737       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
738       "\xf0\xbf"));
739   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
740       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
741       "\xf1\x80"));
742   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
743       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
744       "\xf3\xbf"));
745   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
746       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
747       "\xf4\x80"));
748   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
749       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
750       "\xf4\x8f"));
751
752   // Overlong sequences with two trailing byte missing.
753   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
754       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xe0"));
755   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
756       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
757       "\xf0\x80"));
758   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
759       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
760       "\xf0\x8f"));
761   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
762       ConvertUTFResultContainer(sourceIllegal)
763           .withScalars(0xfffd, 0xfffd, 0xfffd),
764       "\xf8\x80\x80"));
765   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
766       ConvertUTFResultContainer(sourceIllegal)
767           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
768       "\xfc\x80\x80\x80"));
769
770   // Sequences that represent surrogates with two trailing bytes missing.
771   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
772       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xed"));
773
774   // Ill-formed 4-byte sequences.
775   // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
776   // U+110yxx (invalid)
777   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
778       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
779       "\xf4\x90"));
780   // U+13Fyxx (invalid)
781   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
782       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
783       "\xf4\xbf"));
784   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
785       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
786       "\xf5\x80"));
787   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
788       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
789       "\xf6\x80"));
790   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
791       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
792       "\xf7\x80"));
793   // U+1FFyxx (invalid)
794   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
795       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
796       "\xf7\xbf"));
797
798   // Ill-formed 5-byte sequences.
799   // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
800   // U+200yxx (invalid)
801   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
802       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
803       "\xf8\x88\x80"));
804   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
805       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
806       "\xf8\xbf\xbf"));
807   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
808       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
809       "\xf9\x80\x80"));
810   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
811       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
812       "\xfa\x80\x80"));
813   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
814       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
815       "\xfb\x80\x80"));
816   // U+3FFFyxx (invalid)
817   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
818       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
819       "\xfb\xbf\xbf"));
820
821   // Ill-formed 6-byte sequences.
822   // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
823   // U+4000yxx (invalid)
824   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
825       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
826       "\xfc\x84\x80\x80"));
827   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
828       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
829       "\xfc\xbf\xbf\xbf"));
830   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
831       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
832       "\xfd\x80\x80\x80"));
833   // U+7FFFFyxx (invalid)
834   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
835       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
836       "\xfd\xbf\xbf\xbf"));
837
838   //
839   // Sequences with three continuation bytes missing
840   //
841
842   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
843       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
844   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
845       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf1"));
846   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
847       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf2"));
848   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
849       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf3"));
850   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
851       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf4"));
852
853   // Broken overlong sequences.
854   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
855       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
856   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
857       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
858       "\xf8\x80"));
859   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
860       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
861       "\xfc\x80\x80"));
862
863   // Ill-formed 4-byte sequences.
864   // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
865   // U+14yyxx (invalid)
866   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
867       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf5"));
868   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
869       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf6"));
870   // U+1Cyyxx (invalid)
871   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
872       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf7"));
873
874   // Ill-formed 5-byte sequences.
875   // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
876   // U+20yyxx (invalid)
877   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
878       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
879       "\xf8\x88"));
880   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
881       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
882       "\xf8\xbf"));
883   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
884       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
885       "\xf9\x80"));
886   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
887       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
888       "\xfa\x80"));
889   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
890       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
891       "\xfb\x80"));
892   // U+3FCyyxx (invalid)
893   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
894       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
895       "\xfb\xbf"));
896
897   // Ill-formed 6-byte sequences.
898   // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
899   // U+400yyxx (invalid)
900   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
901       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
902       "\xfc\x84\x80"));
903   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
904       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
905       "\xfc\xbf\xbf"));
906   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
907       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
908       "\xfd\x80\x80"));
909   // U+7FFCyyxx (invalid)
910   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
911       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
912       "\xfd\xbf\xbf"));
913
914   //
915   // Sequences with four continuation bytes missing
916   //
917
918   // Ill-formed 5-byte sequences.
919   // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
920   // U+uzyyxx (invalid)
921   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
922       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
923   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
924       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf9"));
925   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
926       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfa"));
927   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
928       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
929   // U+3zyyxx (invalid)
930   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
931       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
932
933   // Broken overlong sequences.
934   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
935       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
936   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
937       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
938       "\xfc\x80"));
939
940   // Ill-formed 6-byte sequences.
941   // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
942   // U+uzzyyxx (invalid)
943   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
944       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
945       "\xfc\x84"));
946   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
947       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
948       "\xfc\xbf"));
949   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
950       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
951       "\xfd\x80"));
952   // U+7Fzzyyxx (invalid)
953   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
954       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
955       "\xfd\xbf"));
956
957   //
958   // Sequences with five continuation bytes missing
959   //
960
961   // Ill-formed 6-byte sequences.
962   // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
963   // U+uzzyyxx (invalid)
964   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
965       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfc"));
966   // U+uuzzyyxx (invalid)
967   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
968       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfd"));
969
970   //
971   // Consecutive sequences with trailing bytes missing
972   //
973
974   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
975       ConvertUTFResultContainer(sourceIllegal)
976           .withScalars(0xfffd, /**/ 0xfffd, 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
977           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
978           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd)
979           .withScalars(0xfffd, /**/ 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
980           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
981           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
982       "\xc0" "\xe0\x80" "\xf0\x80\x80"
983       "\xf8\x80\x80\x80"
984       "\xfc\x80\x80\x80\x80"
985       "\xdf" "\xef\xbf" "\xf7\xbf\xbf"
986       "\xfb\xbf\xbf\xbf"
987       "\xfd\xbf\xbf\xbf\xbf"));
988
989   //
990   // Overlong UTF-8 sequences
991   //
992
993   // U+002F SOLIDUS
994   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
995       ConvertUTFResultContainer(conversionOK).withScalars(0x002f), "\x2f"));
996
997   // Overlong sequences of the above.
998   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
999       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1000       "\xc0\xaf"));
1001   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1002       ConvertUTFResultContainer(sourceIllegal)
1003           .withScalars(0xfffd, 0xfffd, 0xfffd),
1004       "\xe0\x80\xaf"));
1005   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1006       ConvertUTFResultContainer(sourceIllegal)
1007           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1008       "\xf0\x80\x80\xaf"));
1009   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1010       ConvertUTFResultContainer(sourceIllegal)
1011           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1012       "\xf8\x80\x80\x80\xaf"));
1013   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1014       ConvertUTFResultContainer(sourceIllegal)
1015           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1016       "\xfc\x80\x80\x80\x80\xaf"));
1017
1018   // U+0000 NULL
1019   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1020       ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
1021       StringRef("\x00", 1)));
1022
1023   // Overlong sequences of the above.
1024   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1025       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1026       "\xc0\x80"));
1027   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1028       ConvertUTFResultContainer(sourceIllegal)
1029           .withScalars(0xfffd, 0xfffd, 0xfffd),
1030       "\xe0\x80\x80"));
1031   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1032       ConvertUTFResultContainer(sourceIllegal)
1033           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1034       "\xf0\x80\x80\x80"));
1035   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1036       ConvertUTFResultContainer(sourceIllegal)
1037           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1038       "\xf8\x80\x80\x80\x80"));
1039   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1040       ConvertUTFResultContainer(sourceIllegal)
1041           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1042       "\xfc\x80\x80\x80\x80\x80"));
1043
1044   // Other overlong sequences.
1045   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1046       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1047       "\xc0\xbf"));
1048   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1049       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1050       "\xc1\x80"));
1051   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1052       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1053       "\xc1\xbf"));
1054   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1055       ConvertUTFResultContainer(sourceIllegal)
1056           .withScalars(0xfffd, 0xfffd, 0xfffd),
1057       "\xe0\x9f\xbf"));
1058   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1059       ConvertUTFResultContainer(sourceIllegal)
1060           .withScalars(0xfffd, 0xfffd, 0xfffd),
1061       "\xed\xa0\x80"));
1062   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1063       ConvertUTFResultContainer(sourceIllegal)
1064           .withScalars(0xfffd, 0xfffd, 0xfffd),
1065       "\xed\xbf\xbf"));
1066   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1067       ConvertUTFResultContainer(sourceIllegal)
1068           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1069       "\xf0\x8f\x80\x80"));
1070   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1071       ConvertUTFResultContainer(sourceIllegal)
1072           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1073       "\xf0\x8f\xbf\xbf"));
1074   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1075       ConvertUTFResultContainer(sourceIllegal)
1076           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1077       "\xf8\x87\xbf\xbf\xbf"));
1078   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1079       ConvertUTFResultContainer(sourceIllegal)
1080           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1081       "\xfc\x83\xbf\xbf\xbf\xbf"));
1082
1083   //
1084   // Isolated surrogates
1085   //
1086
1087   // Unicode 6.3.0:
1088   //
1089   //    D71.  High-surrogate code point: A Unicode code point in the range
1090   //    U+D800 to U+DBFF.
1091   //
1092   //    D73.  Low-surrogate code point: A Unicode code point in the range
1093   //    U+DC00 to U+DFFF.
1094
1095   // Note: U+E0100 is <DB40 DD00> in UTF16.
1096
1097   // High surrogates
1098
1099   // U+D800
1100   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1101       ConvertUTFResultContainer(sourceIllegal)
1102           .withScalars(0xfffd, 0xfffd, 0xfffd),
1103       "\xed\xa0\x80"));
1104
1105   // U+DB40
1106   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1107       ConvertUTFResultContainer(sourceIllegal)
1108           .withScalars(0xfffd, 0xfffd, 0xfffd),
1109       "\xed\xac\xa0"));
1110
1111   // U+DBFF
1112   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1113       ConvertUTFResultContainer(sourceIllegal)
1114           .withScalars(0xfffd, 0xfffd, 0xfffd),
1115       "\xed\xaf\xbf"));
1116
1117   // Low surrogates
1118
1119   // U+DC00
1120   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1121       ConvertUTFResultContainer(sourceIllegal)
1122           .withScalars(0xfffd, 0xfffd, 0xfffd),
1123       "\xed\xb0\x80"));
1124
1125   // U+DD00
1126   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1127       ConvertUTFResultContainer(sourceIllegal)
1128           .withScalars(0xfffd, 0xfffd, 0xfffd),
1129       "\xed\xb4\x80"));
1130
1131   // U+DFFF
1132   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1133       ConvertUTFResultContainer(sourceIllegal)
1134           .withScalars(0xfffd, 0xfffd, 0xfffd),
1135       "\xed\xbf\xbf"));
1136
1137   // Surrogate pairs
1138
1139   // U+D800 U+DC00
1140   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1141       ConvertUTFResultContainer(sourceIllegal)
1142           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1143       "\xed\xa0\x80\xed\xb0\x80"));
1144
1145   // U+D800 U+DD00
1146   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1147       ConvertUTFResultContainer(sourceIllegal)
1148           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1149       "\xed\xa0\x80\xed\xb4\x80"));
1150
1151   // U+D800 U+DFFF
1152   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1153       ConvertUTFResultContainer(sourceIllegal)
1154           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1155       "\xed\xa0\x80\xed\xbf\xbf"));
1156
1157   // U+DB40 U+DC00
1158   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1159       ConvertUTFResultContainer(sourceIllegal)
1160           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1161       "\xed\xac\xa0\xed\xb0\x80"));
1162
1163   // U+DB40 U+DD00
1164   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1165       ConvertUTFResultContainer(sourceIllegal)
1166           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1167       "\xed\xac\xa0\xed\xb4\x80"));
1168
1169   // U+DB40 U+DFFF
1170   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1171       ConvertUTFResultContainer(sourceIllegal)
1172           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1173       "\xed\xac\xa0\xed\xbf\xbf"));
1174
1175   // U+DBFF U+DC00
1176   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1177       ConvertUTFResultContainer(sourceIllegal)
1178           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1179       "\xed\xaf\xbf\xed\xb0\x80"));
1180
1181   // U+DBFF U+DD00
1182   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1183       ConvertUTFResultContainer(sourceIllegal)
1184           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1185       "\xed\xaf\xbf\xed\xb4\x80"));
1186
1187   // U+DBFF U+DFFF
1188   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1189       ConvertUTFResultContainer(sourceIllegal)
1190           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1191       "\xed\xaf\xbf\xed\xbf\xbf"));
1192
1193   //
1194   // Noncharacters
1195   //
1196
1197   // Unicode 6.3.0:
1198   //
1199   //    D14.  Noncharacter: A code point that is permanently reserved for
1200   //    internal use and that should never be interchanged. Noncharacters
1201   //    consist of the values U+nFFFE and U+nFFFF (where n is from 0 to 1016)
1202   //    and the values U+FDD0..U+FDEF.
1203
1204   // U+FFFE
1205   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1206       ConvertUTFResultContainer(conversionOK).withScalars(0xfffe),
1207       "\xef\xbf\xbe"));
1208
1209   // U+FFFF
1210   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1211       ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
1212       "\xef\xbf\xbf"));
1213
1214   // U+1FFFE
1215   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1216       ConvertUTFResultContainer(conversionOK).withScalars(0x1fffe),
1217       "\xf0\x9f\xbf\xbe"));
1218
1219   // U+1FFFF
1220   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1221       ConvertUTFResultContainer(conversionOK).withScalars(0x1ffff),
1222       "\xf0\x9f\xbf\xbf"));
1223
1224   // U+2FFFE
1225   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1226       ConvertUTFResultContainer(conversionOK).withScalars(0x2fffe),
1227       "\xf0\xaf\xbf\xbe"));
1228
1229   // U+2FFFF
1230   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1231       ConvertUTFResultContainer(conversionOK).withScalars(0x2ffff),
1232       "\xf0\xaf\xbf\xbf"));
1233
1234   // U+3FFFE
1235   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1236       ConvertUTFResultContainer(conversionOK).withScalars(0x3fffe),
1237       "\xf0\xbf\xbf\xbe"));
1238
1239   // U+3FFFF
1240   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1241       ConvertUTFResultContainer(conversionOK).withScalars(0x3ffff),
1242       "\xf0\xbf\xbf\xbf"));
1243
1244   // U+4FFFE
1245   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1246       ConvertUTFResultContainer(conversionOK).withScalars(0x4fffe),
1247       "\xf1\x8f\xbf\xbe"));
1248
1249   // U+4FFFF
1250   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1251       ConvertUTFResultContainer(conversionOK).withScalars(0x4ffff),
1252       "\xf1\x8f\xbf\xbf"));
1253
1254   // U+5FFFE
1255   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1256       ConvertUTFResultContainer(conversionOK).withScalars(0x5fffe),
1257       "\xf1\x9f\xbf\xbe"));
1258
1259   // U+5FFFF
1260   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1261       ConvertUTFResultContainer(conversionOK).withScalars(0x5ffff),
1262       "\xf1\x9f\xbf\xbf"));
1263
1264   // U+6FFFE
1265   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1266       ConvertUTFResultContainer(conversionOK).withScalars(0x6fffe),
1267       "\xf1\xaf\xbf\xbe"));
1268
1269   // U+6FFFF
1270   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1271       ConvertUTFResultContainer(conversionOK).withScalars(0x6ffff),
1272       "\xf1\xaf\xbf\xbf"));
1273
1274   // U+7FFFE
1275   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1276       ConvertUTFResultContainer(conversionOK).withScalars(0x7fffe),
1277       "\xf1\xbf\xbf\xbe"));
1278
1279   // U+7FFFF
1280   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1281       ConvertUTFResultContainer(conversionOK).withScalars(0x7ffff),
1282       "\xf1\xbf\xbf\xbf"));
1283
1284   // U+8FFFE
1285   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1286       ConvertUTFResultContainer(conversionOK).withScalars(0x8fffe),
1287       "\xf2\x8f\xbf\xbe"));
1288
1289   // U+8FFFF
1290   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1291       ConvertUTFResultContainer(conversionOK).withScalars(0x8ffff),
1292       "\xf2\x8f\xbf\xbf"));
1293
1294   // U+9FFFE
1295   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1296       ConvertUTFResultContainer(conversionOK).withScalars(0x9fffe),
1297       "\xf2\x9f\xbf\xbe"));
1298
1299   // U+9FFFF
1300   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1301       ConvertUTFResultContainer(conversionOK).withScalars(0x9ffff),
1302       "\xf2\x9f\xbf\xbf"));
1303
1304   // U+AFFFE
1305   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1306       ConvertUTFResultContainer(conversionOK).withScalars(0xafffe),
1307       "\xf2\xaf\xbf\xbe"));
1308
1309   // U+AFFFF
1310   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1311       ConvertUTFResultContainer(conversionOK).withScalars(0xaffff),
1312       "\xf2\xaf\xbf\xbf"));
1313
1314   // U+BFFFE
1315   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1316       ConvertUTFResultContainer(conversionOK).withScalars(0xbfffe),
1317       "\xf2\xbf\xbf\xbe"));
1318
1319   // U+BFFFF
1320   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1321       ConvertUTFResultContainer(conversionOK).withScalars(0xbffff),
1322       "\xf2\xbf\xbf\xbf"));
1323
1324   // U+CFFFE
1325   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1326       ConvertUTFResultContainer(conversionOK).withScalars(0xcfffe),
1327       "\xf3\x8f\xbf\xbe"));
1328
1329   // U+CFFFF
1330   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1331       ConvertUTFResultContainer(conversionOK).withScalars(0xcfffF),
1332       "\xf3\x8f\xbf\xbf"));
1333
1334   // U+DFFFE
1335   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1336       ConvertUTFResultContainer(conversionOK).withScalars(0xdfffe),
1337       "\xf3\x9f\xbf\xbe"));
1338
1339   // U+DFFFF
1340   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1341       ConvertUTFResultContainer(conversionOK).withScalars(0xdffff),
1342       "\xf3\x9f\xbf\xbf"));
1343
1344   // U+EFFFE
1345   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1346       ConvertUTFResultContainer(conversionOK).withScalars(0xefffe),
1347       "\xf3\xaf\xbf\xbe"));
1348
1349   // U+EFFFF
1350   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1351       ConvertUTFResultContainer(conversionOK).withScalars(0xeffff),
1352       "\xf3\xaf\xbf\xbf"));
1353
1354   // U+FFFFE
1355   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1356       ConvertUTFResultContainer(conversionOK).withScalars(0xffffe),
1357       "\xf3\xbf\xbf\xbe"));
1358
1359   // U+FFFFF
1360   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1361       ConvertUTFResultContainer(conversionOK).withScalars(0xfffff),
1362       "\xf3\xbf\xbf\xbf"));
1363
1364   // U+10FFFE
1365   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1366       ConvertUTFResultContainer(conversionOK).withScalars(0x10fffe),
1367       "\xf4\x8f\xbf\xbe"));
1368
1369   // U+10FFFF
1370   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1371       ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
1372       "\xf4\x8f\xbf\xbf"));
1373
1374   // U+FDD0
1375   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1376       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd0),
1377       "\xef\xb7\x90"));
1378
1379   // U+FDD1
1380   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1381       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd1),
1382       "\xef\xb7\x91"));
1383
1384   // U+FDD2
1385   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1386       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd2),
1387       "\xef\xb7\x92"));
1388
1389   // U+FDD3
1390   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1391       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd3),
1392       "\xef\xb7\x93"));
1393
1394   // U+FDD4
1395   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1396       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd4),
1397       "\xef\xb7\x94"));
1398
1399   // U+FDD5
1400   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1401       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd5),
1402       "\xef\xb7\x95"));
1403
1404   // U+FDD6
1405   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1406       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd6),
1407       "\xef\xb7\x96"));
1408
1409   // U+FDD7
1410   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1411       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd7),
1412       "\xef\xb7\x97"));
1413
1414   // U+FDD8
1415   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1416       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd8),
1417       "\xef\xb7\x98"));
1418
1419   // U+FDD9
1420   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1421       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd9),
1422       "\xef\xb7\x99"));
1423
1424   // U+FDDA
1425   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1426       ConvertUTFResultContainer(conversionOK).withScalars(0xfdda),
1427       "\xef\xb7\x9a"));
1428
1429   // U+FDDB
1430   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1431       ConvertUTFResultContainer(conversionOK).withScalars(0xfddb),
1432       "\xef\xb7\x9b"));
1433
1434   // U+FDDC
1435   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1436       ConvertUTFResultContainer(conversionOK).withScalars(0xfddc),
1437       "\xef\xb7\x9c"));
1438
1439   // U+FDDD
1440   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1441       ConvertUTFResultContainer(conversionOK).withScalars(0xfddd),
1442       "\xef\xb7\x9d"));
1443
1444   // U+FDDE
1445   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1446       ConvertUTFResultContainer(conversionOK).withScalars(0xfdde),
1447       "\xef\xb7\x9e"));
1448
1449   // U+FDDF
1450   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1451       ConvertUTFResultContainer(conversionOK).withScalars(0xfddf),
1452       "\xef\xb7\x9f"));
1453
1454   // U+FDE0
1455   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1456       ConvertUTFResultContainer(conversionOK).withScalars(0xfde0),
1457       "\xef\xb7\xa0"));
1458
1459   // U+FDE1
1460   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1461       ConvertUTFResultContainer(conversionOK).withScalars(0xfde1),
1462       "\xef\xb7\xa1"));
1463
1464   // U+FDE2
1465   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1466       ConvertUTFResultContainer(conversionOK).withScalars(0xfde2),
1467       "\xef\xb7\xa2"));
1468
1469   // U+FDE3
1470   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1471       ConvertUTFResultContainer(conversionOK).withScalars(0xfde3),
1472       "\xef\xb7\xa3"));
1473
1474   // U+FDE4
1475   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1476       ConvertUTFResultContainer(conversionOK).withScalars(0xfde4),
1477       "\xef\xb7\xa4"));
1478
1479   // U+FDE5
1480   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1481       ConvertUTFResultContainer(conversionOK).withScalars(0xfde5),
1482       "\xef\xb7\xa5"));
1483
1484   // U+FDE6
1485   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1486       ConvertUTFResultContainer(conversionOK).withScalars(0xfde6),
1487       "\xef\xb7\xa6"));
1488
1489   // U+FDE7
1490   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1491       ConvertUTFResultContainer(conversionOK).withScalars(0xfde7),
1492       "\xef\xb7\xa7"));
1493
1494   // U+FDE8
1495   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1496       ConvertUTFResultContainer(conversionOK).withScalars(0xfde8),
1497       "\xef\xb7\xa8"));
1498
1499   // U+FDE9
1500   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1501       ConvertUTFResultContainer(conversionOK).withScalars(0xfde9),
1502       "\xef\xb7\xa9"));
1503
1504   // U+FDEA
1505   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1506       ConvertUTFResultContainer(conversionOK).withScalars(0xfdea),
1507       "\xef\xb7\xaa"));
1508
1509   // U+FDEB
1510   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1511       ConvertUTFResultContainer(conversionOK).withScalars(0xfdeb),
1512       "\xef\xb7\xab"));
1513
1514   // U+FDEC
1515   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1516       ConvertUTFResultContainer(conversionOK).withScalars(0xfdec),
1517       "\xef\xb7\xac"));
1518
1519   // U+FDED
1520   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1521       ConvertUTFResultContainer(conversionOK).withScalars(0xfded),
1522       "\xef\xb7\xad"));
1523
1524   // U+FDEE
1525   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1526       ConvertUTFResultContainer(conversionOK).withScalars(0xfdee),
1527       "\xef\xb7\xae"));
1528
1529   // U+FDEF
1530   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1531       ConvertUTFResultContainer(conversionOK).withScalars(0xfdef),
1532       "\xef\xb7\xaf"));
1533
1534   // U+FDF0
1535   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1536       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf0),
1537       "\xef\xb7\xb0"));
1538
1539   // U+FDF1
1540   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1541       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf1),
1542       "\xef\xb7\xb1"));
1543
1544   // U+FDF2
1545   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1546       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf2),
1547       "\xef\xb7\xb2"));
1548
1549   // U+FDF3
1550   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1551       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf3),
1552       "\xef\xb7\xb3"));
1553
1554   // U+FDF4
1555   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1556       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf4),
1557       "\xef\xb7\xb4"));
1558
1559   // U+FDF5
1560   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1561       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf5),
1562       "\xef\xb7\xb5"));
1563
1564   // U+FDF6
1565   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1566       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf6),
1567       "\xef\xb7\xb6"));
1568
1569   // U+FDF7
1570   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1571       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf7),
1572       "\xef\xb7\xb7"));
1573
1574   // U+FDF8
1575   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1576       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf8),
1577       "\xef\xb7\xb8"));
1578
1579   // U+FDF9
1580   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1581       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf9),
1582       "\xef\xb7\xb9"));
1583
1584   // U+FDFA
1585   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1586       ConvertUTFResultContainer(conversionOK).withScalars(0xfdfa),
1587       "\xef\xb7\xba"));
1588
1589   // U+FDFB
1590   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1591       ConvertUTFResultContainer(conversionOK).withScalars(0xfdfb),
1592       "\xef\xb7\xbb"));
1593
1594   // U+FDFC
1595   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1596       ConvertUTFResultContainer(conversionOK).withScalars(0xfdfc),
1597       "\xef\xb7\xbc"));
1598
1599   // U+FDFD
1600   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1601       ConvertUTFResultContainer(conversionOK).withScalars(0xfdfd),
1602       "\xef\xb7\xbd"));
1603
1604   // U+FDFE
1605   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1606       ConvertUTFResultContainer(conversionOK).withScalars(0xfdfe),
1607       "\xef\xb7\xbe"));
1608
1609   // U+FDFF
1610   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1611       ConvertUTFResultContainer(conversionOK).withScalars(0xfdff),
1612       "\xef\xb7\xbf"));
1613 }
1614
1615 TEST(ConvertUTFTest, UTF8ToUTF32PartialLenient) {
1616   // U+0041 LATIN CAPITAL LETTER A
1617   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1618       ConvertUTFResultContainer(conversionOK).withScalars(0x0041),
1619       "\x41", true));
1620
1621   //
1622   // Sequences with one continuation byte missing
1623   //
1624
1625   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1626       ConvertUTFResultContainer(sourceExhausted),
1627       "\xc2", true));
1628   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1629       ConvertUTFResultContainer(sourceExhausted),
1630       "\xdf", true));
1631   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1632       ConvertUTFResultContainer(sourceExhausted),
1633       "\xe0\xa0", true));
1634   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1635       ConvertUTFResultContainer(sourceExhausted),
1636       "\xe0\xbf", true));
1637   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1638       ConvertUTFResultContainer(sourceExhausted),
1639       "\xe1\x80", true));
1640   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1641       ConvertUTFResultContainer(sourceExhausted),
1642       "\xec\xbf", true));
1643   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1644       ConvertUTFResultContainer(sourceExhausted),
1645       "\xed\x80", true));
1646   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1647       ConvertUTFResultContainer(sourceExhausted),
1648       "\xed\x9f", true));
1649   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1650       ConvertUTFResultContainer(sourceExhausted),
1651       "\xee\x80", true));
1652   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1653       ConvertUTFResultContainer(sourceExhausted),
1654       "\xef\xbf", true));
1655   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1656       ConvertUTFResultContainer(sourceExhausted),
1657       "\xf0\x90\x80", true));
1658   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1659       ConvertUTFResultContainer(sourceExhausted),
1660       "\xf0\xbf\xbf", true));
1661   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1662       ConvertUTFResultContainer(sourceExhausted),
1663       "\xf1\x80\x80", true));
1664   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1665       ConvertUTFResultContainer(sourceExhausted),
1666       "\xf3\xbf\xbf", true));
1667   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1668       ConvertUTFResultContainer(sourceExhausted),
1669       "\xf4\x80\x80", true));
1670   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1671       ConvertUTFResultContainer(sourceExhausted),
1672       "\xf4\x8f\xbf", true));
1673
1674   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1675       ConvertUTFResultContainer(sourceExhausted).withScalars(0x0041),
1676       "\x41\xc2", true));
1677 }
1678