1 //===----------------------------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 // UNSUPPORTED: no-localization
10 // UNSUPPORTED: c++03
11 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
12
13 // <filesystem>
14
15 // class path
16
17 // Test constructors, accessors and modifiers that convert from/to various
18 // character encodings. Constructors and modifiers (append, concat,
19 // operator/=, operator+=) accept inputs with various character encodings,
20 // and accessors (*string(), string<>(), u8string()) export the string with
21 // various encodings.
22 //
23 // Some encodings are standardized; char16_t, char32_t and the u8string
24 // accessor and u8path constructor (and normal functions taking char8_t in
25 // C++20) convert from/to UTF-16, UTF-32 and UTF-8. wchar_t can be either
26 // UTF-16 or UTF-32 depending on the size of the wchar_t type, or can be
27 // left unimplemented.
28 //
29 // Plain char is implicitly UTF-8 on posix systems. On Windows, plain char
30 // is supposed to be in the same encoding as the platform's native file
31 // system APIs consumes in the functions that take narrow strings as path
32 // names.
33
34
35 #include "filesystem_include.h"
36 #include <type_traits>
37 #include <cassert>
38
39 #include "test_macros.h"
40 #include "filesystem_test_helper.h"
41
42 // Test conversion with strings that fit within the latin1 charset, that fit
43 // within one code point in UTF-16, and that can be expressible in certain
44 // one-byte code pages.
test_latin_unicode()45 static void test_latin_unicode()
46 {
47 const char16_t u16str[] = { 0xe5, 0xe4, 0xf6, 0x00 };
48 const char32_t u32str[] = { 0xe5, 0xe4, 0xf6, 0x00 };
49 const char str[] = { char(0xc3), char(0xa5), char(0xc3), char(0xa4), char(0xc3), char(0xb6), 0x00 }; // UTF8, in a regular char string
50 #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t)
51 const char8_t u8str[] = { 0xc3, 0xa5, 0xc3, 0xa4, 0xc3, 0xb6, 0x00 };
52 #else
53 const char u8str[] = { char(0xc3), char(0xa5), char(0xc3), char(0xa4), char(0xc3), char(0xb6), 0x00 };
54 #endif
55 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
56 const wchar_t wstr[] = { 0xe5, 0xe4, 0xf6, 0x00 };
57 #endif
58
59 // Test well-defined conversion between UTF-8, UTF-16 and UTF-32
60 {
61 const fs::path p(u16str);
62 assert(p.u8string() == u8str);
63 assert(p.u16string() == u16str);
64 assert(p.u32string() == u32str);
65 assert(p.string<char16_t>() == u16str);
66 assert(p.string<char32_t>() == u32str);
67 }
68 {
69 const fs::path p(u32str);
70 assert(p.u8string() == u8str);
71 assert(p.u16string() == u16str);
72 assert(p.u32string() == u32str);
73 assert(p.string<char16_t>() == u16str);
74 assert(p.string<char32_t>() == u32str);
75 }
76 {
77 const fs::path p = fs::u8path(str);
78 assert(p.u8string() == u8str);
79 assert(p.u16string() == u16str);
80 assert(p.u32string() == u32str);
81 assert(p.string<char16_t>() == u16str);
82 assert(p.string<char32_t>() == u32str);
83 }
84 #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t)
85 {
86 // In C++20, the path constructor can unambiguously handle UTF-8 input,
87 // even if the plain char constructor would treat it as something else.
88 const fs::path p(u8str);
89 assert(p.u8string() == u8str);
90 assert(p.u16string() == u16str);
91 assert(p.u32string() == u32str);
92 assert(p.string<char8_t>() == u8str);
93 assert(p.string<char16_t>() == u16str);
94 assert(p.string<char32_t>() == u32str);
95 }
96 // Check reading various inputs with string<char8_t>()
97 {
98 const fs::path p(u16str);
99 assert(p.string<char8_t>() == u8str);
100 }
101 {
102 const fs::path p(u32str);
103 assert(p.string<char8_t>() == u8str);
104 }
105 {
106 const fs::path p = fs::u8path(str);
107 assert(p.string<char8_t>() == u8str);
108 }
109 #endif
110 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
111 // Test conversion to/from wchar_t.
112 {
113 const fs::path p(u16str);
114 assert(p.wstring() == wstr);
115 assert(p.string<wchar_t>() == wstr);
116 }
117 {
118 const fs::path p = fs::u8path(str);
119 assert(p.wstring() == wstr);
120 assert(p.string<wchar_t>() == wstr);
121 }
122 {
123 const fs::path p(wstr);
124 assert(p.wstring() == wstr);
125 assert(p.u8string() == u8str);
126 assert(p.u16string() == u16str);
127 assert(p.u32string() == u32str);
128 assert(p.string<wchar_t>() == wstr);
129 }
130 #endif // TEST_HAS_NO_WIDE_CHARACTERS
131 #ifndef _WIN32
132 // Test conversion to/from regular char-based string. On POSIX, this
133 // is implied to convert to/from UTF-8.
134 {
135 const fs::path p(str);
136 assert(p.string() == str);
137 assert(p.u16string() == u16str);
138 assert(p.string<char>() == str);
139 }
140 {
141 const fs::path p(u16str);
142 assert(p.string() == str);
143 assert(p.string<char>() == str);
144 }
145 #else
146 // On windows, the narrow char-based input/output is supposed to be
147 // in the charset that narrow file IO APIs use. This can either be the
148 // current active code page (ACP) or the OEM code page, exposed by
149 // the AreFileApisANSI() function, and settable with SetFileApisToANSI() and
150 // SetFileApisToOEM(). We can't set which codepage is active within
151 // the process, but for some specific known ones, we can check if they
152 // behave as expected.
153 SetFileApisToANSI();
154 if (GetACP() == 1252) {
155 const char latin1[] = { char(0xe5), char(0xe4), char(0xf6), 0x00 };
156 {
157 const fs::path p(wstr);
158 assert(p.string() == latin1);
159 assert(p.string<char>() == latin1);
160 }
161 {
162 const fs::path p(latin1);
163 assert(p.string() == latin1);
164 assert(p.wstring() == wstr);
165 assert(p.u8string() == u8str);
166 assert(p.u16string() == u16str);
167 assert(p.string<char>() == latin1);
168 assert(p.string<wchar_t>() == wstr);
169 }
170 }
171 SetFileApisToOEM();
172 if (GetOEMCP() == 850 || GetOEMCP() == 437) {
173 // These chars are identical in both CP 850 and 437
174 const char cp850[] = { char(0x86), char(0x84), char(0x94), 0x00 };
175 {
176 const fs::path p(wstr);
177 assert(p.string() == cp850);
178 assert(p.string<char>() == cp850);
179 }
180 {
181 const fs::path p(cp850);
182 assert(p.string() == cp850);
183 assert(p.wstring() == wstr);
184 assert(p.u8string() == u8str);
185 assert(p.u16string() == u16str);
186 assert(p.string<char>() == cp850);
187 assert(p.string<wchar_t>() == wstr);
188 }
189 }
190 #endif
191 }
192
193 // Test conversion with strings that don't fit within one UTF-16 code point.
194 // Here, wchar_t can be either UTF-16 or UTF-32 depending on the size on the
195 // particular platform.
test_wide_unicode()196 static void test_wide_unicode()
197 {
198 const char16_t u16str[] = { 0xd801, 0xdc37, 0x00 };
199 const char32_t u32str[] = { 0x10437, 0x00 };
200 #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t)
201 const char8_t u8str[] = { 0xf0, 0x90, 0x90, 0xb7, 0x00 };
202 #else
203 const char u8str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 };
204 #endif
205 const char str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 };
206 {
207 const fs::path p = fs::u8path(str);
208 assert(p.u8string() == u8str);
209 assert(p.u16string() == u16str);
210 assert(p.u32string() == u32str);
211 }
212 {
213 const fs::path p(u16str);
214 assert(p.u8string() == u8str);
215 assert(p.u16string() == u16str);
216 assert(p.u32string() == u32str);
217 }
218 {
219 const fs::path p(u32str);
220 assert(p.u8string() == u8str);
221 assert(p.u16string() == u16str);
222 assert(p.u32string() == u32str);
223 }
224 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__)
225 # if __SIZEOF_WCHAR_T__ == 2
226 const wchar_t wstr[] = { 0xd801, 0xdc37, 0x00 };
227 # else
228 const wchar_t wstr[] = { 0x10437, 0x00 };
229 # endif
230 // Test conversion to/from wchar_t.
231 {
232 const fs::path p = fs::u8path(str);
233 assert(p.wstring() == wstr);
234 }
235 {
236 const fs::path p(u16str);
237 assert(p.wstring() == wstr);
238 }
239 {
240 const fs::path p(u32str);
241 assert(p.wstring() == wstr);
242 }
243 {
244 const fs::path p(wstr);
245 assert(p.u8string() == u8str);
246 assert(p.u16string() == u16str);
247 assert(p.u32string() == u32str);
248 assert(p.wstring() == wstr);
249 }
250 #endif // !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__)
251 }
252
253 // Test appending paths in different encodings.
test_append()254 static void test_append()
255 {
256 const char16_t u16str[] = { 0xd801, 0xdc37, 0x00 };
257 const char32_t u32str[] = { 0x10437, 0x00 };
258 const char32_t u32ref[] = { 0x10437, fs::path::preferred_separator, 0x10437, fs::path::preferred_separator, 0x10437, 0x00 };
259 const char str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 };
260 {
261 fs::path p = fs::u8path(str) / u16str / u32str;
262 assert(p.u32string() == u32ref);
263 p = fs::u8path(str).append(u16str).append(u32str);
264 assert(p.u32string() == u32ref);
265 p = fs::u8path(str);
266 p /= u16str;
267 p /= u32str;
268 assert(p.u32string() == u32ref);
269 }
270 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__)
271 # if __SIZEOF_WCHAR_T__ == 2
272 const wchar_t wstr[] = { 0xd801, 0xdc37, 0x00 };
273 # else
274 const wchar_t wstr[] = { 0x10437, 0x00 };
275 # endif
276 // Test conversion from wchar_t.
277 {
278 fs::path p = fs::path(u16str) / wstr / u32str;
279 assert(p.u32string() == u32ref);
280 p = fs::path(u16str).append(wstr).append(u32str);
281 assert(p.u32string() == u32ref);
282 p = fs::path(u16str);
283 p /= wstr;
284 p /= u32str;
285 assert(p.u32string() == u32ref);
286 }
287 #endif // !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__)
288 }
289
test_concat()290 static void test_concat()
291 {
292 const char16_t u16str[] = { 0xd801, 0xdc37, 0x00 };
293 const char32_t u32str[] = { 0x10437, 0x00 };
294 const char32_t u32ref[] = { 0x10437, 0x10437, 0x10437, 0x00 };
295 const char str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 };
296 {
297 fs::path p = fs::u8path(str);
298 p += u16str;
299 p += u32str;
300 assert(p.u32string() == u32ref);
301 p = fs::u8path(str).concat(u16str).concat(u32str);
302 assert(p.u32string() == u32ref);
303 }
304 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__)
305 # if __SIZEOF_WCHAR_T__ == 2
306 const wchar_t wstr[] = { 0xd801, 0xdc37, 0x00 };
307 # else
308 const wchar_t wstr[] = { 0x10437, 0x00 };
309 # endif
310 // Test conversion from wchar_t.
311 {
312 fs::path p = fs::path(u16str);
313 p += wstr;
314 p += u32str;
315 assert(p.u32string() == u32ref);
316 p = fs::path(u16str).concat(wstr).concat(u32str);
317 assert(p.u32string() == u32ref);
318 }
319 #endif // !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__)
320 }
321
test_append_concat_narrow()322 static void test_append_concat_narrow()
323 {
324 const char16_t u16str[] = { 0xe5, 0x00 };
325 const char32_t u32ref_append[] = { 0xe5, fs::path::preferred_separator, 0xe5, 0x00 };
326 const char32_t u32ref_concat[] = { 0xe5, 0xe5, 0x00 };
327
328 #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t)
329 {
330 const char8_t u8str[] = { 0xc3, 0xa5, 0x00 };
331 // In C++20, appends of a char8_t string is unambiguously treated as
332 // UTF-8.
333 fs::path p = fs::path(u16str) / u8str;
334 assert(p.u32string() == u32ref_append);
335 p = fs::path(u16str).append(u8str);
336 assert(p.u32string() == u32ref_append);
337 p = fs::path(u16str);
338 p /= u8str;
339 assert(p.u32string() == u32ref_append);
340 p = fs::path(u16str).concat(u8str);
341 assert(p.u32string() == u32ref_concat);
342 p = fs::path(u16str);
343 p += u8str;
344 assert(p.u32string() == u32ref_concat);
345 }
346 #endif
347 #ifndef _WIN32
348 // Test appending a regular char-based string. On POSIX, this
349 // is implied to convert to/from UTF-8.
350 {
351 const char str[] = { char(0xc3), char(0xa5), 0x00 }; // UTF8, in a regular char string
352 fs::path p = fs::path(u16str) / str;
353 assert(p.u32string() == u32ref_append);
354 p = fs::path(u16str).append(str);
355 assert(p.u32string() == u32ref_append);
356 p = fs::path(u16str);
357 p /= str;
358 assert(p.u32string() == u32ref_append);
359 p = fs::path(u16str).concat(str);
360 assert(p.u32string() == u32ref_concat);
361 p = fs::path(u16str);
362 p += str;
363 assert(p.u32string() == u32ref_concat);
364 }
365 #else
366 SetFileApisToANSI();
367 if (GetACP() == 1252) {
368 const char latin1[] = { char(0xe5), 0x00 };
369 fs::path p = fs::path(u16str) / latin1;
370 assert(p.u32string() == u32ref_append);
371 p = fs::path(u16str).append(latin1);
372 assert(p.u32string() == u32ref_append);
373 p = fs::path(u16str);
374 p /= latin1;
375 assert(p.u32string() == u32ref_append);
376 p = fs::path(u16str).concat(latin1);
377 assert(p.u32string() == u32ref_concat);
378 p = fs::path(u16str);
379 p += latin1;
380 assert(p.u32string() == u32ref_concat);
381 }
382 SetFileApisToOEM();
383 if (GetOEMCP() == 850 || GetOEMCP() == 437) {
384 // This chars is identical in both CP 850 and 437
385 const char cp850[] = { char(0x86), 0x00 };
386 fs::path p = fs::path(u16str) / cp850;
387 assert(p.u32string() == u32ref_append);
388 p = fs::path(u16str).append(cp850);
389 assert(p.u32string() == u32ref_append);
390 p = fs::path(u16str);
391 p /= cp850;
392 assert(p.u32string() == u32ref_append);
393 p = fs::path(u16str).concat(cp850);
394 assert(p.u32string() == u32ref_concat);
395 p = fs::path(u16str);
396 p += cp850;
397 assert(p.u32string() == u32ref_concat);
398 }
399 #endif
400 }
401
main(int,char **)402 int main(int, char**)
403 {
404 test_latin_unicode();
405 test_wide_unicode();
406 test_append();
407 test_concat();
408 test_append_concat_narrow();
409
410 return 0;
411 }
412