1 //===----------------------------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 // UNSUPPORTED: no-localization
10 // UNSUPPORTED: c++03
11 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
12 
13 // <filesystem>
14 
15 // class path
16 
17 // Test constructors, accessors and modifiers that convert from/to various
18 // character encodings. Constructors and modifiers (append, concat,
19 // operator/=, operator+=) accept inputs with various character encodings,
20 // and accessors (*string(), string<>(), u8string()) export the string with
21 // various encodings.
22 //
23 // Some encodings are standardized; char16_t, char32_t and the u8string
24 // accessor and u8path constructor (and normal functions taking char8_t in
25 // C++20) convert from/to UTF-16, UTF-32 and UTF-8. wchar_t can be either
26 // UTF-16 or UTF-32 depending on the size of the wchar_t type, or can be
27 // left unimplemented.
28 //
29 // Plain char is implicitly UTF-8 on posix systems. On Windows, plain char
30 // is supposed to be in the same encoding as the platform's native file
31 // system APIs consumes in the functions that take narrow strings as path
32 // names.
33 
34 
35 #include "filesystem_include.h"
36 #include <type_traits>
37 #include <cassert>
38 
39 #include "test_macros.h"
40 #include "filesystem_test_helper.h"
41 
42 // Test conversion with strings that fit within the latin1 charset, that fit
43 // within one code point in UTF-16, and that can be expressible in certain
44 // one-byte code pages.
test_latin_unicode()45 static void test_latin_unicode()
46 {
47   const char16_t u16str[] = { 0xe5, 0xe4, 0xf6, 0x00 };
48   const char32_t u32str[] = { 0xe5, 0xe4, 0xf6, 0x00 };
49   const char str[] = { char(0xc3), char(0xa5), char(0xc3), char(0xa4), char(0xc3), char(0xb6), 0x00 }; // UTF8, in a regular char string
50 #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t)
51   const char8_t u8str[] = { 0xc3, 0xa5, 0xc3, 0xa4, 0xc3, 0xb6, 0x00 };
52 #else
53   const char u8str[] = { char(0xc3), char(0xa5), char(0xc3), char(0xa4), char(0xc3), char(0xb6), 0x00 };
54 #endif
55 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
56   const wchar_t wstr[] = { 0xe5, 0xe4, 0xf6, 0x00 };
57 #endif
58 
59   // Test well-defined conversion between UTF-8, UTF-16 and UTF-32
60   {
61     const fs::path p(u16str);
62     assert(p.u8string() == u8str);
63     assert(p.u16string() == u16str);
64     assert(p.u32string() == u32str);
65     assert(p.string<char16_t>() == u16str);
66     assert(p.string<char32_t>() == u32str);
67   }
68   {
69     const fs::path p(u32str);
70     assert(p.u8string() == u8str);
71     assert(p.u16string() == u16str);
72     assert(p.u32string() == u32str);
73     assert(p.string<char16_t>() == u16str);
74     assert(p.string<char32_t>() == u32str);
75   }
76   {
77     const fs::path p = fs::u8path(str);
78     assert(p.u8string() == u8str);
79     assert(p.u16string() == u16str);
80     assert(p.u32string() == u32str);
81     assert(p.string<char16_t>() == u16str);
82     assert(p.string<char32_t>() == u32str);
83   }
84 #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t)
85   {
86     // In C++20, the path constructor can unambiguously handle UTF-8 input,
87     // even if the plain char constructor would treat it as something else.
88     const fs::path p(u8str);
89     assert(p.u8string() == u8str);
90     assert(p.u16string() == u16str);
91     assert(p.u32string() == u32str);
92     assert(p.string<char8_t>() == u8str);
93     assert(p.string<char16_t>() == u16str);
94     assert(p.string<char32_t>() == u32str);
95   }
96   // Check reading various inputs with string<char8_t>()
97   {
98     const fs::path p(u16str);
99     assert(p.string<char8_t>() == u8str);
100   }
101   {
102     const fs::path p(u32str);
103     assert(p.string<char8_t>() == u8str);
104   }
105   {
106     const fs::path p = fs::u8path(str);
107     assert(p.string<char8_t>() == u8str);
108   }
109 #endif
110 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
111   // Test conversion to/from wchar_t.
112   {
113     const fs::path p(u16str);
114     assert(p.wstring() == wstr);
115     assert(p.string<wchar_t>() == wstr);
116   }
117   {
118     const fs::path p = fs::u8path(str);
119     assert(p.wstring() == wstr);
120     assert(p.string<wchar_t>() == wstr);
121   }
122   {
123     const fs::path p(wstr);
124     assert(p.wstring() == wstr);
125     assert(p.u8string() == u8str);
126     assert(p.u16string() == u16str);
127     assert(p.u32string() == u32str);
128     assert(p.string<wchar_t>() == wstr);
129   }
130 #endif // TEST_HAS_NO_WIDE_CHARACTERS
131 #ifndef _WIN32
132   // Test conversion to/from regular char-based string. On POSIX, this
133   // is implied to convert to/from UTF-8.
134   {
135     const fs::path p(str);
136     assert(p.string() == str);
137     assert(p.u16string() == u16str);
138     assert(p.string<char>() == str);
139   }
140   {
141     const fs::path p(u16str);
142     assert(p.string() == str);
143     assert(p.string<char>() == str);
144   }
145 #else
146   // On windows, the narrow char-based input/output is supposed to be
147   // in the charset that narrow file IO APIs use. This can either be the
148   // current active code page (ACP) or the OEM code page, exposed by
149   // the AreFileApisANSI() function, and settable with SetFileApisToANSI() and
150   // SetFileApisToOEM(). We can't set which codepage is active within
151   // the process, but for some specific known ones, we can check if they
152   // behave as expected.
153   SetFileApisToANSI();
154   if (GetACP() == 1252) {
155     const char latin1[] = { char(0xe5), char(0xe4), char(0xf6), 0x00 };
156     {
157       const fs::path p(wstr);
158       assert(p.string() == latin1);
159       assert(p.string<char>() == latin1);
160     }
161     {
162       const fs::path p(latin1);
163       assert(p.string() == latin1);
164       assert(p.wstring() == wstr);
165       assert(p.u8string() == u8str);
166       assert(p.u16string() == u16str);
167       assert(p.string<char>() == latin1);
168       assert(p.string<wchar_t>() == wstr);
169     }
170   }
171   SetFileApisToOEM();
172   if (GetOEMCP() == 850 || GetOEMCP() == 437) {
173     // These chars are identical in both CP 850 and 437
174     const char cp850[] = { char(0x86), char(0x84), char(0x94), 0x00 };
175     {
176       const fs::path p(wstr);
177       assert(p.string() == cp850);
178       assert(p.string<char>() == cp850);
179     }
180     {
181       const fs::path p(cp850);
182       assert(p.string() == cp850);
183       assert(p.wstring() == wstr);
184       assert(p.u8string() == u8str);
185       assert(p.u16string() == u16str);
186       assert(p.string<char>() == cp850);
187       assert(p.string<wchar_t>() == wstr);
188     }
189   }
190 #endif
191 }
192 
193 // Test conversion with strings that don't fit within one UTF-16 code point.
194 // Here, wchar_t can be either UTF-16 or UTF-32 depending on the size on the
195 // particular platform.
test_wide_unicode()196 static void test_wide_unicode()
197 {
198   const char16_t u16str[] = { 0xd801, 0xdc37, 0x00 };
199   const char32_t u32str[] = { 0x10437, 0x00 };
200 #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t)
201   const char8_t u8str[] = { 0xf0, 0x90, 0x90, 0xb7, 0x00 };
202 #else
203   const char u8str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 };
204 #endif
205   const char str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 };
206   {
207     const fs::path p = fs::u8path(str);
208     assert(p.u8string() == u8str);
209     assert(p.u16string() == u16str);
210     assert(p.u32string() == u32str);
211   }
212   {
213     const fs::path p(u16str);
214     assert(p.u8string() == u8str);
215     assert(p.u16string() == u16str);
216     assert(p.u32string() == u32str);
217   }
218   {
219     const fs::path p(u32str);
220     assert(p.u8string() == u8str);
221     assert(p.u16string() == u16str);
222     assert(p.u32string() == u32str);
223   }
224 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__)
225 # if __SIZEOF_WCHAR_T__ == 2
226   const wchar_t wstr[] = { 0xd801, 0xdc37, 0x00 };
227 # else
228   const wchar_t wstr[] = { 0x10437, 0x00 };
229 # endif
230   // Test conversion to/from wchar_t.
231   {
232     const fs::path p = fs::u8path(str);
233     assert(p.wstring() == wstr);
234   }
235   {
236     const fs::path p(u16str);
237     assert(p.wstring() == wstr);
238   }
239   {
240     const fs::path p(u32str);
241     assert(p.wstring() == wstr);
242   }
243   {
244     const fs::path p(wstr);
245     assert(p.u8string() == u8str);
246     assert(p.u16string() == u16str);
247     assert(p.u32string() == u32str);
248     assert(p.wstring() == wstr);
249   }
250 #endif // !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__)
251 }
252 
253 // Test appending paths in different encodings.
test_append()254 static void test_append()
255 {
256   const char16_t u16str[] = { 0xd801, 0xdc37, 0x00 };
257   const char32_t u32str[] = { 0x10437, 0x00 };
258   const char32_t u32ref[] = { 0x10437, fs::path::preferred_separator, 0x10437, fs::path::preferred_separator, 0x10437, 0x00 };
259   const char str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 };
260   {
261     fs::path p = fs::u8path(str) / u16str / u32str;
262     assert(p.u32string() == u32ref);
263     p = fs::u8path(str).append(u16str).append(u32str);
264     assert(p.u32string() == u32ref);
265     p = fs::u8path(str);
266     p /= u16str;
267     p /= u32str;
268     assert(p.u32string() == u32ref);
269   }
270 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__)
271 # if __SIZEOF_WCHAR_T__ == 2
272   const wchar_t wstr[] = { 0xd801, 0xdc37, 0x00 };
273 # else
274   const wchar_t wstr[] = { 0x10437, 0x00 };
275 # endif
276   // Test conversion from wchar_t.
277   {
278     fs::path p = fs::path(u16str) / wstr / u32str;
279     assert(p.u32string() == u32ref);
280     p = fs::path(u16str).append(wstr).append(u32str);
281     assert(p.u32string() == u32ref);
282     p = fs::path(u16str);
283     p /= wstr;
284     p /= u32str;
285     assert(p.u32string() == u32ref);
286   }
287 #endif // !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__)
288 }
289 
test_concat()290 static void test_concat()
291 {
292   const char16_t u16str[] = { 0xd801, 0xdc37, 0x00 };
293   const char32_t u32str[] = { 0x10437, 0x00 };
294   const char32_t u32ref[] = { 0x10437, 0x10437, 0x10437, 0x00 };
295   const char str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 };
296   {
297     fs::path p = fs::u8path(str);
298     p += u16str;
299     p += u32str;
300     assert(p.u32string() == u32ref);
301     p = fs::u8path(str).concat(u16str).concat(u32str);
302     assert(p.u32string() == u32ref);
303   }
304 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__)
305 # if __SIZEOF_WCHAR_T__ == 2
306   const wchar_t wstr[] = { 0xd801, 0xdc37, 0x00 };
307 # else
308   const wchar_t wstr[] = { 0x10437, 0x00 };
309 # endif
310   // Test conversion from wchar_t.
311   {
312     fs::path p = fs::path(u16str);
313     p += wstr;
314     p += u32str;
315     assert(p.u32string() == u32ref);
316     p = fs::path(u16str).concat(wstr).concat(u32str);
317     assert(p.u32string() == u32ref);
318   }
319 #endif // !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__)
320 }
321 
test_append_concat_narrow()322 static void test_append_concat_narrow()
323 {
324   const char16_t u16str[] = { 0xe5, 0x00 };
325   const char32_t u32ref_append[] = { 0xe5, fs::path::preferred_separator, 0xe5, 0x00 };
326   const char32_t u32ref_concat[] = { 0xe5, 0xe5, 0x00 };
327 
328 #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t)
329   {
330     const char8_t u8str[] = { 0xc3, 0xa5, 0x00 };
331     // In C++20, appends of a char8_t string is unambiguously treated as
332     // UTF-8.
333     fs::path p = fs::path(u16str) / u8str;
334     assert(p.u32string() == u32ref_append);
335     p = fs::path(u16str).append(u8str);
336     assert(p.u32string() == u32ref_append);
337     p = fs::path(u16str);
338     p /= u8str;
339     assert(p.u32string() == u32ref_append);
340     p = fs::path(u16str).concat(u8str);
341     assert(p.u32string() == u32ref_concat);
342     p = fs::path(u16str);
343     p += u8str;
344     assert(p.u32string() == u32ref_concat);
345   }
346 #endif
347 #ifndef _WIN32
348   // Test appending a regular char-based string. On POSIX, this
349   // is implied to convert to/from UTF-8.
350   {
351     const char str[] = { char(0xc3), char(0xa5), 0x00 }; // UTF8, in a regular char string
352     fs::path p = fs::path(u16str) / str;
353     assert(p.u32string() == u32ref_append);
354     p = fs::path(u16str).append(str);
355     assert(p.u32string() == u32ref_append);
356     p = fs::path(u16str);
357     p /= str;
358     assert(p.u32string() == u32ref_append);
359     p = fs::path(u16str).concat(str);
360     assert(p.u32string() == u32ref_concat);
361     p = fs::path(u16str);
362     p += str;
363     assert(p.u32string() == u32ref_concat);
364   }
365 #else
366   SetFileApisToANSI();
367   if (GetACP() == 1252) {
368     const char latin1[] = { char(0xe5), 0x00 };
369     fs::path p = fs::path(u16str) / latin1;
370     assert(p.u32string() == u32ref_append);
371     p = fs::path(u16str).append(latin1);
372     assert(p.u32string() == u32ref_append);
373     p = fs::path(u16str);
374     p /= latin1;
375     assert(p.u32string() == u32ref_append);
376     p = fs::path(u16str).concat(latin1);
377     assert(p.u32string() == u32ref_concat);
378     p = fs::path(u16str);
379     p += latin1;
380     assert(p.u32string() == u32ref_concat);
381   }
382   SetFileApisToOEM();
383   if (GetOEMCP() == 850 || GetOEMCP() == 437) {
384     // This chars is identical in both CP 850 and 437
385     const char cp850[] = { char(0x86), 0x00 };
386     fs::path p = fs::path(u16str) / cp850;
387     assert(p.u32string() == u32ref_append);
388     p = fs::path(u16str).append(cp850);
389     assert(p.u32string() == u32ref_append);
390     p = fs::path(u16str);
391     p /= cp850;
392     assert(p.u32string() == u32ref_append);
393     p = fs::path(u16str).concat(cp850);
394     assert(p.u32string() == u32ref_concat);
395     p = fs::path(u16str);
396     p += cp850;
397     assert(p.u32string() == u32ref_concat);
398   }
399 #endif
400 }
401 
main(int,char **)402 int main(int, char**)
403 {
404   test_latin_unicode();
405   test_wide_unicode();
406   test_append();
407   test_concat();
408   test_append_concat_narrow();
409 
410   return 0;
411 }
412