1 //===----------------------------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 // UNSUPPORTED: libcpp-has-no-localization
10 // UNSUPPORTED: c++03
11 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
12 
13 // <filesystem>
14 
15 // class path
16 
17 // Test constructors, accessors and modifiers that convert from/to various
18 // character encodings. Constructors and modifiers (append, concat,
19 // operator/=, operator+=) accept inputs with various character encodings,
20 // and accessors (*string(), string<>(), u8string()) export the string with
21 // various encodings.
22 //
23 // Some encodings are standardized; char16_t, char32_t and the u8string
24 // accessor and u8path constructor (and normal functions taking char8_t in
25 // C++20) convert from/to UTF-16, UTF-32 and UTF-8. wchar_t can be either
26 // UTF-16 or UTF-32 depending on the size of the wchar_t type, or can be
27 // left unimplemented.
28 //
29 // Plain char is implicitly UTF-8 on posix systems. On Windows, plain char
30 // is supposed to be in the same encoding as the platform's native file
31 // system APIs consumes in the functions that take narrow strings as path
32 // names.
33 
34 
35 #include "filesystem_include.h"
36 #include <type_traits>
37 #include <cassert>
38 
39 #include "test_macros.h"
40 #include "filesystem_test_helper.h"
41 
42 // libstdc++ doesn't define conversions from/to wchar_t outside of windows.
43 #if defined(__GLIBCXX__) && !defined(_WIN32)
44 #  define HAS_NO_WCHAR
45 #endif
46 
47 // Test conversion with strings that fit within the latin1 charset, that fit
48 // within one code point in UTF-16, and that can be expressible in certain
49 // one-byte code pages.
50 static void test_latin_unicode()
51 {
52   const char16_t u16str[] = { 0xe5, 0xe4, 0xf6, 0x00 };
53   const char32_t u32str[] = { 0xe5, 0xe4, 0xf6, 0x00 };
54   const char str[] = { char(0xc3), char(0xa5), char(0xc3), char(0xa4), char(0xc3), char(0xb6), 0x00 }; // UTF8, in a regular char string
55 #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t)
56   const char8_t u8str[] = { 0xc3, 0xa5, 0xc3, 0xa4, 0xc3, 0xb6, 0x00 };
57 #else
58   const char u8str[] = { char(0xc3), char(0xa5), char(0xc3), char(0xa4), char(0xc3), char(0xb6), 0x00 };
59 #endif
60 #ifndef HAS_NO_WCHAR
61   const wchar_t wstr[] = { 0xe5, 0xe4, 0xf6, 0x00 };
62 #endif
63 
64   // Test well-defined conversion between UTF-8, UTF-16 and UTF-32
65   {
66     const fs::path p(u16str);
67     assert(p.u8string() == u8str);
68     assert(p.u16string() == u16str);
69     assert(p.u32string() == u32str);
70     assert(p.string<char16_t>() == u16str);
71     assert(p.string<char32_t>() == u32str);
72   }
73   {
74     const fs::path p(u32str);
75     assert(p.u8string() == u8str);
76     assert(p.u16string() == u16str);
77     assert(p.u32string() == u32str);
78     assert(p.string<char16_t>() == u16str);
79     assert(p.string<char32_t>() == u32str);
80   }
81   {
82     const fs::path p = fs::u8path(str);
83     assert(p.u8string() == u8str);
84     assert(p.u16string() == u16str);
85     assert(p.u32string() == u32str);
86     assert(p.string<char16_t>() == u16str);
87     assert(p.string<char32_t>() == u32str);
88   }
89 #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t)
90   {
91     // In C++20, the path constructor can unambiguously handle UTF-8 input,
92     // even if the plain char constructor would treat it as something else.
93     const fs::path p(u8str);
94     assert(p.u8string() == u8str);
95     assert(p.u16string() == u16str);
96     assert(p.u32string() == u32str);
97     assert(p.string<char8_t>() == u8str);
98     assert(p.string<char16_t>() == u16str);
99     assert(p.string<char32_t>() == u32str);
100   }
101   // Check reading various inputs with string<char8_t>()
102   {
103     const fs::path p(u16str);
104     assert(p.string<char8_t>() == u8str);
105   }
106   {
107     const fs::path p(u32str);
108     assert(p.string<char8_t>() == u8str);
109   }
110   {
111     const fs::path p = fs::u8path(str);
112     assert(p.string<char8_t>() == u8str);
113   }
114 #endif
115 #ifndef HAS_NO_WCHAR
116   // Test conversion to/from wchar_t.
117   {
118     const fs::path p(u16str);
119     assert(p.wstring() == wstr);
120     assert(p.string<wchar_t>() == wstr);
121   }
122   {
123     const fs::path p = fs::u8path(str);
124     assert(p.wstring() == wstr);
125     assert(p.string<wchar_t>() == wstr);
126   }
127   {
128     const fs::path p(wstr);
129     assert(p.wstring() == wstr);
130     assert(p.u8string() == u8str);
131     assert(p.u16string() == u16str);
132     assert(p.u32string() == u32str);
133     assert(p.string<wchar_t>() == wstr);
134   }
135 #endif
136 #ifndef _WIN32
137   // Test conversion to/from regular char-based string. On POSIX, this
138   // is implied to convert to/from UTF-8.
139   {
140     const fs::path p(str);
141     assert(p.string() == str);
142     assert(p.u16string() == u16str);
143     assert(p.string<char>() == str);
144   }
145   {
146     const fs::path p(u16str);
147     assert(p.string() == str);
148     assert(p.string<char>() == str);
149   }
150 #else
151   // On windows, the narrow char-based input/output is supposed to be
152   // in the charset that narrow file IO APIs use. This can either be the
153   // current active code page (ACP) or the OEM code page, exposed by
154   // the AreFileApisANSI() function, and settable with SetFileApisToANSI() and
155   // SetFileApisToOEM(). We can't set which codepage is active within
156   // the process, but for some specific known ones, we can check if they
157   // behave as expected.
158   SetFileApisToANSI();
159   if (GetACP() == 1252) {
160     const char latin1[] = { char(0xe5), char(0xe4), char(0xf6), 0x00 };
161     {
162       const fs::path p(wstr);
163       assert(p.string() == latin1);
164       assert(p.string<char>() == latin1);
165     }
166     {
167       const fs::path p(latin1);
168       assert(p.string() == latin1);
169       assert(p.wstring() == wstr);
170       assert(p.u8string() == u8str);
171       assert(p.u16string() == u16str);
172       assert(p.string<char>() == latin1);
173       assert(p.string<wchar_t>() == wstr);
174     }
175   }
176   SetFileApisToOEM();
177   if (GetOEMCP() == 850 || GetOEMCP() == 437) {
178     // These chars are identical in both CP 850 and 437
179     const char cp850[] = { char(0x86), char(0x84), char(0x94), 0x00 };
180     {
181       const fs::path p(wstr);
182       assert(p.string() == cp850);
183       assert(p.string<char>() == cp850);
184     }
185     {
186       const fs::path p(cp850);
187       assert(p.string() == cp850);
188       assert(p.wstring() == wstr);
189       assert(p.u8string() == u8str);
190       assert(p.u16string() == u16str);
191       assert(p.string<char>() == cp850);
192       assert(p.string<wchar_t>() == wstr);
193     }
194   }
195 #endif
196 }
197 
198 // Test conversion with strings that don't fit within one UTF-16 code point.
199 // Here, wchar_t can be either UTF-16 or UTF-32 depending on the size on the
200 // particular platform.
201 static void test_wide_unicode()
202 {
203   const char16_t u16str[] = { 0xd801, 0xdc37, 0x00 };
204   const char32_t u32str[] = { 0x10437, 0x00 };
205 #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t)
206   const char8_t u8str[] = { 0xf0, 0x90, 0x90, 0xb7, 0x00 };
207 #else
208   const char u8str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 };
209 #endif
210   const char str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 };
211   {
212     const fs::path p = fs::u8path(str);
213     assert(p.u8string() == u8str);
214     assert(p.u16string() == u16str);
215     assert(p.u32string() == u32str);
216   }
217   {
218     const fs::path p(u16str);
219     assert(p.u8string() == u8str);
220     assert(p.u16string() == u16str);
221     assert(p.u32string() == u32str);
222   }
223   {
224     const fs::path p(u32str);
225     assert(p.u8string() == u8str);
226     assert(p.u16string() == u16str);
227     assert(p.u32string() == u32str);
228   }
229 #if !defined(HAS_NO_WCHAR) && defined(__SIZEOF_WCHAR_T__)
230 #if __SIZEOF_WCHAR_T__ == 2
231   const wchar_t wstr[] = { 0xd801, 0xdc37, 0x00 };
232 #else
233   const wchar_t wstr[] = { 0x10437, 0x00 };
234 #endif
235   // Test conversion to/from wchar_t.
236   // libstdc++ doesn't define conversions from/to wchar_t outside of windows.
237   {
238     const fs::path p = fs::u8path(str);
239     assert(p.wstring() == wstr);
240   }
241   {
242     const fs::path p(u16str);
243     assert(p.wstring() == wstr);
244   }
245   {
246     const fs::path p(u32str);
247     assert(p.wstring() == wstr);
248   }
249   {
250     const fs::path p(wstr);
251     assert(p.u8string() == u8str);
252     assert(p.u16string() == u16str);
253     assert(p.u32string() == u32str);
254     assert(p.wstring() == wstr);
255   }
256 #endif
257 }
258 
259 // Test appending paths in different encodings.
260 static void test_append()
261 {
262   const char16_t u16str[] = { 0xd801, 0xdc37, 0x00 };
263   const char32_t u32str[] = { 0x10437, 0x00 };
264   const char32_t u32ref[] = { 0x10437, fs::path::preferred_separator, 0x10437, fs::path::preferred_separator, 0x10437, 0x00 };
265   const char str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 };
266   {
267     fs::path p = fs::u8path(str) / u16str / u32str;
268     assert(p.u32string() == u32ref);
269     p = fs::u8path(str).append(u16str).append(u32str);
270     assert(p.u32string() == u32ref);
271     p = fs::u8path(str);
272     p /= u16str;
273     p /= u32str;
274     assert(p.u32string() == u32ref);
275   }
276 #if !defined(HAS_NO_WCHAR) && defined(__SIZEOF_WCHAR_T__)
277 #if __SIZEOF_WCHAR_T__ == 2
278   const wchar_t wstr[] = { 0xd801, 0xdc37, 0x00 };
279 #else
280   const wchar_t wstr[] = { 0x10437, 0x00 };
281 #endif
282   // Test conversion from wchar_t.
283   // libstdc++ doesn't define conversions from/to wchar_t outside of windows.
284   {
285     fs::path p = fs::path(u16str) / wstr / u32str;
286     assert(p.u32string() == u32ref);
287     p = fs::path(u16str).append(wstr).append(u32str);
288     assert(p.u32string() == u32ref);
289     p = fs::path(u16str);
290     p /= wstr;
291     p /= u32str;
292     assert(p.u32string() == u32ref);
293   }
294 #endif
295 }
296 
297 static void test_concat()
298 {
299   const char16_t u16str[] = { 0xd801, 0xdc37, 0x00 };
300   const char32_t u32str[] = { 0x10437, 0x00 };
301   const char32_t u32ref[] = { 0x10437, 0x10437, 0x10437, 0x00 };
302   const char str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 };
303   {
304     fs::path p = fs::u8path(str);
305     p += u16str;
306     p += u32str;
307     assert(p.u32string() == u32ref);
308     p = fs::u8path(str).concat(u16str).concat(u32str);
309     assert(p.u32string() == u32ref);
310   }
311 #if !defined(HAS_NO_WCHAR) && defined(__SIZEOF_WCHAR_T__)
312 #if __SIZEOF_WCHAR_T__ == 2
313   const wchar_t wstr[] = { 0xd801, 0xdc37, 0x00 };
314 #else
315   const wchar_t wstr[] = { 0x10437, 0x00 };
316 #endif
317   // Test conversion from wchar_t.
318   // libstdc++ doesn't define conversions from/to wchar_t outside of windows.
319   {
320     fs::path p = fs::path(u16str);
321     p += wstr;
322     p += u32str;
323     assert(p.u32string() == u32ref);
324     p = fs::path(u16str).concat(wstr).concat(u32str);
325     assert(p.u32string() == u32ref);
326   }
327 #endif
328 }
329 
330 static void test_append_concat_narrow()
331 {
332   const char16_t u16str[] = { 0xe5, 0x00 };
333   const char32_t u32ref_append[] = { 0xe5, fs::path::preferred_separator, 0xe5, 0x00 };
334   const char32_t u32ref_concat[] = { 0xe5, 0xe5, 0x00 };
335 
336 #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t)
337   {
338     const char8_t u8str[] = { 0xc3, 0xa5, 0x00 };
339     // In C++20, appends of a char8_t string is unambiguously treated as
340     // UTF-8.
341     fs::path p = fs::path(u16str) / u8str;
342     assert(p.u32string() == u32ref_append);
343     p = fs::path(u16str).append(u8str);
344     assert(p.u32string() == u32ref_append);
345     p = fs::path(u16str);
346     p /= u8str;
347     assert(p.u32string() == u32ref_append);
348     p = fs::path(u16str).concat(u8str);
349     assert(p.u32string() == u32ref_concat);
350     p = fs::path(u16str);
351     p += u8str;
352     assert(p.u32string() == u32ref_concat);
353   }
354 #endif
355 #ifndef _WIN32
356   // Test appending a regular char-based string. On POSIX, this
357   // is implied to convert to/from UTF-8.
358   {
359     const char str[] = { char(0xc3), char(0xa5), 0x00 }; // UTF8, in a regular char string
360     fs::path p = fs::path(u16str) / str;
361     assert(p.u32string() == u32ref_append);
362     p = fs::path(u16str).append(str);
363     assert(p.u32string() == u32ref_append);
364     p = fs::path(u16str);
365     p /= str;
366     assert(p.u32string() == u32ref_append);
367     p = fs::path(u16str).concat(str);
368     assert(p.u32string() == u32ref_concat);
369     p = fs::path(u16str);
370     p += str;
371     assert(p.u32string() == u32ref_concat);
372   }
373 #else
374   SetFileApisToANSI();
375   if (GetACP() == 1252) {
376     const char latin1[] = { char(0xe5), 0x00 };
377     fs::path p = fs::path(u16str) / latin1;
378     assert(p.u32string() == u32ref_append);
379     p = fs::path(u16str).append(latin1);
380     assert(p.u32string() == u32ref_append);
381     p = fs::path(u16str);
382     p /= latin1;
383     assert(p.u32string() == u32ref_append);
384     p = fs::path(u16str).concat(latin1);
385     assert(p.u32string() == u32ref_concat);
386     p = fs::path(u16str);
387     p += latin1;
388     assert(p.u32string() == u32ref_concat);
389   }
390   SetFileApisToOEM();
391   if (GetOEMCP() == 850 || GetOEMCP() == 437) {
392     // This chars is identical in both CP 850 and 437
393     const char cp850[] = { char(0x86), 0x00 };
394     fs::path p = fs::path(u16str) / cp850;
395     assert(p.u32string() == u32ref_append);
396     p = fs::path(u16str).append(cp850);
397     assert(p.u32string() == u32ref_append);
398     p = fs::path(u16str);
399     p /= cp850;
400     assert(p.u32string() == u32ref_append);
401     p = fs::path(u16str).concat(cp850);
402     assert(p.u32string() == u32ref_concat);
403     p = fs::path(u16str);
404     p += cp850;
405     assert(p.u32string() == u32ref_concat);
406   }
407 #endif
408 }
409 
410 int main(int, char**)
411 {
412   test_latin_unicode();
413   test_wide_unicode();
414   test_append();
415   test_concat();
416   test_append_concat_narrow();
417 
418   return 0;
419 }
420