1 //===----------------------------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 // UNSUPPORTED: no-localization 10 // UNSUPPORTED: c++03 11 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS 12 13 // <filesystem> 14 15 // class path 16 17 // Test constructors, accessors and modifiers that convert from/to various 18 // character encodings. Constructors and modifiers (append, concat, 19 // operator/=, operator+=) accept inputs with various character encodings, 20 // and accessors (*string(), string<>(), u8string()) export the string with 21 // various encodings. 22 // 23 // Some encodings are standardized; char16_t, char32_t and the u8string 24 // accessor and u8path constructor (and normal functions taking char8_t in 25 // C++20) convert from/to UTF-16, UTF-32 and UTF-8. wchar_t can be either 26 // UTF-16 or UTF-32 depending on the size of the wchar_t type, or can be 27 // left unimplemented. 28 // 29 // Plain char is implicitly UTF-8 on posix systems. On Windows, plain char 30 // is supposed to be in the same encoding as the platform's native file 31 // system APIs consumes in the functions that take narrow strings as path 32 // names. 33 34 35 #include "filesystem_include.h" 36 #include <type_traits> 37 #include <cassert> 38 39 #include "test_macros.h" 40 #include "filesystem_test_helper.h" 41 42 // Test conversion with strings that fit within the latin1 charset, that fit 43 // within one code point in UTF-16, and that can be expressible in certain 44 // one-byte code pages. 45 static void test_latin_unicode() 46 { 47 const char16_t u16str[] = { 0xe5, 0xe4, 0xf6, 0x00 }; 48 const char32_t u32str[] = { 0xe5, 0xe4, 0xf6, 0x00 }; 49 const char str[] = { char(0xc3), char(0xa5), char(0xc3), char(0xa4), char(0xc3), char(0xb6), 0x00 }; // UTF8, in a regular char string 50 #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t) 51 const char8_t u8str[] = { 0xc3, 0xa5, 0xc3, 0xa4, 0xc3, 0xb6, 0x00 }; 52 #else 53 const char u8str[] = { char(0xc3), char(0xa5), char(0xc3), char(0xa4), char(0xc3), char(0xb6), 0x00 }; 54 #endif 55 #ifndef TEST_HAS_NO_WIDE_CHARACTERS 56 const wchar_t wstr[] = { 0xe5, 0xe4, 0xf6, 0x00 }; 57 #endif 58 59 // Test well-defined conversion between UTF-8, UTF-16 and UTF-32 60 { 61 const fs::path p(u16str); 62 assert(p.u8string() == u8str); 63 assert(p.u16string() == u16str); 64 assert(p.u32string() == u32str); 65 assert(p.string<char16_t>() == u16str); 66 assert(p.string<char32_t>() == u32str); 67 } 68 { 69 const fs::path p(u32str); 70 assert(p.u8string() == u8str); 71 assert(p.u16string() == u16str); 72 assert(p.u32string() == u32str); 73 assert(p.string<char16_t>() == u16str); 74 assert(p.string<char32_t>() == u32str); 75 } 76 { 77 const fs::path p = fs::u8path(str); 78 assert(p.u8string() == u8str); 79 assert(p.u16string() == u16str); 80 assert(p.u32string() == u32str); 81 assert(p.string<char16_t>() == u16str); 82 assert(p.string<char32_t>() == u32str); 83 } 84 #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t) 85 { 86 // In C++20, the path constructor can unambiguously handle UTF-8 input, 87 // even if the plain char constructor would treat it as something else. 88 const fs::path p(u8str); 89 assert(p.u8string() == u8str); 90 assert(p.u16string() == u16str); 91 assert(p.u32string() == u32str); 92 assert(p.string<char8_t>() == u8str); 93 assert(p.string<char16_t>() == u16str); 94 assert(p.string<char32_t>() == u32str); 95 } 96 // Check reading various inputs with string<char8_t>() 97 { 98 const fs::path p(u16str); 99 assert(p.string<char8_t>() == u8str); 100 } 101 { 102 const fs::path p(u32str); 103 assert(p.string<char8_t>() == u8str); 104 } 105 { 106 const fs::path p = fs::u8path(str); 107 assert(p.string<char8_t>() == u8str); 108 } 109 #endif 110 #ifndef TEST_HAS_NO_WIDE_CHARACTERS 111 // Test conversion to/from wchar_t. 112 { 113 const fs::path p(u16str); 114 assert(p.wstring() == wstr); 115 assert(p.string<wchar_t>() == wstr); 116 } 117 { 118 const fs::path p = fs::u8path(str); 119 assert(p.wstring() == wstr); 120 assert(p.string<wchar_t>() == wstr); 121 } 122 { 123 const fs::path p(wstr); 124 assert(p.wstring() == wstr); 125 assert(p.u8string() == u8str); 126 assert(p.u16string() == u16str); 127 assert(p.u32string() == u32str); 128 assert(p.string<wchar_t>() == wstr); 129 } 130 #endif // TEST_HAS_NO_WIDE_CHARACTERS 131 #ifndef _WIN32 132 // Test conversion to/from regular char-based string. On POSIX, this 133 // is implied to convert to/from UTF-8. 134 { 135 const fs::path p(str); 136 assert(p.string() == str); 137 assert(p.u16string() == u16str); 138 assert(p.string<char>() == str); 139 } 140 { 141 const fs::path p(u16str); 142 assert(p.string() == str); 143 assert(p.string<char>() == str); 144 } 145 #else 146 // On windows, the narrow char-based input/output is supposed to be 147 // in the charset that narrow file IO APIs use. This can either be the 148 // current active code page (ACP) or the OEM code page, exposed by 149 // the AreFileApisANSI() function, and settable with SetFileApisToANSI() and 150 // SetFileApisToOEM(). We can't set which codepage is active within 151 // the process, but for some specific known ones, we can check if they 152 // behave as expected. 153 SetFileApisToANSI(); 154 if (GetACP() == 1252) { 155 const char latin1[] = { char(0xe5), char(0xe4), char(0xf6), 0x00 }; 156 { 157 const fs::path p(wstr); 158 assert(p.string() == latin1); 159 assert(p.string<char>() == latin1); 160 } 161 { 162 const fs::path p(latin1); 163 assert(p.string() == latin1); 164 assert(p.wstring() == wstr); 165 assert(p.u8string() == u8str); 166 assert(p.u16string() == u16str); 167 assert(p.string<char>() == latin1); 168 assert(p.string<wchar_t>() == wstr); 169 } 170 } 171 SetFileApisToOEM(); 172 if (GetOEMCP() == 850 || GetOEMCP() == 437) { 173 // These chars are identical in both CP 850 and 437 174 const char cp850[] = { char(0x86), char(0x84), char(0x94), 0x00 }; 175 { 176 const fs::path p(wstr); 177 assert(p.string() == cp850); 178 assert(p.string<char>() == cp850); 179 } 180 { 181 const fs::path p(cp850); 182 assert(p.string() == cp850); 183 assert(p.wstring() == wstr); 184 assert(p.u8string() == u8str); 185 assert(p.u16string() == u16str); 186 assert(p.string<char>() == cp850); 187 assert(p.string<wchar_t>() == wstr); 188 } 189 } 190 #endif 191 } 192 193 // Test conversion with strings that don't fit within one UTF-16 code point. 194 // Here, wchar_t can be either UTF-16 or UTF-32 depending on the size on the 195 // particular platform. 196 static void test_wide_unicode() 197 { 198 const char16_t u16str[] = { 0xd801, 0xdc37, 0x00 }; 199 const char32_t u32str[] = { 0x10437, 0x00 }; 200 #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t) 201 const char8_t u8str[] = { 0xf0, 0x90, 0x90, 0xb7, 0x00 }; 202 #else 203 const char u8str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 }; 204 #endif 205 const char str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 }; 206 { 207 const fs::path p = fs::u8path(str); 208 assert(p.u8string() == u8str); 209 assert(p.u16string() == u16str); 210 assert(p.u32string() == u32str); 211 } 212 { 213 const fs::path p(u16str); 214 assert(p.u8string() == u8str); 215 assert(p.u16string() == u16str); 216 assert(p.u32string() == u32str); 217 } 218 { 219 const fs::path p(u32str); 220 assert(p.u8string() == u8str); 221 assert(p.u16string() == u16str); 222 assert(p.u32string() == u32str); 223 } 224 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__) 225 # if __SIZEOF_WCHAR_T__ == 2 226 const wchar_t wstr[] = { 0xd801, 0xdc37, 0x00 }; 227 # else 228 const wchar_t wstr[] = { 0x10437, 0x00 }; 229 # endif 230 // Test conversion to/from wchar_t. 231 { 232 const fs::path p = fs::u8path(str); 233 assert(p.wstring() == wstr); 234 } 235 { 236 const fs::path p(u16str); 237 assert(p.wstring() == wstr); 238 } 239 { 240 const fs::path p(u32str); 241 assert(p.wstring() == wstr); 242 } 243 { 244 const fs::path p(wstr); 245 assert(p.u8string() == u8str); 246 assert(p.u16string() == u16str); 247 assert(p.u32string() == u32str); 248 assert(p.wstring() == wstr); 249 } 250 #endif // !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__) 251 } 252 253 // Test appending paths in different encodings. 254 static void test_append() 255 { 256 const char16_t u16str[] = { 0xd801, 0xdc37, 0x00 }; 257 const char32_t u32str[] = { 0x10437, 0x00 }; 258 const char32_t u32ref[] = { 0x10437, fs::path::preferred_separator, 0x10437, fs::path::preferred_separator, 0x10437, 0x00 }; 259 const char str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 }; 260 { 261 fs::path p = fs::u8path(str) / u16str / u32str; 262 assert(p.u32string() == u32ref); 263 p = fs::u8path(str).append(u16str).append(u32str); 264 assert(p.u32string() == u32ref); 265 p = fs::u8path(str); 266 p /= u16str; 267 p /= u32str; 268 assert(p.u32string() == u32ref); 269 } 270 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__) 271 # if __SIZEOF_WCHAR_T__ == 2 272 const wchar_t wstr[] = { 0xd801, 0xdc37, 0x00 }; 273 # else 274 const wchar_t wstr[] = { 0x10437, 0x00 }; 275 # endif 276 // Test conversion from wchar_t. 277 { 278 fs::path p = fs::path(u16str) / wstr / u32str; 279 assert(p.u32string() == u32ref); 280 p = fs::path(u16str).append(wstr).append(u32str); 281 assert(p.u32string() == u32ref); 282 p = fs::path(u16str); 283 p /= wstr; 284 p /= u32str; 285 assert(p.u32string() == u32ref); 286 } 287 #endif // !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__) 288 } 289 290 static void test_concat() 291 { 292 const char16_t u16str[] = { 0xd801, 0xdc37, 0x00 }; 293 const char32_t u32str[] = { 0x10437, 0x00 }; 294 const char32_t u32ref[] = { 0x10437, 0x10437, 0x10437, 0x00 }; 295 const char str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 }; 296 { 297 fs::path p = fs::u8path(str); 298 p += u16str; 299 p += u32str; 300 assert(p.u32string() == u32ref); 301 p = fs::u8path(str).concat(u16str).concat(u32str); 302 assert(p.u32string() == u32ref); 303 } 304 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__) 305 # if __SIZEOF_WCHAR_T__ == 2 306 const wchar_t wstr[] = { 0xd801, 0xdc37, 0x00 }; 307 # else 308 const wchar_t wstr[] = { 0x10437, 0x00 }; 309 # endif 310 // Test conversion from wchar_t. 311 { 312 fs::path p = fs::path(u16str); 313 p += wstr; 314 p += u32str; 315 assert(p.u32string() == u32ref); 316 p = fs::path(u16str).concat(wstr).concat(u32str); 317 assert(p.u32string() == u32ref); 318 } 319 #endif // !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__) 320 } 321 322 static void test_append_concat_narrow() 323 { 324 const char16_t u16str[] = { 0xe5, 0x00 }; 325 const char32_t u32ref_append[] = { 0xe5, fs::path::preferred_separator, 0xe5, 0x00 }; 326 const char32_t u32ref_concat[] = { 0xe5, 0xe5, 0x00 }; 327 328 #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t) 329 { 330 const char8_t u8str[] = { 0xc3, 0xa5, 0x00 }; 331 // In C++20, appends of a char8_t string is unambiguously treated as 332 // UTF-8. 333 fs::path p = fs::path(u16str) / u8str; 334 assert(p.u32string() == u32ref_append); 335 p = fs::path(u16str).append(u8str); 336 assert(p.u32string() == u32ref_append); 337 p = fs::path(u16str); 338 p /= u8str; 339 assert(p.u32string() == u32ref_append); 340 p = fs::path(u16str).concat(u8str); 341 assert(p.u32string() == u32ref_concat); 342 p = fs::path(u16str); 343 p += u8str; 344 assert(p.u32string() == u32ref_concat); 345 } 346 #endif 347 #ifndef _WIN32 348 // Test appending a regular char-based string. On POSIX, this 349 // is implied to convert to/from UTF-8. 350 { 351 const char str[] = { char(0xc3), char(0xa5), 0x00 }; // UTF8, in a regular char string 352 fs::path p = fs::path(u16str) / str; 353 assert(p.u32string() == u32ref_append); 354 p = fs::path(u16str).append(str); 355 assert(p.u32string() == u32ref_append); 356 p = fs::path(u16str); 357 p /= str; 358 assert(p.u32string() == u32ref_append); 359 p = fs::path(u16str).concat(str); 360 assert(p.u32string() == u32ref_concat); 361 p = fs::path(u16str); 362 p += str; 363 assert(p.u32string() == u32ref_concat); 364 } 365 #else 366 SetFileApisToANSI(); 367 if (GetACP() == 1252) { 368 const char latin1[] = { char(0xe5), 0x00 }; 369 fs::path p = fs::path(u16str) / latin1; 370 assert(p.u32string() == u32ref_append); 371 p = fs::path(u16str).append(latin1); 372 assert(p.u32string() == u32ref_append); 373 p = fs::path(u16str); 374 p /= latin1; 375 assert(p.u32string() == u32ref_append); 376 p = fs::path(u16str).concat(latin1); 377 assert(p.u32string() == u32ref_concat); 378 p = fs::path(u16str); 379 p += latin1; 380 assert(p.u32string() == u32ref_concat); 381 } 382 SetFileApisToOEM(); 383 if (GetOEMCP() == 850 || GetOEMCP() == 437) { 384 // This chars is identical in both CP 850 and 437 385 const char cp850[] = { char(0x86), 0x00 }; 386 fs::path p = fs::path(u16str) / cp850; 387 assert(p.u32string() == u32ref_append); 388 p = fs::path(u16str).append(cp850); 389 assert(p.u32string() == u32ref_append); 390 p = fs::path(u16str); 391 p /= cp850; 392 assert(p.u32string() == u32ref_append); 393 p = fs::path(u16str).concat(cp850); 394 assert(p.u32string() == u32ref_concat); 395 p = fs::path(u16str); 396 p += cp850; 397 assert(p.u32string() == u32ref_concat); 398 } 399 #endif 400 } 401 402 int main(int, char**) 403 { 404 test_latin_unicode(); 405 test_wide_unicode(); 406 test_append(); 407 test_concat(); 408 test_append_concat_narrow(); 409 410 return 0; 411 } 412