1 //===----------------------------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 // UNSUPPORTED: libcpp-has-no-localization 10 // UNSUPPORTED: c++03 11 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS 12 13 // <filesystem> 14 15 // class path 16 17 // Test constructors, accessors and modifiers that convert from/to various 18 // character encodings. Constructors and modifiers (append, concat, 19 // operator/=, operator+=) accept inputs with various character encodings, 20 // and accessors (*string(), string<>(), u8string()) export the string with 21 // various encodings. 22 // 23 // Some encodings are standardized; char16_t, char32_t and the u8string 24 // accessor and u8path constructor (and normal functions taking char8_t in 25 // C++20) convert from/to UTF-16, UTF-32 and UTF-8. wchar_t can be either 26 // UTF-16 or UTF-32 depending on the size of the wchar_t type, or can be 27 // left unimplemented. 28 // 29 // Plain char is implicitly UTF-8 on posix systems. On Windows, plain char 30 // is supposed to be in the same encoding as the platform's native file 31 // system APIs consumes in the functions that take narrow strings as path 32 // names. 33 34 35 #include "filesystem_include.h" 36 #include <type_traits> 37 #include <cassert> 38 39 #include "test_macros.h" 40 #include "filesystem_test_helper.h" 41 42 // libstdc++ doesn't define conversions from/to wchar_t outside of windows. 43 #if defined(__GLIBCXX__) && !defined(_WIN32) 44 # define HAS_NO_WCHAR 45 #endif 46 47 // Test conversion with strings that fit within the latin1 charset, that fit 48 // within one code point in UTF-16, and that can be expressible in certain 49 // one-byte code pages. 50 static void test_latin_unicode() 51 { 52 const char16_t u16str[] = { 0xe5, 0xe4, 0xf6, 0x00 }; 53 const char32_t u32str[] = { 0xe5, 0xe4, 0xf6, 0x00 }; 54 const char str[] = { char(0xc3), char(0xa5), char(0xc3), char(0xa4), char(0xc3), char(0xb6), 0x00 }; // UTF8, in a regular char string 55 #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t) 56 const char8_t u8str[] = { 0xc3, 0xa5, 0xc3, 0xa4, 0xc3, 0xb6, 0x00 }; 57 #else 58 const char u8str[] = { char(0xc3), char(0xa5), char(0xc3), char(0xa4), char(0xc3), char(0xb6), 0x00 }; 59 #endif 60 #ifndef HAS_NO_WCHAR 61 const wchar_t wstr[] = { 0xe5, 0xe4, 0xf6, 0x00 }; 62 #endif 63 64 // Test well-defined conversion between UTF-8, UTF-16 and UTF-32 65 { 66 const fs::path p(u16str); 67 assert(p.u8string() == u8str); 68 assert(p.u16string() == u16str); 69 assert(p.u32string() == u32str); 70 assert(p.string<char16_t>() == u16str); 71 assert(p.string<char32_t>() == u32str); 72 } 73 { 74 const fs::path p(u32str); 75 assert(p.u8string() == u8str); 76 assert(p.u16string() == u16str); 77 assert(p.u32string() == u32str); 78 assert(p.string<char16_t>() == u16str); 79 assert(p.string<char32_t>() == u32str); 80 } 81 { 82 const fs::path p = fs::u8path(str); 83 assert(p.u8string() == u8str); 84 assert(p.u16string() == u16str); 85 assert(p.u32string() == u32str); 86 assert(p.string<char16_t>() == u16str); 87 assert(p.string<char32_t>() == u32str); 88 } 89 #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t) 90 { 91 // In C++20, the path constructor can unambiguously handle UTF-8 input, 92 // even if the plain char constructor would treat it as something else. 93 const fs::path p(u8str); 94 assert(p.u8string() == u8str); 95 assert(p.u16string() == u16str); 96 assert(p.u32string() == u32str); 97 assert(p.string<char8_t>() == u8str); 98 assert(p.string<char16_t>() == u16str); 99 assert(p.string<char32_t>() == u32str); 100 } 101 // Check reading various inputs with string<char8_t>() 102 { 103 const fs::path p(u16str); 104 assert(p.string<char8_t>() == u8str); 105 } 106 { 107 const fs::path p(u32str); 108 assert(p.string<char8_t>() == u8str); 109 } 110 { 111 const fs::path p = fs::u8path(str); 112 assert(p.string<char8_t>() == u8str); 113 } 114 #endif 115 #ifndef HAS_NO_WCHAR 116 // Test conversion to/from wchar_t. 117 { 118 const fs::path p(u16str); 119 assert(p.wstring() == wstr); 120 assert(p.string<wchar_t>() == wstr); 121 } 122 { 123 const fs::path p = fs::u8path(str); 124 assert(p.wstring() == wstr); 125 assert(p.string<wchar_t>() == wstr); 126 } 127 { 128 const fs::path p(wstr); 129 assert(p.wstring() == wstr); 130 assert(p.u8string() == u8str); 131 assert(p.u16string() == u16str); 132 assert(p.u32string() == u32str); 133 assert(p.string<wchar_t>() == wstr); 134 } 135 #endif 136 #ifndef _WIN32 137 // Test conversion to/from regular char-based string. On POSIX, this 138 // is implied to convert to/from UTF-8. 139 { 140 const fs::path p(str); 141 assert(p.string() == str); 142 assert(p.u16string() == u16str); 143 assert(p.string<char>() == str); 144 } 145 { 146 const fs::path p(u16str); 147 assert(p.string() == str); 148 assert(p.string<char>() == str); 149 } 150 #else 151 // On windows, the narrow char-based input/output is supposed to be 152 // in the charset that narrow file IO APIs use. This can either be the 153 // current active code page (ACP) or the OEM code page, exposed by 154 // the AreFileApisANSI() function, and settable with SetFileApisToANSI() and 155 // SetFileApisToOEM(). We can't set which codepage is active within 156 // the process, but for some specific known ones, we can check if they 157 // behave as expected. 158 SetFileApisToANSI(); 159 if (GetACP() == 1252) { 160 const char latin1[] = { char(0xe5), char(0xe4), char(0xf6), 0x00 }; 161 { 162 const fs::path p(wstr); 163 assert(p.string() == latin1); 164 assert(p.string<char>() == latin1); 165 } 166 { 167 const fs::path p(latin1); 168 assert(p.string() == latin1); 169 assert(p.wstring() == wstr); 170 assert(p.u8string() == u8str); 171 assert(p.u16string() == u16str); 172 assert(p.string<char>() == latin1); 173 assert(p.string<wchar_t>() == wstr); 174 } 175 } 176 SetFileApisToOEM(); 177 if (GetOEMCP() == 850 || GetOEMCP() == 437) { 178 // These chars are identical in both CP 850 and 437 179 const char cp850[] = { char(0x86), char(0x84), char(0x94), 0x00 }; 180 { 181 const fs::path p(wstr); 182 assert(p.string() == cp850); 183 assert(p.string<char>() == cp850); 184 } 185 { 186 const fs::path p(cp850); 187 assert(p.string() == cp850); 188 assert(p.wstring() == wstr); 189 assert(p.u8string() == u8str); 190 assert(p.u16string() == u16str); 191 assert(p.string<char>() == cp850); 192 assert(p.string<wchar_t>() == wstr); 193 } 194 } 195 #endif 196 } 197 198 // Test conversion with strings that don't fit within one UTF-16 code point. 199 // Here, wchar_t can be either UTF-16 or UTF-32 depending on the size on the 200 // particular platform. 201 static void test_wide_unicode() 202 { 203 const char16_t u16str[] = { 0xd801, 0xdc37, 0x00 }; 204 const char32_t u32str[] = { 0x10437, 0x00 }; 205 #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t) 206 const char8_t u8str[] = { 0xf0, 0x90, 0x90, 0xb7, 0x00 }; 207 #else 208 const char u8str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 }; 209 #endif 210 const char str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 }; 211 { 212 const fs::path p = fs::u8path(str); 213 assert(p.u8string() == u8str); 214 assert(p.u16string() == u16str); 215 assert(p.u32string() == u32str); 216 } 217 { 218 const fs::path p(u16str); 219 assert(p.u8string() == u8str); 220 assert(p.u16string() == u16str); 221 assert(p.u32string() == u32str); 222 } 223 { 224 const fs::path p(u32str); 225 assert(p.u8string() == u8str); 226 assert(p.u16string() == u16str); 227 assert(p.u32string() == u32str); 228 } 229 #if !defined(HAS_NO_WCHAR) && defined(__SIZEOF_WCHAR_T__) 230 #if __SIZEOF_WCHAR_T__ == 2 231 const wchar_t wstr[] = { 0xd801, 0xdc37, 0x00 }; 232 #else 233 const wchar_t wstr[] = { 0x10437, 0x00 }; 234 #endif 235 // Test conversion to/from wchar_t. 236 // libstdc++ doesn't define conversions from/to wchar_t outside of windows. 237 { 238 const fs::path p = fs::u8path(str); 239 assert(p.wstring() == wstr); 240 } 241 { 242 const fs::path p(u16str); 243 assert(p.wstring() == wstr); 244 } 245 { 246 const fs::path p(u32str); 247 assert(p.wstring() == wstr); 248 } 249 { 250 const fs::path p(wstr); 251 assert(p.u8string() == u8str); 252 assert(p.u16string() == u16str); 253 assert(p.u32string() == u32str); 254 assert(p.wstring() == wstr); 255 } 256 #endif 257 } 258 259 // Test appending paths in different encodings. 260 static void test_append() 261 { 262 const char16_t u16str[] = { 0xd801, 0xdc37, 0x00 }; 263 const char32_t u32str[] = { 0x10437, 0x00 }; 264 const char32_t u32ref[] = { 0x10437, fs::path::preferred_separator, 0x10437, fs::path::preferred_separator, 0x10437, 0x00 }; 265 const char str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 }; 266 { 267 fs::path p = fs::u8path(str) / u16str / u32str; 268 assert(p.u32string() == u32ref); 269 p = fs::u8path(str).append(u16str).append(u32str); 270 assert(p.u32string() == u32ref); 271 p = fs::u8path(str); 272 p /= u16str; 273 p /= u32str; 274 assert(p.u32string() == u32ref); 275 } 276 #if !defined(HAS_NO_WCHAR) && defined(__SIZEOF_WCHAR_T__) 277 #if __SIZEOF_WCHAR_T__ == 2 278 const wchar_t wstr[] = { 0xd801, 0xdc37, 0x00 }; 279 #else 280 const wchar_t wstr[] = { 0x10437, 0x00 }; 281 #endif 282 // Test conversion from wchar_t. 283 // libstdc++ doesn't define conversions from/to wchar_t outside of windows. 284 { 285 fs::path p = fs::path(u16str) / wstr / u32str; 286 assert(p.u32string() == u32ref); 287 p = fs::path(u16str).append(wstr).append(u32str); 288 assert(p.u32string() == u32ref); 289 p = fs::path(u16str); 290 p /= wstr; 291 p /= u32str; 292 assert(p.u32string() == u32ref); 293 } 294 #endif 295 } 296 297 static void test_concat() 298 { 299 const char16_t u16str[] = { 0xd801, 0xdc37, 0x00 }; 300 const char32_t u32str[] = { 0x10437, 0x00 }; 301 const char32_t u32ref[] = { 0x10437, 0x10437, 0x10437, 0x00 }; 302 const char str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 }; 303 { 304 fs::path p = fs::u8path(str); 305 p += u16str; 306 p += u32str; 307 assert(p.u32string() == u32ref); 308 p = fs::u8path(str).concat(u16str).concat(u32str); 309 assert(p.u32string() == u32ref); 310 } 311 #if !defined(HAS_NO_WCHAR) && defined(__SIZEOF_WCHAR_T__) 312 #if __SIZEOF_WCHAR_T__ == 2 313 const wchar_t wstr[] = { 0xd801, 0xdc37, 0x00 }; 314 #else 315 const wchar_t wstr[] = { 0x10437, 0x00 }; 316 #endif 317 // Test conversion from wchar_t. 318 // libstdc++ doesn't define conversions from/to wchar_t outside of windows. 319 { 320 fs::path p = fs::path(u16str); 321 p += wstr; 322 p += u32str; 323 assert(p.u32string() == u32ref); 324 p = fs::path(u16str).concat(wstr).concat(u32str); 325 assert(p.u32string() == u32ref); 326 } 327 #endif 328 } 329 330 static void test_append_concat_narrow() 331 { 332 const char16_t u16str[] = { 0xe5, 0x00 }; 333 const char32_t u32ref_append[] = { 0xe5, fs::path::preferred_separator, 0xe5, 0x00 }; 334 const char32_t u32ref_concat[] = { 0xe5, 0xe5, 0x00 }; 335 336 #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t) 337 { 338 const char8_t u8str[] = { 0xc3, 0xa5, 0x00 }; 339 // In C++20, appends of a char8_t string is unambiguously treated as 340 // UTF-8. 341 fs::path p = fs::path(u16str) / u8str; 342 assert(p.u32string() == u32ref_append); 343 p = fs::path(u16str).append(u8str); 344 assert(p.u32string() == u32ref_append); 345 p = fs::path(u16str); 346 p /= u8str; 347 assert(p.u32string() == u32ref_append); 348 p = fs::path(u16str).concat(u8str); 349 assert(p.u32string() == u32ref_concat); 350 p = fs::path(u16str); 351 p += u8str; 352 assert(p.u32string() == u32ref_concat); 353 } 354 #endif 355 #ifndef _WIN32 356 // Test appending a regular char-based string. On POSIX, this 357 // is implied to convert to/from UTF-8. 358 { 359 const char str[] = { char(0xc3), char(0xa5), 0x00 }; // UTF8, in a regular char string 360 fs::path p = fs::path(u16str) / str; 361 assert(p.u32string() == u32ref_append); 362 p = fs::path(u16str).append(str); 363 assert(p.u32string() == u32ref_append); 364 p = fs::path(u16str); 365 p /= str; 366 assert(p.u32string() == u32ref_append); 367 p = fs::path(u16str).concat(str); 368 assert(p.u32string() == u32ref_concat); 369 p = fs::path(u16str); 370 p += str; 371 assert(p.u32string() == u32ref_concat); 372 } 373 #else 374 SetFileApisToANSI(); 375 if (GetACP() == 1252) { 376 const char latin1[] = { char(0xe5), 0x00 }; 377 fs::path p = fs::path(u16str) / latin1; 378 assert(p.u32string() == u32ref_append); 379 p = fs::path(u16str).append(latin1); 380 assert(p.u32string() == u32ref_append); 381 p = fs::path(u16str); 382 p /= latin1; 383 assert(p.u32string() == u32ref_append); 384 p = fs::path(u16str).concat(latin1); 385 assert(p.u32string() == u32ref_concat); 386 p = fs::path(u16str); 387 p += latin1; 388 assert(p.u32string() == u32ref_concat); 389 } 390 SetFileApisToOEM(); 391 if (GetOEMCP() == 850 || GetOEMCP() == 437) { 392 // This chars is identical in both CP 850 and 437 393 const char cp850[] = { char(0x86), 0x00 }; 394 fs::path p = fs::path(u16str) / cp850; 395 assert(p.u32string() == u32ref_append); 396 p = fs::path(u16str).append(cp850); 397 assert(p.u32string() == u32ref_append); 398 p = fs::path(u16str); 399 p /= cp850; 400 assert(p.u32string() == u32ref_append); 401 p = fs::path(u16str).concat(cp850); 402 assert(p.u32string() == u32ref_concat); 403 p = fs::path(u16str); 404 p += cp850; 405 assert(p.u32string() == u32ref_concat); 406 } 407 #endif 408 } 409 410 int main(int, char**) 411 { 412 test_latin_unicode(); 413 test_wide_unicode(); 414 test_append(); 415 test_concat(); 416 test_append_concat_narrow(); 417 418 return 0; 419 } 420