1 // -*- C++ -*- 2 //===----------------------------------------------------------------------===// 3 // 4 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5 // See https://llvm.org/LICENSE.txt for license information. 6 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 // 8 //===----------------------------------------------------------------------===// 9 10 #ifndef _LIBCPP___FORMAT_PARSER_STD_FORMAT_SPEC_H 11 #define _LIBCPP___FORMAT_PARSER_STD_FORMAT_SPEC_H 12 13 #include <__algorithm/find_if.h> 14 #include <__algorithm/min.h> 15 #include <__config> 16 #include <__debug> 17 #include <__format/format_arg.h> 18 #include <__format/format_error.h> 19 #include <__format/format_string.h> 20 #include <__variant/monostate.h> 21 #include <bit> 22 #include <concepts> 23 #include <cstdint> 24 #include <type_traits> 25 26 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) 27 # pragma GCC system_header 28 #endif 29 30 _LIBCPP_PUSH_MACROS 31 #include <__undef_macros> 32 33 _LIBCPP_BEGIN_NAMESPACE_STD 34 35 #if _LIBCPP_STD_VER > 17 36 37 // TODO FMT Remove this once we require compilers with proper C++20 support. 38 // If the compiler has no concepts support, the format header will be disabled. 39 // Without concepts support enable_if needs to be used and that too much effort 40 // to support compilers with partial C++20 support. 41 # if !defined(_LIBCPP_HAS_NO_CONCEPTS) 42 43 namespace __format_spec { 44 45 /** 46 * Contains the flags for the std-format-spec. 47 * 48 * Some format-options can only be used for specific C++types and may depend on 49 * the selected format-type. 50 * * The C++type filtering can be done using the proper policies for 51 * @ref __parser_std. 52 * * The format-type filtering needs to be done post parsing in the parser 53 * derived from @ref __parser_std. 54 */ 55 class _LIBCPP_TYPE_VIS _Flags { 56 public: 57 enum class _LIBCPP_ENUM_VIS _Alignment : uint8_t { 58 /** 59 * No alignment is set in the format string. 60 * 61 * Zero-padding is ignored when an alignment is selected. 62 * The default alignment depends on the selected format-type. 63 */ 64 __default, 65 __left, 66 __center, 67 __right 68 }; 69 enum class _LIBCPP_ENUM_VIS _Sign : uint8_t { 70 /** 71 * No sign is set in the format string. 72 * 73 * The sign isn't allowed for certain format-types. By using this value 74 * it's possible to detect whether or not the user explicitly set the sign 75 * flag. For formatting purposes it behaves the same as @ref __minus. 76 */ 77 __default, 78 __minus, 79 __plus, 80 __space 81 }; 82 83 _Alignment __alignment : 2 {_Alignment::__default}; 84 _Sign __sign : 2 {_Sign::__default}; 85 uint8_t __alternate_form : 1 {false}; 86 uint8_t __zero_padding : 1 {false}; 87 uint8_t __locale_specific_form : 1 {false}; 88 89 enum class _LIBCPP_ENUM_VIS _Type : uint8_t { 90 __default, 91 __string, 92 __binary_lower_case, 93 __binary_upper_case, 94 __octal, 95 __decimal, 96 __hexadecimal_lower_case, 97 __hexadecimal_upper_case, 98 __pointer, 99 __char, 100 __float_hexadecimal_lower_case, 101 __float_hexadecimal_upper_case, 102 __scientific_lower_case, 103 __scientific_upper_case, 104 __fixed_lower_case, 105 __fixed_upper_case, 106 __general_lower_case, 107 __general_upper_case 108 }; 109 110 _Type __type{_Type::__default}; 111 }; 112 113 namespace __detail { 114 template <class _CharT> 115 _LIBCPP_HIDE_FROM_ABI constexpr bool 116 __parse_alignment(_CharT __c, _Flags& __flags) noexcept { 117 switch (__c) { 118 case _CharT('<'): 119 __flags.__alignment = _Flags::_Alignment::__left; 120 return true; 121 122 case _CharT('^'): 123 __flags.__alignment = _Flags::_Alignment::__center; 124 return true; 125 126 case _CharT('>'): 127 __flags.__alignment = _Flags::_Alignment::__right; 128 return true; 129 } 130 return false; 131 } 132 } // namespace __detail 133 134 template <class _CharT> 135 class _LIBCPP_TEMPLATE_VIS __parser_fill_align { 136 public: 137 // TODO FMT The standard doesn't specify this character is a Unicode 138 // character. Validate what fmt and MSVC have implemented. 139 _CharT __fill{_CharT(' ')}; 140 141 protected: 142 _LIBCPP_HIDE_FROM_ABI constexpr const _CharT* 143 __parse(const _CharT* __begin, const _CharT* __end, _Flags& __flags) { 144 _LIBCPP_ASSERT(__begin != __end, 145 "When called with an empty input the function will cause " 146 "undefined behavior by evaluating data not in the input"); 147 if (__begin + 1 != __end) { 148 if (__detail::__parse_alignment(*(__begin + 1), __flags)) { 149 if (*__begin == _CharT('{') || *__begin == _CharT('}')) 150 __throw_format_error( 151 "The format-spec fill field contains an invalid character"); 152 __fill = *__begin; 153 return __begin + 2; 154 } 155 } 156 157 if (__detail::__parse_alignment(*__begin, __flags)) 158 return __begin + 1; 159 160 return __begin; 161 } 162 }; 163 164 template <class _CharT> 165 _LIBCPP_HIDE_FROM_ABI constexpr const _CharT* 166 __parse_sign(const _CharT* __begin, _Flags& __flags) noexcept { 167 switch (*__begin) { 168 case _CharT('-'): 169 __flags.__sign = _Flags::_Sign::__minus; 170 break; 171 case _CharT('+'): 172 __flags.__sign = _Flags::_Sign::__plus; 173 break; 174 case _CharT(' '): 175 __flags.__sign = _Flags::_Sign::__space; 176 break; 177 default: 178 return __begin; 179 } 180 return __begin + 1; 181 } 182 183 template <class _CharT> 184 _LIBCPP_HIDE_FROM_ABI constexpr const _CharT* 185 __parse_alternate_form(const _CharT* __begin, _Flags& __flags) noexcept { 186 if (*__begin == _CharT('#')) { 187 __flags.__alternate_form = true; 188 ++__begin; 189 } 190 191 return __begin; 192 } 193 194 template <class _CharT> 195 _LIBCPP_HIDE_FROM_ABI constexpr const _CharT* 196 __parse_zero_padding(const _CharT* __begin, _Flags& __flags) noexcept { 197 if (*__begin == _CharT('0')) { 198 __flags.__zero_padding = true; 199 ++__begin; 200 } 201 202 return __begin; 203 } 204 205 template <class _CharT> 206 _LIBCPP_HIDE_FROM_ABI constexpr __format::__parse_number_result< _CharT> 207 __parse_arg_id(const _CharT* __begin, const _CharT* __end, auto& __parse_ctx) { 208 // This function is a wrapper to call the real parser. But it does the 209 // validation for the pre-conditions and post-conditions. 210 if (__begin == __end) 211 __throw_format_error("End of input while parsing format-spec arg-id"); 212 213 __format::__parse_number_result __r = 214 __format::__parse_arg_id(__begin, __end, __parse_ctx); 215 216 if (__r.__ptr == __end || *__r.__ptr != _CharT('}')) 217 __throw_format_error("A format-spec arg-id should terminate at a '}'"); 218 219 ++__r.__ptr; 220 return __r; 221 } 222 223 template <class _Context> 224 _LIBCPP_HIDE_FROM_ABI constexpr uint32_t 225 __substitute_arg_id(basic_format_arg<_Context> __arg) { 226 return visit_format_arg( 227 [](auto __arg) -> uint32_t { 228 using _Type = decltype(__arg); 229 if constexpr (integral<_Type>) { 230 if constexpr (signed_integral<_Type>) { 231 if (__arg < 0) 232 __throw_format_error("A format-spec arg-id replacement shouldn't " 233 "have a negative value"); 234 } 235 236 using _CT = common_type_t<_Type, decltype(__format::__number_max)>; 237 if (static_cast<_CT>(__arg) > 238 static_cast<_CT>(__format::__number_max)) 239 __throw_format_error("A format-spec arg-id replacement exceeds " 240 "the maximum supported value"); 241 return __arg; 242 } else if constexpr (same_as<_Type, monostate>) 243 __throw_format_error("Argument index out of bounds"); 244 else 245 __throw_format_error("A format-spec arg-id replacement argument " 246 "isn't an integral type"); 247 }, 248 __arg); 249 } 250 251 class _LIBCPP_TYPE_VIS __parser_width { 252 public: 253 /** Contains a width or an arg-id. */ 254 uint32_t __width : 31 {0}; 255 /** Determines whether the value stored is a width or an arg-id. */ 256 uint32_t __width_as_arg : 1 {0}; 257 258 protected: 259 /** 260 * Does the supplied std-format-spec contain a width field? 261 * 262 * When the field isn't present there's no padding required. This can be used 263 * to optimize the formatting. 264 */ 265 constexpr bool __has_width_field() const noexcept { 266 return __width_as_arg || __width; 267 } 268 269 /** 270 * Does the supplied width field contain an arg-id? 271 * 272 * If @c true the formatter needs to call @ref __substitute_width_arg_id. 273 */ 274 constexpr bool __width_needs_substitution() const noexcept { 275 return __width_as_arg; 276 } 277 278 template <class _CharT> 279 _LIBCPP_HIDE_FROM_ABI constexpr const _CharT* 280 __parse(const _CharT* __begin, const _CharT* __end, auto& __parse_ctx) { 281 if (*__begin == _CharT('0')) 282 __throw_format_error( 283 "A format-spec width field shouldn't have a leading zero"); 284 285 if (*__begin == _CharT('{')) { 286 __format::__parse_number_result __r = 287 __parse_arg_id(++__begin, __end, __parse_ctx); 288 __width = __r.__value; 289 __width_as_arg = 1; 290 return __r.__ptr; 291 } 292 293 if (*__begin < _CharT('0') || *__begin > _CharT('9')) 294 return __begin; 295 296 __format::__parse_number_result __r = 297 __format::__parse_number(__begin, __end); 298 __width = __r.__value; 299 _LIBCPP_ASSERT(__width != 0, 300 "A zero value isn't allowed and should be impossible, " 301 "due to validations in this function"); 302 return __r.__ptr; 303 } 304 305 void _LIBCPP_HIDE_FROM_ABI constexpr __substitute_width_arg_id(auto __arg) { 306 _LIBCPP_ASSERT(__width_as_arg == 1, 307 "Substitute width called when no substitution is required"); 308 309 // The clearing of the flag isn't required but looks better when debugging 310 // the code. 311 __width_as_arg = 0; 312 __width = __substitute_arg_id(__arg); 313 if (__width == 0) 314 __throw_format_error( 315 "A format-spec width field replacement should have a positive value"); 316 } 317 }; 318 319 class _LIBCPP_TYPE_VIS __parser_precision { 320 public: 321 /** Contains a precision or an arg-id. */ 322 uint32_t __precision : 31 {__format::__number_max}; 323 /** 324 * Determines whether the value stored is a precision or an arg-id. 325 * 326 * @note Since @ref __precision == @ref __format::__number_max is a valid 327 * value, the default value contains an arg-id of INT32_MAX. (This number of 328 * arguments isn't supported by compilers.) This is used to detect whether 329 * the std-format-spec contains a precision field. 330 */ 331 uint32_t __precision_as_arg : 1 {1}; 332 333 protected: 334 /** 335 * Does the supplied std-format-spec contain a precision field? 336 * 337 * When the field isn't present there's no truncating required. This can be 338 * used to optimize the formatting. 339 */ 340 constexpr bool __has_precision_field() const noexcept { 341 342 return __precision_as_arg == 0 || // Contains a value? 343 __precision != __format::__number_max; // The arg-id is valid? 344 } 345 346 /** 347 * Does the supplied precision field contain an arg-id? 348 * 349 * If @c true the formatter needs to call @ref __substitute_precision_arg_id. 350 */ 351 constexpr bool __precision_needs_substitution() const noexcept { 352 return __precision_as_arg && __precision != __format::__number_max; 353 } 354 355 template <class _CharT> 356 _LIBCPP_HIDE_FROM_ABI constexpr const _CharT* 357 __parse(const _CharT* __begin, const _CharT* __end, auto& __parse_ctx) { 358 if (*__begin != _CharT('.')) 359 return __begin; 360 361 ++__begin; 362 if (__begin == __end) 363 __throw_format_error("End of input while parsing format-spec precision"); 364 365 if (*__begin == _CharT('0')) { 366 ++__begin; 367 if (__begin != __end && *__begin >= '0' && *__begin <= '9') 368 __throw_format_error( 369 "A format-spec precision field shouldn't have a leading zero"); 370 371 __precision = 0; 372 __precision_as_arg = 0; 373 return __begin; 374 } 375 376 if (*__begin == _CharT('{')) { 377 __format::__parse_number_result __arg_id = 378 __parse_arg_id(++__begin, __end, __parse_ctx); 379 _LIBCPP_ASSERT(__arg_id.__value != __format::__number_max, 380 "Unsupported number of arguments, since this number of " 381 "arguments is used a special value"); 382 __precision = __arg_id.__value; 383 return __arg_id.__ptr; 384 } 385 386 if (*__begin < _CharT('0') || *__begin > _CharT('9')) 387 __throw_format_error( 388 "The format-spec precision field doesn't contain a value or arg-id"); 389 390 __format::__parse_number_result __r = 391 __format::__parse_number(__begin, __end); 392 __precision = __r.__value; 393 __precision_as_arg = 0; 394 return __r.__ptr; 395 } 396 397 void _LIBCPP_HIDE_FROM_ABI constexpr __substitute_precision_arg_id( 398 auto __arg) { 399 _LIBCPP_ASSERT( 400 __precision_as_arg == 1 && __precision != __format::__number_max, 401 "Substitute precision called when no substitution is required"); 402 403 // The clearing of the flag isn't required but looks better when debugging 404 // the code. 405 __precision_as_arg = 0; 406 __precision = __substitute_arg_id(__arg); 407 } 408 }; 409 410 template <class _CharT> 411 _LIBCPP_HIDE_FROM_ABI constexpr const _CharT* 412 __parse_locale_specific_form(const _CharT* __begin, _Flags& __flags) noexcept { 413 if (*__begin == _CharT('L')) { 414 __flags.__locale_specific_form = true; 415 ++__begin; 416 } 417 418 return __begin; 419 } 420 421 template <class _CharT> 422 _LIBCPP_HIDE_FROM_ABI constexpr const _CharT* 423 __parse_type(const _CharT* __begin, _Flags& __flags) { 424 425 // Determines the type. It does not validate whether the selected type is 426 // valid. Most formatters have optional fields that are only allowed for 427 // certain types. These parsers need to do validation after the type has 428 // been parsed. So its easier to implement the validation for all types in 429 // the specific parse function. 430 switch (*__begin) { 431 case 'A': 432 __flags.__type = _Flags::_Type::__float_hexadecimal_upper_case; 433 break; 434 case 'B': 435 __flags.__type = _Flags::_Type::__binary_upper_case; 436 break; 437 case 'E': 438 __flags.__type = _Flags::_Type::__scientific_upper_case; 439 break; 440 case 'F': 441 __flags.__type = _Flags::_Type::__fixed_upper_case; 442 break; 443 case 'G': 444 __flags.__type = _Flags::_Type::__general_upper_case; 445 break; 446 case 'X': 447 __flags.__type = _Flags::_Type::__hexadecimal_upper_case; 448 break; 449 case 'a': 450 __flags.__type = _Flags::_Type::__float_hexadecimal_lower_case; 451 break; 452 case 'b': 453 __flags.__type = _Flags::_Type::__binary_lower_case; 454 break; 455 case 'c': 456 __flags.__type = _Flags::_Type::__char; 457 break; 458 case 'd': 459 __flags.__type = _Flags::_Type::__decimal; 460 break; 461 case 'e': 462 __flags.__type = _Flags::_Type::__scientific_lower_case; 463 break; 464 case 'f': 465 __flags.__type = _Flags::_Type::__fixed_lower_case; 466 break; 467 case 'g': 468 __flags.__type = _Flags::_Type::__general_lower_case; 469 break; 470 case 'o': 471 __flags.__type = _Flags::_Type::__octal; 472 break; 473 case 'p': 474 __flags.__type = _Flags::_Type::__pointer; 475 break; 476 case 's': 477 __flags.__type = _Flags::_Type::__string; 478 break; 479 case 'x': 480 __flags.__type = _Flags::_Type::__hexadecimal_lower_case; 481 break; 482 default: 483 return __begin; 484 } 485 return ++__begin; 486 } 487 488 /** 489 * The parser for the std-format-spec. 490 * 491 * [format.string.std]/1 specifies the std-format-spec: 492 * fill-and-align sign # 0 width precision L type 493 * 494 * All these fields are optional. Whether these fields can be used depend on: 495 * - The type supplied to the format string. 496 * E.g. A string never uses the sign field so the field may not be set. 497 * This constrain is validated by the parsers in this file. 498 * - The supplied value for the optional type field. 499 * E.g. A int formatted as decimal uses the sign field. 500 * When formatted as a char the sign field may no longer be set. 501 * This constrain isn't validated by the parsers in this file. 502 * 503 * The base classes are ordered to minimize the amount of padding. 504 * 505 * This implements the parser for the string types. 506 */ 507 template <class _CharT> 508 class _LIBCPP_TEMPLATE_VIS __parser_string 509 : public __parser_width, // provides __width(|as_arg) 510 public __parser_precision, // provides __precision(|as_arg) 511 public __parser_fill_align<_CharT>, // provides __fill and uses __flags 512 public _Flags // provides __flags 513 { 514 public: 515 using char_type = _CharT; 516 517 _LIBCPP_HIDE_FROM_ABI constexpr __parser_string() { 518 this->__alignment = _Flags::_Alignment::__left; 519 } 520 521 /** 522 * The low-level std-format-spec parse function. 523 * 524 * @pre __begin points at the beginning of the std-format-spec. This means 525 * directly after the ':'. 526 * @pre The std-format-spec parses the entire input, or the first unmatched 527 * character is a '}'. 528 * 529 * @returns The iterator pointing at the last parsed character. 530 */ 531 _LIBCPP_HIDE_FROM_ABI constexpr auto parse(auto& __parse_ctx) 532 -> decltype(__parse_ctx.begin()) { 533 auto __it = __parse(__parse_ctx); 534 __process_display_type(); 535 return __it; 536 } 537 538 private: 539 /** 540 * Parses the std-format-spec. 541 * 542 * @throws __throw_format_error When @a __parse_ctx contains an ill-formed 543 * std-format-spec. 544 * 545 * @returns An iterator to the end of input or point at the closing '}'. 546 */ 547 _LIBCPP_HIDE_FROM_ABI constexpr auto __parse(auto& __parse_ctx) 548 -> decltype(__parse_ctx.begin()) { 549 550 auto __begin = __parse_ctx.begin(); 551 auto __end = __parse_ctx.end(); 552 if (__begin == __end) 553 return __begin; 554 555 __begin = __parser_fill_align<_CharT>::__parse(__begin, __end, 556 static_cast<_Flags&>(*this)); 557 if (__begin == __end) 558 return __begin; 559 560 __begin = __parser_width::__parse(__begin, __end, __parse_ctx); 561 if (__begin == __end) 562 return __begin; 563 564 __begin = __parser_precision::__parse(__begin, __end, __parse_ctx); 565 if (__begin == __end) 566 return __begin; 567 568 __begin = __parse_type(__begin, static_cast<_Flags&>(*this)); 569 570 if (__begin != __end && *__begin != _CharT('}')) 571 __throw_format_error( 572 "The format-spec should consume the input or end with a '}'"); 573 574 return __begin; 575 } 576 577 /** Processes the parsed std-format-spec based on the parsed display type. */ 578 void _LIBCPP_HIDE_FROM_ABI constexpr __process_display_type() { 579 switch (this->__type) { 580 case _Flags::_Type::__default: 581 case _Flags::_Type::__string: 582 break; 583 584 default: 585 __throw_format_error("The format-spec type has a type not supported for " 586 "a string argument"); 587 } 588 } 589 }; 590 591 /** 592 * The parser for the std-format-spec. 593 * 594 * This implements the parser for the integral types. This includes the 595 * character type and boolean type. 596 * 597 * See @ref __parser_string. 598 */ 599 template <class _CharT> 600 class _LIBCPP_TEMPLATE_VIS __parser_integral 601 : public __parser_width, // provides __width(|as_arg) 602 public __parser_fill_align<_CharT>, // provides __fill and uses __flags 603 public _Flags // provides __flags 604 { 605 public: 606 using char_type = _CharT; 607 608 // TODO FMT This class probably doesn't need public member functions after 609 // format.string.std/std_format_spec_integral.pass.cpp has been retired. 610 611 /** 612 * The low-level std-format-spec parse function. 613 * 614 * @pre __begin points at the beginning of the std-format-spec. This means 615 * directly after the ':'. 616 * @pre The std-format-spec parses the entire input, or the first unmatched 617 * character is a '}'. 618 * 619 * @returns The iterator pointing at the last parsed character. 620 */ 621 _LIBCPP_HIDE_FROM_ABI constexpr auto parse(auto& __parse_ctx) 622 -> decltype(__parse_ctx.begin()) { 623 auto __begin = __parse_ctx.begin(); 624 auto __end = __parse_ctx.end(); 625 if (__begin == __end) 626 return __begin; 627 628 __begin = __parser_fill_align<_CharT>::__parse(__begin, __end, 629 static_cast<_Flags&>(*this)); 630 if (__begin == __end) 631 return __begin; 632 633 __begin = __parse_sign(__begin, static_cast<_Flags&>(*this)); 634 if (__begin == __end) 635 return __begin; 636 637 __begin = __parse_alternate_form(__begin, static_cast<_Flags&>(*this)); 638 if (__begin == __end) 639 return __begin; 640 641 __begin = __parse_zero_padding(__begin, static_cast<_Flags&>(*this)); 642 if (__begin == __end) 643 return __begin; 644 645 __begin = __parser_width::__parse(__begin, __end, __parse_ctx); 646 if (__begin == __end) 647 return __begin; 648 649 __begin = 650 __parse_locale_specific_form(__begin, static_cast<_Flags&>(*this)); 651 if (__begin == __end) 652 return __begin; 653 654 __begin = __parse_type(__begin, static_cast<_Flags&>(*this)); 655 656 if (__begin != __end && *__begin != _CharT('}')) 657 __throw_format_error( 658 "The format-spec should consume the input or end with a '}'"); 659 660 return __begin; 661 } 662 663 protected: 664 /** 665 * Handles the post-parsing updates for the integer types. 666 * 667 * Updates the zero-padding and alignment for integer types. 668 * 669 * [format.string.std]/13 670 * If the 0 character and an align option both appear, the 0 character is 671 * ignored. 672 * 673 * For the formatter a @ref __default alignment means zero-padding. Update 674 * the alignment based on parsed format string. 675 */ 676 _LIBCPP_HIDE_FROM_ABI constexpr void __handle_integer() noexcept { 677 this->__zero_padding &= this->__alignment == _Flags::_Alignment::__default; 678 if (!this->__zero_padding && 679 this->__alignment == _Flags::_Alignment::__default) 680 this->__alignment = _Flags::_Alignment::__right; 681 } 682 683 /** 684 * Handles the post-parsing updates for the character types. 685 * 686 * Sets the alignment and validates the format flags set for a character type. 687 * 688 * At the moment the validation for a character and a Boolean behave the 689 * same, but this may change in the future. 690 * Specifically at the moment the locale-specific form is allowed for the 691 * char output type, but it has no effect on the output. 692 */ 693 _LIBCPP_HIDE_FROM_ABI constexpr void __handle_char() { __handle_bool(); } 694 695 /** 696 * Handles the post-parsing updates for the Boolean types. 697 * 698 * Sets the alignment and validates the format flags set for a Boolean type. 699 */ 700 _LIBCPP_HIDE_FROM_ABI constexpr void __handle_bool() { 701 if (this->__sign != _Flags::_Sign::__default) 702 __throw_format_error("A sign field isn't allowed in this format-spec"); 703 704 if (this->__alternate_form) 705 __throw_format_error( 706 "An alternate form field isn't allowed in this format-spec"); 707 708 if (this->__zero_padding) 709 __throw_format_error( 710 "A zero-padding field isn't allowed in this format-spec"); 711 712 if (this->__alignment == _Flags::_Alignment::__default) 713 this->__alignment = _Flags::_Alignment::__left; 714 } 715 }; 716 717 // TODO FMT Add a parser for floating-point values. 718 // TODO FMT Add a parser for pointer values. 719 720 /** Helper struct returned from @ref __get_string_alignment. */ 721 template <class _CharT> 722 struct _LIBCPP_TEMPLATE_VIS __string_alignment { 723 /** Points beyond the last character to write to the output. */ 724 const _CharT* __last; 725 /** 726 * The estimated number of columns in the output or 0. 727 * 728 * Only when the output needs to be aligned it's required to know the exact 729 * number of columns in the output. So if the formatted output has only a 730 * minimum width the exact size isn't important. It's only important to know 731 * the minimum has been reached. The minimum width is the width specified in 732 * the format-spec. 733 * 734 * For example in this code @code std::format("{:10}", MyString); @endcode 735 * the width estimation can stop once the algorithm has determined the output 736 * width is 10 columns. 737 * 738 * So if: 739 * * @ref __align == @c true the @ref __size is the estimated number of 740 * columns required. 741 * * @ref __align == @c false the @ref __size is the estimated number of 742 * columns required or 0 when the estimation algorithm stopped prematurely. 743 */ 744 ptrdiff_t __size; 745 /** 746 * Does the output need to be aligned. 747 * 748 * When alignment is needed the output algorithm needs to add the proper 749 * padding. Else the output algorithm just needs to copy the input up to 750 * @ref __last. 751 */ 752 bool __align; 753 }; 754 755 #ifndef _LIBCPP_HAS_NO_UNICODE 756 namespace __detail { 757 758 /** 759 * Unicode column width estimates. 760 * 761 * Unicode can be stored in several formats: UTF-8, UTF-16, and UTF-32. 762 * Depending on format the relation between the number of code units stored and 763 * the number of output columns differs. The first relation is the number of 764 * code units forming a code point. (The text assumes the code units are 765 * unsigned.) 766 * - UTF-8 The number of code units is between one and four. The first 127 767 * Unicode code points match the ASCII character set. When the highest bit is 768 * set it means the code point has more than one code unit. 769 * - UTF-16: The number of code units is between 1 and 2. When the first 770 * code unit is in the range [0xd800,0xdfff) it means the code point uses two 771 * code units. 772 * - UTF-32: The number of code units is always one. 773 * 774 * The code point to the number of columns isn't well defined. The code uses the 775 * estimations defined in [format.string.std]/11. This list might change in the 776 * future. 777 * 778 * The algorithm of @ref __get_string_alignment uses two different scanners: 779 * - The simple scanner @ref __estimate_column_width_fast. This scanner assumes 780 * 1 code unit is 1 column. This scanner stops when it can't be sure the 781 * assumption is valid: 782 * - UTF-8 when the code point is encoded in more than 1 code unit. 783 * - UTF-16 and UTF-32 when the first multi-column code point is encountered. 784 * (The code unit's value is lower than 0xd800 so the 2 code unit encoding 785 * is irrelevant for this scanner.) 786 * Due to these assumptions the scanner is faster than the full scanner. It 787 * can process all text only containing ASCII. For UTF-16/32 it can process 788 * most (all?) European languages. (Note the set it can process might be 789 * reduced in the future, due to updates in the scanning rules.) 790 * - The full scanner @ref __estimate_column_width. This scanner, if needed, 791 * converts multiple code units into one code point then converts the code 792 * point to a column width. 793 * 794 * See also: 795 * - [format.string.general]/11 796 * - https://en.wikipedia.org/wiki/UTF-8#Encoding 797 * - https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF 798 */ 799 800 /** 801 * The first 2 column code point. 802 * 803 * This is the point where the fast UTF-16/32 scanner needs to stop processing. 804 */ 805 inline constexpr uint32_t __two_column_code_point = 0x1100; 806 807 /** Helper concept for an UTF-8 character type. */ 808 template <class _CharT> 809 concept __utf8_character = same_as<_CharT, char> || same_as<_CharT, char8_t>; 810 811 /** Helper concept for an UTF-16 character type. */ 812 template <class _CharT> 813 concept __utf16_character = (same_as<_CharT, wchar_t> && sizeof(wchar_t) == 2) || same_as<_CharT, char16_t>; 814 815 /** Helper concept for an UTF-32 character type. */ 816 template <class _CharT> 817 concept __utf32_character = (same_as<_CharT, wchar_t> && sizeof(wchar_t) == 4) || same_as<_CharT, char32_t>; 818 819 /** Helper concept for an UTF-16 or UTF-32 character type. */ 820 template <class _CharT> 821 concept __utf16_or_32_character = __utf16_character<_CharT> || __utf32_character<_CharT>; 822 823 /** 824 * Converts a code point to the column width. 825 * 826 * The estimations are conforming to [format.string.general]/11 827 * 828 * This version expects a value less than 0x1'0000, which is a 3-byte UTF-8 829 * character. 830 */ 831 _LIBCPP_HIDE_FROM_ABI inline constexpr int __column_width_3(uint32_t __c) noexcept { 832 _LIBCPP_ASSERT(__c < 0x1'0000, 833 "Use __column_width_4 or __column_width for larger values"); 834 835 // clang-format off 836 return 1 + (__c >= 0x1100 && (__c <= 0x115f || 837 (__c >= 0x2329 && (__c <= 0x232a || 838 (__c >= 0x2e80 && (__c <= 0x303e || 839 (__c >= 0x3040 && (__c <= 0xa4cf || 840 (__c >= 0xac00 && (__c <= 0xd7a3 || 841 (__c >= 0xf900 && (__c <= 0xfaff || 842 (__c >= 0xfe10 && (__c <= 0xfe19 || 843 (__c >= 0xfe30 && (__c <= 0xfe6f || 844 (__c >= 0xff00 && (__c <= 0xff60 || 845 (__c >= 0xffe0 && (__c <= 0xffe6 846 )))))))))))))))))))); 847 // clang-format on 848 } 849 850 /** 851 * @overload 852 * 853 * This version expects a value greater than or equal to 0x1'0000, which is a 854 * 4-byte UTF-8 character. 855 */ 856 _LIBCPP_HIDE_FROM_ABI inline constexpr int __column_width_4(uint32_t __c) noexcept { 857 _LIBCPP_ASSERT(__c >= 0x1'0000, 858 "Use __column_width_3 or __column_width for smaller values"); 859 860 // clang-format off 861 return 1 + (__c >= 0x1'f300 && (__c <= 0x1'f64f || 862 (__c >= 0x1'f900 && (__c <= 0x1'f9ff || 863 (__c >= 0x2'0000 && (__c <= 0x2'fffd || 864 (__c >= 0x3'0000 && (__c <= 0x3'fffd 865 )))))))); 866 // clang-format on 867 } 868 869 /** 870 * @overload 871 * 872 * The general case, accepting all values. 873 */ 874 _LIBCPP_HIDE_FROM_ABI inline constexpr int __column_width(uint32_t __c) noexcept { 875 if (__c < 0x1'0000) 876 return __column_width_3(__c); 877 878 return __column_width_4(__c); 879 } 880 881 /** 882 * Estimate the column width for the UTF-8 sequence using the fast algorithm. 883 */ 884 template <__utf8_character _CharT> 885 _LIBCPP_HIDE_FROM_ABI constexpr const _CharT* 886 __estimate_column_width_fast(const _CharT* __first, 887 const _CharT* __last) noexcept { 888 return _VSTD::find_if(__first, __last, 889 [](unsigned char __c) { return __c & 0x80; }); 890 } 891 892 /** 893 * @overload 894 * 895 * The implementation for UTF-16/32. 896 */ 897 template <__utf16_or_32_character _CharT> 898 _LIBCPP_HIDE_FROM_ABI constexpr const _CharT* 899 __estimate_column_width_fast(const _CharT* __first, 900 const _CharT* __last) noexcept { 901 return _VSTD::find_if(__first, __last, 902 [](uint32_t __c) { return __c >= 0x1100; }); 903 } 904 905 template <class _CharT> 906 struct _LIBCPP_TEMPLATE_VIS __column_width_result { 907 /** The number of output columns. */ 908 size_t __width; 909 /** 910 * The last parsed element. 911 * 912 * This limits the original output to fit in the wanted number of columns. 913 */ 914 const _CharT* __ptr; 915 }; 916 917 /** 918 * Small helper to determine the width of malformed Unicode. 919 * 920 * @note This function's only needed for UTF-8. During scanning UTF-8 there 921 * are multiple place where it can be detected that the Unicode is malformed. 922 * UTF-16 only requires 1 test and UTF-32 requires no testing. 923 */ 924 template <__utf8_character _CharT> 925 _LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT> 926 __estimate_column_width_malformed(const _CharT* __first, const _CharT* __last, 927 size_t __maximum, size_t __result) noexcept { 928 size_t __size = __last - __first; 929 size_t __n = _VSTD::min(__size, __maximum); 930 return {__result + __n, __first + __n}; 931 } 932 933 /** 934 * Determines the number of output columns needed to render the input. 935 * 936 * @note When the scanner encounters malformed Unicode it acts as-if every code 937 * unit at the end of the input is one output column. It's expected the output 938 * terminal will replace these malformed code units with a one column 939 * replacement characters. 940 * 941 * @param __first Points to the first element of the input range. 942 * @param __last Points beyond the last element of the input range. 943 * @param __maximum The maximum number of output columns. The returned number 944 * of estimated output columns will not exceed this value. 945 */ 946 template <__utf8_character _CharT> 947 _LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT> 948 __estimate_column_width(const _CharT* __first, const _CharT* __last, 949 size_t __maximum) noexcept { 950 size_t __result = 0; 951 952 while (__first != __last) { 953 // Based on the number of leading 1 bits the number of code units in the 954 // code point can be determined. See 955 // https://en.wikipedia.org/wiki/UTF-8#Encoding 956 switch (_VSTD::countl_one(static_cast<unsigned char>(*__first))) { 957 case 0: // 1-code unit encoding: all 1 column 958 ++__result; 959 ++__first; 960 break; 961 962 case 2: // 2-code unit encoding: all 1 column 963 // Malformed Unicode. 964 if (__last - __first < 2) [[unlikely]] 965 return __estimate_column_width_malformed(__first, __last, __maximum, 966 __result); 967 __first += 2; 968 ++__result; 969 break; 970 971 case 3: // 3-code unit encoding: either 1 or 2 columns 972 // Malformed Unicode. 973 if (__last - __first < 3) [[unlikely]] 974 return __estimate_column_width_malformed(__first, __last, __maximum, 975 __result); 976 { 977 uint32_t __c = static_cast<unsigned char>(*__first++) & 0x0f; 978 __c <<= 6; 979 __c |= static_cast<unsigned char>(*__first++) & 0x3f; 980 __c <<= 6; 981 __c |= static_cast<unsigned char>(*__first++) & 0x3f; 982 __result += __column_width_3(__c); 983 if (__result > __maximum) 984 return {__result - 2, __first - 3}; 985 } 986 break; 987 case 4: // 4-code unit encoding: either 1 or 2 columns 988 // Malformed Unicode. 989 if (__last - __first < 4) [[unlikely]] 990 return __estimate_column_width_malformed(__first, __last, __maximum, 991 __result); 992 { 993 uint32_t __c = static_cast<unsigned char>(*__first++) & 0x07; 994 __c <<= 6; 995 __c |= static_cast<unsigned char>(*__first++) & 0x3f; 996 __c <<= 6; 997 __c |= static_cast<unsigned char>(*__first++) & 0x3f; 998 __c <<= 6; 999 __c |= static_cast<unsigned char>(*__first++) & 0x3f; 1000 __result += __column_width_4(__c); 1001 if (__result > __maximum) 1002 return {__result - 2, __first - 4}; 1003 } 1004 break; 1005 default: 1006 // Malformed Unicode. 1007 return __estimate_column_width_malformed(__first, __last, __maximum, 1008 __result); 1009 } 1010 1011 if (__result >= __maximum) 1012 return {__result, __first}; 1013 } 1014 return {__result, __first}; 1015 } 1016 1017 template <__utf16_character _CharT> 1018 _LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT> 1019 __estimate_column_width(const _CharT* __first, const _CharT* __last, 1020 size_t __maximum) noexcept { 1021 size_t __result = 0; 1022 1023 while (__first != __last) { 1024 uint32_t __c = *__first; 1025 // Is the code unit part of a surrogate pair? See 1026 // https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF 1027 if (__c >= 0xd800 && __c <= 0xDfff) { 1028 // Malformed Unicode. 1029 if (__last - __first < 2) [[unlikely]] 1030 return {__result + 1, __first + 1}; 1031 1032 __c -= 0xd800; 1033 __c <<= 10; 1034 __c += (*(__first + 1) - 0xdc00); 1035 __c += 0x10'000; 1036 1037 __result += __column_width_4(__c); 1038 if (__result > __maximum) 1039 return {__result - 2, __first}; 1040 __first += 2; 1041 } else { 1042 __result += __column_width_3(__c); 1043 if (__result > __maximum) 1044 return {__result - 2, __first}; 1045 ++__first; 1046 } 1047 1048 if (__result >= __maximum) 1049 return {__result, __first}; 1050 } 1051 1052 return {__result, __first}; 1053 } 1054 1055 template <__utf32_character _CharT> 1056 _LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT> 1057 __estimate_column_width(const _CharT* __first, const _CharT* __last, 1058 size_t __maximum) noexcept { 1059 size_t __result = 0; 1060 1061 while (__first != __last) { 1062 wchar_t __c = *__first; 1063 __result += __column_width(__c); 1064 1065 if (__result > __maximum) 1066 return {__result - 2, __first}; 1067 1068 ++__first; 1069 if (__result >= __maximum) 1070 return {__result, __first}; 1071 } 1072 1073 return {__result, __first}; 1074 } 1075 1076 } // namespace __detail 1077 1078 template <class _CharT> 1079 _LIBCPP_HIDE_FROM_ABI constexpr __string_alignment<_CharT> 1080 __get_string_alignment(const _CharT* __first, const _CharT* __last, 1081 ptrdiff_t __width, ptrdiff_t __precision) noexcept { 1082 _LIBCPP_ASSERT(__width != 0 || __precision != -1, 1083 "The function has no effect and shouldn't be used"); 1084 1085 // TODO FMT There might be more optimizations possible: 1086 // If __precision == __format::__number_max and the encoding is: 1087 // * UTF-8 : 4 * (__last - __first) >= __width 1088 // * UTF-16 : 2 * (__last - __first) >= __width 1089 // * UTF-32 : (__last - __first) >= __width 1090 // In these cases it's certain the output is at least the requested width. 1091 // It's unknown how often this happens in practice. For now the improvement 1092 // isn't implemented. 1093 1094 /* 1095 * First assume there are no special Unicode code units in the input. 1096 * - Apply the precision (this may reduce the size of the input). When 1097 * __precison == -1 this step is omitted. 1098 * - Scan for special code units in the input. 1099 * If our assumption was correct the __pos will be at the end of the input. 1100 */ 1101 const ptrdiff_t __length = __last - __first; 1102 const _CharT* __limit = 1103 __first + 1104 (__precision == -1 ? __length : _VSTD::min(__length, __precision)); 1105 ptrdiff_t __size = __limit - __first; 1106 const _CharT* __pos = 1107 __detail::__estimate_column_width_fast(__first, __limit); 1108 1109 if (__pos == __limit) 1110 return {__limit, __size, __size < __width}; 1111 1112 /* 1113 * Our assumption was wrong, there are special Unicode code units. 1114 * The range [__first, __pos) contains a set of code units with the 1115 * following property: 1116 * Every _CharT in the range will be rendered in 1 column. 1117 * 1118 * If there's no maximum width and the parsed size already exceeds the 1119 * minimum required width. The real size isn't important. So bail out. 1120 */ 1121 if (__precision == -1 && (__pos - __first) >= __width) 1122 return {__last, 0, false}; 1123 1124 /* If there's a __precision, truncate the output to that width. */ 1125 ptrdiff_t __prefix = __pos - __first; 1126 if (__precision != -1) { 1127 _LIBCPP_ASSERT(__precision > __prefix, "Logic error."); 1128 auto __lengh_info = __detail::__estimate_column_width( 1129 __pos, __last, __precision - __prefix); 1130 __size = __lengh_info.__width + __prefix; 1131 return {__lengh_info.__ptr, __size, __size < __width}; 1132 } 1133 1134 /* Else use __width to determine the number of required padding characters. */ 1135 _LIBCPP_ASSERT(__width > __prefix, "Logic error."); 1136 /* 1137 * The column width is always one or two columns. For the precision the wanted 1138 * column width is the maximum, for the width it's the minimum. Using the 1139 * width estimation with its truncating behavior will result in the wrong 1140 * result in the following case: 1141 * - The last code unit processed requires two columns and exceeds the 1142 * maximum column width. 1143 * By increasing the __maximum by one avoids this issue. (It means it may 1144 * pass one code point more than required to determine the proper result; 1145 * that however isn't a problem for the algorithm.) 1146 */ 1147 size_t __maximum = 1 + __width - __prefix; 1148 auto __lengh_info = 1149 __detail::__estimate_column_width(__pos, __last, __maximum); 1150 if (__lengh_info.__ptr != __last) { 1151 // Consumed the width number of code units. The exact size of the string 1152 // is unknown. We only know we don't need to align the output. 1153 _LIBCPP_ASSERT(static_cast<ptrdiff_t>(__lengh_info.__width + __prefix) >= 1154 __width, 1155 "Logic error"); 1156 return {__last, 0, false}; 1157 } 1158 1159 __size = __lengh_info.__width + __prefix; 1160 return {__last, __size, __size < __width}; 1161 } 1162 #else // _LIBCPP_HAS_NO_UNICODE 1163 template <class _CharT> 1164 _LIBCPP_HIDE_FROM_ABI constexpr __string_alignment<_CharT> 1165 __get_string_alignment(const _CharT* __first, const _CharT* __last, 1166 ptrdiff_t __width, ptrdiff_t __precision) noexcept { 1167 const ptrdiff_t __length = __last - __first; 1168 const _CharT* __limit = 1169 __first + 1170 (__precision == -1 ? __length : _VSTD::min(__length, __precision)); 1171 ptrdiff_t __size = __limit - __first; 1172 return {__limit, __size, __size < __width}; 1173 } 1174 #endif // _LIBCPP_HAS_NO_UNICODE 1175 1176 } // namespace __format_spec 1177 1178 # endif // !defined(_LIBCPP_HAS_NO_CONCEPTS) 1179 1180 #endif //_LIBCPP_STD_VER > 17 1181 1182 _LIBCPP_END_NAMESPACE_STD 1183 1184 _LIBCPP_POP_MACROS 1185 1186 #endif // _LIBCPP___FORMAT_PARSER_STD_FORMAT_SPEC_H 1187