1 //===-- StringPrinter.cpp -------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "lldb/DataFormatters/StringPrinter.h" 10 11 #include "lldb/Core/Debugger.h" 12 #include "lldb/Core/ValueObject.h" 13 #include "lldb/Target/Language.h" 14 #include "lldb/Target/Process.h" 15 #include "lldb/Target/Target.h" 16 #include "lldb/Utility/Status.h" 17 18 #include "llvm/Support/ConvertUTF.h" 19 20 #include <ctype.h> 21 #include <locale> 22 #include <memory> 23 24 using namespace lldb; 25 using namespace lldb_private; 26 using namespace lldb_private::formatters; 27 using GetPrintableElementType = StringPrinter::GetPrintableElementType; 28 using StringElementType = StringPrinter::StringElementType; 29 30 /// DecodedCharBuffer stores the decoded contents of a single character. It 31 /// avoids managing memory on the heap by copying decoded bytes into an in-line 32 /// buffer. 33 class DecodedCharBuffer { 34 public: 35 DecodedCharBuffer(std::nullptr_t) {} 36 37 DecodedCharBuffer(const uint8_t *bytes, size_t size) : m_size(size) { 38 if (size > MaxLength) 39 llvm_unreachable("unsupported length"); 40 memcpy(m_data, bytes, size); 41 } 42 43 DecodedCharBuffer(const char *bytes, size_t size) 44 : DecodedCharBuffer(reinterpret_cast<const uint8_t *>(bytes), size) {} 45 46 const uint8_t *GetBytes() const { return m_data; } 47 48 size_t GetSize() const { return m_size; } 49 50 private: 51 static constexpr unsigned MaxLength = 16; 52 53 size_t m_size = 0; 54 uint8_t m_data[MaxLength] = {0}; 55 }; 56 57 using EscapingHelper = 58 std::function<DecodedCharBuffer(uint8_t *, uint8_t *, uint8_t *&)>; 59 60 // we define this for all values of type but only implement it for those we 61 // care about that's good because we get linker errors for any unsupported type 62 template <StringElementType type> 63 static DecodedCharBuffer 64 GetPrintableImpl(uint8_t *buffer, uint8_t *buffer_end, uint8_t *&next, 65 StringPrinter::EscapeStyle escape_style); 66 67 // Mimic isprint() for Unicode codepoints. 68 static bool isprint32(char32_t codepoint) { 69 if (codepoint <= 0x1F || codepoint == 0x7F) // C0 70 { 71 return false; 72 } 73 if (codepoint >= 0x80 && codepoint <= 0x9F) // C1 74 { 75 return false; 76 } 77 if (codepoint == 0x2028 || codepoint == 0x2029) // line/paragraph separators 78 { 79 return false; 80 } 81 if (codepoint == 0x200E || codepoint == 0x200F || 82 (codepoint >= 0x202A && 83 codepoint <= 0x202E)) // bidirectional text control 84 { 85 return false; 86 } 87 if (codepoint >= 0xFFF9 && 88 codepoint <= 0xFFFF) // interlinears and generally specials 89 { 90 return false; 91 } 92 return true; 93 } 94 95 DecodedCharBuffer attemptASCIIEscape(char32_t c, 96 StringPrinter::EscapeStyle escape_style) { 97 const bool is_swift_escape_style = 98 escape_style == StringPrinter::EscapeStyle::Swift; 99 switch (c) { 100 case 0: 101 return {"\\0", 2}; 102 case '\a': 103 return {"\\a", 2}; 104 case '\b': 105 if (is_swift_escape_style) 106 return nullptr; 107 return {"\\b", 2}; 108 case '\f': 109 if (is_swift_escape_style) 110 return nullptr; 111 return {"\\f", 2}; 112 case '\n': 113 return {"\\n", 2}; 114 case '\r': 115 return {"\\r", 2}; 116 case '\t': 117 return {"\\t", 2}; 118 case '\v': 119 if (is_swift_escape_style) 120 return nullptr; 121 return {"\\v", 2}; 122 case '\"': 123 return {"\\\"", 2}; 124 case '\'': 125 if (is_swift_escape_style) 126 return {"\\'", 2}; 127 return nullptr; 128 case '\\': 129 return {"\\\\", 2}; 130 } 131 return nullptr; 132 } 133 134 template <> 135 DecodedCharBuffer GetPrintableImpl<StringElementType::ASCII>( 136 uint8_t *buffer, uint8_t *buffer_end, uint8_t *&next, 137 StringPrinter::EscapeStyle escape_style) { 138 // The ASCII helper always advances 1 byte at a time. 139 next = buffer + 1; 140 141 DecodedCharBuffer retval = attemptASCIIEscape(*buffer, escape_style); 142 if (retval.GetSize()) 143 return retval; 144 if (isprint(*buffer)) 145 return {buffer, 1}; 146 147 unsigned escaped_len; 148 constexpr unsigned max_buffer_size = 7; 149 uint8_t data[max_buffer_size]; 150 switch (escape_style) { 151 case StringPrinter::EscapeStyle::CXX: 152 // Prints 4 characters, then a \0 terminator. 153 escaped_len = sprintf((char *)data, "\\x%02x", *buffer); 154 break; 155 case StringPrinter::EscapeStyle::Swift: 156 // Prints up to 6 characters, then a \0 terminator. 157 escaped_len = sprintf((char *)data, "\\u{%x}", *buffer); 158 break; 159 } 160 lldbassert(escaped_len > 0 && "unknown string escape style"); 161 return {data, escaped_len}; 162 } 163 164 static char32_t ConvertUTF8ToCodePoint(unsigned char c0, unsigned char c1) { 165 return (c0 - 192) * 64 + (c1 - 128); 166 } 167 static char32_t ConvertUTF8ToCodePoint(unsigned char c0, unsigned char c1, 168 unsigned char c2) { 169 return (c0 - 224) * 4096 + (c1 - 128) * 64 + (c2 - 128); 170 } 171 static char32_t ConvertUTF8ToCodePoint(unsigned char c0, unsigned char c1, 172 unsigned char c2, unsigned char c3) { 173 return (c0 - 240) * 262144 + (c2 - 128) * 4096 + (c2 - 128) * 64 + (c3 - 128); 174 } 175 176 template <> 177 DecodedCharBuffer GetPrintableImpl<StringElementType::UTF8>( 178 uint8_t *buffer, uint8_t *buffer_end, uint8_t *&next, 179 StringPrinter::EscapeStyle escape_style) { 180 const unsigned utf8_encoded_len = llvm::getNumBytesForUTF8(*buffer); 181 182 // If the utf8 encoded length is invalid, or if there aren't enough bytes to 183 // print, this is some kind of corrupted string. 184 if (utf8_encoded_len == 0 || utf8_encoded_len > 4) 185 return nullptr; 186 if ((buffer_end - buffer) < utf8_encoded_len) 187 // There's no room in the buffer for the utf8 sequence. 188 return nullptr; 189 190 char32_t codepoint = 0; 191 switch (utf8_encoded_len) { 192 case 1: 193 // this is just an ASCII byte - ask ASCII 194 return GetPrintableImpl<StringElementType::ASCII>(buffer, buffer_end, next, 195 escape_style); 196 case 2: 197 codepoint = ConvertUTF8ToCodePoint((unsigned char)*buffer, 198 (unsigned char)*(buffer + 1)); 199 break; 200 case 3: 201 codepoint = ConvertUTF8ToCodePoint((unsigned char)*buffer, 202 (unsigned char)*(buffer + 1), 203 (unsigned char)*(buffer + 2)); 204 break; 205 case 4: 206 codepoint = ConvertUTF8ToCodePoint( 207 (unsigned char)*buffer, (unsigned char)*(buffer + 1), 208 (unsigned char)*(buffer + 2), (unsigned char)*(buffer + 3)); 209 break; 210 } 211 212 // We couldn't figure out how to print this codepoint. 213 if (!codepoint) 214 return nullptr; 215 216 // The UTF8 helper always advances by the utf8 encoded length. 217 next = buffer + utf8_encoded_len; 218 DecodedCharBuffer retval = attemptASCIIEscape(codepoint, escape_style); 219 if (retval.GetSize()) 220 return retval; 221 if (isprint32(codepoint)) 222 return {buffer, utf8_encoded_len}; 223 224 unsigned escaped_len; 225 constexpr unsigned max_buffer_size = 13; 226 uint8_t data[max_buffer_size]; 227 switch (escape_style) { 228 case StringPrinter::EscapeStyle::CXX: 229 // Prints 10 characters, then a \0 terminator. 230 escaped_len = sprintf((char *)data, "\\U%08x", (unsigned)codepoint); 231 break; 232 case StringPrinter::EscapeStyle::Swift: 233 // Prints up to 12 characters, then a \0 terminator. 234 escaped_len = sprintf((char *)data, "\\u{%x}", (unsigned)codepoint); 235 break; 236 } 237 lldbassert(escaped_len > 0 && "unknown string escape style"); 238 return {data, escaped_len}; 239 } 240 241 // Given a sequence of bytes, this function returns: a sequence of bytes to 242 // actually print out + a length the following unscanned position of the buffer 243 // is in next 244 static DecodedCharBuffer GetPrintable(StringElementType type, uint8_t *buffer, 245 uint8_t *buffer_end, uint8_t *&next, 246 StringPrinter::EscapeStyle escape_style) { 247 if (!buffer || buffer >= buffer_end) 248 return {nullptr}; 249 250 switch (type) { 251 case StringElementType::ASCII: 252 return GetPrintableImpl<StringElementType::ASCII>(buffer, buffer_end, next, 253 escape_style); 254 case StringElementType::UTF8: 255 return GetPrintableImpl<StringElementType::UTF8>(buffer, buffer_end, next, 256 escape_style); 257 default: 258 return {nullptr}; 259 } 260 } 261 262 static EscapingHelper 263 GetDefaultEscapingHelper(GetPrintableElementType elem_type, 264 StringPrinter::EscapeStyle escape_style) { 265 switch (elem_type) { 266 case GetPrintableElementType::UTF8: 267 case GetPrintableElementType::ASCII: 268 return [escape_style, elem_type](uint8_t *buffer, uint8_t *buffer_end, 269 uint8_t *&next) -> DecodedCharBuffer { 270 return GetPrintable(elem_type == GetPrintableElementType::UTF8 271 ? StringElementType::UTF8 272 : StringElementType::ASCII, 273 buffer, buffer_end, next, escape_style); 274 }; 275 } 276 llvm_unreachable("bad element type"); 277 } 278 279 /// Read a string encoded in accordance with \tparam SourceDataType from a 280 /// host-side LLDB buffer, then pretty-print it to a stream using \p style. 281 template <typename SourceDataType> 282 static bool DumpEncodedBufferToStream( 283 GetPrintableElementType style, 284 llvm::ConversionResult (*ConvertFunction)(const SourceDataType **, 285 const SourceDataType *, 286 llvm::UTF8 **, llvm::UTF8 *, 287 llvm::ConversionFlags), 288 const StringPrinter::ReadBufferAndDumpToStreamOptions &dump_options) { 289 assert(dump_options.GetStream() && "need a Stream to print the string to"); 290 Stream &stream(*dump_options.GetStream()); 291 if (dump_options.GetPrefixToken() != nullptr) 292 stream.Printf("%s", dump_options.GetPrefixToken()); 293 if (dump_options.GetQuote() != 0) 294 stream.Printf("%c", dump_options.GetQuote()); 295 auto data(dump_options.GetData()); 296 auto source_size(dump_options.GetSourceSize()); 297 if (data.GetByteSize() && data.GetDataStart() && data.GetDataEnd()) { 298 const int bufferSPSize = data.GetByteSize(); 299 if (dump_options.GetSourceSize() == 0) { 300 const int origin_encoding = 8 * sizeof(SourceDataType); 301 source_size = bufferSPSize / (origin_encoding / 4); 302 } 303 304 const SourceDataType *data_ptr = 305 (const SourceDataType *)data.GetDataStart(); 306 const SourceDataType *data_end_ptr = data_ptr + source_size; 307 308 const bool zero_is_terminator = dump_options.GetBinaryZeroIsTerminator(); 309 310 if (zero_is_terminator) { 311 while (data_ptr < data_end_ptr) { 312 if (!*data_ptr) { 313 data_end_ptr = data_ptr; 314 break; 315 } 316 data_ptr++; 317 } 318 319 data_ptr = (const SourceDataType *)data.GetDataStart(); 320 } 321 322 lldb::DataBufferSP utf8_data_buffer_sp; 323 llvm::UTF8 *utf8_data_ptr = nullptr; 324 llvm::UTF8 *utf8_data_end_ptr = nullptr; 325 326 if (ConvertFunction) { 327 utf8_data_buffer_sp = 328 std::make_shared<DataBufferHeap>(4 * bufferSPSize, 0); 329 utf8_data_ptr = (llvm::UTF8 *)utf8_data_buffer_sp->GetBytes(); 330 utf8_data_end_ptr = utf8_data_ptr + utf8_data_buffer_sp->GetByteSize(); 331 ConvertFunction(&data_ptr, data_end_ptr, &utf8_data_ptr, 332 utf8_data_end_ptr, llvm::lenientConversion); 333 if (!zero_is_terminator) 334 utf8_data_end_ptr = utf8_data_ptr; 335 // needed because the ConvertFunction will change the value of the 336 // data_ptr. 337 utf8_data_ptr = 338 (llvm::UTF8 *)utf8_data_buffer_sp->GetBytes(); 339 } else { 340 // just copy the pointers - the cast is necessary to make the compiler 341 // happy but this should only happen if we are reading UTF8 data 342 utf8_data_ptr = const_cast<llvm::UTF8 *>( 343 reinterpret_cast<const llvm::UTF8 *>(data_ptr)); 344 utf8_data_end_ptr = const_cast<llvm::UTF8 *>( 345 reinterpret_cast<const llvm::UTF8 *>(data_end_ptr)); 346 } 347 348 const bool escape_non_printables = dump_options.GetEscapeNonPrintables(); 349 EscapingHelper escaping_callback; 350 if (escape_non_printables) 351 escaping_callback = 352 GetDefaultEscapingHelper(style, dump_options.GetEscapeStyle()); 353 354 // since we tend to accept partial data (and even partially malformed data) 355 // we might end up with no NULL terminator before the end_ptr hence we need 356 // to take a slower route and ensure we stay within boundaries 357 for (; utf8_data_ptr < utf8_data_end_ptr;) { 358 if (zero_is_terminator && !*utf8_data_ptr) 359 break; 360 361 if (escape_non_printables) { 362 uint8_t *next_data = nullptr; 363 auto printable = 364 escaping_callback(utf8_data_ptr, utf8_data_end_ptr, next_data); 365 auto printable_bytes = printable.GetBytes(); 366 auto printable_size = printable.GetSize(); 367 368 // We failed to figure out how to print this string. 369 if (!printable_bytes || !next_data) 370 return false; 371 372 for (unsigned c = 0; c < printable_size; c++) 373 stream.Printf("%c", *(printable_bytes + c)); 374 utf8_data_ptr = (uint8_t *)next_data; 375 } else { 376 stream.Printf("%c", *utf8_data_ptr); 377 utf8_data_ptr++; 378 } 379 } 380 } 381 if (dump_options.GetQuote() != 0) 382 stream.Printf("%c", dump_options.GetQuote()); 383 if (dump_options.GetSuffixToken() != nullptr) 384 stream.Printf("%s", dump_options.GetSuffixToken()); 385 if (dump_options.GetIsTruncated()) 386 stream.Printf("..."); 387 return true; 388 } 389 390 lldb_private::formatters::StringPrinter::ReadStringAndDumpToStreamOptions:: 391 ReadStringAndDumpToStreamOptions(ValueObject &valobj) 392 : ReadStringAndDumpToStreamOptions() { 393 SetEscapeNonPrintables( 394 valobj.GetTargetSP()->GetDebugger().GetEscapeNonPrintables()); 395 } 396 397 lldb_private::formatters::StringPrinter::ReadBufferAndDumpToStreamOptions:: 398 ReadBufferAndDumpToStreamOptions(ValueObject &valobj) 399 : ReadBufferAndDumpToStreamOptions() { 400 SetEscapeNonPrintables( 401 valobj.GetTargetSP()->GetDebugger().GetEscapeNonPrintables()); 402 } 403 404 lldb_private::formatters::StringPrinter::ReadBufferAndDumpToStreamOptions:: 405 ReadBufferAndDumpToStreamOptions( 406 const ReadStringAndDumpToStreamOptions &options) 407 : ReadBufferAndDumpToStreamOptions() { 408 SetStream(options.GetStream()); 409 SetPrefixToken(options.GetPrefixToken()); 410 SetSuffixToken(options.GetSuffixToken()); 411 SetQuote(options.GetQuote()); 412 SetEscapeNonPrintables(options.GetEscapeNonPrintables()); 413 SetBinaryZeroIsTerminator(options.GetBinaryZeroIsTerminator()); 414 SetEscapeStyle(options.GetEscapeStyle()); 415 } 416 417 namespace lldb_private { 418 419 namespace formatters { 420 421 template <typename SourceDataType> 422 static bool ReadEncodedBufferAndDumpToStream( 423 StringElementType elem_type, 424 const StringPrinter::ReadStringAndDumpToStreamOptions &options, 425 llvm::ConversionResult (*ConvertFunction)(const SourceDataType **, 426 const SourceDataType *, 427 llvm::UTF8 **, llvm::UTF8 *, 428 llvm::ConversionFlags)) { 429 assert(options.GetStream() && "need a Stream to print the string to"); 430 if (!options.GetStream()) 431 return false; 432 433 if (options.GetLocation() == 0 || 434 options.GetLocation() == LLDB_INVALID_ADDRESS) 435 return false; 436 437 lldb::ProcessSP process_sp(options.GetProcessSP()); 438 if (!process_sp) 439 return false; 440 441 constexpr int type_width = sizeof(SourceDataType); 442 constexpr int origin_encoding = 8 * type_width; 443 if (origin_encoding != 8 && origin_encoding != 16 && origin_encoding != 32) 444 return false; 445 // If not UTF8 or ASCII, conversion to UTF8 is necessary. 446 if (origin_encoding != 8 && !ConvertFunction) 447 return false; 448 449 bool needs_zero_terminator = options.GetNeedsZeroTermination(); 450 451 bool is_truncated = false; 452 const auto max_size = process_sp->GetTarget().GetMaximumSizeOfStringSummary(); 453 454 uint32_t sourceSize; 455 if (elem_type == StringElementType::ASCII && !options.GetSourceSize()) { 456 // FIXME: The NSString formatter sets HasSourceSize(true) when the size is 457 // actually unknown, as well as SetBinaryZeroIsTerminator(false). IIUC the 458 // C++ formatter also sets SetBinaryZeroIsTerminator(false) when it doesn't 459 // mean to. I don't see how this makes sense: we should fix the formatters. 460 // 461 // Until then, the behavior that's expected for ASCII strings with unknown 462 // lengths is to read up to the max size and then null-terminate. Do that. 463 sourceSize = max_size; 464 needs_zero_terminator = true; 465 } else if (options.HasSourceSize()) { 466 sourceSize = options.GetSourceSize(); 467 if (!options.GetIgnoreMaxLength()) { 468 if (sourceSize > max_size) { 469 sourceSize = max_size; 470 is_truncated = true; 471 } 472 } 473 } else { 474 sourceSize = max_size; 475 needs_zero_terminator = true; 476 } 477 478 const int bufferSPSize = sourceSize * type_width; 479 lldb::DataBufferSP buffer_sp(new DataBufferHeap(bufferSPSize, 0)); 480 481 // Check if we got bytes. We never get any bytes if we have an empty 482 // string, but we still continue so that we end up actually printing 483 // an empty string (""). 484 if (sourceSize != 0 && !buffer_sp->GetBytes()) 485 return false; 486 487 Status error; 488 char *buffer = reinterpret_cast<char *>(buffer_sp->GetBytes()); 489 490 if (elem_type == StringElementType::ASCII) 491 process_sp->ReadCStringFromMemory(options.GetLocation(), buffer, 492 bufferSPSize, error); 493 else if (needs_zero_terminator) 494 process_sp->ReadStringFromMemory(options.GetLocation(), buffer, 495 bufferSPSize, error, type_width); 496 else 497 process_sp->ReadMemoryFromInferior(options.GetLocation(), buffer, 498 bufferSPSize, error); 499 if (error.Fail()) { 500 options.GetStream()->Printf("unable to read data"); 501 return true; 502 } 503 504 DataExtractor data(buffer_sp, process_sp->GetByteOrder(), 505 process_sp->GetAddressByteSize()); 506 507 StringPrinter::ReadBufferAndDumpToStreamOptions dump_options(options); 508 dump_options.SetData(data); 509 dump_options.SetSourceSize(sourceSize); 510 dump_options.SetIsTruncated(is_truncated); 511 dump_options.SetNeedsZeroTermination(needs_zero_terminator); 512 if (needs_zero_terminator) 513 dump_options.SetBinaryZeroIsTerminator(true); 514 515 GetPrintableElementType print_style = (elem_type == StringElementType::ASCII) 516 ? GetPrintableElementType::ASCII 517 : GetPrintableElementType::UTF8; 518 return DumpEncodedBufferToStream(print_style, ConvertFunction, dump_options); 519 } 520 521 template <> 522 bool StringPrinter::ReadStringAndDumpToStream<StringElementType::UTF8>( 523 const ReadStringAndDumpToStreamOptions &options) { 524 return ReadEncodedBufferAndDumpToStream<llvm::UTF8>(StringElementType::UTF8, 525 options, nullptr); 526 } 527 528 template <> 529 bool StringPrinter::ReadStringAndDumpToStream<StringElementType::UTF16>( 530 const ReadStringAndDumpToStreamOptions &options) { 531 return ReadEncodedBufferAndDumpToStream<llvm::UTF16>( 532 StringElementType::UTF16, options, llvm::ConvertUTF16toUTF8); 533 } 534 535 template <> 536 bool StringPrinter::ReadStringAndDumpToStream<StringElementType::UTF32>( 537 const ReadStringAndDumpToStreamOptions &options) { 538 return ReadEncodedBufferAndDumpToStream<llvm::UTF32>( 539 StringElementType::UTF32, options, llvm::ConvertUTF32toUTF8); 540 } 541 542 template <> 543 bool StringPrinter::ReadStringAndDumpToStream<StringElementType::ASCII>( 544 const ReadStringAndDumpToStreamOptions &options) { 545 return ReadEncodedBufferAndDumpToStream<char>(StringElementType::ASCII, 546 options, nullptr); 547 } 548 549 template <> 550 bool StringPrinter::ReadBufferAndDumpToStream<StringElementType::UTF8>( 551 const ReadBufferAndDumpToStreamOptions &options) { 552 return DumpEncodedBufferToStream<llvm::UTF8>(GetPrintableElementType::UTF8, 553 nullptr, options); 554 } 555 556 template <> 557 bool StringPrinter::ReadBufferAndDumpToStream<StringElementType::UTF16>( 558 const ReadBufferAndDumpToStreamOptions &options) { 559 return DumpEncodedBufferToStream(GetPrintableElementType::UTF8, 560 llvm::ConvertUTF16toUTF8, options); 561 } 562 563 template <> 564 bool StringPrinter::ReadBufferAndDumpToStream<StringElementType::UTF32>( 565 const ReadBufferAndDumpToStreamOptions &options) { 566 return DumpEncodedBufferToStream(GetPrintableElementType::UTF8, 567 llvm::ConvertUTF32toUTF8, options); 568 } 569 570 template <> 571 bool StringPrinter::ReadBufferAndDumpToStream<StringElementType::ASCII>( 572 const ReadBufferAndDumpToStreamOptions &options) { 573 // Treat ASCII the same as UTF8. 574 // 575 // FIXME: This is probably not the right thing to do (well, it's debatable). 576 // If an ASCII-encoded string happens to contain a sequence of invalid bytes 577 // that forms a valid UTF8 character, we'll print out that character. This is 578 // good if you're playing fast and loose with encodings (probably good for 579 // std::string users), but maybe not so good if you care about your string 580 // formatter respecting the semantics of your selected string encoding. In 581 // the latter case you'd want to see the character byte sequence ('\x..'), not 582 // the UTF8 character itself. 583 return ReadBufferAndDumpToStream<StringElementType::UTF8>(options); 584 } 585 586 } // namespace formatters 587 588 } // namespace lldb_private 589