1 //===-- StringPrinter.cpp -------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "lldb/DataFormatters/StringPrinter.h"
10 
11 #include "lldb/Core/Debugger.h"
12 #include "lldb/Core/ValueObject.h"
13 #include "lldb/Target/Language.h"
14 #include "lldb/Target/Process.h"
15 #include "lldb/Target/Target.h"
16 #include "lldb/Utility/Status.h"
17 
18 #include "llvm/Support/ConvertUTF.h"
19 
20 #include <ctype.h>
21 #include <locale>
22 #include <memory>
23 
24 using namespace lldb;
25 using namespace lldb_private;
26 using namespace lldb_private::formatters;
27 using GetPrintableElementType = StringPrinter::GetPrintableElementType;
28 using StringElementType = StringPrinter::StringElementType;
29 
30 /// DecodedCharBuffer stores the decoded contents of a single character. It
31 /// avoids managing memory on the heap by copying decoded bytes into an in-line
32 /// buffer.
33 struct DecodedCharBuffer {
34   static constexpr unsigned MaxLength = 16;
35 
36 public:
37   DecodedCharBuffer(std::nullptr_t) {}
38 
39   DecodedCharBuffer(const uint8_t *bytes, size_t size) : m_size(size) {
40     if (size > MaxLength)
41       llvm_unreachable("unsupported length");
42     memcpy(m_data, bytes, size);
43   }
44 
45   DecodedCharBuffer(const char *bytes, size_t size)
46       : DecodedCharBuffer(reinterpret_cast<const uint8_t *>(bytes), size) {}
47 
48   const uint8_t *GetBytes() const { return m_data; }
49 
50   size_t GetSize() const { return m_size; }
51 
52 private:
53   size_t m_size = 0;
54   uint8_t m_data[MaxLength] = {0};
55 };
56 
57 using EscapingHelper =
58     std::function<DecodedCharBuffer(uint8_t *, uint8_t *, uint8_t *&)>;
59 
60 // we define this for all values of type but only implement it for those we
61 // care about that's good because we get linker errors for any unsupported type
62 template <StringElementType type>
63 static DecodedCharBuffer
64 GetPrintableImpl(uint8_t *buffer, uint8_t *buffer_end, uint8_t *&next,
65                  StringPrinter::EscapeStyle escape_style);
66 
67 // Mimic isprint() for Unicode codepoints.
68 static bool isprint32(char32_t codepoint) {
69   if (codepoint <= 0x1F || codepoint == 0x7F) // C0
70   {
71     return false;
72   }
73   if (codepoint >= 0x80 && codepoint <= 0x9F) // C1
74   {
75     return false;
76   }
77   if (codepoint == 0x2028 || codepoint == 0x2029) // line/paragraph separators
78   {
79     return false;
80   }
81   if (codepoint == 0x200E || codepoint == 0x200F ||
82       (codepoint >= 0x202A &&
83        codepoint <= 0x202E)) // bidirectional text control
84   {
85     return false;
86   }
87   if (codepoint >= 0xFFF9 &&
88       codepoint <= 0xFFFF) // interlinears and generally specials
89   {
90     return false;
91   }
92   return true;
93 }
94 
95 DecodedCharBuffer attemptASCIIEscape(char32_t c,
96                                      StringPrinter::EscapeStyle escape_style) {
97   const bool is_swift_escape_style =
98       escape_style == StringPrinter::EscapeStyle::Swift;
99   switch (c) {
100   case 0:
101     return {"\\0", 2};
102   case '\a':
103     return {"\\a", 2};
104   case '\b':
105     if (is_swift_escape_style)
106       return nullptr;
107     return {"\\b", 2};
108   case '\f':
109     if (is_swift_escape_style)
110       return nullptr;
111     return {"\\f", 2};
112   case '\n':
113     return {"\\n", 2};
114   case '\r':
115     return {"\\r", 2};
116   case '\t':
117     return {"\\t", 2};
118   case '\v':
119     if (is_swift_escape_style)
120       return nullptr;
121     return {"\\v", 2};
122   case '\"':
123     return {"\\\"", 2};
124   case '\'':
125     if (is_swift_escape_style)
126       return {"\\'", 2};
127     return nullptr;
128   case '\\':
129     return {"\\\\", 2};
130   }
131   return nullptr;
132 }
133 
134 template <>
135 DecodedCharBuffer GetPrintableImpl<StringElementType::ASCII>(
136     uint8_t *buffer, uint8_t *buffer_end, uint8_t *&next,
137     StringPrinter::EscapeStyle escape_style) {
138   // The ASCII helper always advances 1 byte at a time.
139   next = buffer + 1;
140 
141   DecodedCharBuffer retval = attemptASCIIEscape(*buffer, escape_style);
142   if (retval.GetSize())
143     return retval;
144   if (isprint(*buffer))
145     return {buffer, 1};
146 
147   unsigned escaped_len;
148   constexpr unsigned max_buffer_size = 7;
149   uint8_t data[max_buffer_size];
150   switch (escape_style) {
151   case StringPrinter::EscapeStyle::CXX:
152     // Prints 4 characters, then a \0 terminator.
153     escaped_len = sprintf((char *)data, "\\x%02x", *buffer);
154     break;
155   case StringPrinter::EscapeStyle::Swift:
156     // Prints up to 6 characters, then a \0 terminator.
157     escaped_len = sprintf((char *)data, "\\u{%x}", *buffer);
158     break;
159   }
160   lldbassert(escaped_len > 0 && "unknown string escape style");
161   return {data, escaped_len};
162 }
163 
164 static char32_t ConvertUTF8ToCodePoint(unsigned char c0, unsigned char c1) {
165   return (c0 - 192) * 64 + (c1 - 128);
166 }
167 static char32_t ConvertUTF8ToCodePoint(unsigned char c0, unsigned char c1,
168                                        unsigned char c2) {
169   return (c0 - 224) * 4096 + (c1 - 128) * 64 + (c2 - 128);
170 }
171 static char32_t ConvertUTF8ToCodePoint(unsigned char c0, unsigned char c1,
172                                        unsigned char c2, unsigned char c3) {
173   return (c0 - 240) * 262144 + (c2 - 128) * 4096 + (c2 - 128) * 64 + (c3 - 128);
174 }
175 
176 template <>
177 DecodedCharBuffer GetPrintableImpl<StringElementType::UTF8>(
178     uint8_t *buffer, uint8_t *buffer_end, uint8_t *&next,
179     StringPrinter::EscapeStyle escape_style) {
180   const unsigned utf8_encoded_len = llvm::getNumBytesForUTF8(*buffer);
181 
182   // If the utf8 encoded length is invalid, or if there aren't enough bytes to
183   // print, this is some kind of corrupted string.
184   if (utf8_encoded_len == 0 || utf8_encoded_len > 4)
185     return nullptr;
186   if ((buffer_end - buffer) < utf8_encoded_len)
187     // There's no room in the buffer for the utf8 sequence.
188     return nullptr;
189 
190   char32_t codepoint = 0;
191   switch (utf8_encoded_len) {
192   case 1:
193     // this is just an ASCII byte - ask ASCII
194     return GetPrintableImpl<StringElementType::ASCII>(buffer, buffer_end, next,
195                                                       escape_style);
196   case 2:
197     codepoint = ConvertUTF8ToCodePoint((unsigned char)*buffer,
198                                        (unsigned char)*(buffer + 1));
199     break;
200   case 3:
201     codepoint = ConvertUTF8ToCodePoint((unsigned char)*buffer,
202                                        (unsigned char)*(buffer + 1),
203                                        (unsigned char)*(buffer + 2));
204     break;
205   case 4:
206     codepoint = ConvertUTF8ToCodePoint(
207         (unsigned char)*buffer, (unsigned char)*(buffer + 1),
208         (unsigned char)*(buffer + 2), (unsigned char)*(buffer + 3));
209     break;
210   }
211 
212   // We couldn't figure out how to print this codepoint.
213   if (!codepoint)
214     return nullptr;
215 
216   // The UTF8 helper always advances by the utf8 encoded length.
217   next = buffer + utf8_encoded_len;
218   DecodedCharBuffer retval = attemptASCIIEscape(codepoint, escape_style);
219   if (retval.GetSize())
220     return retval;
221   if (isprint32(codepoint))
222     return {buffer, utf8_encoded_len};
223 
224   unsigned escaped_len;
225   constexpr unsigned max_buffer_size = 13;
226   uint8_t data[max_buffer_size];
227   switch (escape_style) {
228   case StringPrinter::EscapeStyle::CXX:
229     // Prints 10 characters, then a \0 terminator.
230     escaped_len = sprintf((char *)data, "\\U%08x", (unsigned)codepoint);
231     break;
232   case StringPrinter::EscapeStyle::Swift:
233     // Prints up to 12 characters, then a \0 terminator.
234     escaped_len = sprintf((char *)data, "\\u{%x}", (unsigned)codepoint);
235     break;
236   }
237   lldbassert(escaped_len > 0 && "unknown string escape style");
238   return {data, escaped_len};
239 }
240 
241 // Given a sequence of bytes, this function returns: a sequence of bytes to
242 // actually print out + a length the following unscanned position of the buffer
243 // is in next
244 static DecodedCharBuffer GetPrintable(StringElementType type, uint8_t *buffer,
245                                       uint8_t *buffer_end, uint8_t *&next,
246                                       StringPrinter::EscapeStyle escape_style) {
247   if (!buffer || buffer >= buffer_end)
248     return {nullptr};
249 
250   switch (type) {
251   case StringElementType::ASCII:
252     return GetPrintableImpl<StringElementType::ASCII>(buffer, buffer_end, next,
253                                                       escape_style);
254   case StringElementType::UTF8:
255     return GetPrintableImpl<StringElementType::UTF8>(buffer, buffer_end, next,
256                                                      escape_style);
257   default:
258     return {nullptr};
259   }
260 }
261 
262 static EscapingHelper
263 GetDefaultEscapingHelper(GetPrintableElementType elem_type,
264                          StringPrinter::EscapeStyle escape_style) {
265   switch (elem_type) {
266   case GetPrintableElementType::UTF8:
267   case GetPrintableElementType::ASCII:
268     return [escape_style, elem_type](uint8_t *buffer, uint8_t *buffer_end,
269                                      uint8_t *&next) -> DecodedCharBuffer {
270       return GetPrintable(elem_type == GetPrintableElementType::UTF8
271                               ? StringElementType::UTF8
272                               : StringElementType::ASCII,
273                           buffer, buffer_end, next, escape_style);
274     };
275   }
276   llvm_unreachable("bad element type");
277 }
278 
279 /// Read a string encoded in accordance with \tparam SourceDataType from a
280 /// host-side LLDB buffer, then pretty-print it to a stream using \p style.
281 template <typename SourceDataType>
282 static bool DumpEncodedBufferToStream(
283     GetPrintableElementType style,
284     llvm::ConversionResult (*ConvertFunction)(const SourceDataType **,
285                                               const SourceDataType *,
286                                               llvm::UTF8 **, llvm::UTF8 *,
287                                               llvm::ConversionFlags),
288     const StringPrinter::ReadBufferAndDumpToStreamOptions &dump_options) {
289   assert(dump_options.GetStream() && "need a Stream to print the string to");
290   Stream &stream(*dump_options.GetStream());
291   if (dump_options.GetPrefixToken() != nullptr)
292     stream.Printf("%s", dump_options.GetPrefixToken());
293   if (dump_options.GetQuote() != 0)
294     stream.Printf("%c", dump_options.GetQuote());
295   auto data(dump_options.GetData());
296   auto source_size(dump_options.GetSourceSize());
297   if (data.GetByteSize() && data.GetDataStart() && data.GetDataEnd()) {
298     const int bufferSPSize = data.GetByteSize();
299     if (dump_options.GetSourceSize() == 0) {
300       const int origin_encoding = 8 * sizeof(SourceDataType);
301       source_size = bufferSPSize / (origin_encoding / 4);
302     }
303 
304     const SourceDataType *data_ptr =
305         (const SourceDataType *)data.GetDataStart();
306     const SourceDataType *data_end_ptr = data_ptr + source_size;
307 
308     const bool zero_is_terminator = dump_options.GetBinaryZeroIsTerminator();
309 
310     if (zero_is_terminator) {
311       while (data_ptr < data_end_ptr) {
312         if (!*data_ptr) {
313           data_end_ptr = data_ptr;
314           break;
315         }
316         data_ptr++;
317       }
318 
319       data_ptr = (const SourceDataType *)data.GetDataStart();
320     }
321 
322     lldb::DataBufferSP utf8_data_buffer_sp;
323     llvm::UTF8 *utf8_data_ptr = nullptr;
324     llvm::UTF8 *utf8_data_end_ptr = nullptr;
325 
326     if (ConvertFunction) {
327       utf8_data_buffer_sp =
328           std::make_shared<DataBufferHeap>(4 * bufferSPSize, 0);
329       utf8_data_ptr = (llvm::UTF8 *)utf8_data_buffer_sp->GetBytes();
330       utf8_data_end_ptr = utf8_data_ptr + utf8_data_buffer_sp->GetByteSize();
331       ConvertFunction(&data_ptr, data_end_ptr, &utf8_data_ptr,
332                       utf8_data_end_ptr, llvm::lenientConversion);
333       if (!zero_is_terminator)
334         utf8_data_end_ptr = utf8_data_ptr;
335       // needed because the ConvertFunction will change the value of the
336       // data_ptr.
337       utf8_data_ptr =
338           (llvm::UTF8 *)utf8_data_buffer_sp->GetBytes();
339     } else {
340       // just copy the pointers - the cast is necessary to make the compiler
341       // happy but this should only happen if we are reading UTF8 data
342       utf8_data_ptr = const_cast<llvm::UTF8 *>(
343           reinterpret_cast<const llvm::UTF8 *>(data_ptr));
344       utf8_data_end_ptr = const_cast<llvm::UTF8 *>(
345           reinterpret_cast<const llvm::UTF8 *>(data_end_ptr));
346     }
347 
348     const bool escape_non_printables = dump_options.GetEscapeNonPrintables();
349     EscapingHelper escaping_callback;
350     if (escape_non_printables)
351       escaping_callback =
352           GetDefaultEscapingHelper(style, dump_options.GetEscapeStyle());
353 
354     // since we tend to accept partial data (and even partially malformed data)
355     // we might end up with no NULL terminator before the end_ptr hence we need
356     // to take a slower route and ensure we stay within boundaries
357     for (; utf8_data_ptr < utf8_data_end_ptr;) {
358       if (zero_is_terminator && !*utf8_data_ptr)
359         break;
360 
361       if (escape_non_printables) {
362         uint8_t *next_data = nullptr;
363         auto printable =
364             escaping_callback(utf8_data_ptr, utf8_data_end_ptr, next_data);
365         auto printable_bytes = printable.GetBytes();
366         auto printable_size = printable.GetSize();
367 
368         // We failed to figure out how to print this string.
369         if (!printable_bytes || !next_data)
370           return false;
371 
372         for (unsigned c = 0; c < printable_size; c++)
373           stream.Printf("%c", *(printable_bytes + c));
374         utf8_data_ptr = (uint8_t *)next_data;
375       } else {
376         stream.Printf("%c", *utf8_data_ptr);
377         utf8_data_ptr++;
378       }
379     }
380   }
381   if (dump_options.GetQuote() != 0)
382     stream.Printf("%c", dump_options.GetQuote());
383   if (dump_options.GetSuffixToken() != nullptr)
384     stream.Printf("%s", dump_options.GetSuffixToken());
385   if (dump_options.GetIsTruncated())
386     stream.Printf("...");
387   return true;
388 }
389 
390 lldb_private::formatters::StringPrinter::ReadStringAndDumpToStreamOptions::
391     ReadStringAndDumpToStreamOptions(ValueObject &valobj)
392     : ReadStringAndDumpToStreamOptions() {
393   SetEscapeNonPrintables(
394       valobj.GetTargetSP()->GetDebugger().GetEscapeNonPrintables());
395 }
396 
397 lldb_private::formatters::StringPrinter::ReadBufferAndDumpToStreamOptions::
398     ReadBufferAndDumpToStreamOptions(ValueObject &valobj)
399     : ReadBufferAndDumpToStreamOptions() {
400   SetEscapeNonPrintables(
401       valobj.GetTargetSP()->GetDebugger().GetEscapeNonPrintables());
402 }
403 
404 lldb_private::formatters::StringPrinter::ReadBufferAndDumpToStreamOptions::
405     ReadBufferAndDumpToStreamOptions(
406         const ReadStringAndDumpToStreamOptions &options)
407     : ReadBufferAndDumpToStreamOptions() {
408   SetStream(options.GetStream());
409   SetPrefixToken(options.GetPrefixToken());
410   SetSuffixToken(options.GetSuffixToken());
411   SetQuote(options.GetQuote());
412   SetEscapeNonPrintables(options.GetEscapeNonPrintables());
413   SetBinaryZeroIsTerminator(options.GetBinaryZeroIsTerminator());
414   SetEscapeStyle(options.GetEscapeStyle());
415 }
416 
417 namespace lldb_private {
418 
419 namespace formatters {
420 
421 template <typename SourceDataType>
422 static bool ReadEncodedBufferAndDumpToStream(
423     StringElementType elem_type,
424     const StringPrinter::ReadStringAndDumpToStreamOptions &options,
425     llvm::ConversionResult (*ConvertFunction)(const SourceDataType **,
426                                               const SourceDataType *,
427                                               llvm::UTF8 **, llvm::UTF8 *,
428                                               llvm::ConversionFlags)) {
429   assert(options.GetStream() && "need a Stream to print the string to");
430   if (!options.GetStream())
431     return false;
432 
433   if (options.GetLocation() == 0 ||
434       options.GetLocation() == LLDB_INVALID_ADDRESS)
435     return false;
436 
437   lldb::ProcessSP process_sp(options.GetProcessSP());
438   if (!process_sp)
439     return false;
440 
441   constexpr int type_width = sizeof(SourceDataType);
442   constexpr int origin_encoding = 8 * type_width;
443   if (origin_encoding != 8 && origin_encoding != 16 && origin_encoding != 32)
444     return false;
445   // If not UTF8 or ASCII, conversion to UTF8 is necessary.
446   if (origin_encoding != 8 && !ConvertFunction)
447     return false;
448 
449   bool needs_zero_terminator = options.GetNeedsZeroTermination();
450 
451   bool is_truncated = false;
452   const auto max_size = process_sp->GetTarget().GetMaximumSizeOfStringSummary();
453 
454   uint32_t sourceSize;
455   if (elem_type == StringElementType::ASCII && !options.GetSourceSize()) {
456     // FIXME: The NSString formatter sets HasSourceSize(true) when the size is
457     // actually unknown, as well as SetBinaryZeroIsTerminator(false). IIUC the
458     // C++ formatter also sets SetBinaryZeroIsTerminator(false) when it doesn't
459     // mean to. I don't see how this makes sense: we should fix the formatters.
460     //
461     // Until then, the behavior that's expected for ASCII strings with unknown
462     // lengths is to read up to the max size and then null-terminate. Do that.
463     sourceSize = max_size;
464     needs_zero_terminator = true;
465   } else if (options.HasSourceSize()) {
466     sourceSize = options.GetSourceSize();
467     if (!options.GetIgnoreMaxLength()) {
468       if (sourceSize > max_size) {
469         sourceSize = max_size;
470         is_truncated = true;
471       }
472     }
473   } else {
474     sourceSize = max_size;
475     needs_zero_terminator = true;
476   }
477 
478   const int bufferSPSize = sourceSize * type_width;
479   lldb::DataBufferSP buffer_sp(new DataBufferHeap(bufferSPSize, 0));
480 
481   // Check if we got bytes. We never get any bytes if we have an empty
482   // string, but we still continue so that we end up actually printing
483   // an empty string ("").
484   if (sourceSize != 0 && !buffer_sp->GetBytes())
485     return false;
486 
487   Status error;
488   char *buffer = reinterpret_cast<char *>(buffer_sp->GetBytes());
489 
490   if (elem_type == StringElementType::ASCII)
491     process_sp->ReadCStringFromMemory(options.GetLocation(), buffer,
492                                       bufferSPSize, error);
493   else if (needs_zero_terminator)
494     process_sp->ReadStringFromMemory(options.GetLocation(), buffer,
495                                      bufferSPSize, error, type_width);
496   else
497     process_sp->ReadMemoryFromInferior(options.GetLocation(), buffer,
498                                        bufferSPSize, error);
499   if (error.Fail()) {
500     options.GetStream()->Printf("unable to read data");
501     return true;
502   }
503 
504   DataExtractor data(buffer_sp, process_sp->GetByteOrder(),
505                      process_sp->GetAddressByteSize());
506 
507   StringPrinter::ReadBufferAndDumpToStreamOptions dump_options(options);
508   dump_options.SetData(data);
509   dump_options.SetSourceSize(sourceSize);
510   dump_options.SetIsTruncated(is_truncated);
511   dump_options.SetNeedsZeroTermination(needs_zero_terminator);
512   if (needs_zero_terminator)
513     dump_options.SetBinaryZeroIsTerminator(true);
514 
515   GetPrintableElementType print_style = (elem_type == StringElementType::ASCII)
516                                             ? GetPrintableElementType::ASCII
517                                             : GetPrintableElementType::UTF8;
518   return DumpEncodedBufferToStream(print_style, ConvertFunction, dump_options);
519 }
520 
521 template <>
522 bool StringPrinter::ReadStringAndDumpToStream<StringElementType::UTF8>(
523     const ReadStringAndDumpToStreamOptions &options) {
524   return ReadEncodedBufferAndDumpToStream<llvm::UTF8>(StringElementType::UTF8,
525                                                       options, nullptr);
526 }
527 
528 template <>
529 bool StringPrinter::ReadStringAndDumpToStream<StringElementType::UTF16>(
530     const ReadStringAndDumpToStreamOptions &options) {
531   return ReadEncodedBufferAndDumpToStream<llvm::UTF16>(
532       StringElementType::UTF16, options, llvm::ConvertUTF16toUTF8);
533 }
534 
535 template <>
536 bool StringPrinter::ReadStringAndDumpToStream<StringElementType::UTF32>(
537     const ReadStringAndDumpToStreamOptions &options) {
538   return ReadEncodedBufferAndDumpToStream<llvm::UTF32>(
539       StringElementType::UTF32, options, llvm::ConvertUTF32toUTF8);
540 }
541 
542 template <>
543 bool StringPrinter::ReadStringAndDumpToStream<StringElementType::ASCII>(
544     const ReadStringAndDumpToStreamOptions &options) {
545   return ReadEncodedBufferAndDumpToStream<char>(StringElementType::ASCII,
546                                                 options, nullptr);
547 }
548 
549 template <>
550 bool StringPrinter::ReadBufferAndDumpToStream<StringElementType::UTF8>(
551     const ReadBufferAndDumpToStreamOptions &options) {
552   return DumpEncodedBufferToStream<llvm::UTF8>(GetPrintableElementType::UTF8,
553                                                nullptr, options);
554 }
555 
556 template <>
557 bool StringPrinter::ReadBufferAndDumpToStream<StringElementType::UTF16>(
558     const ReadBufferAndDumpToStreamOptions &options) {
559   return DumpEncodedBufferToStream(GetPrintableElementType::UTF8,
560                                    llvm::ConvertUTF16toUTF8, options);
561 }
562 
563 template <>
564 bool StringPrinter::ReadBufferAndDumpToStream<StringElementType::UTF32>(
565     const ReadBufferAndDumpToStreamOptions &options) {
566   return DumpEncodedBufferToStream(GetPrintableElementType::UTF8,
567                                    llvm::ConvertUTF32toUTF8, options);
568 }
569 
570 template <>
571 bool StringPrinter::ReadBufferAndDumpToStream<StringElementType::ASCII>(
572     const ReadBufferAndDumpToStreamOptions &options) {
573   // Treat ASCII the same as UTF8.
574   //
575   // FIXME: This is probably not the right thing to do (well, it's debatable).
576   // If an ASCII-encoded string happens to contain a sequence of invalid bytes
577   // that forms a valid UTF8 character, we'll print out that character. This is
578   // good if you're playing fast and loose with encodings (probably good for
579   // std::string users), but maybe not so good if you care about your string
580   // formatter respecting the semantics of your selected string encoding. In
581   // the latter case you'd want to see the character byte sequence ('\x..'), not
582   // the UTF8 character itself.
583   return ReadBufferAndDumpToStream<StringElementType::UTF8>(options);
584 }
585 
586 } // namespace formatters
587 
588 } // namespace lldb_private
589