1 //===-- StringPrinter.cpp -------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "lldb/DataFormatters/StringPrinter.h"
10 
11 #include "lldb/Core/Debugger.h"
12 #include "lldb/Core/ValueObject.h"
13 #include "lldb/Target/Language.h"
14 #include "lldb/Target/Process.h"
15 #include "lldb/Target/Target.h"
16 #include "lldb/Utility/Status.h"
17 
18 #include "llvm/Support/ConvertUTF.h"
19 
20 #include <ctype.h>
21 #include <locale>
22 #include <memory>
23 
24 using namespace lldb;
25 using namespace lldb_private;
26 using namespace lldb_private::formatters;
27 
28 // we define this for all values of type but only implement it for those we
29 // care about that's good because we get linker errors for any unsupported type
30 template <lldb_private::formatters::StringPrinter::StringElementType type>
31 static StringPrinter::StringPrinterBufferPointer
32 GetPrintableImpl(uint8_t *buffer, uint8_t *buffer_end, uint8_t *&next);
33 
34 // mimic isprint() for Unicode codepoints
35 static bool isprint(char32_t codepoint) {
36   if (codepoint <= 0x1F || codepoint == 0x7F) // C0
37   {
38     return false;
39   }
40   if (codepoint >= 0x80 && codepoint <= 0x9F) // C1
41   {
42     return false;
43   }
44   if (codepoint == 0x2028 || codepoint == 0x2029) // line/paragraph separators
45   {
46     return false;
47   }
48   if (codepoint == 0x200E || codepoint == 0x200F ||
49       (codepoint >= 0x202A &&
50        codepoint <= 0x202E)) // bidirectional text control
51   {
52     return false;
53   }
54   if (codepoint >= 0xFFF9 &&
55       codepoint <= 0xFFFF) // interlinears and generally specials
56   {
57     return false;
58   }
59   return true;
60 }
61 
62 template <>
63 StringPrinter::StringPrinterBufferPointer
64 GetPrintableImpl<StringPrinter::StringElementType::ASCII>(uint8_t *buffer,
65                                                           uint8_t *buffer_end,
66                                                           uint8_t *&next) {
67   StringPrinter::StringPrinterBufferPointer retval = {nullptr};
68 
69   switch (*buffer) {
70   case 0:
71     retval = {"\\0", 2};
72     break;
73   case '\a':
74     retval = {"\\a", 2};
75     break;
76   case '\b':
77     retval = {"\\b", 2};
78     break;
79   case '\f':
80     retval = {"\\f", 2};
81     break;
82   case '\n':
83     retval = {"\\n", 2};
84     break;
85   case '\r':
86     retval = {"\\r", 2};
87     break;
88   case '\t':
89     retval = {"\\t", 2};
90     break;
91   case '\v':
92     retval = {"\\v", 2};
93     break;
94   case '\"':
95     retval = {"\\\"", 2};
96     break;
97   case '\\':
98     retval = {"\\\\", 2};
99     break;
100   default:
101     if (isprint(*buffer))
102       retval = {buffer, 1};
103     else {
104       uint8_t *data = new uint8_t[5];
105       sprintf((char *)data, "\\x%02x", *buffer);
106       retval = {data, 4, [](const uint8_t *c) { delete[] c; }};
107       break;
108     }
109   }
110 
111   next = buffer + 1;
112   return retval;
113 }
114 
115 static char32_t ConvertUTF8ToCodePoint(unsigned char c0, unsigned char c1) {
116   return (c0 - 192) * 64 + (c1 - 128);
117 }
118 static char32_t ConvertUTF8ToCodePoint(unsigned char c0, unsigned char c1,
119                                        unsigned char c2) {
120   return (c0 - 224) * 4096 + (c1 - 128) * 64 + (c2 - 128);
121 }
122 static char32_t ConvertUTF8ToCodePoint(unsigned char c0, unsigned char c1,
123                                        unsigned char c2, unsigned char c3) {
124   return (c0 - 240) * 262144 + (c2 - 128) * 4096 + (c2 - 128) * 64 + (c3 - 128);
125 }
126 
127 template <>
128 StringPrinter::StringPrinterBufferPointer
129 GetPrintableImpl<StringPrinter::StringElementType::UTF8>(uint8_t *buffer,
130                                                          uint8_t *buffer_end,
131                                                          uint8_t *&next) {
132   StringPrinter::StringPrinterBufferPointer retval{nullptr};
133 
134   const unsigned utf8_encoded_len = llvm::getNumBytesForUTF8(*buffer);
135 
136   // If the utf8 encoded length is invalid, or if there aren't enough bytes to
137   // print, this is some kind of corrupted string.
138   if (utf8_encoded_len == 0 || utf8_encoded_len > 4)
139     return retval;
140   if ((buffer_end - buffer) < utf8_encoded_len)
141     // There's no room in the buffer for the utf8 sequence.
142     return retval;
143 
144   char32_t codepoint = 0;
145   switch (utf8_encoded_len) {
146   case 1:
147     // this is just an ASCII byte - ask ASCII
148     return GetPrintableImpl<StringPrinter::StringElementType::ASCII>(
149         buffer, buffer_end, next);
150   case 2:
151     codepoint = ConvertUTF8ToCodePoint((unsigned char)*buffer,
152                                        (unsigned char)*(buffer + 1));
153     break;
154   case 3:
155     codepoint = ConvertUTF8ToCodePoint((unsigned char)*buffer,
156                                        (unsigned char)*(buffer + 1),
157                                        (unsigned char)*(buffer + 2));
158     break;
159   case 4:
160     codepoint = ConvertUTF8ToCodePoint(
161         (unsigned char)*buffer, (unsigned char)*(buffer + 1),
162         (unsigned char)*(buffer + 2), (unsigned char)*(buffer + 3));
163     break;
164   }
165 
166   if (codepoint) {
167     switch (codepoint) {
168     case 0:
169       retval = {"\\0", 2};
170       break;
171     case '\a':
172       retval = {"\\a", 2};
173       break;
174     case '\b':
175       retval = {"\\b", 2};
176       break;
177     case '\f':
178       retval = {"\\f", 2};
179       break;
180     case '\n':
181       retval = {"\\n", 2};
182       break;
183     case '\r':
184       retval = {"\\r", 2};
185       break;
186     case '\t':
187       retval = {"\\t", 2};
188       break;
189     case '\v':
190       retval = {"\\v", 2};
191       break;
192     case '\"':
193       retval = {"\\\"", 2};
194       break;
195     case '\\':
196       retval = {"\\\\", 2};
197       break;
198     default:
199       if (isprint(codepoint))
200         retval = {buffer, utf8_encoded_len};
201       else {
202         uint8_t *data = new uint8_t[11];
203         sprintf((char *)data, "\\U%08x", (unsigned)codepoint);
204         retval = {data, 10, [](const uint8_t *c) { delete[] c; }};
205         break;
206       }
207     }
208 
209     next = buffer + utf8_encoded_len;
210     return retval;
211   }
212 
213   // We couldn't figure out how to print this string.
214   return retval;
215 }
216 
217 // Given a sequence of bytes, this function returns: a sequence of bytes to
218 // actually print out + a length the following unscanned position of the buffer
219 // is in next
220 static StringPrinter::StringPrinterBufferPointer
221 GetPrintable(StringPrinter::StringElementType type, uint8_t *buffer,
222              uint8_t *buffer_end, uint8_t *&next) {
223   if (!buffer || buffer >= buffer_end)
224     return {nullptr};
225 
226   switch (type) {
227   case StringPrinter::StringElementType::ASCII:
228     return GetPrintableImpl<StringPrinter::StringElementType::ASCII>(
229         buffer, buffer_end, next);
230   case StringPrinter::StringElementType::UTF8:
231     return GetPrintableImpl<StringPrinter::StringElementType::UTF8>(
232         buffer, buffer_end, next);
233   default:
234     return {nullptr};
235   }
236 }
237 
238 StringPrinter::EscapingHelper
239 StringPrinter::GetDefaultEscapingHelper(GetPrintableElementType elem_type) {
240   switch (elem_type) {
241   case GetPrintableElementType::UTF8:
242     return [](uint8_t *buffer, uint8_t *buffer_end,
243               uint8_t *&next) -> StringPrinter::StringPrinterBufferPointer {
244       return GetPrintable(StringPrinter::StringElementType::UTF8, buffer,
245                           buffer_end, next);
246     };
247   case GetPrintableElementType::ASCII:
248     return [](uint8_t *buffer, uint8_t *buffer_end,
249               uint8_t *&next) -> StringPrinter::StringPrinterBufferPointer {
250       return GetPrintable(StringPrinter::StringElementType::ASCII, buffer,
251                           buffer_end, next);
252     };
253   }
254   llvm_unreachable("bad element type");
255 }
256 
257 // use this call if you already have an LLDB-side buffer for the data
258 template <typename SourceDataType>
259 static bool DumpUTFBufferToStream(
260     llvm::ConversionResult (*ConvertFunction)(const SourceDataType **,
261                                               const SourceDataType *,
262                                               llvm::UTF8 **, llvm::UTF8 *,
263                                               llvm::ConversionFlags),
264     const StringPrinter::ReadBufferAndDumpToStreamOptions &dump_options) {
265   Stream &stream(*dump_options.GetStream());
266   if (dump_options.GetPrefixToken() != nullptr)
267     stream.Printf("%s", dump_options.GetPrefixToken());
268   if (dump_options.GetQuote() != 0)
269     stream.Printf("%c", dump_options.GetQuote());
270   auto data(dump_options.GetData());
271   auto source_size(dump_options.GetSourceSize());
272   if (data.GetByteSize() && data.GetDataStart() && data.GetDataEnd()) {
273     const int bufferSPSize = data.GetByteSize();
274     if (dump_options.GetSourceSize() == 0) {
275       const int origin_encoding = 8 * sizeof(SourceDataType);
276       source_size = bufferSPSize / (origin_encoding / 4);
277     }
278 
279     const SourceDataType *data_ptr =
280         (const SourceDataType *)data.GetDataStart();
281     const SourceDataType *data_end_ptr = data_ptr + source_size;
282 
283     const bool zero_is_terminator = dump_options.GetBinaryZeroIsTerminator();
284 
285     if (zero_is_terminator) {
286       while (data_ptr < data_end_ptr) {
287         if (!*data_ptr) {
288           data_end_ptr = data_ptr;
289           break;
290         }
291         data_ptr++;
292       }
293 
294       data_ptr = (const SourceDataType *)data.GetDataStart();
295     }
296 
297     lldb::DataBufferSP utf8_data_buffer_sp;
298     llvm::UTF8 *utf8_data_ptr = nullptr;
299     llvm::UTF8 *utf8_data_end_ptr = nullptr;
300 
301     if (ConvertFunction) {
302       utf8_data_buffer_sp =
303           std::make_shared<DataBufferHeap>(4 * bufferSPSize, 0);
304       utf8_data_ptr = (llvm::UTF8 *)utf8_data_buffer_sp->GetBytes();
305       utf8_data_end_ptr = utf8_data_ptr + utf8_data_buffer_sp->GetByteSize();
306       ConvertFunction(&data_ptr, data_end_ptr, &utf8_data_ptr,
307                       utf8_data_end_ptr, llvm::lenientConversion);
308       if (!zero_is_terminator)
309         utf8_data_end_ptr = utf8_data_ptr;
310       // needed because the ConvertFunction will change the value of the
311       // data_ptr.
312       utf8_data_ptr =
313           (llvm::UTF8 *)utf8_data_buffer_sp->GetBytes();
314     } else {
315       // just copy the pointers - the cast is necessary to make the compiler
316       // happy but this should only happen if we are reading UTF8 data
317       utf8_data_ptr = const_cast<llvm::UTF8 *>(
318           reinterpret_cast<const llvm::UTF8 *>(data_ptr));
319       utf8_data_end_ptr = const_cast<llvm::UTF8 *>(
320           reinterpret_cast<const llvm::UTF8 *>(data_end_ptr));
321     }
322 
323     const bool escape_non_printables = dump_options.GetEscapeNonPrintables();
324     lldb_private::formatters::StringPrinter::EscapingHelper escaping_callback;
325     if (escape_non_printables) {
326       if (Language *language = Language::FindPlugin(dump_options.GetLanguage()))
327         escaping_callback = language->GetStringPrinterEscapingHelper(
328             lldb_private::formatters::StringPrinter::GetPrintableElementType::
329                 UTF8);
330       else
331         escaping_callback =
332             lldb_private::formatters::StringPrinter::GetDefaultEscapingHelper(
333                 lldb_private::formatters::StringPrinter::
334                     GetPrintableElementType::UTF8);
335     }
336 
337     // since we tend to accept partial data (and even partially malformed data)
338     // we might end up with no NULL terminator before the end_ptr hence we need
339     // to take a slower route and ensure we stay within boundaries
340     for (; utf8_data_ptr < utf8_data_end_ptr;) {
341       if (zero_is_terminator && !*utf8_data_ptr)
342         break;
343 
344       if (escape_non_printables) {
345         uint8_t *next_data = nullptr;
346         auto printable =
347             escaping_callback(utf8_data_ptr, utf8_data_end_ptr, next_data);
348         auto printable_bytes = printable.GetBytes();
349         auto printable_size = printable.GetSize();
350 
351         // We failed to figure out how to print this string.
352         if (!printable_bytes || !next_data)
353           return false;
354 
355         for (unsigned c = 0; c < printable_size; c++)
356           stream.Printf("%c", *(printable_bytes + c));
357         utf8_data_ptr = (uint8_t *)next_data;
358       } else {
359         stream.Printf("%c", *utf8_data_ptr);
360         utf8_data_ptr++;
361       }
362     }
363   }
364   if (dump_options.GetQuote() != 0)
365     stream.Printf("%c", dump_options.GetQuote());
366   if (dump_options.GetSuffixToken() != nullptr)
367     stream.Printf("%s", dump_options.GetSuffixToken());
368   if (dump_options.GetIsTruncated())
369     stream.Printf("...");
370   return true;
371 }
372 
373 lldb_private::formatters::StringPrinter::ReadStringAndDumpToStreamOptions::
374     ReadStringAndDumpToStreamOptions(ValueObject &valobj)
375     : ReadStringAndDumpToStreamOptions() {
376   SetEscapeNonPrintables(
377       valobj.GetTargetSP()->GetDebugger().GetEscapeNonPrintables());
378 }
379 
380 lldb_private::formatters::StringPrinter::ReadBufferAndDumpToStreamOptions::
381     ReadBufferAndDumpToStreamOptions(ValueObject &valobj)
382     : ReadBufferAndDumpToStreamOptions() {
383   SetEscapeNonPrintables(
384       valobj.GetTargetSP()->GetDebugger().GetEscapeNonPrintables());
385 }
386 
387 lldb_private::formatters::StringPrinter::ReadBufferAndDumpToStreamOptions::
388     ReadBufferAndDumpToStreamOptions(
389         const ReadStringAndDumpToStreamOptions &options)
390     : ReadBufferAndDumpToStreamOptions() {
391   SetStream(options.GetStream());
392   SetPrefixToken(options.GetPrefixToken());
393   SetSuffixToken(options.GetSuffixToken());
394   SetQuote(options.GetQuote());
395   SetEscapeNonPrintables(options.GetEscapeNonPrintables());
396   SetBinaryZeroIsTerminator(options.GetBinaryZeroIsTerminator());
397   SetLanguage(options.GetLanguage());
398 }
399 
400 namespace lldb_private {
401 
402 namespace formatters {
403 
404 template <>
405 bool StringPrinter::ReadStringAndDumpToStream<
406     StringPrinter::StringElementType::ASCII>(
407     const ReadStringAndDumpToStreamOptions &options) {
408   assert(options.GetStream() && "need a Stream to print the string to");
409   Status my_error;
410 
411   ProcessSP process_sp(options.GetProcessSP());
412 
413   if (process_sp.get() == nullptr || options.GetLocation() == 0)
414     return false;
415 
416   size_t size;
417   const auto max_size = process_sp->GetTarget().GetMaximumSizeOfStringSummary();
418   bool is_truncated = false;
419 
420   if (options.GetSourceSize() == 0)
421     size = max_size;
422   else if (!options.GetIgnoreMaxLength()) {
423     size = options.GetSourceSize();
424     if (size > max_size) {
425       size = max_size;
426       is_truncated = true;
427     }
428   } else
429     size = options.GetSourceSize();
430 
431   lldb::DataBufferSP buffer_sp(new DataBufferHeap(size, 0));
432 
433   process_sp->ReadCStringFromMemory(
434       options.GetLocation(), (char *)buffer_sp->GetBytes(), size, my_error);
435 
436   if (my_error.Fail())
437     return false;
438 
439   const char *prefix_token = options.GetPrefixToken();
440   char quote = options.GetQuote();
441 
442   if (prefix_token != nullptr)
443     options.GetStream()->Printf("%s%c", prefix_token, quote);
444   else if (quote != 0)
445     options.GetStream()->Printf("%c", quote);
446 
447   uint8_t *data_end = buffer_sp->GetBytes() + buffer_sp->GetByteSize();
448 
449   const bool escape_non_printables = options.GetEscapeNonPrintables();
450   lldb_private::formatters::StringPrinter::EscapingHelper escaping_callback;
451   if (escape_non_printables) {
452     if (Language *language = Language::FindPlugin(options.GetLanguage()))
453       escaping_callback = language->GetStringPrinterEscapingHelper(
454           lldb_private::formatters::StringPrinter::GetPrintableElementType::
455               ASCII);
456     else
457       escaping_callback =
458           lldb_private::formatters::StringPrinter::GetDefaultEscapingHelper(
459               lldb_private::formatters::StringPrinter::GetPrintableElementType::
460                   ASCII);
461   }
462 
463   // since we tend to accept partial data (and even partially malformed data)
464   // we might end up with no NULL terminator before the end_ptr hence we need
465   // to take a slower route and ensure we stay within boundaries
466   for (uint8_t *data = buffer_sp->GetBytes(); *data && (data < data_end);) {
467     if (escape_non_printables) {
468       uint8_t *next_data = nullptr;
469       auto printable = escaping_callback(data, data_end, next_data);
470       auto printable_bytes = printable.GetBytes();
471       auto printable_size = printable.GetSize();
472 
473       // We failed to figure out how to print this string.
474       if (!printable_bytes || !next_data)
475         return false;
476 
477       for (unsigned c = 0; c < printable_size; c++)
478         options.GetStream()->Printf("%c", *(printable_bytes + c));
479       data = (uint8_t *)next_data;
480     } else {
481       options.GetStream()->Printf("%c", *data);
482       data++;
483     }
484   }
485 
486   const char *suffix_token = options.GetSuffixToken();
487 
488   if (suffix_token != nullptr)
489     options.GetStream()->Printf("%c%s", quote, suffix_token);
490   else if (quote != 0)
491     options.GetStream()->Printf("%c", quote);
492 
493   if (is_truncated)
494     options.GetStream()->Printf("...");
495 
496   return true;
497 }
498 
499 template <typename SourceDataType>
500 static bool ReadUTFBufferAndDumpToStream(
501     const StringPrinter::ReadStringAndDumpToStreamOptions &options,
502     llvm::ConversionResult (*ConvertFunction)(const SourceDataType **,
503                                               const SourceDataType *,
504                                               llvm::UTF8 **, llvm::UTF8 *,
505                                               llvm::ConversionFlags)) {
506   assert(options.GetStream() && "need a Stream to print the string to");
507 
508   if (options.GetLocation() == 0 ||
509       options.GetLocation() == LLDB_INVALID_ADDRESS)
510     return false;
511 
512   lldb::ProcessSP process_sp(options.GetProcessSP());
513 
514   if (!process_sp)
515     return false;
516 
517   const int type_width = sizeof(SourceDataType);
518   const int origin_encoding = 8 * type_width;
519   if (origin_encoding != 8 && origin_encoding != 16 && origin_encoding != 32)
520     return false;
521   // if not UTF8, I need a conversion function to return proper UTF8
522   if (origin_encoding != 8 && !ConvertFunction)
523     return false;
524 
525   if (!options.GetStream())
526     return false;
527 
528   uint32_t sourceSize = options.GetSourceSize();
529   bool needs_zero_terminator = options.GetNeedsZeroTermination();
530 
531   bool is_truncated = false;
532   const auto max_size = process_sp->GetTarget().GetMaximumSizeOfStringSummary();
533 
534   if (!sourceSize) {
535     sourceSize = max_size;
536     needs_zero_terminator = true;
537   } else if (!options.GetIgnoreMaxLength()) {
538     if (sourceSize > max_size) {
539       sourceSize = max_size;
540       is_truncated = true;
541     }
542   }
543 
544   const int bufferSPSize = sourceSize * type_width;
545 
546   lldb::DataBufferSP buffer_sp(new DataBufferHeap(bufferSPSize, 0));
547 
548   if (!buffer_sp->GetBytes())
549     return false;
550 
551   Status error;
552   char *buffer = reinterpret_cast<char *>(buffer_sp->GetBytes());
553 
554   if (needs_zero_terminator)
555     process_sp->ReadStringFromMemory(options.GetLocation(), buffer,
556                                      bufferSPSize, error, type_width);
557   else
558     process_sp->ReadMemoryFromInferior(options.GetLocation(),
559                                        (char *)buffer_sp->GetBytes(),
560                                        bufferSPSize, error);
561 
562   if (error.Fail()) {
563     options.GetStream()->Printf("unable to read data");
564     return true;
565   }
566 
567   DataExtractor data(buffer_sp, process_sp->GetByteOrder(),
568                      process_sp->GetAddressByteSize());
569 
570   StringPrinter::ReadBufferAndDumpToStreamOptions dump_options(options);
571   dump_options.SetData(data);
572   dump_options.SetSourceSize(sourceSize);
573   dump_options.SetIsTruncated(is_truncated);
574 
575   return DumpUTFBufferToStream(ConvertFunction, dump_options);
576 }
577 
578 template <>
579 bool StringPrinter::ReadStringAndDumpToStream<
580     StringPrinter::StringElementType::UTF8>(
581     const ReadStringAndDumpToStreamOptions &options) {
582   return ReadUTFBufferAndDumpToStream<llvm::UTF8>(options, nullptr);
583 }
584 
585 template <>
586 bool StringPrinter::ReadStringAndDumpToStream<
587     StringPrinter::StringElementType::UTF16>(
588     const ReadStringAndDumpToStreamOptions &options) {
589   return ReadUTFBufferAndDumpToStream<llvm::UTF16>(options,
590                                                    llvm::ConvertUTF16toUTF8);
591 }
592 
593 template <>
594 bool StringPrinter::ReadStringAndDumpToStream<
595     StringPrinter::StringElementType::UTF32>(
596     const ReadStringAndDumpToStreamOptions &options) {
597   return ReadUTFBufferAndDumpToStream<llvm::UTF32>(options,
598                                                    llvm::ConvertUTF32toUTF8);
599 }
600 
601 template <>
602 bool StringPrinter::ReadBufferAndDumpToStream<
603     StringPrinter::StringElementType::UTF8>(
604     const ReadBufferAndDumpToStreamOptions &options) {
605   assert(options.GetStream() && "need a Stream to print the string to");
606 
607   return DumpUTFBufferToStream<llvm::UTF8>(nullptr, options);
608 }
609 
610 template <>
611 bool StringPrinter::ReadBufferAndDumpToStream<
612     StringPrinter::StringElementType::ASCII>(
613     const ReadBufferAndDumpToStreamOptions &options) {
614   // treat ASCII the same as UTF8
615   // FIXME: can we optimize ASCII some more?
616   return ReadBufferAndDumpToStream<StringElementType::UTF8>(options);
617 }
618 
619 template <>
620 bool StringPrinter::ReadBufferAndDumpToStream<
621     StringPrinter::StringElementType::UTF16>(
622     const ReadBufferAndDumpToStreamOptions &options) {
623   assert(options.GetStream() && "need a Stream to print the string to");
624 
625   return DumpUTFBufferToStream(llvm::ConvertUTF16toUTF8, options);
626 }
627 
628 template <>
629 bool StringPrinter::ReadBufferAndDumpToStream<
630     StringPrinter::StringElementType::UTF32>(
631     const ReadBufferAndDumpToStreamOptions &options) {
632   assert(options.GetStream() && "need a Stream to print the string to");
633 
634   return DumpUTFBufferToStream(llvm::ConvertUTF32toUTF8, options);
635 }
636 
637 } // namespace formatters
638 
639 } // namespace lldb_private
640