1 //===-- StringPrinter.cpp ----------------------------------------*- C++
2 //-*-===//
3 //
4 //                     The LLVM Compiler Infrastructure
5 //
6 // This file is distributed under the University of Illinois Open Source
7 // License. See LICENSE.TXT for details.
8 //
9 //===----------------------------------------------------------------------===//
10 
11 #include "lldb/DataFormatters/StringPrinter.h"
12 
13 #include "lldb/Core/Debugger.h"
14 #include "lldb/Core/Error.h"
15 #include "lldb/Core/ValueObject.h"
16 #include "lldb/Target/Language.h"
17 #include "lldb/Target/Process.h"
18 #include "lldb/Target/Target.h"
19 
20 #include "llvm/Support/ConvertUTF.h"
21 
22 #include <ctype.h>
23 #include <locale>
24 
25 using namespace lldb;
26 using namespace lldb_private;
27 using namespace lldb_private::formatters;
28 
29 // we define this for all values of type but only implement it for those we care
30 // about
31 // that's good because we get linker errors for any unsupported type
32 template <lldb_private::formatters::StringPrinter::StringElementType type>
33 static StringPrinter::StringPrinterBufferPointer<>
34 GetPrintableImpl(uint8_t *buffer, uint8_t *buffer_end, uint8_t *&next);
35 
36 // mimic isprint() for Unicode codepoints
37 static bool isprint(char32_t codepoint) {
38   if (codepoint <= 0x1F || codepoint == 0x7F) // C0
39   {
40     return false;
41   }
42   if (codepoint >= 0x80 && codepoint <= 0x9F) // C1
43   {
44     return false;
45   }
46   if (codepoint == 0x2028 || codepoint == 0x2029) // line/paragraph separators
47   {
48     return false;
49   }
50   if (codepoint == 0x200E || codepoint == 0x200F ||
51       (codepoint >= 0x202A &&
52        codepoint <= 0x202E)) // bidirectional text control
53   {
54     return false;
55   }
56   if (codepoint >= 0xFFF9 &&
57       codepoint <= 0xFFFF) // interlinears and generally specials
58   {
59     return false;
60   }
61   return true;
62 }
63 
64 template <>
65 StringPrinter::StringPrinterBufferPointer<>
66 GetPrintableImpl<StringPrinter::StringElementType::ASCII>(uint8_t *buffer,
67                                                           uint8_t *buffer_end,
68                                                           uint8_t *&next) {
69   StringPrinter::StringPrinterBufferPointer<> retval = {nullptr};
70 
71   switch (*buffer) {
72   case 0:
73     retval = {"\\0", 2};
74     break;
75   case '\a':
76     retval = {"\\a", 2};
77     break;
78   case '\b':
79     retval = {"\\b", 2};
80     break;
81   case '\f':
82     retval = {"\\f", 2};
83     break;
84   case '\n':
85     retval = {"\\n", 2};
86     break;
87   case '\r':
88     retval = {"\\r", 2};
89     break;
90   case '\t':
91     retval = {"\\t", 2};
92     break;
93   case '\v':
94     retval = {"\\v", 2};
95     break;
96   case '\"':
97     retval = {"\\\"", 2};
98     break;
99   case '\\':
100     retval = {"\\\\", 2};
101     break;
102   default:
103     if (isprint(*buffer))
104       retval = {buffer, 1};
105     else {
106       uint8_t *data = new uint8_t[5];
107       sprintf((char *)data, "\\x%02x", *buffer);
108       retval = {data, 4, [](const uint8_t *c) { delete[] c; }};
109       break;
110     }
111   }
112 
113   next = buffer + 1;
114   return retval;
115 }
116 
117 static char32_t ConvertUTF8ToCodePoint(unsigned char c0, unsigned char c1) {
118   return (c0 - 192) * 64 + (c1 - 128);
119 }
120 static char32_t ConvertUTF8ToCodePoint(unsigned char c0, unsigned char c1,
121                                        unsigned char c2) {
122   return (c0 - 224) * 4096 + (c1 - 128) * 64 + (c2 - 128);
123 }
124 static char32_t ConvertUTF8ToCodePoint(unsigned char c0, unsigned char c1,
125                                        unsigned char c2, unsigned char c3) {
126   return (c0 - 240) * 262144 + (c2 - 128) * 4096 + (c2 - 128) * 64 + (c3 - 128);
127 }
128 
129 template <>
130 StringPrinter::StringPrinterBufferPointer<>
131 GetPrintableImpl<StringPrinter::StringElementType::UTF8>(uint8_t *buffer,
132                                                          uint8_t *buffer_end,
133                                                          uint8_t *&next) {
134   StringPrinter::StringPrinterBufferPointer<> retval{nullptr};
135 
136   unsigned utf8_encoded_len = getNumBytesForUTF8(*buffer);
137 
138   if (1 + buffer_end - buffer < utf8_encoded_len) {
139     // I don't have enough bytes - print whatever I have left
140     retval = {buffer, static_cast<size_t>(1 + buffer_end - buffer)};
141     next = buffer_end + 1;
142     return retval;
143   }
144 
145   char32_t codepoint = 0;
146   switch (utf8_encoded_len) {
147   case 1:
148     // this is just an ASCII byte - ask ASCII
149     return GetPrintableImpl<StringPrinter::StringElementType::ASCII>(
150         buffer, buffer_end, next);
151   case 2:
152     codepoint = ConvertUTF8ToCodePoint((unsigned char)*buffer,
153                                        (unsigned char)*(buffer + 1));
154     break;
155   case 3:
156     codepoint = ConvertUTF8ToCodePoint((unsigned char)*buffer,
157                                        (unsigned char)*(buffer + 1),
158                                        (unsigned char)*(buffer + 2));
159     break;
160   case 4:
161     codepoint = ConvertUTF8ToCodePoint(
162         (unsigned char)*buffer, (unsigned char)*(buffer + 1),
163         (unsigned char)*(buffer + 2), (unsigned char)*(buffer + 3));
164     break;
165   default:
166     // this is probably some bogus non-character thing
167     // just print it as-is and hope to sync up again soon
168     retval = {buffer, 1};
169     next = buffer + 1;
170     return retval;
171   }
172 
173   if (codepoint) {
174     switch (codepoint) {
175     case 0:
176       retval = {"\\0", 2};
177       break;
178     case '\a':
179       retval = {"\\a", 2};
180       break;
181     case '\b':
182       retval = {"\\b", 2};
183       break;
184     case '\f':
185       retval = {"\\f", 2};
186       break;
187     case '\n':
188       retval = {"\\n", 2};
189       break;
190     case '\r':
191       retval = {"\\r", 2};
192       break;
193     case '\t':
194       retval = {"\\t", 2};
195       break;
196     case '\v':
197       retval = {"\\v", 2};
198       break;
199     case '\"':
200       retval = {"\\\"", 2};
201       break;
202     case '\\':
203       retval = {"\\\\", 2};
204       break;
205     default:
206       if (isprint(codepoint))
207         retval = {buffer, utf8_encoded_len};
208       else {
209         uint8_t *data = new uint8_t[11];
210         sprintf((char *)data, "\\U%08x", (unsigned)codepoint);
211         retval = {data, 10, [](const uint8_t *c) { delete[] c; }};
212         break;
213       }
214     }
215 
216     next = buffer + utf8_encoded_len;
217     return retval;
218   }
219 
220   // this should not happen - but just in case.. try to resync at some point
221   retval = {buffer, 1};
222   next = buffer + 1;
223   return retval;
224 }
225 
226 // Given a sequence of bytes, this function returns:
227 // a sequence of bytes to actually print out + a length
228 // the following unscanned position of the buffer is in next
229 static StringPrinter::StringPrinterBufferPointer<>
230 GetPrintable(StringPrinter::StringElementType type, uint8_t *buffer,
231              uint8_t *buffer_end, uint8_t *&next) {
232   if (!buffer)
233     return {nullptr};
234 
235   switch (type) {
236   case StringPrinter::StringElementType::ASCII:
237     return GetPrintableImpl<StringPrinter::StringElementType::ASCII>(
238         buffer, buffer_end, next);
239   case StringPrinter::StringElementType::UTF8:
240     return GetPrintableImpl<StringPrinter::StringElementType::UTF8>(
241         buffer, buffer_end, next);
242   default:
243     return {nullptr};
244   }
245 }
246 
247 StringPrinter::EscapingHelper
248 StringPrinter::GetDefaultEscapingHelper(GetPrintableElementType elem_type) {
249   switch (elem_type) {
250   case GetPrintableElementType::UTF8:
251     return [](uint8_t *buffer, uint8_t *buffer_end,
252               uint8_t *&next) -> StringPrinter::StringPrinterBufferPointer<> {
253       return GetPrintable(StringPrinter::StringElementType::UTF8, buffer,
254                           buffer_end, next);
255     };
256   case GetPrintableElementType::ASCII:
257     return [](uint8_t *buffer, uint8_t *buffer_end,
258               uint8_t *&next) -> StringPrinter::StringPrinterBufferPointer<> {
259       return GetPrintable(StringPrinter::StringElementType::ASCII, buffer,
260                           buffer_end, next);
261     };
262   }
263   llvm_unreachable("bad element type");
264 }
265 
266 // use this call if you already have an LLDB-side buffer for the data
267 template <typename SourceDataType>
268 static bool DumpUTFBufferToStream(
269     ConversionResult (*ConvertFunction)(const SourceDataType **,
270                                         const SourceDataType *, UTF8 **, UTF8 *,
271                                         ConversionFlags),
272     const StringPrinter::ReadBufferAndDumpToStreamOptions &dump_options) {
273   Stream &stream(*dump_options.GetStream());
274   if (dump_options.GetPrefixToken() != 0)
275     stream.Printf("%s", dump_options.GetPrefixToken());
276   if (dump_options.GetQuote() != 0)
277     stream.Printf("%c", dump_options.GetQuote());
278   auto data(dump_options.GetData());
279   auto source_size(dump_options.GetSourceSize());
280   if (data.GetByteSize() && data.GetDataStart() && data.GetDataEnd()) {
281     const int bufferSPSize = data.GetByteSize();
282     if (dump_options.GetSourceSize() == 0) {
283       const int origin_encoding = 8 * sizeof(SourceDataType);
284       source_size = bufferSPSize / (origin_encoding / 4);
285     }
286 
287     const SourceDataType *data_ptr =
288         (const SourceDataType *)data.GetDataStart();
289     const SourceDataType *data_end_ptr = data_ptr + source_size;
290 
291     const bool zero_is_terminator = dump_options.GetBinaryZeroIsTerminator();
292 
293     if (zero_is_terminator) {
294       while (data_ptr < data_end_ptr) {
295         if (!*data_ptr) {
296           data_end_ptr = data_ptr;
297           break;
298         }
299         data_ptr++;
300       }
301 
302       data_ptr = (const SourceDataType *)data.GetDataStart();
303     }
304 
305     lldb::DataBufferSP utf8_data_buffer_sp;
306     UTF8 *utf8_data_ptr = nullptr;
307     UTF8 *utf8_data_end_ptr = nullptr;
308 
309     if (ConvertFunction) {
310       utf8_data_buffer_sp.reset(new DataBufferHeap(4 * bufferSPSize, 0));
311       utf8_data_ptr = (UTF8 *)utf8_data_buffer_sp->GetBytes();
312       utf8_data_end_ptr = utf8_data_ptr + utf8_data_buffer_sp->GetByteSize();
313       ConvertFunction(&data_ptr, data_end_ptr, &utf8_data_ptr,
314                       utf8_data_end_ptr, lenientConversion);
315       if (false == zero_is_terminator)
316         utf8_data_end_ptr = utf8_data_ptr;
317       utf8_data_ptr =
318           (UTF8 *)utf8_data_buffer_sp->GetBytes(); // needed because the
319                                                    // ConvertFunction will
320                                                    // change the value of the
321                                                    // data_ptr
322     } else {
323       // just copy the pointers - the cast is necessary to make the compiler
324       // happy
325       // but this should only happen if we are reading UTF8 data
326       utf8_data_ptr =
327           const_cast<UTF8 *>(reinterpret_cast<const UTF8 *>(data_ptr));
328       utf8_data_end_ptr =
329           const_cast<UTF8 *>(reinterpret_cast<const UTF8 *>(data_end_ptr));
330     }
331 
332     const bool escape_non_printables = dump_options.GetEscapeNonPrintables();
333     lldb_private::formatters::StringPrinter::EscapingHelper escaping_callback;
334     if (escape_non_printables) {
335       if (Language *language = Language::FindPlugin(dump_options.GetLanguage()))
336         escaping_callback = language->GetStringPrinterEscapingHelper(
337             lldb_private::formatters::StringPrinter::GetPrintableElementType::
338                 UTF8);
339       else
340         escaping_callback =
341             lldb_private::formatters::StringPrinter::GetDefaultEscapingHelper(
342                 lldb_private::formatters::StringPrinter::
343                     GetPrintableElementType::UTF8);
344     }
345 
346     // since we tend to accept partial data (and even partially malformed data)
347     // we might end up with no NULL terminator before the end_ptr
348     // hence we need to take a slower route and ensure we stay within boundaries
349     for (; utf8_data_ptr < utf8_data_end_ptr;) {
350       if (zero_is_terminator && !*utf8_data_ptr)
351         break;
352 
353       if (escape_non_printables) {
354         uint8_t *next_data = nullptr;
355         auto printable =
356             escaping_callback(utf8_data_ptr, utf8_data_end_ptr, next_data);
357         auto printable_bytes = printable.GetBytes();
358         auto printable_size = printable.GetSize();
359         if (!printable_bytes || !next_data) {
360           // GetPrintable() failed on us - print one byte in a desperate resync
361           // attempt
362           printable_bytes = utf8_data_ptr;
363           printable_size = 1;
364           next_data = utf8_data_ptr + 1;
365         }
366         for (unsigned c = 0; c < printable_size; c++)
367           stream.Printf("%c", *(printable_bytes + c));
368         utf8_data_ptr = (uint8_t *)next_data;
369       } else {
370         stream.Printf("%c", *utf8_data_ptr);
371         utf8_data_ptr++;
372       }
373     }
374   }
375   if (dump_options.GetQuote() != 0)
376     stream.Printf("%c", dump_options.GetQuote());
377   if (dump_options.GetSuffixToken() != 0)
378     stream.Printf("%s", dump_options.GetSuffixToken());
379   if (dump_options.GetIsTruncated())
380     stream.Printf("...");
381   return true;
382 }
383 
384 lldb_private::formatters::StringPrinter::ReadStringAndDumpToStreamOptions::
385     ReadStringAndDumpToStreamOptions(ValueObject &valobj)
386     : ReadStringAndDumpToStreamOptions() {
387   SetEscapeNonPrintables(
388       valobj.GetTargetSP()->GetDebugger().GetEscapeNonPrintables());
389 }
390 
391 lldb_private::formatters::StringPrinter::ReadBufferAndDumpToStreamOptions::
392     ReadBufferAndDumpToStreamOptions(ValueObject &valobj)
393     : ReadBufferAndDumpToStreamOptions() {
394   SetEscapeNonPrintables(
395       valobj.GetTargetSP()->GetDebugger().GetEscapeNonPrintables());
396 }
397 
398 lldb_private::formatters::StringPrinter::ReadBufferAndDumpToStreamOptions::
399     ReadBufferAndDumpToStreamOptions(
400         const ReadStringAndDumpToStreamOptions &options)
401     : ReadBufferAndDumpToStreamOptions() {
402   SetStream(options.GetStream());
403   SetPrefixToken(options.GetPrefixToken());
404   SetSuffixToken(options.GetSuffixToken());
405   SetQuote(options.GetQuote());
406   SetEscapeNonPrintables(options.GetEscapeNonPrintables());
407   SetBinaryZeroIsTerminator(options.GetBinaryZeroIsTerminator());
408   SetLanguage(options.GetLanguage());
409 }
410 
411 namespace lldb_private {
412 
413 namespace formatters {
414 
415 template <>
416 bool StringPrinter::ReadStringAndDumpToStream<
417     StringPrinter::StringElementType::ASCII>(
418     const ReadStringAndDumpToStreamOptions &options) {
419   assert(options.GetStream() && "need a Stream to print the string to");
420   Error my_error;
421 
422   ProcessSP process_sp(options.GetProcessSP());
423 
424   if (process_sp.get() == nullptr || options.GetLocation() == 0)
425     return false;
426 
427   size_t size;
428   const auto max_size = process_sp->GetTarget().GetMaximumSizeOfStringSummary();
429   bool is_truncated = false;
430 
431   if (options.GetSourceSize() == 0)
432     size = max_size;
433   else if (!options.GetIgnoreMaxLength()) {
434     size = options.GetSourceSize();
435     if (size > max_size) {
436       size = max_size;
437       is_truncated = true;
438     }
439   } else
440     size = options.GetSourceSize();
441 
442   lldb::DataBufferSP buffer_sp(new DataBufferHeap(size, 0));
443 
444   process_sp->ReadCStringFromMemory(
445       options.GetLocation(), (char *)buffer_sp->GetBytes(), size, my_error);
446 
447   if (my_error.Fail())
448     return false;
449 
450   const char *prefix_token = options.GetPrefixToken();
451   char quote = options.GetQuote();
452 
453   if (prefix_token != 0)
454     options.GetStream()->Printf("%s%c", prefix_token, quote);
455   else if (quote != 0)
456     options.GetStream()->Printf("%c", quote);
457 
458   uint8_t *data_end = buffer_sp->GetBytes() + buffer_sp->GetByteSize();
459 
460   const bool escape_non_printables = options.GetEscapeNonPrintables();
461   lldb_private::formatters::StringPrinter::EscapingHelper escaping_callback;
462   if (escape_non_printables) {
463     if (Language *language = Language::FindPlugin(options.GetLanguage()))
464       escaping_callback = language->GetStringPrinterEscapingHelper(
465           lldb_private::formatters::StringPrinter::GetPrintableElementType::
466               ASCII);
467     else
468       escaping_callback =
469           lldb_private::formatters::StringPrinter::GetDefaultEscapingHelper(
470               lldb_private::formatters::StringPrinter::GetPrintableElementType::
471                   ASCII);
472   }
473 
474   // since we tend to accept partial data (and even partially malformed data)
475   // we might end up with no NULL terminator before the end_ptr
476   // hence we need to take a slower route and ensure we stay within boundaries
477   for (uint8_t *data = buffer_sp->GetBytes(); *data && (data < data_end);) {
478     if (escape_non_printables) {
479       uint8_t *next_data = nullptr;
480       auto printable = escaping_callback(data, data_end, next_data);
481       auto printable_bytes = printable.GetBytes();
482       auto printable_size = printable.GetSize();
483       if (!printable_bytes || !next_data) {
484         // GetPrintable() failed on us - print one byte in a desperate resync
485         // attempt
486         printable_bytes = data;
487         printable_size = 1;
488         next_data = data + 1;
489       }
490       for (unsigned c = 0; c < printable_size; c++)
491         options.GetStream()->Printf("%c", *(printable_bytes + c));
492       data = (uint8_t *)next_data;
493     } else {
494       options.GetStream()->Printf("%c", *data);
495       data++;
496     }
497   }
498 
499   const char *suffix_token = options.GetSuffixToken();
500 
501   if (suffix_token != 0)
502     options.GetStream()->Printf("%c%s", quote, suffix_token);
503   else if (quote != 0)
504     options.GetStream()->Printf("%c", quote);
505 
506   if (is_truncated)
507     options.GetStream()->Printf("...");
508 
509   return true;
510 }
511 
512 template <typename SourceDataType>
513 static bool ReadUTFBufferAndDumpToStream(
514     const StringPrinter::ReadStringAndDumpToStreamOptions &options,
515     ConversionResult (*ConvertFunction)(const SourceDataType **,
516                                         const SourceDataType *, UTF8 **, UTF8 *,
517                                         ConversionFlags)) {
518   assert(options.GetStream() && "need a Stream to print the string to");
519 
520   if (options.GetLocation() == 0 ||
521       options.GetLocation() == LLDB_INVALID_ADDRESS)
522     return false;
523 
524   lldb::ProcessSP process_sp(options.GetProcessSP());
525 
526   if (!process_sp)
527     return false;
528 
529   const int type_width = sizeof(SourceDataType);
530   const int origin_encoding = 8 * type_width;
531   if (origin_encoding != 8 && origin_encoding != 16 && origin_encoding != 32)
532     return false;
533   // if not UTF8, I need a conversion function to return proper UTF8
534   if (origin_encoding != 8 && !ConvertFunction)
535     return false;
536 
537   if (!options.GetStream())
538     return false;
539 
540   uint32_t sourceSize = options.GetSourceSize();
541   bool needs_zero_terminator = options.GetNeedsZeroTermination();
542 
543   bool is_truncated = false;
544   const auto max_size = process_sp->GetTarget().GetMaximumSizeOfStringSummary();
545 
546   if (!sourceSize) {
547     sourceSize = max_size;
548     needs_zero_terminator = true;
549   } else if (!options.GetIgnoreMaxLength()) {
550     if (sourceSize > max_size) {
551       sourceSize = max_size;
552       is_truncated = true;
553     }
554   }
555 
556   const int bufferSPSize = sourceSize * type_width;
557 
558   lldb::DataBufferSP buffer_sp(new DataBufferHeap(bufferSPSize, 0));
559 
560   if (!buffer_sp->GetBytes())
561     return false;
562 
563   Error error;
564   char *buffer = reinterpret_cast<char *>(buffer_sp->GetBytes());
565 
566   if (needs_zero_terminator)
567     process_sp->ReadStringFromMemory(options.GetLocation(), buffer,
568                                      bufferSPSize, error, type_width);
569   else
570     process_sp->ReadMemoryFromInferior(options.GetLocation(),
571                                        (char *)buffer_sp->GetBytes(),
572                                        bufferSPSize, error);
573 
574   if (error.Fail()) {
575     options.GetStream()->Printf("unable to read data");
576     return true;
577   }
578 
579   DataExtractor data(buffer_sp, process_sp->GetByteOrder(),
580                      process_sp->GetAddressByteSize());
581 
582   StringPrinter::ReadBufferAndDumpToStreamOptions dump_options(options);
583   dump_options.SetData(data);
584   dump_options.SetSourceSize(sourceSize);
585   dump_options.SetIsTruncated(is_truncated);
586 
587   return DumpUTFBufferToStream(ConvertFunction, dump_options);
588 }
589 
590 template <>
591 bool StringPrinter::ReadStringAndDumpToStream<
592     StringPrinter::StringElementType::UTF8>(
593     const ReadStringAndDumpToStreamOptions &options) {
594   return ReadUTFBufferAndDumpToStream<UTF8>(options, nullptr);
595 }
596 
597 template <>
598 bool StringPrinter::ReadStringAndDumpToStream<
599     StringPrinter::StringElementType::UTF16>(
600     const ReadStringAndDumpToStreamOptions &options) {
601   return ReadUTFBufferAndDumpToStream<UTF16>(options, ConvertUTF16toUTF8);
602 }
603 
604 template <>
605 bool StringPrinter::ReadStringAndDumpToStream<
606     StringPrinter::StringElementType::UTF32>(
607     const ReadStringAndDumpToStreamOptions &options) {
608   return ReadUTFBufferAndDumpToStream<UTF32>(options, ConvertUTF32toUTF8);
609 }
610 
611 template <>
612 bool StringPrinter::ReadBufferAndDumpToStream<
613     StringPrinter::StringElementType::UTF8>(
614     const ReadBufferAndDumpToStreamOptions &options) {
615   assert(options.GetStream() && "need a Stream to print the string to");
616 
617   return DumpUTFBufferToStream<UTF8>(nullptr, options);
618 }
619 
620 template <>
621 bool StringPrinter::ReadBufferAndDumpToStream<
622     StringPrinter::StringElementType::ASCII>(
623     const ReadBufferAndDumpToStreamOptions &options) {
624   // treat ASCII the same as UTF8
625   // FIXME: can we optimize ASCII some more?
626   return ReadBufferAndDumpToStream<StringElementType::UTF8>(options);
627 }
628 
629 template <>
630 bool StringPrinter::ReadBufferAndDumpToStream<
631     StringPrinter::StringElementType::UTF16>(
632     const ReadBufferAndDumpToStreamOptions &options) {
633   assert(options.GetStream() && "need a Stream to print the string to");
634 
635   return DumpUTFBufferToStream(ConvertUTF16toUTF8, options);
636 }
637 
638 template <>
639 bool StringPrinter::ReadBufferAndDumpToStream<
640     StringPrinter::StringElementType::UTF32>(
641     const ReadBufferAndDumpToStreamOptions &options) {
642   assert(options.GetStream() && "need a Stream to print the string to");
643 
644   return DumpUTFBufferToStream(ConvertUTF32toUTF8, options);
645 }
646 
647 } // namespace formatters
648 
649 } // namespace lldb_private
650