1 //===-- StringPrinter.cpp ----------------------------------------*- C++ -*-===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 
10 #include "lldb/DataFormatters/StringPrinter.h"
11 
12 #include "lldb/Core/DataExtractor.h"
13 #include "lldb/Core/Debugger.h"
14 #include "lldb/Core/Error.h"
15 #include "lldb/Core/ValueObject.h"
16 #include "lldb/Target/Process.h"
17 #include "lldb/Target/Target.h"
18 
19 #include "llvm/Support/ConvertUTF.h"
20 
21 #include <ctype.h>
22 #include <functional>
23 #include <locale>
24 
25 using namespace lldb;
26 using namespace lldb_private;
27 using namespace lldb_private::formatters;
28 
29 // I can't use a std::unique_ptr for this because the Deleter is a template argument there
30 // and I want the same type to represent both pointers I want to free and pointers I don't need
31 // to free - which is what this class essentially is
32 // It's very specialized to the needs of this file, and not suggested for general use
33 template <typename T = uint8_t, typename U = char, typename S = size_t>
34 struct StringPrinterBufferPointer
35 {
36 public:
37 
38     typedef std::function<void(const T*)> Deleter;
39 
40     StringPrinterBufferPointer (std::nullptr_t ptr) :
41     m_data(nullptr),
42     m_size(0),
43     m_deleter()
44     {}
45 
46     StringPrinterBufferPointer(const T* bytes, S size, Deleter deleter = nullptr) :
47     m_data(bytes),
48     m_size(size),
49     m_deleter(deleter)
50     {}
51 
52     StringPrinterBufferPointer(const U* bytes, S size, Deleter deleter = nullptr) :
53     m_data((T*)bytes),
54     m_size(size),
55     m_deleter(deleter)
56     {}
57 
58     StringPrinterBufferPointer(StringPrinterBufferPointer&& rhs) :
59     m_data(rhs.m_data),
60     m_size(rhs.m_size),
61     m_deleter(rhs.m_deleter)
62     {
63         rhs.m_data = nullptr;
64     }
65 
66     StringPrinterBufferPointer(const StringPrinterBufferPointer& rhs) :
67     m_data(rhs.m_data),
68     m_size(rhs.m_size),
69     m_deleter(rhs.m_deleter)
70     {
71         rhs.m_data = nullptr; // this is why m_data has to be mutable
72     }
73 
74     const T*
75     GetBytes () const
76     {
77         return m_data;
78     }
79 
80     const S
81     GetSize () const
82     {
83         return m_size;
84     }
85 
86     ~StringPrinterBufferPointer ()
87     {
88         if (m_data && m_deleter)
89             m_deleter(m_data);
90         m_data = nullptr;
91     }
92 
93     StringPrinterBufferPointer&
94     operator = (const StringPrinterBufferPointer& rhs)
95     {
96         if (m_data && m_deleter)
97             m_deleter(m_data);
98         m_data = rhs.m_data;
99         m_size = rhs.m_size;
100         m_deleter = rhs.m_deleter;
101         rhs.m_data = nullptr;
102         return *this;
103     }
104 
105 private:
106     mutable const T* m_data;
107     size_t m_size;
108     Deleter m_deleter;
109 };
110 
111 // we define this for all values of type but only implement it for those we care about
112 // that's good because we get linker errors for any unsupported type
113 template <StringElementType type>
114 static StringPrinterBufferPointer<>
115 GetPrintableImpl(uint8_t* buffer, uint8_t* buffer_end, uint8_t*& next);
116 
117 // mimic isprint() for Unicode codepoints
118 static bool
119 isprint(char32_t codepoint)
120 {
121     if (codepoint <= 0x1F || codepoint == 0x7F) // C0
122     {
123         return false;
124     }
125     if (codepoint >= 0x80 && codepoint <= 0x9F) // C1
126     {
127         return false;
128     }
129     if (codepoint == 0x2028 || codepoint == 0x2029) // line/paragraph separators
130     {
131         return false;
132     }
133     if (codepoint == 0x200E || codepoint == 0x200F || (codepoint >= 0x202A && codepoint <= 0x202E)) // bidirectional text control
134     {
135         return false;
136     }
137     if (codepoint >= 0xFFF9 && codepoint <= 0xFFFF) // interlinears and generally specials
138     {
139         return false;
140     }
141     return true;
142 }
143 
144 template <>
145 StringPrinterBufferPointer<>
146 GetPrintableImpl<StringElementType::ASCII> (uint8_t* buffer, uint8_t* buffer_end, uint8_t*& next)
147 {
148     StringPrinterBufferPointer<> retval = {nullptr};
149 
150     switch (*buffer)
151     {
152         case 0:
153             retval = {"\\0",2};
154             break;
155         case '\a':
156             retval = {"\\a",2};
157             break;
158         case '\b':
159             retval = {"\\b",2};
160             break;
161         case '\f':
162             retval = {"\\f",2};
163             break;
164         case '\n':
165             retval = {"\\n",2};
166             break;
167         case '\r':
168             retval = {"\\r",2};
169             break;
170         case '\t':
171             retval = {"\\t",2};
172             break;
173         case '\v':
174             retval = {"\\v",2};
175             break;
176         case '\"':
177             retval = {"\\\"",2};
178             break;
179         case '\\':
180             retval = {"\\\\",2};
181             break;
182         default:
183           if (isprint(*buffer))
184               retval = {buffer,1};
185           else
186           {
187               uint8_t* data = new uint8_t[5];
188               sprintf((char*)data,"\\x%02x",*buffer);
189               retval = {data, 4, [] (const uint8_t* c) {delete[] c;} };
190               break;
191           }
192     }
193 
194     next = buffer + 1;
195     return retval;
196 }
197 
198 static char32_t
199 ConvertUTF8ToCodePoint (unsigned char c0, unsigned char c1)
200 {
201     return (c0-192)*64+(c1-128);
202 }
203 static char32_t
204 ConvertUTF8ToCodePoint (unsigned char c0, unsigned char c1, unsigned char c2)
205 {
206     return (c0-224)*4096+(c1-128)*64+(c2-128);
207 }
208 static char32_t
209 ConvertUTF8ToCodePoint (unsigned char c0, unsigned char c1, unsigned char c2, unsigned char c3)
210 {
211     return (c0-240)*262144+(c2-128)*4096+(c2-128)*64+(c3-128);
212 }
213 
214 template <>
215 StringPrinterBufferPointer<>
216 GetPrintableImpl<StringElementType::UTF8> (uint8_t* buffer, uint8_t* buffer_end, uint8_t*& next)
217 {
218     StringPrinterBufferPointer<> retval {nullptr};
219 
220     unsigned utf8_encoded_len = getNumBytesForUTF8(*buffer);
221 
222     if (1+buffer_end-buffer < utf8_encoded_len)
223     {
224         // I don't have enough bytes - print whatever I have left
225         retval = {buffer,static_cast<size_t>(1+buffer_end-buffer)};
226         next = buffer_end+1;
227         return retval;
228     }
229 
230     char32_t codepoint = 0;
231     switch (utf8_encoded_len)
232     {
233         case 1:
234             // this is just an ASCII byte - ask ASCII
235             return GetPrintableImpl<StringElementType::ASCII>(buffer, buffer_end, next);
236         case 2:
237             codepoint = ConvertUTF8ToCodePoint((unsigned char)*buffer, (unsigned char)*(buffer+1));
238             break;
239         case 3:
240             codepoint = ConvertUTF8ToCodePoint((unsigned char)*buffer, (unsigned char)*(buffer+1), (unsigned char)*(buffer+2));
241             break;
242         case 4:
243             codepoint = ConvertUTF8ToCodePoint((unsigned char)*buffer, (unsigned char)*(buffer+1), (unsigned char)*(buffer+2), (unsigned char)*(buffer+3));
244             break;
245         default:
246             // this is probably some bogus non-character thing
247             // just print it as-is and hope to sync up again soon
248             retval = {buffer,1};
249             next = buffer+1;
250             return retval;
251     }
252 
253     if (codepoint)
254     {
255         switch (codepoint)
256         {
257             case 0:
258                 retval = {"\\0",2};
259                 break;
260             case '\a':
261                 retval = {"\\a",2};
262                 break;
263             case '\b':
264                 retval = {"\\b",2};
265                 break;
266             case '\f':
267                 retval = {"\\f",2};
268                 break;
269             case '\n':
270                 retval = {"\\n",2};
271                 break;
272             case '\r':
273                 retval = {"\\r",2};
274                 break;
275             case '\t':
276                 retval = {"\\t",2};
277                 break;
278             case '\v':
279                 retval = {"\\v",2};
280                 break;
281             case '\"':
282                 retval = {"\\\"",2};
283                 break;
284             case '\\':
285                 retval = {"\\\\",2};
286                 break;
287             default:
288                 if (isprint(codepoint))
289                     retval = {buffer,utf8_encoded_len};
290                 else
291                 {
292                     uint8_t* data = new uint8_t[11];
293                     sprintf((char*)data,"\\U%08x",codepoint);
294                     retval = { data,10,[] (const uint8_t* c) {delete[] c;} };
295                     break;
296                 }
297         }
298 
299         next = buffer + utf8_encoded_len;
300         return retval;
301     }
302 
303     // this should not happen - but just in case.. try to resync at some point
304     retval = {buffer,1};
305     next = buffer+1;
306     return retval;
307 }
308 
309 // Given a sequence of bytes, this function returns:
310 // a sequence of bytes to actually print out + a length
311 // the following unscanned position of the buffer is in next
312 static StringPrinterBufferPointer<>
313 GetPrintable(StringElementType type, uint8_t* buffer, uint8_t* buffer_end, uint8_t*& next)
314 {
315     if (!buffer)
316         return {nullptr};
317 
318     switch (type)
319     {
320         case StringElementType::ASCII:
321             return GetPrintableImpl<StringElementType::ASCII>(buffer, buffer_end, next);
322         case StringElementType::UTF8:
323             return GetPrintableImpl<StringElementType::UTF8>(buffer, buffer_end, next);
324         default:
325             return {nullptr};
326     }
327 }
328 
329 // use this call if you already have an LLDB-side buffer for the data
330 template<typename SourceDataType>
331 static bool
332 DumpUTFBufferToStream (ConversionResult (*ConvertFunction) (const SourceDataType**,
333                                                             const SourceDataType*,
334                                                             UTF8**,
335                                                             UTF8*,
336                                                             ConversionFlags),
337                        const ReadBufferAndDumpToStreamOptions& dump_options)
338 {
339     Stream &stream(*dump_options.GetStream());
340     if (dump_options.GetPrefixToken() != 0)
341         stream.Printf("%c",dump_options.GetPrefixToken());
342     if (dump_options.GetQuote() != 0)
343         stream.Printf("%c",dump_options.GetQuote());
344     auto data(dump_options.GetData());
345     auto source_size(dump_options.GetSourceSize());
346     if (data.GetByteSize() && data.GetDataStart() && data.GetDataEnd())
347     {
348         const int bufferSPSize = data.GetByteSize();
349         if (dump_options.GetSourceSize() == 0)
350         {
351             const int origin_encoding = 8*sizeof(SourceDataType);
352             source_size = bufferSPSize/(origin_encoding / 4);
353         }
354 
355         const SourceDataType *data_ptr = (const SourceDataType*)data.GetDataStart();
356         const SourceDataType *data_end_ptr = data_ptr + source_size;
357 
358         const bool zero_is_terminator = dump_options.GetBinaryZeroIsTerminator();
359 
360         if (zero_is_terminator)
361         {
362             while (data_ptr < data_end_ptr)
363             {
364                 if (!*data_ptr)
365                 {
366                     data_end_ptr = data_ptr;
367                     break;
368                 }
369                 data_ptr++;
370             }
371 
372             data_ptr = (const SourceDataType*)data.GetDataStart();
373         }
374 
375         lldb::DataBufferSP utf8_data_buffer_sp;
376         UTF8* utf8_data_ptr = nullptr;
377         UTF8* utf8_data_end_ptr = nullptr;
378 
379         if (ConvertFunction)
380         {
381             utf8_data_buffer_sp.reset(new DataBufferHeap(4*bufferSPSize,0));
382             utf8_data_ptr = (UTF8*)utf8_data_buffer_sp->GetBytes();
383             utf8_data_end_ptr = utf8_data_ptr + utf8_data_buffer_sp->GetByteSize();
384             ConvertFunction ( &data_ptr, data_end_ptr, &utf8_data_ptr, utf8_data_end_ptr, lenientConversion );
385             if (false == zero_is_terminator)
386                 utf8_data_end_ptr = utf8_data_ptr;
387             utf8_data_ptr = (UTF8*)utf8_data_buffer_sp->GetBytes(); // needed because the ConvertFunction will change the value of the data_ptr
388         }
389         else
390         {
391             // just copy the pointers - the cast is necessary to make the compiler happy
392             // but this should only happen if we are reading UTF8 data
393             utf8_data_ptr = (UTF8*)data_ptr;
394             utf8_data_end_ptr = (UTF8*)data_end_ptr;
395         }
396 
397         const bool escape_non_printables = dump_options.GetEscapeNonPrintables();
398 
399         // since we tend to accept partial data (and even partially malformed data)
400         // we might end up with no NULL terminator before the end_ptr
401         // hence we need to take a slower route and ensure we stay within boundaries
402         for (;utf8_data_ptr < utf8_data_end_ptr;)
403         {
404             if (zero_is_terminator && !*utf8_data_ptr)
405                 break;
406 
407             if (escape_non_printables)
408             {
409                 uint8_t* next_data = nullptr;
410                 auto printable = GetPrintable(StringElementType::UTF8, utf8_data_ptr, utf8_data_end_ptr, next_data);
411                 auto printable_bytes = printable.GetBytes();
412                 auto printable_size = printable.GetSize();
413                 if (!printable_bytes || !next_data)
414                 {
415                     // GetPrintable() failed on us - print one byte in a desperate resync attempt
416                     printable_bytes = utf8_data_ptr;
417                     printable_size = 1;
418                     next_data = utf8_data_ptr+1;
419                 }
420                 for (unsigned c = 0; c < printable_size; c++)
421                     stream.Printf("%c", *(printable_bytes+c));
422                 utf8_data_ptr = (uint8_t*)next_data;
423             }
424             else
425             {
426                 stream.Printf("%c",*utf8_data_ptr);
427                 utf8_data_ptr++;
428             }
429         }
430     }
431     if (dump_options.GetQuote() != 0)
432         stream.Printf("%c",dump_options.GetQuote());
433     return true;
434 }
435 
436 lldb_private::formatters::ReadStringAndDumpToStreamOptions::ReadStringAndDumpToStreamOptions (ValueObject& valobj) :
437     ReadStringAndDumpToStreamOptions()
438 {
439     SetEscapeNonPrintables(valobj.GetTargetSP()->GetDebugger().GetEscapeNonPrintables());
440 }
441 
442 lldb_private::formatters::ReadBufferAndDumpToStreamOptions::ReadBufferAndDumpToStreamOptions (ValueObject& valobj) :
443     ReadBufferAndDumpToStreamOptions()
444 {
445     SetEscapeNonPrintables(valobj.GetTargetSP()->GetDebugger().GetEscapeNonPrintables());
446 }
447 
448 lldb_private::formatters::ReadBufferAndDumpToStreamOptions::ReadBufferAndDumpToStreamOptions (const lldb_private::formatters::ReadStringAndDumpToStreamOptions& options) :
449     ReadBufferAndDumpToStreamOptions()
450 {
451     SetStream(options.GetStream());
452     SetPrefixToken(options.GetPrefixToken());
453     SetQuote(options.GetQuote());
454     SetEscapeNonPrintables(options.GetEscapeNonPrintables());
455     SetBinaryZeroIsTerminator(options.GetBinaryZeroIsTerminator());
456 }
457 
458 
459 namespace lldb_private
460 {
461 
462 namespace formatters
463 {
464 
465 template <>
466 bool
467 ReadStringAndDumpToStream<StringElementType::ASCII> (const ReadStringAndDumpToStreamOptions& options)
468 {
469     assert(options.GetStream() && "need a Stream to print the string to");
470     Error my_error;
471 
472     ProcessSP process_sp(options.GetProcessSP());
473 
474     if (process_sp.get() == nullptr || options.GetLocation() == 0)
475         return false;
476 
477     size_t size;
478 
479     if (options.GetSourceSize() == 0)
480         size = process_sp->GetTarget().GetMaximumSizeOfStringSummary();
481     else if (!options.GetIgnoreMaxLength())
482         size = std::min(options.GetSourceSize(),process_sp->GetTarget().GetMaximumSizeOfStringSummary());
483     else
484         size = options.GetSourceSize();
485 
486     lldb::DataBufferSP buffer_sp(new DataBufferHeap(size,0));
487 
488     process_sp->ReadCStringFromMemory(options.GetLocation(), (char*)buffer_sp->GetBytes(), size, my_error);
489 
490     if (my_error.Fail())
491         return false;
492 
493     char prefix_token = options.GetPrefixToken();
494     char quote = options.GetQuote();
495 
496     if (prefix_token != 0)
497         options.GetStream()->Printf("%c%c",prefix_token,quote);
498     else if (quote != 0)
499         options.GetStream()->Printf("%c",quote);
500 
501     uint8_t* data_end = buffer_sp->GetBytes()+buffer_sp->GetByteSize();
502 
503     // since we tend to accept partial data (and even partially malformed data)
504     // we might end up with no NULL terminator before the end_ptr
505     // hence we need to take a slower route and ensure we stay within boundaries
506     for (uint8_t* data = buffer_sp->GetBytes(); *data && (data < data_end);)
507     {
508         if (options.GetEscapeNonPrintables())
509         {
510             uint8_t* next_data = nullptr;
511             auto printable = GetPrintable(StringElementType::ASCII, data, data_end, next_data);
512             auto printable_bytes = printable.GetBytes();
513             auto printable_size = printable.GetSize();
514             if (!printable_bytes || !next_data)
515             {
516                 // GetPrintable() failed on us - print one byte in a desperate resync attempt
517                 printable_bytes = data;
518                 printable_size = 1;
519                 next_data = data+1;
520             }
521             for (unsigned c = 0; c < printable_size; c++)
522                 options.GetStream()->Printf("%c", *(printable_bytes+c));
523             data = (uint8_t*)next_data;
524         }
525         else
526         {
527             options.GetStream()->Printf("%c",*data);
528             data++;
529         }
530     }
531 
532     if (quote != 0)
533         options.GetStream()->Printf("%c",quote);
534 
535     return true;
536 }
537 
538 template<typename SourceDataType>
539 static bool
540 ReadUTFBufferAndDumpToStream (const ReadStringAndDumpToStreamOptions& options,
541                               ConversionResult (*ConvertFunction) (const SourceDataType**,
542                                                                    const SourceDataType*,
543                                                                    UTF8**,
544                                                                    UTF8*,
545                                                                    ConversionFlags))
546 {
547     assert(options.GetStream() && "need a Stream to print the string to");
548 
549     if (options.GetLocation() == 0 || options.GetLocation() == LLDB_INVALID_ADDRESS)
550         return false;
551 
552     lldb::ProcessSP process_sp(options.GetProcessSP());
553 
554     if (!process_sp)
555         return false;
556 
557     const int type_width = sizeof(SourceDataType);
558     const int origin_encoding = 8 * type_width ;
559     if (origin_encoding != 8 && origin_encoding != 16 && origin_encoding != 32)
560         return false;
561     // if not UTF8, I need a conversion function to return proper UTF8
562     if (origin_encoding != 8 && !ConvertFunction)
563         return false;
564 
565     if (!options.GetStream())
566         return false;
567 
568     uint32_t sourceSize = options.GetSourceSize();
569     bool needs_zero_terminator = options.GetNeedsZeroTermination();
570 
571     if (!sourceSize)
572     {
573         sourceSize = process_sp->GetTarget().GetMaximumSizeOfStringSummary();
574         needs_zero_terminator = true;
575     }
576     else if (!options.GetIgnoreMaxLength())
577         sourceSize = std::min(sourceSize,process_sp->GetTarget().GetMaximumSizeOfStringSummary());
578 
579     const int bufferSPSize = sourceSize * type_width;
580 
581     lldb::DataBufferSP buffer_sp(new DataBufferHeap(bufferSPSize,0));
582 
583     if (!buffer_sp->GetBytes())
584         return false;
585 
586     Error error;
587     char *buffer = reinterpret_cast<char *>(buffer_sp->GetBytes());
588 
589     if (needs_zero_terminator)
590         process_sp->ReadStringFromMemory(options.GetLocation(), buffer, bufferSPSize, error, type_width);
591     else
592         process_sp->ReadMemoryFromInferior(options.GetLocation(), (char*)buffer_sp->GetBytes(), bufferSPSize, error);
593 
594     if (error.Fail())
595     {
596         options.GetStream()->Printf("unable to read data");
597         return true;
598     }
599 
600     DataExtractor data(buffer_sp, process_sp->GetByteOrder(), process_sp->GetAddressByteSize());
601 
602     ReadBufferAndDumpToStreamOptions dump_options(options);
603     dump_options.SetData(data);
604     dump_options.SetSourceSize(sourceSize);
605 
606     return DumpUTFBufferToStream(ConvertFunction, dump_options);
607 }
608 
609 template <>
610 bool
611 ReadStringAndDumpToStream<StringElementType::UTF8> (const ReadStringAndDumpToStreamOptions& options)
612 {
613     return ReadUTFBufferAndDumpToStream<UTF8>(options,
614                                               nullptr);
615 }
616 
617 template <>
618 bool
619 ReadStringAndDumpToStream<StringElementType::UTF16> (const ReadStringAndDumpToStreamOptions& options)
620 {
621     return ReadUTFBufferAndDumpToStream<UTF16>(options,
622                                                ConvertUTF16toUTF8);
623 }
624 
625 template <>
626 bool
627 ReadStringAndDumpToStream<StringElementType::UTF32> (const ReadStringAndDumpToStreamOptions& options)
628 {
629     return ReadUTFBufferAndDumpToStream<UTF32>(options,
630                                                ConvertUTF32toUTF8);
631 }
632 
633 template <>
634 bool
635 ReadBufferAndDumpToStream<StringElementType::UTF8> (const ReadBufferAndDumpToStreamOptions& options)
636 {
637     assert(options.GetStream() && "need a Stream to print the string to");
638 
639     return DumpUTFBufferToStream<UTF8>(nullptr, options);
640 }
641 
642 template <>
643 bool
644 ReadBufferAndDumpToStream<StringElementType::ASCII> (const ReadBufferAndDumpToStreamOptions& options)
645 {
646     // treat ASCII the same as UTF8
647     // FIXME: can we optimize ASCII some more?
648     return ReadBufferAndDumpToStream<StringElementType::UTF8>(options);
649 }
650 
651 template <>
652 bool
653 ReadBufferAndDumpToStream<StringElementType::UTF16> (const ReadBufferAndDumpToStreamOptions& options)
654 {
655     assert(options.GetStream() && "need a Stream to print the string to");
656 
657     return DumpUTFBufferToStream(ConvertUTF16toUTF8, options);
658 }
659 
660 template <>
661 bool
662 ReadBufferAndDumpToStream<StringElementType::UTF32> (const ReadBufferAndDumpToStreamOptions& options)
663 {
664     assert(options.GetStream() && "need a Stream to print the string to");
665 
666     return DumpUTFBufferToStream(ConvertUTF32toUTF8, options);
667 }
668 
669 } // namespace formatters
670 
671 } // namespace lldb_private
672