1 //=== JSON.cpp - JSON value, parsing and serialization - C++ -----------*-===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===---------------------------------------------------------------------===//
9 
10 #include "llvm/Support/JSON.h"
11 #include "llvm/Support/Format.h"
12 #include <cctype>
13 
14 namespace llvm {
15 namespace json {
16 
17 Value &Object::operator[](const ObjectKey &K) {
18   return try_emplace(K, nullptr).first->getSecond();
19 }
20 Value &Object::operator[](ObjectKey &&K) {
21   return try_emplace(std::move(K), nullptr).first->getSecond();
22 }
23 Value *Object::get(StringRef K) {
24   auto I = find(K);
25   if (I == end())
26     return nullptr;
27   return &I->second;
28 }
29 const Value *Object::get(StringRef K) const {
30   auto I = find(K);
31   if (I == end())
32     return nullptr;
33   return &I->second;
34 }
35 llvm::Optional<std::nullptr_t> Object::getNull(StringRef K) const {
36   if (auto *V = get(K))
37     return V->getAsNull();
38   return llvm::None;
39 }
40 llvm::Optional<bool> Object::getBoolean(StringRef K) const {
41   if (auto *V = get(K))
42     return V->getAsBoolean();
43   return llvm::None;
44 }
45 llvm::Optional<double> Object::getNumber(StringRef K) const {
46   if (auto *V = get(K))
47     return V->getAsNumber();
48   return llvm::None;
49 }
50 llvm::Optional<int64_t> Object::getInteger(StringRef K) const {
51   if (auto *V = get(K))
52     return V->getAsInteger();
53   return llvm::None;
54 }
55 llvm::Optional<llvm::StringRef> Object::getString(StringRef K) const {
56   if (auto *V = get(K))
57     return V->getAsString();
58   return llvm::None;
59 }
60 const json::Object *Object::getObject(StringRef K) const {
61   if (auto *V = get(K))
62     return V->getAsObject();
63   return nullptr;
64 }
65 json::Object *Object::getObject(StringRef K) {
66   if (auto *V = get(K))
67     return V->getAsObject();
68   return nullptr;
69 }
70 const json::Array *Object::getArray(StringRef K) const {
71   if (auto *V = get(K))
72     return V->getAsArray();
73   return nullptr;
74 }
75 json::Array *Object::getArray(StringRef K) {
76   if (auto *V = get(K))
77     return V->getAsArray();
78   return nullptr;
79 }
80 bool operator==(const Object &LHS, const Object &RHS) {
81   if (LHS.size() != RHS.size())
82     return false;
83   for (const auto &L : LHS) {
84     auto R = RHS.find(L.first);
85     if (R == RHS.end() || L.second != R->second)
86       return false;
87   }
88   return true;
89 }
90 
91 Array::Array(std::initializer_list<Value> Elements) {
92   V.reserve(Elements.size());
93   for (const Value &V : Elements) {
94     emplace_back(nullptr);
95     back().moveFrom(std::move(V));
96   }
97 }
98 
99 Value::Value(std::initializer_list<Value> Elements)
100     : Value(json::Array(Elements)) {}
101 
102 void Value::copyFrom(const Value &M) {
103   Type = M.Type;
104   switch (Type) {
105   case T_Null:
106   case T_Boolean:
107   case T_Number:
108     memcpy(Union.buffer, M.Union.buffer, sizeof(Union.buffer));
109     break;
110   case T_StringRef:
111     create<StringRef>(M.as<StringRef>());
112     break;
113   case T_String:
114     create<std::string>(M.as<std::string>());
115     break;
116   case T_Object:
117     create<json::Object>(M.as<json::Object>());
118     break;
119   case T_Array:
120     create<json::Array>(M.as<json::Array>());
121     break;
122   }
123 }
124 
125 void Value::moveFrom(const Value &&M) {
126   Type = M.Type;
127   switch (Type) {
128   case T_Null:
129   case T_Boolean:
130   case T_Number:
131     memcpy(Union.buffer, M.Union.buffer, sizeof(Union.buffer));
132     break;
133   case T_StringRef:
134     create<StringRef>(M.as<StringRef>());
135     break;
136   case T_String:
137     create<std::string>(std::move(M.as<std::string>()));
138     M.Type = T_Null;
139     break;
140   case T_Object:
141     create<json::Object>(std::move(M.as<json::Object>()));
142     M.Type = T_Null;
143     break;
144   case T_Array:
145     create<json::Array>(std::move(M.as<json::Array>()));
146     M.Type = T_Null;
147     break;
148   }
149 }
150 
151 void Value::destroy() {
152   switch (Type) {
153   case T_Null:
154   case T_Boolean:
155   case T_Number:
156     break;
157   case T_StringRef:
158     as<StringRef>().~StringRef();
159     break;
160   case T_String:
161     as<std::string>().~basic_string();
162     break;
163   case T_Object:
164     as<json::Object>().~Object();
165     break;
166   case T_Array:
167     as<json::Array>().~Array();
168     break;
169   }
170 }
171 
172 bool operator==(const Value &L, const Value &R) {
173   if (L.kind() != R.kind())
174     return false;
175   switch (L.kind()) {
176   case Value::Null:
177     return *L.getAsNull() == *R.getAsNull();
178   case Value::Boolean:
179     return *L.getAsBoolean() == *R.getAsBoolean();
180   case Value::Number:
181     return *L.getAsNumber() == *R.getAsNumber();
182   case Value::String:
183     return *L.getAsString() == *R.getAsString();
184   case Value::Array:
185     return *L.getAsArray() == *R.getAsArray();
186   case Value::Object:
187     return *L.getAsObject() == *R.getAsObject();
188   }
189   llvm_unreachable("Unknown value kind");
190 }
191 
192 namespace {
193 // Simple recursive-descent JSON parser.
194 class Parser {
195 public:
196   Parser(StringRef JSON)
197       : Start(JSON.begin()), P(JSON.begin()), End(JSON.end()) {}
198 
199   bool parseValue(Value &Out);
200 
201   bool assertEnd() {
202     eatWhitespace();
203     if (P == End)
204       return true;
205     return parseError("Text after end of document");
206   }
207 
208   Error takeError() {
209     assert(Err);
210     return std::move(*Err);
211   }
212 
213 private:
214   void eatWhitespace() {
215     while (P != End && (*P == ' ' || *P == '\r' || *P == '\n' || *P == '\t'))
216       ++P;
217   }
218 
219   // On invalid syntax, parseX() functions return false and set Err.
220   bool parseNumber(char First, double &Out);
221   bool parseString(std::string &Out);
222   bool parseUnicode(std::string &Out);
223   bool parseError(const char *Msg); // always returns false
224 
225   char next() { return P == End ? 0 : *P++; }
226   char peek() { return P == End ? 0 : *P; }
227   static bool isNumber(char C) {
228     return C == '0' || C == '1' || C == '2' || C == '3' || C == '4' ||
229            C == '5' || C == '6' || C == '7' || C == '8' || C == '9' ||
230            C == 'e' || C == 'E' || C == '+' || C == '-' || C == '.';
231   }
232 
233   Optional<Error> Err;
234   const char *Start, *P, *End;
235 };
236 
237 bool Parser::parseValue(Value &Out) {
238   eatWhitespace();
239   if (P == End)
240     return parseError("Unexpected EOF");
241   switch (char C = next()) {
242   // Bare null/true/false are easy - first char identifies them.
243   case 'n':
244     Out = nullptr;
245     return (next() == 'u' && next() == 'l' && next() == 'l') ||
246            parseError("Invalid JSON value (null?)");
247   case 't':
248     Out = true;
249     return (next() == 'r' && next() == 'u' && next() == 'e') ||
250            parseError("Invalid JSON value (true?)");
251   case 'f':
252     Out = false;
253     return (next() == 'a' && next() == 'l' && next() == 's' && next() == 'e') ||
254            parseError("Invalid JSON value (false?)");
255   case '"': {
256     std::string S;
257     if (parseString(S)) {
258       Out = std::move(S);
259       return true;
260     }
261     return false;
262   }
263   case '[': {
264     Out = Array{};
265     Array &A = *Out.getAsArray();
266     eatWhitespace();
267     if (peek() == ']') {
268       ++P;
269       return true;
270     }
271     for (;;) {
272       A.emplace_back(nullptr);
273       if (!parseValue(A.back()))
274         return false;
275       eatWhitespace();
276       switch (next()) {
277       case ',':
278         eatWhitespace();
279         continue;
280       case ']':
281         return true;
282       default:
283         return parseError("Expected , or ] after array element");
284       }
285     }
286   }
287   case '{': {
288     Out = Object{};
289     Object &O = *Out.getAsObject();
290     eatWhitespace();
291     if (peek() == '}') {
292       ++P;
293       return true;
294     }
295     for (;;) {
296       if (next() != '"')
297         return parseError("Expected object key");
298       std::string K;
299       if (!parseString(K))
300         return false;
301       eatWhitespace();
302       if (next() != ':')
303         return parseError("Expected : after object key");
304       eatWhitespace();
305       if (!parseValue(O[std::move(K)]))
306         return false;
307       eatWhitespace();
308       switch (next()) {
309       case ',':
310         eatWhitespace();
311         continue;
312       case '}':
313         return true;
314       default:
315         return parseError("Expected , or } after object property");
316       }
317     }
318   }
319   default:
320     if (isNumber(C)) {
321       double Num;
322       if (parseNumber(C, Num)) {
323         Out = Num;
324         return true;
325       } else {
326         return false;
327       }
328     }
329     return parseError("Invalid JSON value");
330   }
331 }
332 
333 bool Parser::parseNumber(char First, double &Out) {
334   SmallString<24> S;
335   S.push_back(First);
336   while (isNumber(peek()))
337     S.push_back(next());
338   char *End;
339   Out = std::strtod(S.c_str(), &End);
340   return End == S.end() || parseError("Invalid JSON value (number?)");
341 }
342 
343 bool Parser::parseString(std::string &Out) {
344   // leading quote was already consumed.
345   for (char C = next(); C != '"'; C = next()) {
346     if (LLVM_UNLIKELY(P == End))
347       return parseError("Unterminated string");
348     if (LLVM_UNLIKELY((C & 0x1f) == C))
349       return parseError("Control character in string");
350     if (LLVM_LIKELY(C != '\\')) {
351       Out.push_back(C);
352       continue;
353     }
354     // Handle escape sequence.
355     switch (C = next()) {
356     case '"':
357     case '\\':
358     case '/':
359       Out.push_back(C);
360       break;
361     case 'b':
362       Out.push_back('\b');
363       break;
364     case 'f':
365       Out.push_back('\f');
366       break;
367     case 'n':
368       Out.push_back('\n');
369       break;
370     case 'r':
371       Out.push_back('\r');
372       break;
373     case 't':
374       Out.push_back('\t');
375       break;
376     case 'u':
377       if (!parseUnicode(Out))
378         return false;
379       break;
380     default:
381       return parseError("Invalid escape sequence");
382     }
383   }
384   return true;
385 }
386 
387 static void encodeUtf8(uint32_t Rune, std::string &Out) {
388   if (Rune < 0x80) {
389     Out.push_back(Rune & 0x7F);
390   } else if (Rune < 0x800) {
391     uint8_t FirstByte = 0xC0 | ((Rune & 0x7C0) >> 6);
392     uint8_t SecondByte = 0x80 | (Rune & 0x3F);
393     Out.push_back(FirstByte);
394     Out.push_back(SecondByte);
395   } else if (Rune < 0x10000) {
396     uint8_t FirstByte = 0xE0 | ((Rune & 0xF000) >> 12);
397     uint8_t SecondByte = 0x80 | ((Rune & 0xFC0) >> 6);
398     uint8_t ThirdByte = 0x80 | (Rune & 0x3F);
399     Out.push_back(FirstByte);
400     Out.push_back(SecondByte);
401     Out.push_back(ThirdByte);
402   } else if (Rune < 0x110000) {
403     uint8_t FirstByte = 0xF0 | ((Rune & 0x1F0000) >> 18);
404     uint8_t SecondByte = 0x80 | ((Rune & 0x3F000) >> 12);
405     uint8_t ThirdByte = 0x80 | ((Rune & 0xFC0) >> 6);
406     uint8_t FourthByte = 0x80 | (Rune & 0x3F);
407     Out.push_back(FirstByte);
408     Out.push_back(SecondByte);
409     Out.push_back(ThirdByte);
410     Out.push_back(FourthByte);
411   } else {
412     llvm_unreachable("Invalid codepoint");
413   }
414 }
415 
416 // Parse a UTF-16 \uNNNN escape sequence. "\u" has already been consumed.
417 // May parse several sequential escapes to ensure proper surrogate handling.
418 // We do not use ConvertUTF.h, it can't accept and replace unpaired surrogates.
419 // These are invalid Unicode but valid JSON (RFC 8259, section 8.2).
420 bool Parser::parseUnicode(std::string &Out) {
421   // Invalid UTF is not a JSON error (RFC 8529§8.2). It gets replaced by U+FFFD.
422   auto Invalid = [&] { Out.append(/* UTF-8 */ {'\xef', '\xbf', '\xbd'}); };
423   // Decodes 4 hex digits from the stream into Out, returns false on error.
424   auto Parse4Hex = [this](uint16_t &Out) -> bool {
425     Out = 0;
426     char Bytes[] = {next(), next(), next(), next()};
427     for (unsigned char C : Bytes) {
428       if (!std::isxdigit(C))
429         return parseError("Invalid \\u escape sequence");
430       Out <<= 4;
431       Out |= (C > '9') ? (C & ~0x20) - 'A' + 10 : (C - '0');
432     }
433     return true;
434   };
435   uint16_t First; // UTF-16 code unit from the first \u escape.
436   if (!Parse4Hex(First))
437     return false;
438 
439   // We loop to allow proper surrogate-pair error handling.
440   while (true) {
441     // Case 1: the UTF-16 code unit is already a codepoint in the BMP.
442     if (LLVM_LIKELY(First < 0xD800 || First >= 0xE000)) {
443       encodeUtf8(First, Out);
444       return true;
445     }
446 
447     // Case 2: it's an (unpaired) trailing surrogate.
448     if (LLVM_UNLIKELY(First >= 0xDC00)) {
449       Invalid();
450       return true;
451     }
452 
453     // Case 3: it's a leading surrogate. We expect a trailing one next.
454     // Case 3a: there's no trailing \u escape. Don't advance in the stream.
455     if (!LLVM_LIKELY(P + 2 <= End && *P == '\\' && *(P + 1) == 'u')) {
456       Invalid(); // Leading surrogate was unpaired.
457       return true;
458     }
459     P += 2;
460     uint16_t Second;
461     if (!Parse4Hex(Second))
462       return false;
463     // Case 3b: there was another \u escape, but it wasn't a trailing surrogate.
464     if (LLVM_UNLIKELY(Second < 0xDC00 || Second >= 0xE000)) {
465       Invalid();      // Leading surrogate was unpaired.
466       First = Second; // Second escape still needs to be processed.
467       continue;
468     }
469     // Case 3c: a valid surrogate pair encoding an astral codepoint.
470     encodeUtf8(0x10000 | ((First - 0xD800) << 10) | (Second - 0xDC00), Out);
471     return true;
472   }
473 }
474 
475 bool Parser::parseError(const char *Msg) {
476   int Line = 1;
477   const char *StartOfLine = Start;
478   for (const char *X = Start; X < P; ++X) {
479     if (*X == 0x0A) {
480       ++Line;
481       StartOfLine = X + 1;
482     }
483   }
484   Err.emplace(
485       llvm::make_unique<ParseError>(Msg, Line, P - StartOfLine, P - Start));
486   return false;
487 }
488 } // namespace
489 
490 Expected<Value> parse(StringRef JSON) {
491   Parser P(JSON);
492   Value E = nullptr;
493   if (P.parseValue(E))
494     if (P.assertEnd())
495       return std::move(E);
496   return P.takeError();
497 }
498 char ParseError::ID = 0;
499 
500 static std::vector<const Object::value_type *> sortedElements(const Object &O) {
501   std::vector<const Object::value_type *> Elements;
502   for (const auto &E : O)
503     Elements.push_back(&E);
504   llvm::sort(Elements.begin(), Elements.end(),
505              [](const Object::value_type *L, const Object::value_type *R) {
506                return L->first < R->first;
507              });
508   return Elements;
509 }
510 
511 } // namespace json
512 } // namespace llvm
513 
514 static void quote(llvm::raw_ostream &OS, llvm::StringRef S) {
515   OS << '\"';
516   for (unsigned char C : S) {
517     if (C == 0x22 || C == 0x5C)
518       OS << '\\';
519     if (C >= 0x20) {
520       OS << C;
521       continue;
522     }
523     OS << '\\';
524     switch (C) {
525     // A few characters are common enough to make short escapes worthwhile.
526     case '\t':
527       OS << 't';
528       break;
529     case '\n':
530       OS << 'n';
531       break;
532     case '\r':
533       OS << 'r';
534       break;
535     default:
536       OS << 'u';
537       llvm::write_hex(OS, C, llvm::HexPrintStyle::Lower, 4);
538       break;
539     }
540   }
541   OS << '\"';
542 }
543 
544 enum IndenterAction {
545   Indent,
546   Outdent,
547   Newline,
548   Space,
549 };
550 
551 // Prints JSON. The indenter can be used to control formatting.
552 template <typename Indenter>
553 void llvm::json::Value::print(raw_ostream &OS, const Indenter &I) const {
554   switch (Type) {
555   case T_Null:
556     OS << "null";
557     break;
558   case T_Boolean:
559     OS << (as<bool>() ? "true" : "false");
560     break;
561   case T_Number:
562     OS << format("%g", as<double>());
563     break;
564   case T_StringRef:
565     quote(OS, as<StringRef>());
566     break;
567   case T_String:
568     quote(OS, as<std::string>());
569     break;
570   case T_Object: {
571     bool Comma = false;
572     OS << '{';
573     I(Indent);
574     for (const auto *P : sortedElements(as<json::Object>())) {
575       if (Comma)
576         OS << ',';
577       Comma = true;
578       I(Newline);
579       quote(OS, P->first);
580       OS << ':';
581       I(Space);
582       P->second.print(OS, I);
583     }
584     I(Outdent);
585     if (Comma)
586       I(Newline);
587     OS << '}';
588     break;
589   }
590   case T_Array: {
591     bool Comma = false;
592     OS << '[';
593     I(Indent);
594     for (const auto &E : as<json::Array>()) {
595       if (Comma)
596         OS << ',';
597       Comma = true;
598       I(Newline);
599       E.print(OS, I);
600     }
601     I(Outdent);
602     if (Comma)
603       I(Newline);
604     OS << ']';
605     break;
606   }
607   }
608 }
609 
610 void llvm::format_provider<llvm::json::Value>::format(
611     const llvm::json::Value &E, raw_ostream &OS, StringRef Options) {
612   if (Options.empty()) {
613     OS << E;
614     return;
615   }
616   unsigned IndentAmount = 0;
617   if (Options.getAsInteger(/*Radix=*/10, IndentAmount))
618     llvm_unreachable("json::Value format options should be an integer");
619   unsigned IndentLevel = 0;
620   E.print(OS, [&](IndenterAction A) {
621     switch (A) {
622     case Newline:
623       OS << '\n';
624       OS.indent(IndentLevel);
625       break;
626     case Space:
627       OS << ' ';
628       break;
629     case Indent:
630       IndentLevel += IndentAmount;
631       break;
632     case Outdent:
633       IndentLevel -= IndentAmount;
634       break;
635     };
636   });
637 }
638 
639 llvm::raw_ostream &llvm::json::operator<<(raw_ostream &OS, const Value &E) {
640   E.print(OS, [](IndenterAction A) { /*ignore*/ });
641   return OS;
642 }
643