16be38247SSam McCall //=== JSON.cpp - JSON value, parsing and serialization - C++ -----------*-===//
26be38247SSam McCall //
32946cd70SChandler Carruth // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
42946cd70SChandler Carruth // See https://llvm.org/LICENSE.txt for license information.
52946cd70SChandler Carruth // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
66be38247SSam McCall //
76be38247SSam McCall //===---------------------------------------------------------------------===//
86be38247SSam McCall 
96be38247SSam McCall #include "llvm/Support/JSON.h"
10e6057bc6SSam McCall #include "llvm/Support/ConvertUTF.h"
116be38247SSam McCall #include "llvm/Support/Format.h"
126be38247SSam McCall #include <cctype>
136be38247SSam McCall 
146be38247SSam McCall namespace llvm {
156be38247SSam McCall namespace json {
166be38247SSam McCall 
176be38247SSam McCall Value &Object::operator[](const ObjectKey &K) {
186be38247SSam McCall   return try_emplace(K, nullptr).first->getSecond();
196be38247SSam McCall }
206be38247SSam McCall Value &Object::operator[](ObjectKey &&K) {
216be38247SSam McCall   return try_emplace(std::move(K), nullptr).first->getSecond();
226be38247SSam McCall }
236be38247SSam McCall Value *Object::get(StringRef K) {
246be38247SSam McCall   auto I = find(K);
256be38247SSam McCall   if (I == end())
266be38247SSam McCall     return nullptr;
276be38247SSam McCall   return &I->second;
286be38247SSam McCall }
296be38247SSam McCall const Value *Object::get(StringRef K) const {
306be38247SSam McCall   auto I = find(K);
316be38247SSam McCall   if (I == end())
326be38247SSam McCall     return nullptr;
336be38247SSam McCall   return &I->second;
346be38247SSam McCall }
356be38247SSam McCall llvm::Optional<std::nullptr_t> Object::getNull(StringRef K) const {
366be38247SSam McCall   if (auto *V = get(K))
376be38247SSam McCall     return V->getAsNull();
386be38247SSam McCall   return llvm::None;
396be38247SSam McCall }
406be38247SSam McCall llvm::Optional<bool> Object::getBoolean(StringRef K) const {
416be38247SSam McCall   if (auto *V = get(K))
426be38247SSam McCall     return V->getAsBoolean();
436be38247SSam McCall   return llvm::None;
446be38247SSam McCall }
456be38247SSam McCall llvm::Optional<double> Object::getNumber(StringRef K) const {
466be38247SSam McCall   if (auto *V = get(K))
476be38247SSam McCall     return V->getAsNumber();
486be38247SSam McCall   return llvm::None;
496be38247SSam McCall }
506be38247SSam McCall llvm::Optional<int64_t> Object::getInteger(StringRef K) const {
516be38247SSam McCall   if (auto *V = get(K))
526be38247SSam McCall     return V->getAsInteger();
536be38247SSam McCall   return llvm::None;
546be38247SSam McCall }
556be38247SSam McCall llvm::Optional<llvm::StringRef> Object::getString(StringRef K) const {
566be38247SSam McCall   if (auto *V = get(K))
576be38247SSam McCall     return V->getAsString();
586be38247SSam McCall   return llvm::None;
596be38247SSam McCall }
606be38247SSam McCall const json::Object *Object::getObject(StringRef K) const {
616be38247SSam McCall   if (auto *V = get(K))
626be38247SSam McCall     return V->getAsObject();
636be38247SSam McCall   return nullptr;
646be38247SSam McCall }
656be38247SSam McCall json::Object *Object::getObject(StringRef K) {
666be38247SSam McCall   if (auto *V = get(K))
676be38247SSam McCall     return V->getAsObject();
686be38247SSam McCall   return nullptr;
696be38247SSam McCall }
706be38247SSam McCall const json::Array *Object::getArray(StringRef K) const {
716be38247SSam McCall   if (auto *V = get(K))
726be38247SSam McCall     return V->getAsArray();
736be38247SSam McCall   return nullptr;
746be38247SSam McCall }
756be38247SSam McCall json::Array *Object::getArray(StringRef K) {
766be38247SSam McCall   if (auto *V = get(K))
776be38247SSam McCall     return V->getAsArray();
786be38247SSam McCall   return nullptr;
796be38247SSam McCall }
806be38247SSam McCall bool operator==(const Object &LHS, const Object &RHS) {
816be38247SSam McCall   if (LHS.size() != RHS.size())
826be38247SSam McCall     return false;
836be38247SSam McCall   for (const auto &L : LHS) {
846be38247SSam McCall     auto R = RHS.find(L.first);
856be38247SSam McCall     if (R == RHS.end() || L.second != R->second)
866be38247SSam McCall       return false;
876be38247SSam McCall   }
886be38247SSam McCall   return true;
896be38247SSam McCall }
906be38247SSam McCall 
916be38247SSam McCall Array::Array(std::initializer_list<Value> Elements) {
926be38247SSam McCall   V.reserve(Elements.size());
936be38247SSam McCall   for (const Value &V : Elements) {
946be38247SSam McCall     emplace_back(nullptr);
956be38247SSam McCall     back().moveFrom(std::move(V));
966be38247SSam McCall   }
976be38247SSam McCall }
986be38247SSam McCall 
996be38247SSam McCall Value::Value(std::initializer_list<Value> Elements)
1006be38247SSam McCall     : Value(json::Array(Elements)) {}
1016be38247SSam McCall 
1026be38247SSam McCall void Value::copyFrom(const Value &M) {
1036be38247SSam McCall   Type = M.Type;
1046be38247SSam McCall   switch (Type) {
1056be38247SSam McCall   case T_Null:
1066be38247SSam McCall   case T_Boolean:
107d93eaeb7SSam McCall   case T_Double:
108d93eaeb7SSam McCall   case T_Integer:
1096be38247SSam McCall     memcpy(Union.buffer, M.Union.buffer, sizeof(Union.buffer));
1106be38247SSam McCall     break;
1116be38247SSam McCall   case T_StringRef:
1126be38247SSam McCall     create<StringRef>(M.as<StringRef>());
1136be38247SSam McCall     break;
1146be38247SSam McCall   case T_String:
1156be38247SSam McCall     create<std::string>(M.as<std::string>());
1166be38247SSam McCall     break;
1176be38247SSam McCall   case T_Object:
1186be38247SSam McCall     create<json::Object>(M.as<json::Object>());
1196be38247SSam McCall     break;
1206be38247SSam McCall   case T_Array:
1216be38247SSam McCall     create<json::Array>(M.as<json::Array>());
1226be38247SSam McCall     break;
1236be38247SSam McCall   }
1246be38247SSam McCall }
1256be38247SSam McCall 
1266be38247SSam McCall void Value::moveFrom(const Value &&M) {
1276be38247SSam McCall   Type = M.Type;
1286be38247SSam McCall   switch (Type) {
1296be38247SSam McCall   case T_Null:
1306be38247SSam McCall   case T_Boolean:
131d93eaeb7SSam McCall   case T_Double:
132d93eaeb7SSam McCall   case T_Integer:
1336be38247SSam McCall     memcpy(Union.buffer, M.Union.buffer, sizeof(Union.buffer));
1346be38247SSam McCall     break;
1356be38247SSam McCall   case T_StringRef:
1366be38247SSam McCall     create<StringRef>(M.as<StringRef>());
1376be38247SSam McCall     break;
1386be38247SSam McCall   case T_String:
1396be38247SSam McCall     create<std::string>(std::move(M.as<std::string>()));
1406be38247SSam McCall     M.Type = T_Null;
1416be38247SSam McCall     break;
1426be38247SSam McCall   case T_Object:
1436be38247SSam McCall     create<json::Object>(std::move(M.as<json::Object>()));
1446be38247SSam McCall     M.Type = T_Null;
1456be38247SSam McCall     break;
1466be38247SSam McCall   case T_Array:
1476be38247SSam McCall     create<json::Array>(std::move(M.as<json::Array>()));
1486be38247SSam McCall     M.Type = T_Null;
1496be38247SSam McCall     break;
1506be38247SSam McCall   }
1516be38247SSam McCall }
1526be38247SSam McCall 
1536be38247SSam McCall void Value::destroy() {
1546be38247SSam McCall   switch (Type) {
1556be38247SSam McCall   case T_Null:
1566be38247SSam McCall   case T_Boolean:
157d93eaeb7SSam McCall   case T_Double:
158d93eaeb7SSam McCall   case T_Integer:
1596be38247SSam McCall     break;
1606be38247SSam McCall   case T_StringRef:
1616be38247SSam McCall     as<StringRef>().~StringRef();
1626be38247SSam McCall     break;
1636be38247SSam McCall   case T_String:
1646be38247SSam McCall     as<std::string>().~basic_string();
1656be38247SSam McCall     break;
1666be38247SSam McCall   case T_Object:
1676be38247SSam McCall     as<json::Object>().~Object();
1686be38247SSam McCall     break;
1696be38247SSam McCall   case T_Array:
1706be38247SSam McCall     as<json::Array>().~Array();
1716be38247SSam McCall     break;
1726be38247SSam McCall   }
1736be38247SSam McCall }
1746be38247SSam McCall 
1756be38247SSam McCall bool operator==(const Value &L, const Value &R) {
1766be38247SSam McCall   if (L.kind() != R.kind())
1776be38247SSam McCall     return false;
1786be38247SSam McCall   switch (L.kind()) {
1796be38247SSam McCall   case Value::Null:
1806be38247SSam McCall     return *L.getAsNull() == *R.getAsNull();
1816be38247SSam McCall   case Value::Boolean:
1826be38247SSam McCall     return *L.getAsBoolean() == *R.getAsBoolean();
1836be38247SSam McCall   case Value::Number:
1841e7491eaSSam McCall     // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=323
1851e7491eaSSam McCall     // The same integer must convert to the same double, per the standard.
1861e7491eaSSam McCall     // However we see 64-vs-80-bit precision comparisons with gcc-7 -O3 -m32.
1871e7491eaSSam McCall     // So we avoid floating point promotion for exact comparisons.
1881e7491eaSSam McCall     if (L.Type == Value::T_Integer || R.Type == Value::T_Integer)
1891e7491eaSSam McCall       return L.getAsInteger() == R.getAsInteger();
1906be38247SSam McCall     return *L.getAsNumber() == *R.getAsNumber();
1916be38247SSam McCall   case Value::String:
1926be38247SSam McCall     return *L.getAsString() == *R.getAsString();
1936be38247SSam McCall   case Value::Array:
1946be38247SSam McCall     return *L.getAsArray() == *R.getAsArray();
1956be38247SSam McCall   case Value::Object:
1966be38247SSam McCall     return *L.getAsObject() == *R.getAsObject();
1976be38247SSam McCall   }
1986be38247SSam McCall   llvm_unreachable("Unknown value kind");
1996be38247SSam McCall }
2006be38247SSam McCall 
2016be38247SSam McCall namespace {
2026be38247SSam McCall // Simple recursive-descent JSON parser.
2036be38247SSam McCall class Parser {
2046be38247SSam McCall public:
2056be38247SSam McCall   Parser(StringRef JSON)
2066be38247SSam McCall       : Start(JSON.begin()), P(JSON.begin()), End(JSON.end()) {}
2076be38247SSam McCall 
208e6057bc6SSam McCall   bool checkUTF8() {
209e6057bc6SSam McCall     size_t ErrOffset;
210e6057bc6SSam McCall     if (isUTF8(StringRef(Start, End - Start), &ErrOffset))
211e6057bc6SSam McCall       return true;
212e6057bc6SSam McCall     P = Start + ErrOffset; // For line/column calculation.
213e6057bc6SSam McCall     return parseError("Invalid UTF-8 sequence");
214e6057bc6SSam McCall   }
215e6057bc6SSam McCall 
2166be38247SSam McCall   bool parseValue(Value &Out);
2176be38247SSam McCall 
2186be38247SSam McCall   bool assertEnd() {
2196be38247SSam McCall     eatWhitespace();
2206be38247SSam McCall     if (P == End)
2216be38247SSam McCall       return true;
2226be38247SSam McCall     return parseError("Text after end of document");
2236be38247SSam McCall   }
2246be38247SSam McCall 
2256be38247SSam McCall   Error takeError() {
2266be38247SSam McCall     assert(Err);
2276be38247SSam McCall     return std::move(*Err);
2286be38247SSam McCall   }
2296be38247SSam McCall 
2306be38247SSam McCall private:
2316be38247SSam McCall   void eatWhitespace() {
2326be38247SSam McCall     while (P != End && (*P == ' ' || *P == '\r' || *P == '\n' || *P == '\t'))
2336be38247SSam McCall       ++P;
2346be38247SSam McCall   }
2356be38247SSam McCall 
2366be38247SSam McCall   // On invalid syntax, parseX() functions return false and set Err.
237d93eaeb7SSam McCall   bool parseNumber(char First, Value &Out);
2386be38247SSam McCall   bool parseString(std::string &Out);
2396be38247SSam McCall   bool parseUnicode(std::string &Out);
2406be38247SSam McCall   bool parseError(const char *Msg); // always returns false
2416be38247SSam McCall 
2426be38247SSam McCall   char next() { return P == End ? 0 : *P++; }
2436be38247SSam McCall   char peek() { return P == End ? 0 : *P; }
2446be38247SSam McCall   static bool isNumber(char C) {
2456be38247SSam McCall     return C == '0' || C == '1' || C == '2' || C == '3' || C == '4' ||
2466be38247SSam McCall            C == '5' || C == '6' || C == '7' || C == '8' || C == '9' ||
2476be38247SSam McCall            C == 'e' || C == 'E' || C == '+' || C == '-' || C == '.';
2486be38247SSam McCall   }
2496be38247SSam McCall 
2506be38247SSam McCall   Optional<Error> Err;
2516be38247SSam McCall   const char *Start, *P, *End;
2526be38247SSam McCall };
2536be38247SSam McCall 
2546be38247SSam McCall bool Parser::parseValue(Value &Out) {
2556be38247SSam McCall   eatWhitespace();
2566be38247SSam McCall   if (P == End)
2576be38247SSam McCall     return parseError("Unexpected EOF");
2586be38247SSam McCall   switch (char C = next()) {
2596be38247SSam McCall   // Bare null/true/false are easy - first char identifies them.
2606be38247SSam McCall   case 'n':
2616be38247SSam McCall     Out = nullptr;
2626be38247SSam McCall     return (next() == 'u' && next() == 'l' && next() == 'l') ||
2636be38247SSam McCall            parseError("Invalid JSON value (null?)");
2646be38247SSam McCall   case 't':
2656be38247SSam McCall     Out = true;
2666be38247SSam McCall     return (next() == 'r' && next() == 'u' && next() == 'e') ||
2676be38247SSam McCall            parseError("Invalid JSON value (true?)");
2686be38247SSam McCall   case 'f':
2696be38247SSam McCall     Out = false;
2706be38247SSam McCall     return (next() == 'a' && next() == 'l' && next() == 's' && next() == 'e') ||
2716be38247SSam McCall            parseError("Invalid JSON value (false?)");
2726be38247SSam McCall   case '"': {
2736be38247SSam McCall     std::string S;
2746be38247SSam McCall     if (parseString(S)) {
2756be38247SSam McCall       Out = std::move(S);
2766be38247SSam McCall       return true;
2776be38247SSam McCall     }
2786be38247SSam McCall     return false;
2796be38247SSam McCall   }
2806be38247SSam McCall   case '[': {
2816be38247SSam McCall     Out = Array{};
2826be38247SSam McCall     Array &A = *Out.getAsArray();
2836be38247SSam McCall     eatWhitespace();
2846be38247SSam McCall     if (peek() == ']') {
2856be38247SSam McCall       ++P;
2866be38247SSam McCall       return true;
2876be38247SSam McCall     }
2886be38247SSam McCall     for (;;) {
2896be38247SSam McCall       A.emplace_back(nullptr);
2906be38247SSam McCall       if (!parseValue(A.back()))
2916be38247SSam McCall         return false;
2926be38247SSam McCall       eatWhitespace();
2936be38247SSam McCall       switch (next()) {
2946be38247SSam McCall       case ',':
2956be38247SSam McCall         eatWhitespace();
2966be38247SSam McCall         continue;
2976be38247SSam McCall       case ']':
2986be38247SSam McCall         return true;
2996be38247SSam McCall       default:
3006be38247SSam McCall         return parseError("Expected , or ] after array element");
3016be38247SSam McCall       }
3026be38247SSam McCall     }
3036be38247SSam McCall   }
3046be38247SSam McCall   case '{': {
3056be38247SSam McCall     Out = Object{};
3066be38247SSam McCall     Object &O = *Out.getAsObject();
3076be38247SSam McCall     eatWhitespace();
3086be38247SSam McCall     if (peek() == '}') {
3096be38247SSam McCall       ++P;
3106be38247SSam McCall       return true;
3116be38247SSam McCall     }
3126be38247SSam McCall     for (;;) {
3136be38247SSam McCall       if (next() != '"')
3146be38247SSam McCall         return parseError("Expected object key");
3156be38247SSam McCall       std::string K;
3166be38247SSam McCall       if (!parseString(K))
3176be38247SSam McCall         return false;
3186be38247SSam McCall       eatWhitespace();
3196be38247SSam McCall       if (next() != ':')
3206be38247SSam McCall         return parseError("Expected : after object key");
3216be38247SSam McCall       eatWhitespace();
3226be38247SSam McCall       if (!parseValue(O[std::move(K)]))
3236be38247SSam McCall         return false;
3246be38247SSam McCall       eatWhitespace();
3256be38247SSam McCall       switch (next()) {
3266be38247SSam McCall       case ',':
3276be38247SSam McCall         eatWhitespace();
3286be38247SSam McCall         continue;
3296be38247SSam McCall       case '}':
3306be38247SSam McCall         return true;
3316be38247SSam McCall       default:
3326be38247SSam McCall         return parseError("Expected , or } after object property");
3336be38247SSam McCall       }
3346be38247SSam McCall     }
3356be38247SSam McCall   }
3366be38247SSam McCall   default:
337d93eaeb7SSam McCall     if (isNumber(C))
338d93eaeb7SSam McCall       return parseNumber(C, Out);
3396be38247SSam McCall     return parseError("Invalid JSON value");
3406be38247SSam McCall   }
3416be38247SSam McCall }
3426be38247SSam McCall 
343d93eaeb7SSam McCall bool Parser::parseNumber(char First, Value &Out) {
344d93eaeb7SSam McCall   // Read the number into a string. (Must be null-terminated for strto*).
3456be38247SSam McCall   SmallString<24> S;
3466be38247SSam McCall   S.push_back(First);
3476be38247SSam McCall   while (isNumber(peek()))
3486be38247SSam McCall     S.push_back(next());
3496be38247SSam McCall   char *End;
350d93eaeb7SSam McCall   // Try first to parse as integer, and if so preserve full 64 bits.
351d93eaeb7SSam McCall   // strtoll returns long long >= 64 bits, so check it's in range too.
352d93eaeb7SSam McCall   auto I = std::strtoll(S.c_str(), &End, 10);
353d93eaeb7SSam McCall   if (End == S.end() && I >= std::numeric_limits<int64_t>::min() &&
354d93eaeb7SSam McCall       I <= std::numeric_limits<int64_t>::max()) {
355d93eaeb7SSam McCall     Out = int64_t(I);
356d93eaeb7SSam McCall     return true;
357d93eaeb7SSam McCall   }
358d93eaeb7SSam McCall   // If it's not an integer
3596be38247SSam McCall   Out = std::strtod(S.c_str(), &End);
3606be38247SSam McCall   return End == S.end() || parseError("Invalid JSON value (number?)");
3616be38247SSam McCall }
3626be38247SSam McCall 
3636be38247SSam McCall bool Parser::parseString(std::string &Out) {
3646be38247SSam McCall   // leading quote was already consumed.
3656be38247SSam McCall   for (char C = next(); C != '"'; C = next()) {
3666be38247SSam McCall     if (LLVM_UNLIKELY(P == End))
3676be38247SSam McCall       return parseError("Unterminated string");
3686be38247SSam McCall     if (LLVM_UNLIKELY((C & 0x1f) == C))
3696be38247SSam McCall       return parseError("Control character in string");
3706be38247SSam McCall     if (LLVM_LIKELY(C != '\\')) {
3716be38247SSam McCall       Out.push_back(C);
3726be38247SSam McCall       continue;
3736be38247SSam McCall     }
3746be38247SSam McCall     // Handle escape sequence.
3756be38247SSam McCall     switch (C = next()) {
3766be38247SSam McCall     case '"':
3776be38247SSam McCall     case '\\':
3786be38247SSam McCall     case '/':
3796be38247SSam McCall       Out.push_back(C);
3806be38247SSam McCall       break;
3816be38247SSam McCall     case 'b':
3826be38247SSam McCall       Out.push_back('\b');
3836be38247SSam McCall       break;
3846be38247SSam McCall     case 'f':
3856be38247SSam McCall       Out.push_back('\f');
3866be38247SSam McCall       break;
3876be38247SSam McCall     case 'n':
3886be38247SSam McCall       Out.push_back('\n');
3896be38247SSam McCall       break;
3906be38247SSam McCall     case 'r':
3916be38247SSam McCall       Out.push_back('\r');
3926be38247SSam McCall       break;
3936be38247SSam McCall     case 't':
3946be38247SSam McCall       Out.push_back('\t');
3956be38247SSam McCall       break;
3966be38247SSam McCall     case 'u':
3976be38247SSam McCall       if (!parseUnicode(Out))
3986be38247SSam McCall         return false;
3996be38247SSam McCall       break;
4006be38247SSam McCall     default:
4016be38247SSam McCall       return parseError("Invalid escape sequence");
4026be38247SSam McCall     }
4036be38247SSam McCall   }
4046be38247SSam McCall   return true;
4056be38247SSam McCall }
4066be38247SSam McCall 
4076be38247SSam McCall static void encodeUtf8(uint32_t Rune, std::string &Out) {
4086be38247SSam McCall   if (Rune < 0x80) {
4096be38247SSam McCall     Out.push_back(Rune & 0x7F);
4106be38247SSam McCall   } else if (Rune < 0x800) {
4116be38247SSam McCall     uint8_t FirstByte = 0xC0 | ((Rune & 0x7C0) >> 6);
4126be38247SSam McCall     uint8_t SecondByte = 0x80 | (Rune & 0x3F);
4136be38247SSam McCall     Out.push_back(FirstByte);
4146be38247SSam McCall     Out.push_back(SecondByte);
4156be38247SSam McCall   } else if (Rune < 0x10000) {
4166be38247SSam McCall     uint8_t FirstByte = 0xE0 | ((Rune & 0xF000) >> 12);
4176be38247SSam McCall     uint8_t SecondByte = 0x80 | ((Rune & 0xFC0) >> 6);
4186be38247SSam McCall     uint8_t ThirdByte = 0x80 | (Rune & 0x3F);
4196be38247SSam McCall     Out.push_back(FirstByte);
4206be38247SSam McCall     Out.push_back(SecondByte);
4216be38247SSam McCall     Out.push_back(ThirdByte);
4226be38247SSam McCall   } else if (Rune < 0x110000) {
4236be38247SSam McCall     uint8_t FirstByte = 0xF0 | ((Rune & 0x1F0000) >> 18);
4246be38247SSam McCall     uint8_t SecondByte = 0x80 | ((Rune & 0x3F000) >> 12);
4256be38247SSam McCall     uint8_t ThirdByte = 0x80 | ((Rune & 0xFC0) >> 6);
4266be38247SSam McCall     uint8_t FourthByte = 0x80 | (Rune & 0x3F);
4276be38247SSam McCall     Out.push_back(FirstByte);
4286be38247SSam McCall     Out.push_back(SecondByte);
4296be38247SSam McCall     Out.push_back(ThirdByte);
4306be38247SSam McCall     Out.push_back(FourthByte);
4316be38247SSam McCall   } else {
4326be38247SSam McCall     llvm_unreachable("Invalid codepoint");
4336be38247SSam McCall   }
4346be38247SSam McCall }
4356be38247SSam McCall 
4366be38247SSam McCall // Parse a UTF-16 \uNNNN escape sequence. "\u" has already been consumed.
4376be38247SSam McCall // May parse several sequential escapes to ensure proper surrogate handling.
4386be38247SSam McCall // We do not use ConvertUTF.h, it can't accept and replace unpaired surrogates.
4396be38247SSam McCall // These are invalid Unicode but valid JSON (RFC 8259, section 8.2).
4406be38247SSam McCall bool Parser::parseUnicode(std::string &Out) {
4416be38247SSam McCall   // Invalid UTF is not a JSON error (RFC 8529§8.2). It gets replaced by U+FFFD.
4426be38247SSam McCall   auto Invalid = [&] { Out.append(/* UTF-8 */ {'\xef', '\xbf', '\xbd'}); };
4436be38247SSam McCall   // Decodes 4 hex digits from the stream into Out, returns false on error.
4446be38247SSam McCall   auto Parse4Hex = [this](uint16_t &Out) -> bool {
4456be38247SSam McCall     Out = 0;
4466be38247SSam McCall     char Bytes[] = {next(), next(), next(), next()};
4476be38247SSam McCall     for (unsigned char C : Bytes) {
4486be38247SSam McCall       if (!std::isxdigit(C))
4496be38247SSam McCall         return parseError("Invalid \\u escape sequence");
4506be38247SSam McCall       Out <<= 4;
4516be38247SSam McCall       Out |= (C > '9') ? (C & ~0x20) - 'A' + 10 : (C - '0');
4526be38247SSam McCall     }
4536be38247SSam McCall     return true;
4546be38247SSam McCall   };
4556be38247SSam McCall   uint16_t First; // UTF-16 code unit from the first \u escape.
4566be38247SSam McCall   if (!Parse4Hex(First))
4576be38247SSam McCall     return false;
4586be38247SSam McCall 
4596be38247SSam McCall   // We loop to allow proper surrogate-pair error handling.
4606be38247SSam McCall   while (true) {
4616be38247SSam McCall     // Case 1: the UTF-16 code unit is already a codepoint in the BMP.
4626be38247SSam McCall     if (LLVM_LIKELY(First < 0xD800 || First >= 0xE000)) {
4636be38247SSam McCall       encodeUtf8(First, Out);
4646be38247SSam McCall       return true;
4656be38247SSam McCall     }
4666be38247SSam McCall 
4676be38247SSam McCall     // Case 2: it's an (unpaired) trailing surrogate.
4686be38247SSam McCall     if (LLVM_UNLIKELY(First >= 0xDC00)) {
4696be38247SSam McCall       Invalid();
4706be38247SSam McCall       return true;
4716be38247SSam McCall     }
4726be38247SSam McCall 
4736be38247SSam McCall     // Case 3: it's a leading surrogate. We expect a trailing one next.
4746be38247SSam McCall     // Case 3a: there's no trailing \u escape. Don't advance in the stream.
475e6057bc6SSam McCall     if (LLVM_UNLIKELY(P + 2 > End || *P != '\\' || *(P + 1) != 'u')) {
4766be38247SSam McCall       Invalid(); // Leading surrogate was unpaired.
4776be38247SSam McCall       return true;
4786be38247SSam McCall     }
4796be38247SSam McCall     P += 2;
4806be38247SSam McCall     uint16_t Second;
4816be38247SSam McCall     if (!Parse4Hex(Second))
4826be38247SSam McCall       return false;
4836be38247SSam McCall     // Case 3b: there was another \u escape, but it wasn't a trailing surrogate.
4846be38247SSam McCall     if (LLVM_UNLIKELY(Second < 0xDC00 || Second >= 0xE000)) {
4856be38247SSam McCall       Invalid();      // Leading surrogate was unpaired.
4866be38247SSam McCall       First = Second; // Second escape still needs to be processed.
4876be38247SSam McCall       continue;
4886be38247SSam McCall     }
4896be38247SSam McCall     // Case 3c: a valid surrogate pair encoding an astral codepoint.
4906be38247SSam McCall     encodeUtf8(0x10000 | ((First - 0xD800) << 10) | (Second - 0xDC00), Out);
4916be38247SSam McCall     return true;
4926be38247SSam McCall   }
4936be38247SSam McCall }
4946be38247SSam McCall 
4956be38247SSam McCall bool Parser::parseError(const char *Msg) {
4966be38247SSam McCall   int Line = 1;
4976be38247SSam McCall   const char *StartOfLine = Start;
4986be38247SSam McCall   for (const char *X = Start; X < P; ++X) {
4996be38247SSam McCall     if (*X == 0x0A) {
5006be38247SSam McCall       ++Line;
5016be38247SSam McCall       StartOfLine = X + 1;
5026be38247SSam McCall     }
5036be38247SSam McCall   }
5046be38247SSam McCall   Err.emplace(
505*0eaee545SJonas Devlieghere       std::make_unique<ParseError>(Msg, Line, P - StartOfLine, P - Start));
5066be38247SSam McCall   return false;
5076be38247SSam McCall }
5086be38247SSam McCall } // namespace
5096be38247SSam McCall 
5106be38247SSam McCall Expected<Value> parse(StringRef JSON) {
5116be38247SSam McCall   Parser P(JSON);
5126be38247SSam McCall   Value E = nullptr;
513e6057bc6SSam McCall   if (P.checkUTF8())
5146be38247SSam McCall     if (P.parseValue(E))
5156be38247SSam McCall       if (P.assertEnd())
5166be38247SSam McCall         return std::move(E);
5176be38247SSam McCall   return P.takeError();
5186be38247SSam McCall }
5196be38247SSam McCall char ParseError::ID = 0;
5206be38247SSam McCall 
5216be38247SSam McCall static std::vector<const Object::value_type *> sortedElements(const Object &O) {
5226be38247SSam McCall   std::vector<const Object::value_type *> Elements;
5236be38247SSam McCall   for (const auto &E : O)
5246be38247SSam McCall     Elements.push_back(&E);
5250cac726aSFangrui Song   llvm::sort(Elements,
5266be38247SSam McCall              [](const Object::value_type *L, const Object::value_type *R) {
5276be38247SSam McCall                return L->first < R->first;
5286be38247SSam McCall              });
5296be38247SSam McCall   return Elements;
5306be38247SSam McCall }
5316be38247SSam McCall 
532e6057bc6SSam McCall bool isUTF8(llvm::StringRef S, size_t *ErrOffset) {
533e6057bc6SSam McCall   // Fast-path for ASCII, which is valid UTF-8.
534e6057bc6SSam McCall   if (LLVM_LIKELY(isASCII(S)))
535e6057bc6SSam McCall     return true;
536e6057bc6SSam McCall 
537e6057bc6SSam McCall   const UTF8 *Data = reinterpret_cast<const UTF8 *>(S.data()), *Rest = Data;
538e6057bc6SSam McCall   if (LLVM_LIKELY(isLegalUTF8String(&Rest, Data + S.size())))
539e6057bc6SSam McCall     return true;
540e6057bc6SSam McCall 
541e6057bc6SSam McCall   if (ErrOffset)
542e6057bc6SSam McCall     *ErrOffset = Rest - Data;
543e6057bc6SSam McCall   return false;
544e6057bc6SSam McCall }
545e6057bc6SSam McCall 
546e6057bc6SSam McCall std::string fixUTF8(llvm::StringRef S) {
547e6057bc6SSam McCall   // This isn't particularly efficient, but is only for error-recovery.
548e6057bc6SSam McCall   std::vector<UTF32> Codepoints(S.size()); // 1 codepoint per byte suffices.
549e6057bc6SSam McCall   const UTF8 *In8 = reinterpret_cast<const UTF8 *>(S.data());
550e6057bc6SSam McCall   UTF32 *Out32 = Codepoints.data();
551e6057bc6SSam McCall   ConvertUTF8toUTF32(&In8, In8 + S.size(), &Out32, Out32 + Codepoints.size(),
552e6057bc6SSam McCall                      lenientConversion);
553e6057bc6SSam McCall   Codepoints.resize(Out32 - Codepoints.data());
554e6057bc6SSam McCall   std::string Res(4 * Codepoints.size(), 0); // 4 bytes per codepoint suffice
555e6057bc6SSam McCall   const UTF32 *In32 = Codepoints.data();
556e6057bc6SSam McCall   UTF8 *Out8 = reinterpret_cast<UTF8 *>(&Res[0]);
557e6057bc6SSam McCall   ConvertUTF32toUTF8(&In32, In32 + Codepoints.size(), &Out8, Out8 + Res.size(),
558e6057bc6SSam McCall                      strictConversion);
559e6057bc6SSam McCall   Res.resize(reinterpret_cast<char *>(Out8) - Res.data());
560e6057bc6SSam McCall   return Res;
561e6057bc6SSam McCall }
562e6057bc6SSam McCall 
5636be38247SSam McCall static void quote(llvm::raw_ostream &OS, llvm::StringRef S) {
5646be38247SSam McCall   OS << '\"';
5656be38247SSam McCall   for (unsigned char C : S) {
5666be38247SSam McCall     if (C == 0x22 || C == 0x5C)
5676be38247SSam McCall       OS << '\\';
5686be38247SSam McCall     if (C >= 0x20) {
5696be38247SSam McCall       OS << C;
5706be38247SSam McCall       continue;
5716be38247SSam McCall     }
5726be38247SSam McCall     OS << '\\';
5736be38247SSam McCall     switch (C) {
5746be38247SSam McCall     // A few characters are common enough to make short escapes worthwhile.
5756be38247SSam McCall     case '\t':
5766be38247SSam McCall       OS << 't';
5776be38247SSam McCall       break;
5786be38247SSam McCall     case '\n':
5796be38247SSam McCall       OS << 'n';
5806be38247SSam McCall       break;
5816be38247SSam McCall     case '\r':
5826be38247SSam McCall       OS << 'r';
5836be38247SSam McCall       break;
5846be38247SSam McCall     default:
5856be38247SSam McCall       OS << 'u';
5866be38247SSam McCall       llvm::write_hex(OS, C, llvm::HexPrintStyle::Lower, 4);
5876be38247SSam McCall       break;
5886be38247SSam McCall     }
5896be38247SSam McCall   }
5906be38247SSam McCall   OS << '\"';
5916be38247SSam McCall }
5926be38247SSam McCall 
593a7edcfb5SSam McCall void llvm::json::OStream::value(const Value &V) {
594a7edcfb5SSam McCall   switch (V.kind()) {
595a7edcfb5SSam McCall   case Value::Null:
596a7edcfb5SSam McCall     valueBegin();
5976be38247SSam McCall     OS << "null";
598a7edcfb5SSam McCall     return;
599a7edcfb5SSam McCall   case Value::Boolean:
600a7edcfb5SSam McCall     valueBegin();
601a7edcfb5SSam McCall     OS << (*V.getAsBoolean() ? "true" : "false");
602a7edcfb5SSam McCall     return;
603a7edcfb5SSam McCall   case Value::Number:
604a7edcfb5SSam McCall     valueBegin();
605a7edcfb5SSam McCall     if (V.Type == Value::T_Integer)
606a7edcfb5SSam McCall       OS << *V.getAsInteger();
607a7edcfb5SSam McCall     else
608d93eaeb7SSam McCall       OS << format("%.*g", std::numeric_limits<double>::max_digits10,
609a7edcfb5SSam McCall                    *V.getAsNumber());
610a7edcfb5SSam McCall     return;
611a7edcfb5SSam McCall   case Value::String:
612a7edcfb5SSam McCall     valueBegin();
613a7edcfb5SSam McCall     quote(OS, *V.getAsString());
614a7edcfb5SSam McCall     return;
615a7edcfb5SSam McCall   case Value::Array:
616a7edcfb5SSam McCall     return array([&] {
617a7edcfb5SSam McCall       for (const Value &E : *V.getAsArray())
618a7edcfb5SSam McCall         value(E);
619a7edcfb5SSam McCall     });
620a7edcfb5SSam McCall   case Value::Object:
621a7edcfb5SSam McCall     return object([&] {
622a7edcfb5SSam McCall       for (const Object::value_type *E : sortedElements(*V.getAsObject()))
623a7edcfb5SSam McCall         attribute(E->first, E->second);
624a7edcfb5SSam McCall     });
625a7edcfb5SSam McCall   }
626a7edcfb5SSam McCall }
627a7edcfb5SSam McCall 
628a7edcfb5SSam McCall void llvm::json::OStream::valueBegin() {
629a7edcfb5SSam McCall   assert(Stack.back().Ctx != Object && "Only attributes allowed here");
630a7edcfb5SSam McCall   if (Stack.back().HasValue) {
631a7edcfb5SSam McCall     assert(Stack.back().Ctx != Singleton && "Only one value allowed here");
6326be38247SSam McCall     OS << ',';
6336be38247SSam McCall   }
634a7edcfb5SSam McCall   if (Stack.back().Ctx == Array)
635a7edcfb5SSam McCall     newline();
636a7edcfb5SSam McCall   Stack.back().HasValue = true;
6376be38247SSam McCall }
638a7edcfb5SSam McCall 
639a7edcfb5SSam McCall void llvm::json::OStream::newline() {
640a7edcfb5SSam McCall   if (IndentSize) {
641a7edcfb5SSam McCall     OS.write('\n');
642a7edcfb5SSam McCall     OS.indent(Indent);
643a7edcfb5SSam McCall   }
644a7edcfb5SSam McCall }
645a7edcfb5SSam McCall 
646a7edcfb5SSam McCall void llvm::json::OStream::arrayBegin() {
647a7edcfb5SSam McCall   valueBegin();
648a7edcfb5SSam McCall   Stack.emplace_back();
649a7edcfb5SSam McCall   Stack.back().Ctx = Array;
650a7edcfb5SSam McCall   Indent += IndentSize;
6516be38247SSam McCall   OS << '[';
6526be38247SSam McCall }
653a7edcfb5SSam McCall 
654a7edcfb5SSam McCall void llvm::json::OStream::arrayEnd() {
655a7edcfb5SSam McCall   assert(Stack.back().Ctx == Array);
656a7edcfb5SSam McCall   Indent -= IndentSize;
657a7edcfb5SSam McCall   if (Stack.back().HasValue)
658a7edcfb5SSam McCall     newline();
6596be38247SSam McCall   OS << ']';
660a7edcfb5SSam McCall   Stack.pop_back();
661a7edcfb5SSam McCall   assert(!Stack.empty());
6626be38247SSam McCall }
663a7edcfb5SSam McCall 
664a7edcfb5SSam McCall void llvm::json::OStream::objectBegin() {
665a7edcfb5SSam McCall   valueBegin();
666a7edcfb5SSam McCall   Stack.emplace_back();
667a7edcfb5SSam McCall   Stack.back().Ctx = Object;
668a7edcfb5SSam McCall   Indent += IndentSize;
669a7edcfb5SSam McCall   OS << '{';
6706be38247SSam McCall }
671a7edcfb5SSam McCall 
672a7edcfb5SSam McCall void llvm::json::OStream::objectEnd() {
673a7edcfb5SSam McCall   assert(Stack.back().Ctx == Object);
674a7edcfb5SSam McCall   Indent -= IndentSize;
675a7edcfb5SSam McCall   if (Stack.back().HasValue)
676a7edcfb5SSam McCall     newline();
677a7edcfb5SSam McCall   OS << '}';
678a7edcfb5SSam McCall   Stack.pop_back();
679a7edcfb5SSam McCall   assert(!Stack.empty());
6806be38247SSam McCall }
6816be38247SSam McCall 
682a7edcfb5SSam McCall void llvm::json::OStream::attributeBegin(llvm::StringRef Key) {
683a7edcfb5SSam McCall   assert(Stack.back().Ctx == Object);
684a7edcfb5SSam McCall   if (Stack.back().HasValue)
685a7edcfb5SSam McCall     OS << ',';
686a7edcfb5SSam McCall   newline();
687a7edcfb5SSam McCall   Stack.back().HasValue = true;
688a7edcfb5SSam McCall   Stack.emplace_back();
689a7edcfb5SSam McCall   Stack.back().Ctx = Singleton;
690a7edcfb5SSam McCall   if (LLVM_LIKELY(isUTF8(Key))) {
691a7edcfb5SSam McCall     quote(OS, Key);
692a7edcfb5SSam McCall   } else {
693a7edcfb5SSam McCall     assert(false && "Invalid UTF-8 in attribute key");
694a7edcfb5SSam McCall     quote(OS, fixUTF8(Key));
695a7edcfb5SSam McCall   }
696a7edcfb5SSam McCall   OS.write(':');
697a7edcfb5SSam McCall   if (IndentSize)
698a7edcfb5SSam McCall     OS.write(' ');
699a7edcfb5SSam McCall }
700a7edcfb5SSam McCall 
701a7edcfb5SSam McCall void llvm::json::OStream::attributeEnd() {
702a7edcfb5SSam McCall   assert(Stack.back().Ctx == Singleton);
703a7edcfb5SSam McCall   assert(Stack.back().HasValue && "Attribute must have a value");
704a7edcfb5SSam McCall   Stack.pop_back();
705a7edcfb5SSam McCall   assert(Stack.back().Ctx == Object);
706a7edcfb5SSam McCall }
707a7edcfb5SSam McCall 
708a7edcfb5SSam McCall } // namespace json
709a7edcfb5SSam McCall } // namespace llvm
710a7edcfb5SSam McCall 
7116be38247SSam McCall void llvm::format_provider<llvm::json::Value>::format(
7126be38247SSam McCall     const llvm::json::Value &E, raw_ostream &OS, StringRef Options) {
7136be38247SSam McCall   unsigned IndentAmount = 0;
714a7edcfb5SSam McCall   if (!Options.empty() && Options.getAsInteger(/*Radix=*/10, IndentAmount))
7156be38247SSam McCall     llvm_unreachable("json::Value format options should be an integer");
716a7edcfb5SSam McCall   json::OStream(OS, IndentAmount).value(E);
7176be38247SSam McCall }
7186be38247SSam McCall 
719