16be38247SSam McCall //=== JSON.cpp - JSON value, parsing and serialization - C++ -----------*-===// 26be38247SSam McCall // 32946cd70SChandler Carruth // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 42946cd70SChandler Carruth // See https://llvm.org/LICENSE.txt for license information. 52946cd70SChandler Carruth // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 66be38247SSam McCall // 76be38247SSam McCall //===---------------------------------------------------------------------===// 86be38247SSam McCall 96be38247SSam McCall #include "llvm/Support/JSON.h" 10e6057bc6SSam McCall #include "llvm/Support/ConvertUTF.h" 116be38247SSam McCall #include "llvm/Support/Format.h" 126be38247SSam McCall #include <cctype> 136be38247SSam McCall 146be38247SSam McCall namespace llvm { 156be38247SSam McCall namespace json { 166be38247SSam McCall 176be38247SSam McCall Value &Object::operator[](const ObjectKey &K) { 186be38247SSam McCall return try_emplace(K, nullptr).first->getSecond(); 196be38247SSam McCall } 206be38247SSam McCall Value &Object::operator[](ObjectKey &&K) { 216be38247SSam McCall return try_emplace(std::move(K), nullptr).first->getSecond(); 226be38247SSam McCall } 236be38247SSam McCall Value *Object::get(StringRef K) { 246be38247SSam McCall auto I = find(K); 256be38247SSam McCall if (I == end()) 266be38247SSam McCall return nullptr; 276be38247SSam McCall return &I->second; 286be38247SSam McCall } 296be38247SSam McCall const Value *Object::get(StringRef K) const { 306be38247SSam McCall auto I = find(K); 316be38247SSam McCall if (I == end()) 326be38247SSam McCall return nullptr; 336be38247SSam McCall return &I->second; 346be38247SSam McCall } 356be38247SSam McCall llvm::Optional<std::nullptr_t> Object::getNull(StringRef K) const { 366be38247SSam McCall if (auto *V = get(K)) 376be38247SSam McCall return V->getAsNull(); 386be38247SSam McCall return llvm::None; 396be38247SSam McCall } 406be38247SSam McCall llvm::Optional<bool> Object::getBoolean(StringRef K) const { 416be38247SSam McCall if (auto *V = get(K)) 426be38247SSam McCall return V->getAsBoolean(); 436be38247SSam McCall return llvm::None; 446be38247SSam McCall } 456be38247SSam McCall llvm::Optional<double> Object::getNumber(StringRef K) const { 466be38247SSam McCall if (auto *V = get(K)) 476be38247SSam McCall return V->getAsNumber(); 486be38247SSam McCall return llvm::None; 496be38247SSam McCall } 506be38247SSam McCall llvm::Optional<int64_t> Object::getInteger(StringRef K) const { 516be38247SSam McCall if (auto *V = get(K)) 526be38247SSam McCall return V->getAsInteger(); 536be38247SSam McCall return llvm::None; 546be38247SSam McCall } 556be38247SSam McCall llvm::Optional<llvm::StringRef> Object::getString(StringRef K) const { 566be38247SSam McCall if (auto *V = get(K)) 576be38247SSam McCall return V->getAsString(); 586be38247SSam McCall return llvm::None; 596be38247SSam McCall } 606be38247SSam McCall const json::Object *Object::getObject(StringRef K) const { 616be38247SSam McCall if (auto *V = get(K)) 626be38247SSam McCall return V->getAsObject(); 636be38247SSam McCall return nullptr; 646be38247SSam McCall } 656be38247SSam McCall json::Object *Object::getObject(StringRef K) { 666be38247SSam McCall if (auto *V = get(K)) 676be38247SSam McCall return V->getAsObject(); 686be38247SSam McCall return nullptr; 696be38247SSam McCall } 706be38247SSam McCall const json::Array *Object::getArray(StringRef K) const { 716be38247SSam McCall if (auto *V = get(K)) 726be38247SSam McCall return V->getAsArray(); 736be38247SSam McCall return nullptr; 746be38247SSam McCall } 756be38247SSam McCall json::Array *Object::getArray(StringRef K) { 766be38247SSam McCall if (auto *V = get(K)) 776be38247SSam McCall return V->getAsArray(); 786be38247SSam McCall return nullptr; 796be38247SSam McCall } 806be38247SSam McCall bool operator==(const Object &LHS, const Object &RHS) { 816be38247SSam McCall if (LHS.size() != RHS.size()) 826be38247SSam McCall return false; 836be38247SSam McCall for (const auto &L : LHS) { 846be38247SSam McCall auto R = RHS.find(L.first); 856be38247SSam McCall if (R == RHS.end() || L.second != R->second) 866be38247SSam McCall return false; 876be38247SSam McCall } 886be38247SSam McCall return true; 896be38247SSam McCall } 906be38247SSam McCall 916be38247SSam McCall Array::Array(std::initializer_list<Value> Elements) { 926be38247SSam McCall V.reserve(Elements.size()); 936be38247SSam McCall for (const Value &V : Elements) { 946be38247SSam McCall emplace_back(nullptr); 956be38247SSam McCall back().moveFrom(std::move(V)); 966be38247SSam McCall } 976be38247SSam McCall } 986be38247SSam McCall 996be38247SSam McCall Value::Value(std::initializer_list<Value> Elements) 1006be38247SSam McCall : Value(json::Array(Elements)) {} 1016be38247SSam McCall 1026be38247SSam McCall void Value::copyFrom(const Value &M) { 1036be38247SSam McCall Type = M.Type; 1046be38247SSam McCall switch (Type) { 1056be38247SSam McCall case T_Null: 1066be38247SSam McCall case T_Boolean: 107d93eaeb7SSam McCall case T_Double: 108d93eaeb7SSam McCall case T_Integer: 1096be38247SSam McCall memcpy(Union.buffer, M.Union.buffer, sizeof(Union.buffer)); 1106be38247SSam McCall break; 1116be38247SSam McCall case T_StringRef: 1126be38247SSam McCall create<StringRef>(M.as<StringRef>()); 1136be38247SSam McCall break; 1146be38247SSam McCall case T_String: 1156be38247SSam McCall create<std::string>(M.as<std::string>()); 1166be38247SSam McCall break; 1176be38247SSam McCall case T_Object: 1186be38247SSam McCall create<json::Object>(M.as<json::Object>()); 1196be38247SSam McCall break; 1206be38247SSam McCall case T_Array: 1216be38247SSam McCall create<json::Array>(M.as<json::Array>()); 1226be38247SSam McCall break; 1236be38247SSam McCall } 1246be38247SSam McCall } 1256be38247SSam McCall 1266be38247SSam McCall void Value::moveFrom(const Value &&M) { 1276be38247SSam McCall Type = M.Type; 1286be38247SSam McCall switch (Type) { 1296be38247SSam McCall case T_Null: 1306be38247SSam McCall case T_Boolean: 131d93eaeb7SSam McCall case T_Double: 132d93eaeb7SSam McCall case T_Integer: 1336be38247SSam McCall memcpy(Union.buffer, M.Union.buffer, sizeof(Union.buffer)); 1346be38247SSam McCall break; 1356be38247SSam McCall case T_StringRef: 1366be38247SSam McCall create<StringRef>(M.as<StringRef>()); 1376be38247SSam McCall break; 1386be38247SSam McCall case T_String: 1396be38247SSam McCall create<std::string>(std::move(M.as<std::string>())); 1406be38247SSam McCall M.Type = T_Null; 1416be38247SSam McCall break; 1426be38247SSam McCall case T_Object: 1436be38247SSam McCall create<json::Object>(std::move(M.as<json::Object>())); 1446be38247SSam McCall M.Type = T_Null; 1456be38247SSam McCall break; 1466be38247SSam McCall case T_Array: 1476be38247SSam McCall create<json::Array>(std::move(M.as<json::Array>())); 1486be38247SSam McCall M.Type = T_Null; 1496be38247SSam McCall break; 1506be38247SSam McCall } 1516be38247SSam McCall } 1526be38247SSam McCall 1536be38247SSam McCall void Value::destroy() { 1546be38247SSam McCall switch (Type) { 1556be38247SSam McCall case T_Null: 1566be38247SSam McCall case T_Boolean: 157d93eaeb7SSam McCall case T_Double: 158d93eaeb7SSam McCall case T_Integer: 1596be38247SSam McCall break; 1606be38247SSam McCall case T_StringRef: 1616be38247SSam McCall as<StringRef>().~StringRef(); 1626be38247SSam McCall break; 1636be38247SSam McCall case T_String: 1646be38247SSam McCall as<std::string>().~basic_string(); 1656be38247SSam McCall break; 1666be38247SSam McCall case T_Object: 1676be38247SSam McCall as<json::Object>().~Object(); 1686be38247SSam McCall break; 1696be38247SSam McCall case T_Array: 1706be38247SSam McCall as<json::Array>().~Array(); 1716be38247SSam McCall break; 1726be38247SSam McCall } 1736be38247SSam McCall } 1746be38247SSam McCall 1756be38247SSam McCall bool operator==(const Value &L, const Value &R) { 1766be38247SSam McCall if (L.kind() != R.kind()) 1776be38247SSam McCall return false; 1786be38247SSam McCall switch (L.kind()) { 1796be38247SSam McCall case Value::Null: 1806be38247SSam McCall return *L.getAsNull() == *R.getAsNull(); 1816be38247SSam McCall case Value::Boolean: 1826be38247SSam McCall return *L.getAsBoolean() == *R.getAsBoolean(); 1836be38247SSam McCall case Value::Number: 1841e7491eaSSam McCall // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=323 1851e7491eaSSam McCall // The same integer must convert to the same double, per the standard. 1861e7491eaSSam McCall // However we see 64-vs-80-bit precision comparisons with gcc-7 -O3 -m32. 1871e7491eaSSam McCall // So we avoid floating point promotion for exact comparisons. 1881e7491eaSSam McCall if (L.Type == Value::T_Integer || R.Type == Value::T_Integer) 1891e7491eaSSam McCall return L.getAsInteger() == R.getAsInteger(); 1906be38247SSam McCall return *L.getAsNumber() == *R.getAsNumber(); 1916be38247SSam McCall case Value::String: 1926be38247SSam McCall return *L.getAsString() == *R.getAsString(); 1936be38247SSam McCall case Value::Array: 1946be38247SSam McCall return *L.getAsArray() == *R.getAsArray(); 1956be38247SSam McCall case Value::Object: 1966be38247SSam McCall return *L.getAsObject() == *R.getAsObject(); 1976be38247SSam McCall } 1986be38247SSam McCall llvm_unreachable("Unknown value kind"); 1996be38247SSam McCall } 2006be38247SSam McCall 2016be38247SSam McCall namespace { 2026be38247SSam McCall // Simple recursive-descent JSON parser. 2036be38247SSam McCall class Parser { 2046be38247SSam McCall public: 2056be38247SSam McCall Parser(StringRef JSON) 2066be38247SSam McCall : Start(JSON.begin()), P(JSON.begin()), End(JSON.end()) {} 2076be38247SSam McCall 208e6057bc6SSam McCall bool checkUTF8() { 209e6057bc6SSam McCall size_t ErrOffset; 210e6057bc6SSam McCall if (isUTF8(StringRef(Start, End - Start), &ErrOffset)) 211e6057bc6SSam McCall return true; 212e6057bc6SSam McCall P = Start + ErrOffset; // For line/column calculation. 213e6057bc6SSam McCall return parseError("Invalid UTF-8 sequence"); 214e6057bc6SSam McCall } 215e6057bc6SSam McCall 2166be38247SSam McCall bool parseValue(Value &Out); 2176be38247SSam McCall 2186be38247SSam McCall bool assertEnd() { 2196be38247SSam McCall eatWhitespace(); 2206be38247SSam McCall if (P == End) 2216be38247SSam McCall return true; 2226be38247SSam McCall return parseError("Text after end of document"); 2236be38247SSam McCall } 2246be38247SSam McCall 2256be38247SSam McCall Error takeError() { 2266be38247SSam McCall assert(Err); 2276be38247SSam McCall return std::move(*Err); 2286be38247SSam McCall } 2296be38247SSam McCall 2306be38247SSam McCall private: 2316be38247SSam McCall void eatWhitespace() { 2326be38247SSam McCall while (P != End && (*P == ' ' || *P == '\r' || *P == '\n' || *P == '\t')) 2336be38247SSam McCall ++P; 2346be38247SSam McCall } 2356be38247SSam McCall 2366be38247SSam McCall // On invalid syntax, parseX() functions return false and set Err. 237d93eaeb7SSam McCall bool parseNumber(char First, Value &Out); 2386be38247SSam McCall bool parseString(std::string &Out); 2396be38247SSam McCall bool parseUnicode(std::string &Out); 2406be38247SSam McCall bool parseError(const char *Msg); // always returns false 2416be38247SSam McCall 2426be38247SSam McCall char next() { return P == End ? 0 : *P++; } 2436be38247SSam McCall char peek() { return P == End ? 0 : *P; } 2446be38247SSam McCall static bool isNumber(char C) { 2456be38247SSam McCall return C == '0' || C == '1' || C == '2' || C == '3' || C == '4' || 2466be38247SSam McCall C == '5' || C == '6' || C == '7' || C == '8' || C == '9' || 2476be38247SSam McCall C == 'e' || C == 'E' || C == '+' || C == '-' || C == '.'; 2486be38247SSam McCall } 2496be38247SSam McCall 2506be38247SSam McCall Optional<Error> Err; 2516be38247SSam McCall const char *Start, *P, *End; 2526be38247SSam McCall }; 2536be38247SSam McCall 2546be38247SSam McCall bool Parser::parseValue(Value &Out) { 2556be38247SSam McCall eatWhitespace(); 2566be38247SSam McCall if (P == End) 2576be38247SSam McCall return parseError("Unexpected EOF"); 2586be38247SSam McCall switch (char C = next()) { 2596be38247SSam McCall // Bare null/true/false are easy - first char identifies them. 2606be38247SSam McCall case 'n': 2616be38247SSam McCall Out = nullptr; 2626be38247SSam McCall return (next() == 'u' && next() == 'l' && next() == 'l') || 2636be38247SSam McCall parseError("Invalid JSON value (null?)"); 2646be38247SSam McCall case 't': 2656be38247SSam McCall Out = true; 2666be38247SSam McCall return (next() == 'r' && next() == 'u' && next() == 'e') || 2676be38247SSam McCall parseError("Invalid JSON value (true?)"); 2686be38247SSam McCall case 'f': 2696be38247SSam McCall Out = false; 2706be38247SSam McCall return (next() == 'a' && next() == 'l' && next() == 's' && next() == 'e') || 2716be38247SSam McCall parseError("Invalid JSON value (false?)"); 2726be38247SSam McCall case '"': { 2736be38247SSam McCall std::string S; 2746be38247SSam McCall if (parseString(S)) { 2756be38247SSam McCall Out = std::move(S); 2766be38247SSam McCall return true; 2776be38247SSam McCall } 2786be38247SSam McCall return false; 2796be38247SSam McCall } 2806be38247SSam McCall case '[': { 2816be38247SSam McCall Out = Array{}; 2826be38247SSam McCall Array &A = *Out.getAsArray(); 2836be38247SSam McCall eatWhitespace(); 2846be38247SSam McCall if (peek() == ']') { 2856be38247SSam McCall ++P; 2866be38247SSam McCall return true; 2876be38247SSam McCall } 2886be38247SSam McCall for (;;) { 2896be38247SSam McCall A.emplace_back(nullptr); 2906be38247SSam McCall if (!parseValue(A.back())) 2916be38247SSam McCall return false; 2926be38247SSam McCall eatWhitespace(); 2936be38247SSam McCall switch (next()) { 2946be38247SSam McCall case ',': 2956be38247SSam McCall eatWhitespace(); 2966be38247SSam McCall continue; 2976be38247SSam McCall case ']': 2986be38247SSam McCall return true; 2996be38247SSam McCall default: 3006be38247SSam McCall return parseError("Expected , or ] after array element"); 3016be38247SSam McCall } 3026be38247SSam McCall } 3036be38247SSam McCall } 3046be38247SSam McCall case '{': { 3056be38247SSam McCall Out = Object{}; 3066be38247SSam McCall Object &O = *Out.getAsObject(); 3076be38247SSam McCall eatWhitespace(); 3086be38247SSam McCall if (peek() == '}') { 3096be38247SSam McCall ++P; 3106be38247SSam McCall return true; 3116be38247SSam McCall } 3126be38247SSam McCall for (;;) { 3136be38247SSam McCall if (next() != '"') 3146be38247SSam McCall return parseError("Expected object key"); 3156be38247SSam McCall std::string K; 3166be38247SSam McCall if (!parseString(K)) 3176be38247SSam McCall return false; 3186be38247SSam McCall eatWhitespace(); 3196be38247SSam McCall if (next() != ':') 3206be38247SSam McCall return parseError("Expected : after object key"); 3216be38247SSam McCall eatWhitespace(); 3226be38247SSam McCall if (!parseValue(O[std::move(K)])) 3236be38247SSam McCall return false; 3246be38247SSam McCall eatWhitespace(); 3256be38247SSam McCall switch (next()) { 3266be38247SSam McCall case ',': 3276be38247SSam McCall eatWhitespace(); 3286be38247SSam McCall continue; 3296be38247SSam McCall case '}': 3306be38247SSam McCall return true; 3316be38247SSam McCall default: 3326be38247SSam McCall return parseError("Expected , or } after object property"); 3336be38247SSam McCall } 3346be38247SSam McCall } 3356be38247SSam McCall } 3366be38247SSam McCall default: 337d93eaeb7SSam McCall if (isNumber(C)) 338d93eaeb7SSam McCall return parseNumber(C, Out); 3396be38247SSam McCall return parseError("Invalid JSON value"); 3406be38247SSam McCall } 3416be38247SSam McCall } 3426be38247SSam McCall 343d93eaeb7SSam McCall bool Parser::parseNumber(char First, Value &Out) { 344d93eaeb7SSam McCall // Read the number into a string. (Must be null-terminated for strto*). 3456be38247SSam McCall SmallString<24> S; 3466be38247SSam McCall S.push_back(First); 3476be38247SSam McCall while (isNumber(peek())) 3486be38247SSam McCall S.push_back(next()); 3496be38247SSam McCall char *End; 350d93eaeb7SSam McCall // Try first to parse as integer, and if so preserve full 64 bits. 351d93eaeb7SSam McCall // strtoll returns long long >= 64 bits, so check it's in range too. 352d93eaeb7SSam McCall auto I = std::strtoll(S.c_str(), &End, 10); 353d93eaeb7SSam McCall if (End == S.end() && I >= std::numeric_limits<int64_t>::min() && 354d93eaeb7SSam McCall I <= std::numeric_limits<int64_t>::max()) { 355d93eaeb7SSam McCall Out = int64_t(I); 356d93eaeb7SSam McCall return true; 357d93eaeb7SSam McCall } 358d93eaeb7SSam McCall // If it's not an integer 3596be38247SSam McCall Out = std::strtod(S.c_str(), &End); 3606be38247SSam McCall return End == S.end() || parseError("Invalid JSON value (number?)"); 3616be38247SSam McCall } 3626be38247SSam McCall 3636be38247SSam McCall bool Parser::parseString(std::string &Out) { 3646be38247SSam McCall // leading quote was already consumed. 3656be38247SSam McCall for (char C = next(); C != '"'; C = next()) { 3666be38247SSam McCall if (LLVM_UNLIKELY(P == End)) 3676be38247SSam McCall return parseError("Unterminated string"); 3686be38247SSam McCall if (LLVM_UNLIKELY((C & 0x1f) == C)) 3696be38247SSam McCall return parseError("Control character in string"); 3706be38247SSam McCall if (LLVM_LIKELY(C != '\\')) { 3716be38247SSam McCall Out.push_back(C); 3726be38247SSam McCall continue; 3736be38247SSam McCall } 3746be38247SSam McCall // Handle escape sequence. 3756be38247SSam McCall switch (C = next()) { 3766be38247SSam McCall case '"': 3776be38247SSam McCall case '\\': 3786be38247SSam McCall case '/': 3796be38247SSam McCall Out.push_back(C); 3806be38247SSam McCall break; 3816be38247SSam McCall case 'b': 3826be38247SSam McCall Out.push_back('\b'); 3836be38247SSam McCall break; 3846be38247SSam McCall case 'f': 3856be38247SSam McCall Out.push_back('\f'); 3866be38247SSam McCall break; 3876be38247SSam McCall case 'n': 3886be38247SSam McCall Out.push_back('\n'); 3896be38247SSam McCall break; 3906be38247SSam McCall case 'r': 3916be38247SSam McCall Out.push_back('\r'); 3926be38247SSam McCall break; 3936be38247SSam McCall case 't': 3946be38247SSam McCall Out.push_back('\t'); 3956be38247SSam McCall break; 3966be38247SSam McCall case 'u': 3976be38247SSam McCall if (!parseUnicode(Out)) 3986be38247SSam McCall return false; 3996be38247SSam McCall break; 4006be38247SSam McCall default: 4016be38247SSam McCall return parseError("Invalid escape sequence"); 4026be38247SSam McCall } 4036be38247SSam McCall } 4046be38247SSam McCall return true; 4056be38247SSam McCall } 4066be38247SSam McCall 4076be38247SSam McCall static void encodeUtf8(uint32_t Rune, std::string &Out) { 4086be38247SSam McCall if (Rune < 0x80) { 4096be38247SSam McCall Out.push_back(Rune & 0x7F); 4106be38247SSam McCall } else if (Rune < 0x800) { 4116be38247SSam McCall uint8_t FirstByte = 0xC0 | ((Rune & 0x7C0) >> 6); 4126be38247SSam McCall uint8_t SecondByte = 0x80 | (Rune & 0x3F); 4136be38247SSam McCall Out.push_back(FirstByte); 4146be38247SSam McCall Out.push_back(SecondByte); 4156be38247SSam McCall } else if (Rune < 0x10000) { 4166be38247SSam McCall uint8_t FirstByte = 0xE0 | ((Rune & 0xF000) >> 12); 4176be38247SSam McCall uint8_t SecondByte = 0x80 | ((Rune & 0xFC0) >> 6); 4186be38247SSam McCall uint8_t ThirdByte = 0x80 | (Rune & 0x3F); 4196be38247SSam McCall Out.push_back(FirstByte); 4206be38247SSam McCall Out.push_back(SecondByte); 4216be38247SSam McCall Out.push_back(ThirdByte); 4226be38247SSam McCall } else if (Rune < 0x110000) { 4236be38247SSam McCall uint8_t FirstByte = 0xF0 | ((Rune & 0x1F0000) >> 18); 4246be38247SSam McCall uint8_t SecondByte = 0x80 | ((Rune & 0x3F000) >> 12); 4256be38247SSam McCall uint8_t ThirdByte = 0x80 | ((Rune & 0xFC0) >> 6); 4266be38247SSam McCall uint8_t FourthByte = 0x80 | (Rune & 0x3F); 4276be38247SSam McCall Out.push_back(FirstByte); 4286be38247SSam McCall Out.push_back(SecondByte); 4296be38247SSam McCall Out.push_back(ThirdByte); 4306be38247SSam McCall Out.push_back(FourthByte); 4316be38247SSam McCall } else { 4326be38247SSam McCall llvm_unreachable("Invalid codepoint"); 4336be38247SSam McCall } 4346be38247SSam McCall } 4356be38247SSam McCall 4366be38247SSam McCall // Parse a UTF-16 \uNNNN escape sequence. "\u" has already been consumed. 4376be38247SSam McCall // May parse several sequential escapes to ensure proper surrogate handling. 4386be38247SSam McCall // We do not use ConvertUTF.h, it can't accept and replace unpaired surrogates. 4396be38247SSam McCall // These are invalid Unicode but valid JSON (RFC 8259, section 8.2). 4406be38247SSam McCall bool Parser::parseUnicode(std::string &Out) { 4416be38247SSam McCall // Invalid UTF is not a JSON error (RFC 8529§8.2). It gets replaced by U+FFFD. 4426be38247SSam McCall auto Invalid = [&] { Out.append(/* UTF-8 */ {'\xef', '\xbf', '\xbd'}); }; 4436be38247SSam McCall // Decodes 4 hex digits from the stream into Out, returns false on error. 4446be38247SSam McCall auto Parse4Hex = [this](uint16_t &Out) -> bool { 4456be38247SSam McCall Out = 0; 4466be38247SSam McCall char Bytes[] = {next(), next(), next(), next()}; 4476be38247SSam McCall for (unsigned char C : Bytes) { 4486be38247SSam McCall if (!std::isxdigit(C)) 4496be38247SSam McCall return parseError("Invalid \\u escape sequence"); 4506be38247SSam McCall Out <<= 4; 4516be38247SSam McCall Out |= (C > '9') ? (C & ~0x20) - 'A' + 10 : (C - '0'); 4526be38247SSam McCall } 4536be38247SSam McCall return true; 4546be38247SSam McCall }; 4556be38247SSam McCall uint16_t First; // UTF-16 code unit from the first \u escape. 4566be38247SSam McCall if (!Parse4Hex(First)) 4576be38247SSam McCall return false; 4586be38247SSam McCall 4596be38247SSam McCall // We loop to allow proper surrogate-pair error handling. 4606be38247SSam McCall while (true) { 4616be38247SSam McCall // Case 1: the UTF-16 code unit is already a codepoint in the BMP. 4626be38247SSam McCall if (LLVM_LIKELY(First < 0xD800 || First >= 0xE000)) { 4636be38247SSam McCall encodeUtf8(First, Out); 4646be38247SSam McCall return true; 4656be38247SSam McCall } 4666be38247SSam McCall 4676be38247SSam McCall // Case 2: it's an (unpaired) trailing surrogate. 4686be38247SSam McCall if (LLVM_UNLIKELY(First >= 0xDC00)) { 4696be38247SSam McCall Invalid(); 4706be38247SSam McCall return true; 4716be38247SSam McCall } 4726be38247SSam McCall 4736be38247SSam McCall // Case 3: it's a leading surrogate. We expect a trailing one next. 4746be38247SSam McCall // Case 3a: there's no trailing \u escape. Don't advance in the stream. 475e6057bc6SSam McCall if (LLVM_UNLIKELY(P + 2 > End || *P != '\\' || *(P + 1) != 'u')) { 4766be38247SSam McCall Invalid(); // Leading surrogate was unpaired. 4776be38247SSam McCall return true; 4786be38247SSam McCall } 4796be38247SSam McCall P += 2; 4806be38247SSam McCall uint16_t Second; 4816be38247SSam McCall if (!Parse4Hex(Second)) 4826be38247SSam McCall return false; 4836be38247SSam McCall // Case 3b: there was another \u escape, but it wasn't a trailing surrogate. 4846be38247SSam McCall if (LLVM_UNLIKELY(Second < 0xDC00 || Second >= 0xE000)) { 4856be38247SSam McCall Invalid(); // Leading surrogate was unpaired. 4866be38247SSam McCall First = Second; // Second escape still needs to be processed. 4876be38247SSam McCall continue; 4886be38247SSam McCall } 4896be38247SSam McCall // Case 3c: a valid surrogate pair encoding an astral codepoint. 4906be38247SSam McCall encodeUtf8(0x10000 | ((First - 0xD800) << 10) | (Second - 0xDC00), Out); 4916be38247SSam McCall return true; 4926be38247SSam McCall } 4936be38247SSam McCall } 4946be38247SSam McCall 4956be38247SSam McCall bool Parser::parseError(const char *Msg) { 4966be38247SSam McCall int Line = 1; 4976be38247SSam McCall const char *StartOfLine = Start; 4986be38247SSam McCall for (const char *X = Start; X < P; ++X) { 4996be38247SSam McCall if (*X == 0x0A) { 5006be38247SSam McCall ++Line; 5016be38247SSam McCall StartOfLine = X + 1; 5026be38247SSam McCall } 5036be38247SSam McCall } 5046be38247SSam McCall Err.emplace( 505*0eaee545SJonas Devlieghere std::make_unique<ParseError>(Msg, Line, P - StartOfLine, P - Start)); 5066be38247SSam McCall return false; 5076be38247SSam McCall } 5086be38247SSam McCall } // namespace 5096be38247SSam McCall 5106be38247SSam McCall Expected<Value> parse(StringRef JSON) { 5116be38247SSam McCall Parser P(JSON); 5126be38247SSam McCall Value E = nullptr; 513e6057bc6SSam McCall if (P.checkUTF8()) 5146be38247SSam McCall if (P.parseValue(E)) 5156be38247SSam McCall if (P.assertEnd()) 5166be38247SSam McCall return std::move(E); 5176be38247SSam McCall return P.takeError(); 5186be38247SSam McCall } 5196be38247SSam McCall char ParseError::ID = 0; 5206be38247SSam McCall 5216be38247SSam McCall static std::vector<const Object::value_type *> sortedElements(const Object &O) { 5226be38247SSam McCall std::vector<const Object::value_type *> Elements; 5236be38247SSam McCall for (const auto &E : O) 5246be38247SSam McCall Elements.push_back(&E); 5250cac726aSFangrui Song llvm::sort(Elements, 5266be38247SSam McCall [](const Object::value_type *L, const Object::value_type *R) { 5276be38247SSam McCall return L->first < R->first; 5286be38247SSam McCall }); 5296be38247SSam McCall return Elements; 5306be38247SSam McCall } 5316be38247SSam McCall 532e6057bc6SSam McCall bool isUTF8(llvm::StringRef S, size_t *ErrOffset) { 533e6057bc6SSam McCall // Fast-path for ASCII, which is valid UTF-8. 534e6057bc6SSam McCall if (LLVM_LIKELY(isASCII(S))) 535e6057bc6SSam McCall return true; 536e6057bc6SSam McCall 537e6057bc6SSam McCall const UTF8 *Data = reinterpret_cast<const UTF8 *>(S.data()), *Rest = Data; 538e6057bc6SSam McCall if (LLVM_LIKELY(isLegalUTF8String(&Rest, Data + S.size()))) 539e6057bc6SSam McCall return true; 540e6057bc6SSam McCall 541e6057bc6SSam McCall if (ErrOffset) 542e6057bc6SSam McCall *ErrOffset = Rest - Data; 543e6057bc6SSam McCall return false; 544e6057bc6SSam McCall } 545e6057bc6SSam McCall 546e6057bc6SSam McCall std::string fixUTF8(llvm::StringRef S) { 547e6057bc6SSam McCall // This isn't particularly efficient, but is only for error-recovery. 548e6057bc6SSam McCall std::vector<UTF32> Codepoints(S.size()); // 1 codepoint per byte suffices. 549e6057bc6SSam McCall const UTF8 *In8 = reinterpret_cast<const UTF8 *>(S.data()); 550e6057bc6SSam McCall UTF32 *Out32 = Codepoints.data(); 551e6057bc6SSam McCall ConvertUTF8toUTF32(&In8, In8 + S.size(), &Out32, Out32 + Codepoints.size(), 552e6057bc6SSam McCall lenientConversion); 553e6057bc6SSam McCall Codepoints.resize(Out32 - Codepoints.data()); 554e6057bc6SSam McCall std::string Res(4 * Codepoints.size(), 0); // 4 bytes per codepoint suffice 555e6057bc6SSam McCall const UTF32 *In32 = Codepoints.data(); 556e6057bc6SSam McCall UTF8 *Out8 = reinterpret_cast<UTF8 *>(&Res[0]); 557e6057bc6SSam McCall ConvertUTF32toUTF8(&In32, In32 + Codepoints.size(), &Out8, Out8 + Res.size(), 558e6057bc6SSam McCall strictConversion); 559e6057bc6SSam McCall Res.resize(reinterpret_cast<char *>(Out8) - Res.data()); 560e6057bc6SSam McCall return Res; 561e6057bc6SSam McCall } 562e6057bc6SSam McCall 5636be38247SSam McCall static void quote(llvm::raw_ostream &OS, llvm::StringRef S) { 5646be38247SSam McCall OS << '\"'; 5656be38247SSam McCall for (unsigned char C : S) { 5666be38247SSam McCall if (C == 0x22 || C == 0x5C) 5676be38247SSam McCall OS << '\\'; 5686be38247SSam McCall if (C >= 0x20) { 5696be38247SSam McCall OS << C; 5706be38247SSam McCall continue; 5716be38247SSam McCall } 5726be38247SSam McCall OS << '\\'; 5736be38247SSam McCall switch (C) { 5746be38247SSam McCall // A few characters are common enough to make short escapes worthwhile. 5756be38247SSam McCall case '\t': 5766be38247SSam McCall OS << 't'; 5776be38247SSam McCall break; 5786be38247SSam McCall case '\n': 5796be38247SSam McCall OS << 'n'; 5806be38247SSam McCall break; 5816be38247SSam McCall case '\r': 5826be38247SSam McCall OS << 'r'; 5836be38247SSam McCall break; 5846be38247SSam McCall default: 5856be38247SSam McCall OS << 'u'; 5866be38247SSam McCall llvm::write_hex(OS, C, llvm::HexPrintStyle::Lower, 4); 5876be38247SSam McCall break; 5886be38247SSam McCall } 5896be38247SSam McCall } 5906be38247SSam McCall OS << '\"'; 5916be38247SSam McCall } 5926be38247SSam McCall 593a7edcfb5SSam McCall void llvm::json::OStream::value(const Value &V) { 594a7edcfb5SSam McCall switch (V.kind()) { 595a7edcfb5SSam McCall case Value::Null: 596a7edcfb5SSam McCall valueBegin(); 5976be38247SSam McCall OS << "null"; 598a7edcfb5SSam McCall return; 599a7edcfb5SSam McCall case Value::Boolean: 600a7edcfb5SSam McCall valueBegin(); 601a7edcfb5SSam McCall OS << (*V.getAsBoolean() ? "true" : "false"); 602a7edcfb5SSam McCall return; 603a7edcfb5SSam McCall case Value::Number: 604a7edcfb5SSam McCall valueBegin(); 605a7edcfb5SSam McCall if (V.Type == Value::T_Integer) 606a7edcfb5SSam McCall OS << *V.getAsInteger(); 607a7edcfb5SSam McCall else 608d93eaeb7SSam McCall OS << format("%.*g", std::numeric_limits<double>::max_digits10, 609a7edcfb5SSam McCall *V.getAsNumber()); 610a7edcfb5SSam McCall return; 611a7edcfb5SSam McCall case Value::String: 612a7edcfb5SSam McCall valueBegin(); 613a7edcfb5SSam McCall quote(OS, *V.getAsString()); 614a7edcfb5SSam McCall return; 615a7edcfb5SSam McCall case Value::Array: 616a7edcfb5SSam McCall return array([&] { 617a7edcfb5SSam McCall for (const Value &E : *V.getAsArray()) 618a7edcfb5SSam McCall value(E); 619a7edcfb5SSam McCall }); 620a7edcfb5SSam McCall case Value::Object: 621a7edcfb5SSam McCall return object([&] { 622a7edcfb5SSam McCall for (const Object::value_type *E : sortedElements(*V.getAsObject())) 623a7edcfb5SSam McCall attribute(E->first, E->second); 624a7edcfb5SSam McCall }); 625a7edcfb5SSam McCall } 626a7edcfb5SSam McCall } 627a7edcfb5SSam McCall 628a7edcfb5SSam McCall void llvm::json::OStream::valueBegin() { 629a7edcfb5SSam McCall assert(Stack.back().Ctx != Object && "Only attributes allowed here"); 630a7edcfb5SSam McCall if (Stack.back().HasValue) { 631a7edcfb5SSam McCall assert(Stack.back().Ctx != Singleton && "Only one value allowed here"); 6326be38247SSam McCall OS << ','; 6336be38247SSam McCall } 634a7edcfb5SSam McCall if (Stack.back().Ctx == Array) 635a7edcfb5SSam McCall newline(); 636a7edcfb5SSam McCall Stack.back().HasValue = true; 6376be38247SSam McCall } 638a7edcfb5SSam McCall 639a7edcfb5SSam McCall void llvm::json::OStream::newline() { 640a7edcfb5SSam McCall if (IndentSize) { 641a7edcfb5SSam McCall OS.write('\n'); 642a7edcfb5SSam McCall OS.indent(Indent); 643a7edcfb5SSam McCall } 644a7edcfb5SSam McCall } 645a7edcfb5SSam McCall 646a7edcfb5SSam McCall void llvm::json::OStream::arrayBegin() { 647a7edcfb5SSam McCall valueBegin(); 648a7edcfb5SSam McCall Stack.emplace_back(); 649a7edcfb5SSam McCall Stack.back().Ctx = Array; 650a7edcfb5SSam McCall Indent += IndentSize; 6516be38247SSam McCall OS << '['; 6526be38247SSam McCall } 653a7edcfb5SSam McCall 654a7edcfb5SSam McCall void llvm::json::OStream::arrayEnd() { 655a7edcfb5SSam McCall assert(Stack.back().Ctx == Array); 656a7edcfb5SSam McCall Indent -= IndentSize; 657a7edcfb5SSam McCall if (Stack.back().HasValue) 658a7edcfb5SSam McCall newline(); 6596be38247SSam McCall OS << ']'; 660a7edcfb5SSam McCall Stack.pop_back(); 661a7edcfb5SSam McCall assert(!Stack.empty()); 6626be38247SSam McCall } 663a7edcfb5SSam McCall 664a7edcfb5SSam McCall void llvm::json::OStream::objectBegin() { 665a7edcfb5SSam McCall valueBegin(); 666a7edcfb5SSam McCall Stack.emplace_back(); 667a7edcfb5SSam McCall Stack.back().Ctx = Object; 668a7edcfb5SSam McCall Indent += IndentSize; 669a7edcfb5SSam McCall OS << '{'; 6706be38247SSam McCall } 671a7edcfb5SSam McCall 672a7edcfb5SSam McCall void llvm::json::OStream::objectEnd() { 673a7edcfb5SSam McCall assert(Stack.back().Ctx == Object); 674a7edcfb5SSam McCall Indent -= IndentSize; 675a7edcfb5SSam McCall if (Stack.back().HasValue) 676a7edcfb5SSam McCall newline(); 677a7edcfb5SSam McCall OS << '}'; 678a7edcfb5SSam McCall Stack.pop_back(); 679a7edcfb5SSam McCall assert(!Stack.empty()); 6806be38247SSam McCall } 6816be38247SSam McCall 682a7edcfb5SSam McCall void llvm::json::OStream::attributeBegin(llvm::StringRef Key) { 683a7edcfb5SSam McCall assert(Stack.back().Ctx == Object); 684a7edcfb5SSam McCall if (Stack.back().HasValue) 685a7edcfb5SSam McCall OS << ','; 686a7edcfb5SSam McCall newline(); 687a7edcfb5SSam McCall Stack.back().HasValue = true; 688a7edcfb5SSam McCall Stack.emplace_back(); 689a7edcfb5SSam McCall Stack.back().Ctx = Singleton; 690a7edcfb5SSam McCall if (LLVM_LIKELY(isUTF8(Key))) { 691a7edcfb5SSam McCall quote(OS, Key); 692a7edcfb5SSam McCall } else { 693a7edcfb5SSam McCall assert(false && "Invalid UTF-8 in attribute key"); 694a7edcfb5SSam McCall quote(OS, fixUTF8(Key)); 695a7edcfb5SSam McCall } 696a7edcfb5SSam McCall OS.write(':'); 697a7edcfb5SSam McCall if (IndentSize) 698a7edcfb5SSam McCall OS.write(' '); 699a7edcfb5SSam McCall } 700a7edcfb5SSam McCall 701a7edcfb5SSam McCall void llvm::json::OStream::attributeEnd() { 702a7edcfb5SSam McCall assert(Stack.back().Ctx == Singleton); 703a7edcfb5SSam McCall assert(Stack.back().HasValue && "Attribute must have a value"); 704a7edcfb5SSam McCall Stack.pop_back(); 705a7edcfb5SSam McCall assert(Stack.back().Ctx == Object); 706a7edcfb5SSam McCall } 707a7edcfb5SSam McCall 708a7edcfb5SSam McCall } // namespace json 709a7edcfb5SSam McCall } // namespace llvm 710a7edcfb5SSam McCall 7116be38247SSam McCall void llvm::format_provider<llvm::json::Value>::format( 7126be38247SSam McCall const llvm::json::Value &E, raw_ostream &OS, StringRef Options) { 7136be38247SSam McCall unsigned IndentAmount = 0; 714a7edcfb5SSam McCall if (!Options.empty() && Options.getAsInteger(/*Radix=*/10, IndentAmount)) 7156be38247SSam McCall llvm_unreachable("json::Value format options should be an integer"); 716a7edcfb5SSam McCall json::OStream(OS, IndentAmount).value(E); 7176be38247SSam McCall } 7186be38247SSam McCall 719