1 //===--- YAMLParser.cpp - Simple YAML parser ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements a YAML parser. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Support/YAMLParser.h" 15 16 #include "llvm/ADT/ilist.h" 17 #include "llvm/ADT/ilist_node.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/ADT/StringExtras.h" 20 #include "llvm/ADT/Twine.h" 21 #include "llvm/Support/ErrorHandling.h" 22 #include "llvm/Support/MemoryBuffer.h" 23 #include "llvm/Support/raw_ostream.h" 24 #include "llvm/Support/SourceMgr.h" 25 26 using namespace llvm; 27 using namespace yaml; 28 29 enum UnicodeEncodingForm { 30 UEF_UTF32_LE, ///< UTF-32 Little Endian 31 UEF_UTF32_BE, ///< UTF-32 Big Endian 32 UEF_UTF16_LE, ///< UTF-16 Little Endian 33 UEF_UTF16_BE, ///< UTF-16 Big Endian 34 UEF_UTF8, ///< UTF-8 or ascii. 35 UEF_Unknown ///< Not a valid Unicode encoding. 36 }; 37 38 /// EncodingInfo - Holds the encoding type and length of the byte order mark if 39 /// it exists. Length is in {0, 2, 3, 4}. 40 typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo; 41 42 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode 43 /// encoding form of \a Input. 44 /// 45 /// @param Input A string of length 0 or more. 46 /// @returns An EncodingInfo indicating the Unicode encoding form of the input 47 /// and how long the byte order mark is if one exists. 48 static EncodingInfo getUnicodeEncoding(StringRef Input) { 49 if (Input.size() == 0) 50 return std::make_pair(UEF_Unknown, 0); 51 52 switch (uint8_t(Input[0])) { 53 case 0x00: 54 if (Input.size() >= 4) { 55 if ( Input[1] == 0 56 && uint8_t(Input[2]) == 0xFE 57 && uint8_t(Input[3]) == 0xFF) 58 return std::make_pair(UEF_UTF32_BE, 4); 59 if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) 60 return std::make_pair(UEF_UTF32_BE, 0); 61 } 62 63 if (Input.size() >= 2 && Input[1] != 0) 64 return std::make_pair(UEF_UTF16_BE, 0); 65 return std::make_pair(UEF_Unknown, 0); 66 case 0xFF: 67 if ( Input.size() >= 4 68 && uint8_t(Input[1]) == 0xFE 69 && Input[2] == 0 70 && Input[3] == 0) 71 return std::make_pair(UEF_UTF32_LE, 4); 72 73 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) 74 return std::make_pair(UEF_UTF16_LE, 2); 75 return std::make_pair(UEF_Unknown, 0); 76 case 0xFE: 77 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) 78 return std::make_pair(UEF_UTF16_BE, 2); 79 return std::make_pair(UEF_Unknown, 0); 80 case 0xEF: 81 if ( Input.size() >= 3 82 && uint8_t(Input[1]) == 0xBB 83 && uint8_t(Input[2]) == 0xBF) 84 return std::make_pair(UEF_UTF8, 3); 85 return std::make_pair(UEF_Unknown, 0); 86 } 87 88 // It could still be utf-32 or utf-16. 89 if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) 90 return std::make_pair(UEF_UTF32_LE, 0); 91 92 if (Input.size() >= 2 && Input[1] == 0) 93 return std::make_pair(UEF_UTF16_LE, 0); 94 95 return std::make_pair(UEF_UTF8, 0); 96 } 97 98 namespace llvm { 99 namespace yaml { 100 /// Token - A single YAML token. 101 struct Token : ilist_node<Token> { 102 enum TokenKind { 103 TK_Error, // Uninitialized token. 104 TK_StreamStart, 105 TK_StreamEnd, 106 TK_VersionDirective, 107 TK_TagDirective, 108 TK_DocumentStart, 109 TK_DocumentEnd, 110 TK_BlockEntry, 111 TK_BlockEnd, 112 TK_BlockSequenceStart, 113 TK_BlockMappingStart, 114 TK_FlowEntry, 115 TK_FlowSequenceStart, 116 TK_FlowSequenceEnd, 117 TK_FlowMappingStart, 118 TK_FlowMappingEnd, 119 TK_Key, 120 TK_Value, 121 TK_Scalar, 122 TK_Alias, 123 TK_Anchor, 124 TK_Tag 125 } Kind; 126 127 /// A string of length 0 or more whose begin() points to the logical location 128 /// of the token in the input. 129 StringRef Range; 130 131 Token() : Kind(TK_Error) {} 132 }; 133 } 134 } 135 136 namespace llvm { 137 template<> 138 struct ilist_sentinel_traits<Token> { 139 Token *createSentinel() const { 140 return &Sentinel; 141 } 142 static void destroySentinel(Token*) {} 143 144 Token *provideInitialHead() const { return createSentinel(); } 145 Token *ensureHead(Token*) const { return createSentinel(); } 146 static void noteHead(Token*, Token*) {} 147 148 private: 149 mutable Token Sentinel; 150 }; 151 152 template<> 153 struct ilist_node_traits<Token> { 154 Token *createNode(const Token &V) { 155 return new (Alloc.Allocate<Token>()) Token(V); 156 } 157 static void deleteNode(Token *V) {} 158 159 void addNodeToList(Token *) {} 160 void removeNodeFromList(Token *) {} 161 void transferNodesFromList(ilist_node_traits & /*SrcTraits*/, 162 ilist_iterator<Token> /*first*/, 163 ilist_iterator<Token> /*last*/) {} 164 165 BumpPtrAllocator Alloc; 166 }; 167 } 168 169 typedef ilist<Token> TokenQueueT; 170 171 namespace { 172 /// @brief This struct is used to track simple keys. 173 /// 174 /// Simple keys are handled by creating an entry in SimpleKeys for each Token 175 /// which could legally be the start of a simple key. When peekNext is called, 176 /// if the Token To be returned is referenced by a SimpleKey, we continue 177 /// tokenizing until that potential simple key has either been found to not be 178 /// a simple key (we moved on to the next line or went further than 1024 chars). 179 /// Or when we run into a Value, and then insert a Key token (and possibly 180 /// others) before the SimpleKey's Tok. 181 struct SimpleKey { 182 TokenQueueT::iterator Tok; 183 unsigned Column; 184 unsigned Line; 185 unsigned FlowLevel; 186 bool IsRequired; 187 188 bool operator ==(const SimpleKey &Other) { 189 return Tok == Other.Tok; 190 } 191 }; 192 } 193 194 /// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit 195 /// subsequence and the subsequence's length in code units (uint8_t). 196 /// A length of 0 represents an error. 197 typedef std::pair<uint32_t, unsigned> UTF8Decoded; 198 199 static UTF8Decoded decodeUTF8(StringRef Range) { 200 StringRef::iterator Position= Range.begin(); 201 StringRef::iterator End = Range.end(); 202 // 1 byte: [0x00, 0x7f] 203 // Bit pattern: 0xxxxxxx 204 if ((*Position & 0x80) == 0) { 205 return std::make_pair(*Position, 1); 206 } 207 // 2 bytes: [0x80, 0x7ff] 208 // Bit pattern: 110xxxxx 10xxxxxx 209 if (Position + 1 != End && 210 ((*Position & 0xE0) == 0xC0) && 211 ((*(Position + 1) & 0xC0) == 0x80)) { 212 uint32_t codepoint = ((*Position & 0x1F) << 6) | 213 (*(Position + 1) & 0x3F); 214 if (codepoint >= 0x80) 215 return std::make_pair(codepoint, 2); 216 } 217 // 3 bytes: [0x8000, 0xffff] 218 // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx 219 if (Position + 2 != End && 220 ((*Position & 0xF0) == 0xE0) && 221 ((*(Position + 1) & 0xC0) == 0x80) && 222 ((*(Position + 2) & 0xC0) == 0x80)) { 223 uint32_t codepoint = ((*Position & 0x0F) << 12) | 224 ((*(Position + 1) & 0x3F) << 6) | 225 (*(Position + 2) & 0x3F); 226 // Codepoints between 0xD800 and 0xDFFF are invalid, as 227 // they are high / low surrogate halves used by UTF-16. 228 if (codepoint >= 0x800 && 229 (codepoint < 0xD800 || codepoint > 0xDFFF)) 230 return std::make_pair(codepoint, 3); 231 } 232 // 4 bytes: [0x10000, 0x10FFFF] 233 // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 234 if (Position + 3 != End && 235 ((*Position & 0xF8) == 0xF0) && 236 ((*(Position + 1) & 0xC0) == 0x80) && 237 ((*(Position + 2) & 0xC0) == 0x80) && 238 ((*(Position + 3) & 0xC0) == 0x80)) { 239 uint32_t codepoint = ((*Position & 0x07) << 18) | 240 ((*(Position + 1) & 0x3F) << 12) | 241 ((*(Position + 2) & 0x3F) << 6) | 242 (*(Position + 3) & 0x3F); 243 if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) 244 return std::make_pair(codepoint, 4); 245 } 246 return std::make_pair(0, 0); 247 } 248 249 namespace llvm { 250 namespace yaml { 251 /// @brief Scans YAML tokens from a MemoryBuffer. 252 class Scanner { 253 public: 254 Scanner(const StringRef Input, SourceMgr &SM); 255 Scanner(MemoryBuffer *Buffer, SourceMgr &SM_); 256 257 /// @brief Parse the next token and return it without popping it. 258 Token &peekNext(); 259 260 /// @brief Parse the next token and pop it from the queue. 261 Token getNext(); 262 263 void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, 264 ArrayRef<SMRange> Ranges = ArrayRef<SMRange>()) { 265 SM.PrintMessage(Loc, Kind, Message, Ranges); 266 } 267 268 void setError(const Twine &Message, StringRef::iterator Position) { 269 if (Current >= End) 270 Current = End - 1; 271 272 // Don't print out more errors after the first one we encounter. The rest 273 // are just the result of the first, and have no meaning. 274 if (!Failed) 275 printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message); 276 Failed = true; 277 } 278 279 void setError(const Twine &Message) { 280 setError(Message, Current); 281 } 282 283 /// @brief Returns true if an error occurred while parsing. 284 bool failed() { 285 return Failed; 286 } 287 288 private: 289 StringRef currentInput() { 290 return StringRef(Current, End - Current); 291 } 292 293 /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting 294 /// at \a Position. 295 /// 296 /// If the UTF-8 code units starting at Position do not form a well-formed 297 /// code unit subsequence, then the Unicode scalar value is 0, and the length 298 /// is 0. 299 UTF8Decoded decodeUTF8(StringRef::iterator Position) { 300 return ::decodeUTF8(StringRef(Position, End - Position)); 301 } 302 303 // The following functions are based on the gramar rules in the YAML spec. The 304 // style of the function names it meant to closely match how they are written 305 // in the spec. The number within the [] is the number of the grammar rule in 306 // the spec. 307 // 308 // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. 309 // 310 // c- 311 // A production starting and ending with a special character. 312 // b- 313 // A production matching a single line break. 314 // nb- 315 // A production starting and ending with a non-break character. 316 // s- 317 // A production starting and ending with a white space character. 318 // ns- 319 // A production starting and ending with a non-space character. 320 // l- 321 // A production matching complete line(s). 322 323 /// @brief Skip a single nb-char[27] starting at Position. 324 /// 325 /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] 326 /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] 327 /// 328 /// @returns The code unit after the nb-char, or Position if it's not an 329 /// nb-char. 330 StringRef::iterator skip_nb_char(StringRef::iterator Position); 331 332 /// @brief Skip a single b-break[28] starting at Position. 333 /// 334 /// A b-break is 0xD 0xA | 0xD | 0xA 335 /// 336 /// @returns The code unit after the b-break, or Position if it's not a 337 /// b-break. 338 StringRef::iterator skip_b_break(StringRef::iterator Position); 339 340 /// @brief Skip a single s-white[33] starting at Position. 341 /// 342 /// A s-white is 0x20 | 0x9 343 /// 344 /// @returns The code unit after the s-white, or Position if it's not a 345 /// s-white. 346 StringRef::iterator skip_s_white(StringRef::iterator Position); 347 348 /// @brief Skip a single ns-char[34] starting at Position. 349 /// 350 /// A ns-char is nb-char - s-white 351 /// 352 /// @returns The code unit after the ns-char, or Position if it's not a 353 /// ns-char. 354 StringRef::iterator skip_ns_char(StringRef::iterator Position); 355 356 typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator); 357 /// @brief Skip minimal well-formed code unit subsequences until Func 358 /// returns its input. 359 /// 360 /// @returns The code unit after the last minimal well-formed code unit 361 /// subsequence that Func accepted. 362 StringRef::iterator skip_while( SkipWhileFunc Func 363 , StringRef::iterator Position); 364 365 /// @brief Scan ns-uri-char[39]s starting at Cur. 366 /// 367 /// This updates Cur and Column while scanning. 368 /// 369 /// @returns A StringRef starting at Cur which covers the longest contiguous 370 /// sequence of ns-uri-char. 371 StringRef scan_ns_uri_char(); 372 373 /// @brief Scan ns-plain-one-line[133] starting at \a Cur. 374 StringRef scan_ns_plain_one_line(); 375 376 /// @brief Consume a minimal well-formed code unit subsequence starting at 377 /// \a Cur. Return false if it is not the same Unicode scalar value as 378 /// \a Expected. This updates \a Column. 379 bool consume(uint32_t Expected); 380 381 /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. 382 void skip(uint32_t Distance); 383 384 /// @brief Return true if the minimal well-formed code unit subsequence at 385 /// Pos is whitespace or a new line 386 bool isBlankOrBreak(StringRef::iterator Position); 387 388 /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey. 389 void saveSimpleKeyCandidate( TokenQueueT::iterator Tok 390 , unsigned AtColumn 391 , bool IsRequired); 392 393 /// @brief Remove simple keys that can no longer be valid simple keys. 394 /// 395 /// Invalid simple keys are not on the current line or are further than 1024 396 /// columns back. 397 void removeStaleSimpleKeyCandidates(); 398 399 /// @brief Remove all simple keys on FlowLevel \a Level. 400 void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); 401 402 /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd 403 /// tokens if needed. 404 bool unrollIndent(int ToColumn); 405 406 /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint 407 /// if needed. 408 bool rollIndent( int ToColumn 409 , Token::TokenKind Kind 410 , TokenQueueT::iterator InsertPoint); 411 412 /// @brief Skip whitespace and comments until the start of the next token. 413 void scanToNextToken(); 414 415 /// @brief Must be the first token generated. 416 bool scanStreamStart(); 417 418 /// @brief Generate tokens needed to close out the stream. 419 bool scanStreamEnd(); 420 421 /// @brief Scan a %BLAH directive. 422 bool scanDirective(); 423 424 /// @brief Scan a ... or ---. 425 bool scanDocumentIndicator(bool IsStart); 426 427 /// @brief Scan a [ or { and generate the proper flow collection start token. 428 bool scanFlowCollectionStart(bool IsSequence); 429 430 /// @brief Scan a ] or } and generate the proper flow collection end token. 431 bool scanFlowCollectionEnd(bool IsSequence); 432 433 /// @brief Scan the , that separates entries in a flow collection. 434 bool scanFlowEntry(); 435 436 /// @brief Scan the - that starts block sequence entries. 437 bool scanBlockEntry(); 438 439 /// @brief Scan an explicit ? indicating a key. 440 bool scanKey(); 441 442 /// @brief Scan an explicit : indicating a value. 443 bool scanValue(); 444 445 /// @brief Scan a quoted scalar. 446 bool scanFlowScalar(bool IsDoubleQuoted); 447 448 /// @brief Scan an unquoted scalar. 449 bool scanPlainScalar(); 450 451 /// @brief Scan an Alias or Anchor starting with * or &. 452 bool scanAliasOrAnchor(bool IsAlias); 453 454 /// @brief Scan a block scalar starting with | or >. 455 bool scanBlockScalar(bool IsLiteral); 456 457 /// @brief Scan a tag of the form !stuff. 458 bool scanTag(); 459 460 /// @brief Dispatch to the next scanning function based on \a *Cur. 461 bool fetchMoreTokens(); 462 463 /// @brief The SourceMgr used for diagnostics and buffer management. 464 SourceMgr &SM; 465 466 /// @brief The original input. 467 MemoryBuffer *InputBuffer; 468 469 /// @brief The current position of the scanner. 470 StringRef::iterator Current; 471 472 /// @brief The end of the input (one past the last character). 473 StringRef::iterator End; 474 475 /// @brief Current YAML indentation level in spaces. 476 int Indent; 477 478 /// @brief Current column number in Unicode code points. 479 unsigned Column; 480 481 /// @brief Current line number. 482 unsigned Line; 483 484 /// @brief How deep we are in flow style containers. 0 Means at block level. 485 unsigned FlowLevel; 486 487 /// @brief Are we at the start of the stream? 488 bool IsStartOfStream; 489 490 /// @brief Can the next token be the start of a simple key? 491 bool IsSimpleKeyAllowed; 492 493 /// @brief True if an error has occurred. 494 bool Failed; 495 496 /// @brief Queue of tokens. This is required to queue up tokens while looking 497 /// for the end of a simple key. And for cases where a single character 498 /// can produce multiple tokens (e.g. BlockEnd). 499 TokenQueueT TokenQueue; 500 501 /// @brief Indentation levels. 502 SmallVector<int, 4> Indents; 503 504 /// @brief Potential simple keys. 505 SmallVector<SimpleKey, 4> SimpleKeys; 506 }; 507 508 } // end namespace yaml 509 } // end namespace llvm 510 511 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. 512 static void encodeUTF8( uint32_t UnicodeScalarValue 513 , SmallVectorImpl<char> &Result) { 514 if (UnicodeScalarValue <= 0x7F) { 515 Result.push_back(UnicodeScalarValue & 0x7F); 516 } else if (UnicodeScalarValue <= 0x7FF) { 517 uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); 518 uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); 519 Result.push_back(FirstByte); 520 Result.push_back(SecondByte); 521 } else if (UnicodeScalarValue <= 0xFFFF) { 522 uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); 523 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 524 uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); 525 Result.push_back(FirstByte); 526 Result.push_back(SecondByte); 527 Result.push_back(ThirdByte); 528 } else if (UnicodeScalarValue <= 0x10FFFF) { 529 uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); 530 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); 531 uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 532 uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); 533 Result.push_back(FirstByte); 534 Result.push_back(SecondByte); 535 Result.push_back(ThirdByte); 536 Result.push_back(FourthByte); 537 } 538 } 539 540 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { 541 SourceMgr SM; 542 Scanner scanner(Input, SM); 543 while (true) { 544 Token T = scanner.getNext(); 545 switch (T.Kind) { 546 case Token::TK_StreamStart: 547 OS << "Stream-Start: "; 548 break; 549 case Token::TK_StreamEnd: 550 OS << "Stream-End: "; 551 break; 552 case Token::TK_VersionDirective: 553 OS << "Version-Directive: "; 554 break; 555 case Token::TK_TagDirective: 556 OS << "Tag-Directive: "; 557 break; 558 case Token::TK_DocumentStart: 559 OS << "Document-Start: "; 560 break; 561 case Token::TK_DocumentEnd: 562 OS << "Document-End: "; 563 break; 564 case Token::TK_BlockEntry: 565 OS << "Block-Entry: "; 566 break; 567 case Token::TK_BlockEnd: 568 OS << "Block-End: "; 569 break; 570 case Token::TK_BlockSequenceStart: 571 OS << "Block-Sequence-Start: "; 572 break; 573 case Token::TK_BlockMappingStart: 574 OS << "Block-Mapping-Start: "; 575 break; 576 case Token::TK_FlowEntry: 577 OS << "Flow-Entry: "; 578 break; 579 case Token::TK_FlowSequenceStart: 580 OS << "Flow-Sequence-Start: "; 581 break; 582 case Token::TK_FlowSequenceEnd: 583 OS << "Flow-Sequence-End: "; 584 break; 585 case Token::TK_FlowMappingStart: 586 OS << "Flow-Mapping-Start: "; 587 break; 588 case Token::TK_FlowMappingEnd: 589 OS << "Flow-Mapping-End: "; 590 break; 591 case Token::TK_Key: 592 OS << "Key: "; 593 break; 594 case Token::TK_Value: 595 OS << "Value: "; 596 break; 597 case Token::TK_Scalar: 598 OS << "Scalar: "; 599 break; 600 case Token::TK_Alias: 601 OS << "Alias: "; 602 break; 603 case Token::TK_Anchor: 604 OS << "Anchor: "; 605 break; 606 case Token::TK_Tag: 607 OS << "Tag: "; 608 break; 609 case Token::TK_Error: 610 break; 611 } 612 OS << T.Range << "\n"; 613 if (T.Kind == Token::TK_StreamEnd) 614 break; 615 else if (T.Kind == Token::TK_Error) 616 return false; 617 } 618 return true; 619 } 620 621 bool yaml::scanTokens(StringRef Input) { 622 llvm::SourceMgr SM; 623 llvm::yaml::Scanner scanner(Input, SM); 624 for (;;) { 625 llvm::yaml::Token T = scanner.getNext(); 626 if (T.Kind == Token::TK_StreamEnd) 627 break; 628 else if (T.Kind == Token::TK_Error) 629 return false; 630 } 631 return true; 632 } 633 634 std::string yaml::escape(StringRef Input) { 635 std::string EscapedInput; 636 for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { 637 if (*i == '\\') 638 EscapedInput += "\\\\"; 639 else if (*i == '"') 640 EscapedInput += "\\\""; 641 else if (*i == 0) 642 EscapedInput += "\\0"; 643 else if (*i == 0x07) 644 EscapedInput += "\\a"; 645 else if (*i == 0x08) 646 EscapedInput += "\\b"; 647 else if (*i == 0x09) 648 EscapedInput += "\\t"; 649 else if (*i == 0x0A) 650 EscapedInput += "\\n"; 651 else if (*i == 0x0B) 652 EscapedInput += "\\v"; 653 else if (*i == 0x0C) 654 EscapedInput += "\\f"; 655 else if (*i == 0x0D) 656 EscapedInput += "\\r"; 657 else if (*i == 0x1B) 658 EscapedInput += "\\e"; 659 else if ((unsigned char)*i < 0x20) { // Control characters not handled above. 660 std::string HexStr = utohexstr(*i); 661 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 662 } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. 663 UTF8Decoded UnicodeScalarValue 664 = decodeUTF8(StringRef(i, Input.end() - i)); 665 if (UnicodeScalarValue.second == 0) { 666 // Found invalid char. 667 SmallString<4> Val; 668 encodeUTF8(0xFFFD, Val); 669 EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end()); 670 // FIXME: Error reporting. 671 return EscapedInput; 672 } 673 if (UnicodeScalarValue.first == 0x85) 674 EscapedInput += "\\N"; 675 else if (UnicodeScalarValue.first == 0xA0) 676 EscapedInput += "\\_"; 677 else if (UnicodeScalarValue.first == 0x2028) 678 EscapedInput += "\\L"; 679 else if (UnicodeScalarValue.first == 0x2029) 680 EscapedInput += "\\P"; 681 else { 682 std::string HexStr = utohexstr(UnicodeScalarValue.first); 683 if (HexStr.size() <= 2) 684 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 685 else if (HexStr.size() <= 4) 686 EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; 687 else if (HexStr.size() <= 8) 688 EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; 689 } 690 i += UnicodeScalarValue.second - 1; 691 } else 692 EscapedInput.push_back(*i); 693 } 694 return EscapedInput; 695 } 696 697 Scanner::Scanner(StringRef Input, SourceMgr &sm) 698 : SM(sm) 699 , Indent(-1) 700 , Column(0) 701 , Line(0) 702 , FlowLevel(0) 703 , IsStartOfStream(true) 704 , IsSimpleKeyAllowed(true) 705 , Failed(false) { 706 InputBuffer = MemoryBuffer::getMemBuffer(Input, "YAML"); 707 SM.AddNewSourceBuffer(InputBuffer, SMLoc()); 708 Current = InputBuffer->getBufferStart(); 709 End = InputBuffer->getBufferEnd(); 710 } 711 712 Scanner::Scanner(MemoryBuffer *Buffer, SourceMgr &SM_) 713 : SM(SM_) 714 , InputBuffer(Buffer) 715 , Current(InputBuffer->getBufferStart()) 716 , End(InputBuffer->getBufferEnd()) 717 , Indent(-1) 718 , Column(0) 719 , Line(0) 720 , FlowLevel(0) 721 , IsStartOfStream(true) 722 , IsSimpleKeyAllowed(true) 723 , Failed(false) { 724 SM.AddNewSourceBuffer(InputBuffer, SMLoc()); 725 } 726 727 Token &Scanner::peekNext() { 728 // If the current token is a possible simple key, keep parsing until we 729 // can confirm. 730 bool NeedMore = false; 731 while (true) { 732 if (TokenQueue.empty() || NeedMore) { 733 if (!fetchMoreTokens()) { 734 TokenQueue.clear(); 735 TokenQueue.push_back(Token()); 736 return TokenQueue.front(); 737 } 738 } 739 assert(!TokenQueue.empty() && 740 "fetchMoreTokens lied about getting tokens!"); 741 742 removeStaleSimpleKeyCandidates(); 743 SimpleKey SK; 744 SK.Tok = TokenQueue.front(); 745 if (std::find(SimpleKeys.begin(), SimpleKeys.end(), SK) 746 == SimpleKeys.end()) 747 break; 748 else 749 NeedMore = true; 750 } 751 return TokenQueue.front(); 752 } 753 754 Token Scanner::getNext() { 755 Token Ret = peekNext(); 756 // TokenQueue can be empty if there was an error getting the next token. 757 if (!TokenQueue.empty()) 758 TokenQueue.pop_front(); 759 760 // There cannot be any referenced Token's if the TokenQueue is empty. So do a 761 // quick deallocation of them all. 762 if (TokenQueue.empty()) { 763 TokenQueue.Alloc.Reset(); 764 } 765 766 return Ret; 767 } 768 769 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { 770 if (Position == End) 771 return Position; 772 // Check 7 bit c-printable - b-char. 773 if ( *Position == 0x09 774 || (*Position >= 0x20 && *Position <= 0x7E)) 775 return Position + 1; 776 777 // Check for valid UTF-8. 778 if (uint8_t(*Position) & 0x80) { 779 UTF8Decoded u8d = decodeUTF8(Position); 780 if ( u8d.second != 0 781 && u8d.first != 0xFEFF 782 && ( u8d.first == 0x85 783 || ( u8d.first >= 0xA0 784 && u8d.first <= 0xD7FF) 785 || ( u8d.first >= 0xE000 786 && u8d.first <= 0xFFFD) 787 || ( u8d.first >= 0x10000 788 && u8d.first <= 0x10FFFF))) 789 return Position + u8d.second; 790 } 791 return Position; 792 } 793 794 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { 795 if (Position == End) 796 return Position; 797 if (*Position == 0x0D) { 798 if (Position + 1 != End && *(Position + 1) == 0x0A) 799 return Position + 2; 800 return Position + 1; 801 } 802 803 if (*Position == 0x0A) 804 return Position + 1; 805 return Position; 806 } 807 808 809 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { 810 if (Position == End) 811 return Position; 812 if (*Position == ' ' || *Position == '\t') 813 return Position + 1; 814 return Position; 815 } 816 817 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { 818 if (Position == End) 819 return Position; 820 if (*Position == ' ' || *Position == '\t') 821 return Position; 822 return skip_nb_char(Position); 823 } 824 825 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func 826 , StringRef::iterator Position) { 827 while (true) { 828 StringRef::iterator i = (this->*Func)(Position); 829 if (i == Position) 830 break; 831 Position = i; 832 } 833 return Position; 834 } 835 836 static bool is_ns_hex_digit(const char C) { 837 return (C >= '0' && C <= '9') 838 || (C >= 'a' && C <= 'z') 839 || (C >= 'A' && C <= 'Z'); 840 } 841 842 static bool is_ns_word_char(const char C) { 843 return C == '-' 844 || (C >= 'a' && C <= 'z') 845 || (C >= 'A' && C <= 'Z'); 846 } 847 848 StringRef Scanner::scan_ns_uri_char() { 849 StringRef::iterator Start = Current; 850 while (true) { 851 if (Current == End) 852 break; 853 if (( *Current == '%' 854 && Current + 2 < End 855 && is_ns_hex_digit(*(Current + 1)) 856 && is_ns_hex_digit(*(Current + 2))) 857 || is_ns_word_char(*Current) 858 || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") 859 != StringRef::npos) { 860 ++Current; 861 ++Column; 862 } else 863 break; 864 } 865 return StringRef(Start, Current - Start); 866 } 867 868 StringRef Scanner::scan_ns_plain_one_line() { 869 StringRef::iterator start = Current; 870 // The first character must already be verified. 871 ++Current; 872 while (true) { 873 if (Current == End) { 874 break; 875 } else if (*Current == ':') { 876 // Check if the next character is a ns-char. 877 if (Current + 1 == End) 878 break; 879 StringRef::iterator i = skip_ns_char(Current + 1); 880 if (Current + 1 != i) { 881 Current = i; 882 Column += 2; // Consume both the ':' and ns-char. 883 } else 884 break; 885 } else if (*Current == '#') { 886 // Check if the previous character was a ns-char. 887 // The & 0x80 check is to check for the trailing byte of a utf-8 888 if (*(Current - 1) & 0x80 || skip_ns_char(Current - 1) == Current) { 889 ++Current; 890 ++Column; 891 } else 892 break; 893 } else { 894 StringRef::iterator i = skip_nb_char(Current); 895 if (i == Current) 896 break; 897 Current = i; 898 ++Column; 899 } 900 } 901 return StringRef(start, Current - start); 902 } 903 904 bool Scanner::consume(uint32_t Expected) { 905 if (Expected >= 0x80) 906 report_fatal_error("Not dealing with this yet"); 907 if (Current == End) 908 return false; 909 if (uint8_t(*Current) >= 0x80) 910 report_fatal_error("Not dealing with this yet"); 911 if (uint8_t(*Current) == Expected) { 912 ++Current; 913 ++Column; 914 return true; 915 } 916 return false; 917 } 918 919 void Scanner::skip(uint32_t Distance) { 920 Current += Distance; 921 Column += Distance; 922 assert(Current <= End && "Skipped past the end"); 923 } 924 925 bool Scanner::isBlankOrBreak(StringRef::iterator Position) { 926 if (Position == End) 927 return false; 928 if ( *Position == ' ' || *Position == '\t' 929 || *Position == '\r' || *Position == '\n') 930 return true; 931 return false; 932 } 933 934 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok 935 , unsigned AtColumn 936 , bool IsRequired) { 937 if (IsSimpleKeyAllowed) { 938 SimpleKey SK; 939 SK.Tok = Tok; 940 SK.Line = Line; 941 SK.Column = AtColumn; 942 SK.IsRequired = IsRequired; 943 SK.FlowLevel = FlowLevel; 944 SimpleKeys.push_back(SK); 945 } 946 } 947 948 void Scanner::removeStaleSimpleKeyCandidates() { 949 for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); 950 i != SimpleKeys.end();) { 951 if (i->Line != Line || i->Column + 1024 < Column) { 952 if (i->IsRequired) 953 setError( "Could not find expected : for simple key" 954 , i->Tok->Range.begin()); 955 i = SimpleKeys.erase(i); 956 } else 957 ++i; 958 } 959 } 960 961 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { 962 if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) 963 SimpleKeys.pop_back(); 964 } 965 966 bool Scanner::unrollIndent(int ToColumn) { 967 Token T; 968 // Indentation is ignored in flow. 969 if (FlowLevel != 0) 970 return true; 971 972 while (Indent > ToColumn) { 973 T.Kind = Token::TK_BlockEnd; 974 T.Range = StringRef(Current, 1); 975 TokenQueue.push_back(T); 976 Indent = Indents.pop_back_val(); 977 } 978 979 return true; 980 } 981 982 bool Scanner::rollIndent( int ToColumn 983 , Token::TokenKind Kind 984 , TokenQueueT::iterator InsertPoint) { 985 if (FlowLevel) 986 return true; 987 if (Indent < ToColumn) { 988 Indents.push_back(Indent); 989 Indent = ToColumn; 990 991 Token T; 992 T.Kind = Kind; 993 T.Range = StringRef(Current, 0); 994 TokenQueue.insert(InsertPoint, T); 995 } 996 return true; 997 } 998 999 void Scanner::scanToNextToken() { 1000 while (true) { 1001 while (*Current == ' ' || *Current == '\t') { 1002 skip(1); 1003 } 1004 1005 // Skip comment. 1006 if (*Current == '#') { 1007 while (true) { 1008 // This may skip more than one byte, thus Column is only incremented 1009 // for code points. 1010 StringRef::iterator i = skip_nb_char(Current); 1011 if (i == Current) 1012 break; 1013 Current = i; 1014 ++Column; 1015 } 1016 } 1017 1018 // Skip EOL. 1019 StringRef::iterator i = skip_b_break(Current); 1020 if (i == Current) 1021 break; 1022 Current = i; 1023 ++Line; 1024 Column = 0; 1025 // New lines may start a simple key. 1026 if (!FlowLevel) 1027 IsSimpleKeyAllowed = true; 1028 } 1029 } 1030 1031 bool Scanner::scanStreamStart() { 1032 IsStartOfStream = false; 1033 1034 EncodingInfo EI = getUnicodeEncoding(currentInput()); 1035 1036 Token T; 1037 T.Kind = Token::TK_StreamStart; 1038 T.Range = StringRef(Current, EI.second); 1039 TokenQueue.push_back(T); 1040 Current += EI.second; 1041 return true; 1042 } 1043 1044 bool Scanner::scanStreamEnd() { 1045 // Force an ending new line if one isn't present. 1046 if (Column != 0) { 1047 Column = 0; 1048 ++Line; 1049 } 1050 1051 unrollIndent(-1); 1052 SimpleKeys.clear(); 1053 IsSimpleKeyAllowed = false; 1054 1055 Token T; 1056 T.Kind = Token::TK_StreamEnd; 1057 T.Range = StringRef(Current, 0); 1058 TokenQueue.push_back(T); 1059 return true; 1060 } 1061 1062 bool Scanner::scanDirective() { 1063 // Reset the indentation level. 1064 unrollIndent(-1); 1065 SimpleKeys.clear(); 1066 IsSimpleKeyAllowed = false; 1067 1068 StringRef::iterator Start = Current; 1069 consume('%'); 1070 StringRef::iterator NameStart = Current; 1071 Current = skip_while(&Scanner::skip_ns_char, Current); 1072 StringRef Name(NameStart, Current - NameStart); 1073 Current = skip_while(&Scanner::skip_s_white, Current); 1074 1075 if (Name == "YAML") { 1076 Current = skip_while(&Scanner::skip_ns_char, Current); 1077 Token T; 1078 T.Kind = Token::TK_VersionDirective; 1079 T.Range = StringRef(Start, Current - Start); 1080 TokenQueue.push_back(T); 1081 return true; 1082 } 1083 return false; 1084 } 1085 1086 bool Scanner::scanDocumentIndicator(bool IsStart) { 1087 unrollIndent(-1); 1088 SimpleKeys.clear(); 1089 IsSimpleKeyAllowed = false; 1090 1091 Token T; 1092 T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; 1093 T.Range = StringRef(Current, 3); 1094 skip(3); 1095 TokenQueue.push_back(T); 1096 return true; 1097 } 1098 1099 bool Scanner::scanFlowCollectionStart(bool IsSequence) { 1100 Token T; 1101 T.Kind = IsSequence ? Token::TK_FlowSequenceStart 1102 : Token::TK_FlowMappingStart; 1103 T.Range = StringRef(Current, 1); 1104 skip(1); 1105 TokenQueue.push_back(T); 1106 1107 // [ and { may begin a simple key. 1108 saveSimpleKeyCandidate(TokenQueue.back(), Column - 1, false); 1109 1110 // And may also be followed by a simple key. 1111 IsSimpleKeyAllowed = true; 1112 ++FlowLevel; 1113 return true; 1114 } 1115 1116 bool Scanner::scanFlowCollectionEnd(bool IsSequence) { 1117 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1118 IsSimpleKeyAllowed = false; 1119 Token T; 1120 T.Kind = IsSequence ? Token::TK_FlowSequenceEnd 1121 : Token::TK_FlowMappingEnd; 1122 T.Range = StringRef(Current, 1); 1123 skip(1); 1124 TokenQueue.push_back(T); 1125 if (FlowLevel) 1126 --FlowLevel; 1127 return true; 1128 } 1129 1130 bool Scanner::scanFlowEntry() { 1131 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1132 IsSimpleKeyAllowed = true; 1133 Token T; 1134 T.Kind = Token::TK_FlowEntry; 1135 T.Range = StringRef(Current, 1); 1136 skip(1); 1137 TokenQueue.push_back(T); 1138 return true; 1139 } 1140 1141 bool Scanner::scanBlockEntry() { 1142 rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); 1143 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1144 IsSimpleKeyAllowed = true; 1145 Token T; 1146 T.Kind = Token::TK_BlockEntry; 1147 T.Range = StringRef(Current, 1); 1148 skip(1); 1149 TokenQueue.push_back(T); 1150 return true; 1151 } 1152 1153 bool Scanner::scanKey() { 1154 if (!FlowLevel) 1155 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1156 1157 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1158 IsSimpleKeyAllowed = !FlowLevel; 1159 1160 Token T; 1161 T.Kind = Token::TK_Key; 1162 T.Range = StringRef(Current, 1); 1163 skip(1); 1164 TokenQueue.push_back(T); 1165 return true; 1166 } 1167 1168 bool Scanner::scanValue() { 1169 // If the previous token could have been a simple key, insert the key token 1170 // into the token queue. 1171 if (!SimpleKeys.empty()) { 1172 SimpleKey SK = SimpleKeys.pop_back_val(); 1173 Token T; 1174 T.Kind = Token::TK_Key; 1175 T.Range = SK.Tok->Range; 1176 TokenQueueT::iterator i, e; 1177 for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { 1178 if (i == SK.Tok) 1179 break; 1180 } 1181 assert(i != e && "SimpleKey not in token queue!"); 1182 i = TokenQueue.insert(i, T); 1183 1184 // We may also need to add a Block-Mapping-Start token. 1185 rollIndent(SK.Column, Token::TK_BlockMappingStart, i); 1186 1187 IsSimpleKeyAllowed = false; 1188 } else { 1189 if (!FlowLevel) 1190 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1191 IsSimpleKeyAllowed = !FlowLevel; 1192 } 1193 1194 Token T; 1195 T.Kind = Token::TK_Value; 1196 T.Range = StringRef(Current, 1); 1197 skip(1); 1198 TokenQueue.push_back(T); 1199 return true; 1200 } 1201 1202 // Forbidding inlining improves performance by roughly 20%. 1203 // FIXME: Remove once llvm optimizes this to the faster version without hints. 1204 LLVM_ATTRIBUTE_NOINLINE static bool 1205 wasEscaped(StringRef::iterator First, StringRef::iterator Position); 1206 1207 // Returns whether a character at 'Position' was escaped with a leading '\'. 1208 // 'First' specifies the position of the first character in the string. 1209 static bool wasEscaped(StringRef::iterator First, 1210 StringRef::iterator Position) { 1211 assert(Position - 1 >= First); 1212 StringRef::iterator I = Position - 1; 1213 // We calculate the number of consecutive '\'s before the current position 1214 // by iterating backwards through our string. 1215 while (I >= First && *I == '\\') --I; 1216 // (Position - 1 - I) now contains the number of '\'s before the current 1217 // position. If it is odd, the character at 'Position' was escaped. 1218 return (Position - 1 - I) % 2 == 1; 1219 } 1220 1221 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { 1222 StringRef::iterator Start = Current; 1223 unsigned ColStart = Column; 1224 if (IsDoubleQuoted) { 1225 do { 1226 ++Current; 1227 while (Current != End && *Current != '"') 1228 ++Current; 1229 // Repeat until the previous character was not a '\' or was an escaped 1230 // backslash. 1231 } while ( Current != End 1232 && *(Current - 1) == '\\' 1233 && wasEscaped(Start + 1, Current)); 1234 } else { 1235 skip(1); 1236 while (true) { 1237 // Skip a ' followed by another '. 1238 if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { 1239 skip(2); 1240 continue; 1241 } else if (*Current == '\'') 1242 break; 1243 StringRef::iterator i = skip_nb_char(Current); 1244 if (i == Current) { 1245 i = skip_b_break(Current); 1246 if (i == Current) 1247 break; 1248 Current = i; 1249 Column = 0; 1250 ++Line; 1251 } else { 1252 if (i == End) 1253 break; 1254 Current = i; 1255 ++Column; 1256 } 1257 } 1258 } 1259 1260 if (Current == End) { 1261 setError("Expected quote at end of scalar", Current); 1262 return false; 1263 } 1264 1265 skip(1); // Skip ending quote. 1266 Token T; 1267 T.Kind = Token::TK_Scalar; 1268 T.Range = StringRef(Start, Current - Start); 1269 TokenQueue.push_back(T); 1270 1271 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1272 1273 IsSimpleKeyAllowed = false; 1274 1275 return true; 1276 } 1277 1278 bool Scanner::scanPlainScalar() { 1279 StringRef::iterator Start = Current; 1280 unsigned ColStart = Column; 1281 unsigned LeadingBlanks = 0; 1282 assert(Indent >= -1 && "Indent must be >= -1 !"); 1283 unsigned indent = static_cast<unsigned>(Indent + 1); 1284 while (true) { 1285 if (*Current == '#') 1286 break; 1287 1288 while (!isBlankOrBreak(Current)) { 1289 if ( FlowLevel && *Current == ':' 1290 && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) { 1291 setError("Found unexpected ':' while scanning a plain scalar", Current); 1292 return false; 1293 } 1294 1295 // Check for the end of the plain scalar. 1296 if ( (*Current == ':' && isBlankOrBreak(Current + 1)) 1297 || ( FlowLevel 1298 && (StringRef(Current, 1).find_first_of(",:?[]{}") 1299 != StringRef::npos))) 1300 break; 1301 1302 StringRef::iterator i = skip_nb_char(Current); 1303 if (i == Current) 1304 break; 1305 Current = i; 1306 ++Column; 1307 } 1308 1309 // Are we at the end? 1310 if (!isBlankOrBreak(Current)) 1311 break; 1312 1313 // Eat blanks. 1314 StringRef::iterator Tmp = Current; 1315 while (isBlankOrBreak(Tmp)) { 1316 StringRef::iterator i = skip_s_white(Tmp); 1317 if (i != Tmp) { 1318 if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { 1319 setError("Found invalid tab character in indentation", Tmp); 1320 return false; 1321 } 1322 Tmp = i; 1323 ++Column; 1324 } else { 1325 i = skip_b_break(Tmp); 1326 if (!LeadingBlanks) 1327 LeadingBlanks = 1; 1328 Tmp = i; 1329 Column = 0; 1330 ++Line; 1331 } 1332 } 1333 1334 if (!FlowLevel && Column < indent) 1335 break; 1336 1337 Current = Tmp; 1338 } 1339 if (Start == Current) { 1340 setError("Got empty plain scalar", Start); 1341 return false; 1342 } 1343 Token T; 1344 T.Kind = Token::TK_Scalar; 1345 T.Range = StringRef(Start, Current - Start); 1346 TokenQueue.push_back(T); 1347 1348 // Plain scalars can be simple keys. 1349 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1350 1351 IsSimpleKeyAllowed = false; 1352 1353 return true; 1354 } 1355 1356 bool Scanner::scanAliasOrAnchor(bool IsAlias) { 1357 StringRef::iterator Start = Current; 1358 unsigned ColStart = Column; 1359 skip(1); 1360 while(true) { 1361 if ( *Current == '[' || *Current == ']' 1362 || *Current == '{' || *Current == '}' 1363 || *Current == ',' 1364 || *Current == ':') 1365 break; 1366 StringRef::iterator i = skip_ns_char(Current); 1367 if (i == Current) 1368 break; 1369 Current = i; 1370 ++Column; 1371 } 1372 1373 if (Start == Current) { 1374 setError("Got empty alias or anchor", Start); 1375 return false; 1376 } 1377 1378 Token T; 1379 T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; 1380 T.Range = StringRef(Start, Current - Start); 1381 TokenQueue.push_back(T); 1382 1383 // Alias and anchors can be simple keys. 1384 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1385 1386 IsSimpleKeyAllowed = false; 1387 1388 return true; 1389 } 1390 1391 bool Scanner::scanBlockScalar(bool IsLiteral) { 1392 StringRef::iterator Start = Current; 1393 skip(1); // Eat | or > 1394 while(true) { 1395 StringRef::iterator i = skip_nb_char(Current); 1396 if (i == Current) { 1397 if (Column == 0) 1398 break; 1399 i = skip_b_break(Current); 1400 if (i != Current) { 1401 // We got a line break. 1402 Column = 0; 1403 ++Line; 1404 Current = i; 1405 continue; 1406 } else { 1407 // There was an error, which should already have been printed out. 1408 return false; 1409 } 1410 } 1411 Current = i; 1412 ++Column; 1413 } 1414 1415 if (Start == Current) { 1416 setError("Got empty block scalar", Start); 1417 return false; 1418 } 1419 1420 Token T; 1421 T.Kind = Token::TK_Scalar; 1422 T.Range = StringRef(Start, Current - Start); 1423 TokenQueue.push_back(T); 1424 return true; 1425 } 1426 1427 bool Scanner::scanTag() { 1428 StringRef::iterator Start = Current; 1429 unsigned ColStart = Column; 1430 skip(1); // Eat !. 1431 if (Current == End || isBlankOrBreak(Current)); // An empty tag. 1432 else if (*Current == '<') { 1433 skip(1); 1434 scan_ns_uri_char(); 1435 if (!consume('>')) 1436 return false; 1437 } else { 1438 // FIXME: Actually parse the c-ns-shorthand-tag rule. 1439 Current = skip_while(&Scanner::skip_ns_char, Current); 1440 } 1441 1442 Token T; 1443 T.Kind = Token::TK_Tag; 1444 T.Range = StringRef(Start, Current - Start); 1445 TokenQueue.push_back(T); 1446 1447 // Tags can be simple keys. 1448 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1449 1450 IsSimpleKeyAllowed = false; 1451 1452 return true; 1453 } 1454 1455 bool Scanner::fetchMoreTokens() { 1456 if (IsStartOfStream) 1457 return scanStreamStart(); 1458 1459 scanToNextToken(); 1460 1461 if (Current == End) 1462 return scanStreamEnd(); 1463 1464 removeStaleSimpleKeyCandidates(); 1465 1466 unrollIndent(Column); 1467 1468 if (Column == 0 && *Current == '%') 1469 return scanDirective(); 1470 1471 if (Column == 0 && Current + 4 <= End 1472 && *Current == '-' 1473 && *(Current + 1) == '-' 1474 && *(Current + 2) == '-' 1475 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1476 return scanDocumentIndicator(true); 1477 1478 if (Column == 0 && Current + 4 <= End 1479 && *Current == '.' 1480 && *(Current + 1) == '.' 1481 && *(Current + 2) == '.' 1482 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1483 return scanDocumentIndicator(false); 1484 1485 if (*Current == '[') 1486 return scanFlowCollectionStart(true); 1487 1488 if (*Current == '{') 1489 return scanFlowCollectionStart(false); 1490 1491 if (*Current == ']') 1492 return scanFlowCollectionEnd(true); 1493 1494 if (*Current == '}') 1495 return scanFlowCollectionEnd(false); 1496 1497 if (*Current == ',') 1498 return scanFlowEntry(); 1499 1500 if (*Current == '-' && isBlankOrBreak(Current + 1)) 1501 return scanBlockEntry(); 1502 1503 if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1))) 1504 return scanKey(); 1505 1506 if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1))) 1507 return scanValue(); 1508 1509 if (*Current == '*') 1510 return scanAliasOrAnchor(true); 1511 1512 if (*Current == '&') 1513 return scanAliasOrAnchor(false); 1514 1515 if (*Current == '!') 1516 return scanTag(); 1517 1518 if (*Current == '|' && !FlowLevel) 1519 return scanBlockScalar(true); 1520 1521 if (*Current == '>' && !FlowLevel) 1522 return scanBlockScalar(false); 1523 1524 if (*Current == '\'') 1525 return scanFlowScalar(false); 1526 1527 if (*Current == '"') 1528 return scanFlowScalar(true); 1529 1530 // Get a plain scalar. 1531 StringRef FirstChar(Current, 1); 1532 if (!(isBlankOrBreak(Current) 1533 || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos) 1534 || (*Current == '-' && !isBlankOrBreak(Current + 1)) 1535 || (!FlowLevel && (*Current == '?' || *Current == ':') 1536 && isBlankOrBreak(Current + 1)) 1537 || (!FlowLevel && *Current == ':' 1538 && Current + 2 < End 1539 && *(Current + 1) == ':' 1540 && !isBlankOrBreak(Current + 2))) 1541 return scanPlainScalar(); 1542 1543 setError("Unrecognized character while tokenizing."); 1544 return false; 1545 } 1546 1547 Stream::Stream(StringRef Input, SourceMgr &SM) 1548 : scanner(new Scanner(Input, SM)) 1549 , CurrentDoc(0) {} 1550 1551 Stream::Stream(MemoryBuffer *InputBuffer, SourceMgr &SM) 1552 : scanner(new Scanner(InputBuffer, SM)) 1553 , CurrentDoc(0) {} 1554 1555 Stream::~Stream() {} 1556 1557 bool Stream::failed() { return scanner->failed(); } 1558 1559 void Stream::printError(Node *N, const Twine &Msg) { 1560 SmallVector<SMRange, 1> Ranges; 1561 Ranges.push_back(N->getSourceRange()); 1562 scanner->printError( N->getSourceRange().Start 1563 , SourceMgr::DK_Error 1564 , Msg 1565 , Ranges); 1566 } 1567 1568 void Stream::handleYAMLDirective(const Token &t) { 1569 // TODO: Ensure version is 1.x. 1570 } 1571 1572 document_iterator Stream::begin() { 1573 if (CurrentDoc) 1574 report_fatal_error("Can only iterate over the stream once"); 1575 1576 // Skip Stream-Start. 1577 scanner->getNext(); 1578 1579 CurrentDoc.reset(new Document(*this)); 1580 return document_iterator(CurrentDoc); 1581 } 1582 1583 document_iterator Stream::end() { 1584 return document_iterator(); 1585 } 1586 1587 void Stream::skip() { 1588 for (document_iterator i = begin(), e = end(); i != e; ++i) 1589 i->skip(); 1590 } 1591 1592 Node::Node(unsigned int Type, OwningPtr<Document> &D, StringRef A) 1593 : Doc(D) 1594 , TypeID(Type) 1595 , Anchor(A) { 1596 SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); 1597 SourceRange = SMRange(Start, Start); 1598 } 1599 1600 Token &Node::peekNext() { 1601 return Doc->peekNext(); 1602 } 1603 1604 Token Node::getNext() { 1605 return Doc->getNext(); 1606 } 1607 1608 Node *Node::parseBlockNode() { 1609 return Doc->parseBlockNode(); 1610 } 1611 1612 BumpPtrAllocator &Node::getAllocator() { 1613 return Doc->NodeAllocator; 1614 } 1615 1616 void Node::setError(const Twine &Msg, Token &Tok) const { 1617 Doc->setError(Msg, Tok); 1618 } 1619 1620 bool Node::failed() const { 1621 return Doc->failed(); 1622 } 1623 1624 1625 1626 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { 1627 // TODO: Handle newlines properly. We need to remove leading whitespace. 1628 if (Value[0] == '"') { // Double quoted. 1629 // Pull off the leading and trailing "s. 1630 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1631 // Search for characters that would require unescaping the value. 1632 StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); 1633 if (i != StringRef::npos) 1634 return unescapeDoubleQuoted(UnquotedValue, i, Storage); 1635 return UnquotedValue; 1636 } else if (Value[0] == '\'') { // Single quoted. 1637 // Pull off the leading and trailing 's. 1638 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1639 StringRef::size_type i = UnquotedValue.find('\''); 1640 if (i != StringRef::npos) { 1641 // We're going to need Storage. 1642 Storage.clear(); 1643 Storage.reserve(UnquotedValue.size()); 1644 for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { 1645 StringRef Valid(UnquotedValue.begin(), i); 1646 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1647 Storage.push_back('\''); 1648 UnquotedValue = UnquotedValue.substr(i + 2); 1649 } 1650 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 1651 return StringRef(Storage.begin(), Storage.size()); 1652 } 1653 return UnquotedValue; 1654 } 1655 // Plain or block. 1656 return Value.rtrim(" "); 1657 } 1658 1659 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue 1660 , StringRef::size_type i 1661 , SmallVectorImpl<char> &Storage) 1662 const { 1663 // Use Storage to build proper value. 1664 Storage.clear(); 1665 Storage.reserve(UnquotedValue.size()); 1666 for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { 1667 // Insert all previous chars into Storage. 1668 StringRef Valid(UnquotedValue.begin(), i); 1669 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1670 // Chop off inserted chars. 1671 UnquotedValue = UnquotedValue.substr(i); 1672 1673 assert(!UnquotedValue.empty() && "Can't be empty!"); 1674 1675 // Parse escape or line break. 1676 switch (UnquotedValue[0]) { 1677 case '\r': 1678 case '\n': 1679 Storage.push_back('\n'); 1680 if ( UnquotedValue.size() > 1 1681 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1682 UnquotedValue = UnquotedValue.substr(1); 1683 UnquotedValue = UnquotedValue.substr(1); 1684 break; 1685 default: 1686 if (UnquotedValue.size() == 1) 1687 // TODO: Report error. 1688 break; 1689 UnquotedValue = UnquotedValue.substr(1); 1690 switch (UnquotedValue[0]) { 1691 default: { 1692 Token T; 1693 T.Range = StringRef(UnquotedValue.begin(), 1); 1694 setError("Unrecognized escape code!", T); 1695 return ""; 1696 } 1697 case '\r': 1698 case '\n': 1699 // Remove the new line. 1700 if ( UnquotedValue.size() > 1 1701 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1702 UnquotedValue = UnquotedValue.substr(1); 1703 // If this was just a single byte newline, it will get skipped 1704 // below. 1705 break; 1706 case '0': 1707 Storage.push_back(0x00); 1708 break; 1709 case 'a': 1710 Storage.push_back(0x07); 1711 break; 1712 case 'b': 1713 Storage.push_back(0x08); 1714 break; 1715 case 't': 1716 case 0x09: 1717 Storage.push_back(0x09); 1718 break; 1719 case 'n': 1720 Storage.push_back(0x0A); 1721 break; 1722 case 'v': 1723 Storage.push_back(0x0B); 1724 break; 1725 case 'f': 1726 Storage.push_back(0x0C); 1727 break; 1728 case 'r': 1729 Storage.push_back(0x0D); 1730 break; 1731 case 'e': 1732 Storage.push_back(0x1B); 1733 break; 1734 case ' ': 1735 Storage.push_back(0x20); 1736 break; 1737 case '"': 1738 Storage.push_back(0x22); 1739 break; 1740 case '/': 1741 Storage.push_back(0x2F); 1742 break; 1743 case '\\': 1744 Storage.push_back(0x5C); 1745 break; 1746 case 'N': 1747 encodeUTF8(0x85, Storage); 1748 break; 1749 case '_': 1750 encodeUTF8(0xA0, Storage); 1751 break; 1752 case 'L': 1753 encodeUTF8(0x2028, Storage); 1754 break; 1755 case 'P': 1756 encodeUTF8(0x2029, Storage); 1757 break; 1758 case 'x': { 1759 if (UnquotedValue.size() < 3) 1760 // TODO: Report error. 1761 break; 1762 unsigned int UnicodeScalarValue; 1763 if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) 1764 // TODO: Report error. 1765 UnicodeScalarValue = 0xFFFD; 1766 encodeUTF8(UnicodeScalarValue, Storage); 1767 UnquotedValue = UnquotedValue.substr(2); 1768 break; 1769 } 1770 case 'u': { 1771 if (UnquotedValue.size() < 5) 1772 // TODO: Report error. 1773 break; 1774 unsigned int UnicodeScalarValue; 1775 if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) 1776 // TODO: Report error. 1777 UnicodeScalarValue = 0xFFFD; 1778 encodeUTF8(UnicodeScalarValue, Storage); 1779 UnquotedValue = UnquotedValue.substr(4); 1780 break; 1781 } 1782 case 'U': { 1783 if (UnquotedValue.size() < 9) 1784 // TODO: Report error. 1785 break; 1786 unsigned int UnicodeScalarValue; 1787 if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) 1788 // TODO: Report error. 1789 UnicodeScalarValue = 0xFFFD; 1790 encodeUTF8(UnicodeScalarValue, Storage); 1791 UnquotedValue = UnquotedValue.substr(8); 1792 break; 1793 } 1794 } 1795 UnquotedValue = UnquotedValue.substr(1); 1796 } 1797 } 1798 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 1799 return StringRef(Storage.begin(), Storage.size()); 1800 } 1801 1802 Node *KeyValueNode::getKey() { 1803 if (Key) 1804 return Key; 1805 // Handle implicit null keys. 1806 { 1807 Token &t = peekNext(); 1808 if ( t.Kind == Token::TK_BlockEnd 1809 || t.Kind == Token::TK_Value 1810 || t.Kind == Token::TK_Error) { 1811 return Key = new (getAllocator()) NullNode(Doc); 1812 } 1813 if (t.Kind == Token::TK_Key) 1814 getNext(); // skip TK_Key. 1815 } 1816 1817 // Handle explicit null keys. 1818 Token &t = peekNext(); 1819 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { 1820 return Key = new (getAllocator()) NullNode(Doc); 1821 } 1822 1823 // We've got a normal key. 1824 return Key = parseBlockNode(); 1825 } 1826 1827 Node *KeyValueNode::getValue() { 1828 if (Value) 1829 return Value; 1830 getKey()->skip(); 1831 if (failed()) 1832 return Value = new (getAllocator()) NullNode(Doc); 1833 1834 // Handle implicit null values. 1835 { 1836 Token &t = peekNext(); 1837 if ( t.Kind == Token::TK_BlockEnd 1838 || t.Kind == Token::TK_FlowMappingEnd 1839 || t.Kind == Token::TK_Key 1840 || t.Kind == Token::TK_FlowEntry 1841 || t.Kind == Token::TK_Error) { 1842 return Value = new (getAllocator()) NullNode(Doc); 1843 } 1844 1845 if (t.Kind != Token::TK_Value) { 1846 setError("Unexpected token in Key Value.", t); 1847 return Value = new (getAllocator()) NullNode(Doc); 1848 } 1849 getNext(); // skip TK_Value. 1850 } 1851 1852 // Handle explicit null values. 1853 Token &t = peekNext(); 1854 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { 1855 return Value = new (getAllocator()) NullNode(Doc); 1856 } 1857 1858 // We got a normal value. 1859 return Value = parseBlockNode(); 1860 } 1861 1862 void MappingNode::increment() { 1863 if (failed()) { 1864 IsAtEnd = true; 1865 CurrentEntry = 0; 1866 return; 1867 } 1868 if (CurrentEntry) { 1869 CurrentEntry->skip(); 1870 if (Type == MT_Inline) { 1871 IsAtEnd = true; 1872 CurrentEntry = 0; 1873 return; 1874 } 1875 } 1876 Token T = peekNext(); 1877 if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { 1878 // KeyValueNode eats the TK_Key. That way it can detect null keys. 1879 CurrentEntry = new (getAllocator()) KeyValueNode(Doc); 1880 } else if (Type == MT_Block) { 1881 switch (T.Kind) { 1882 case Token::TK_BlockEnd: 1883 getNext(); 1884 IsAtEnd = true; 1885 CurrentEntry = 0; 1886 break; 1887 default: 1888 setError("Unexpected token. Expected Key or Block End", T); 1889 case Token::TK_Error: 1890 IsAtEnd = true; 1891 CurrentEntry = 0; 1892 } 1893 } else { 1894 switch (T.Kind) { 1895 case Token::TK_FlowEntry: 1896 // Eat the flow entry and recurse. 1897 getNext(); 1898 return increment(); 1899 case Token::TK_FlowMappingEnd: 1900 getNext(); 1901 case Token::TK_Error: 1902 // Set this to end iterator. 1903 IsAtEnd = true; 1904 CurrentEntry = 0; 1905 break; 1906 default: 1907 setError( "Unexpected token. Expected Key, Flow Entry, or Flow " 1908 "Mapping End." 1909 , T); 1910 IsAtEnd = true; 1911 CurrentEntry = 0; 1912 } 1913 } 1914 } 1915 1916 void SequenceNode::increment() { 1917 if (failed()) { 1918 IsAtEnd = true; 1919 CurrentEntry = 0; 1920 return; 1921 } 1922 if (CurrentEntry) 1923 CurrentEntry->skip(); 1924 Token T = peekNext(); 1925 if (SeqType == ST_Block) { 1926 switch (T.Kind) { 1927 case Token::TK_BlockEntry: 1928 getNext(); 1929 CurrentEntry = parseBlockNode(); 1930 if (CurrentEntry == 0) { // An error occurred. 1931 IsAtEnd = true; 1932 CurrentEntry = 0; 1933 } 1934 break; 1935 case Token::TK_BlockEnd: 1936 getNext(); 1937 IsAtEnd = true; 1938 CurrentEntry = 0; 1939 break; 1940 default: 1941 setError( "Unexpected token. Expected Block Entry or Block End." 1942 , T); 1943 case Token::TK_Error: 1944 IsAtEnd = true; 1945 CurrentEntry = 0; 1946 } 1947 } else if (SeqType == ST_Indentless) { 1948 switch (T.Kind) { 1949 case Token::TK_BlockEntry: 1950 getNext(); 1951 CurrentEntry = parseBlockNode(); 1952 if (CurrentEntry == 0) { // An error occurred. 1953 IsAtEnd = true; 1954 CurrentEntry = 0; 1955 } 1956 break; 1957 default: 1958 case Token::TK_Error: 1959 IsAtEnd = true; 1960 CurrentEntry = 0; 1961 } 1962 } else if (SeqType == ST_Flow) { 1963 switch (T.Kind) { 1964 case Token::TK_FlowEntry: 1965 // Eat the flow entry and recurse. 1966 getNext(); 1967 WasPreviousTokenFlowEntry = true; 1968 return increment(); 1969 case Token::TK_FlowSequenceEnd: 1970 getNext(); 1971 case Token::TK_Error: 1972 // Set this to end iterator. 1973 IsAtEnd = true; 1974 CurrentEntry = 0; 1975 break; 1976 case Token::TK_StreamEnd: 1977 case Token::TK_DocumentEnd: 1978 case Token::TK_DocumentStart: 1979 setError("Could not find closing ]!", T); 1980 // Set this to end iterator. 1981 IsAtEnd = true; 1982 CurrentEntry = 0; 1983 break; 1984 default: 1985 if (!WasPreviousTokenFlowEntry) { 1986 setError("Expected , between entries!", T); 1987 IsAtEnd = true; 1988 CurrentEntry = 0; 1989 break; 1990 } 1991 // Otherwise it must be a flow entry. 1992 CurrentEntry = parseBlockNode(); 1993 if (!CurrentEntry) { 1994 IsAtEnd = true; 1995 } 1996 WasPreviousTokenFlowEntry = false; 1997 break; 1998 } 1999 } 2000 } 2001 2002 Document::Document(Stream &S) : stream(S), Root(0) { 2003 if (parseDirectives()) 2004 expectToken(Token::TK_DocumentStart); 2005 Token &T = peekNext(); 2006 if (T.Kind == Token::TK_DocumentStart) 2007 getNext(); 2008 } 2009 2010 bool Document::skip() { 2011 if (stream.scanner->failed()) 2012 return false; 2013 if (!Root) 2014 getRoot(); 2015 Root->skip(); 2016 Token &T = peekNext(); 2017 if (T.Kind == Token::TK_StreamEnd) 2018 return false; 2019 if (T.Kind == Token::TK_DocumentEnd) { 2020 getNext(); 2021 return skip(); 2022 } 2023 return true; 2024 } 2025 2026 Token &Document::peekNext() { 2027 return stream.scanner->peekNext(); 2028 } 2029 2030 Token Document::getNext() { 2031 return stream.scanner->getNext(); 2032 } 2033 2034 void Document::setError(const Twine &Message, Token &Location) const { 2035 stream.scanner->setError(Message, Location.Range.begin()); 2036 } 2037 2038 bool Document::failed() const { 2039 return stream.scanner->failed(); 2040 } 2041 2042 Node *Document::parseBlockNode() { 2043 Token T = peekNext(); 2044 // Handle properties. 2045 Token AnchorInfo; 2046 parse_property: 2047 switch (T.Kind) { 2048 case Token::TK_Alias: 2049 getNext(); 2050 return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); 2051 case Token::TK_Anchor: 2052 if (AnchorInfo.Kind == Token::TK_Anchor) { 2053 setError("Already encountered an anchor for this node!", T); 2054 return 0; 2055 } 2056 AnchorInfo = getNext(); // Consume TK_Anchor. 2057 T = peekNext(); 2058 goto parse_property; 2059 case Token::TK_Tag: 2060 getNext(); // Skip TK_Tag. 2061 T = peekNext(); 2062 goto parse_property; 2063 default: 2064 break; 2065 } 2066 2067 switch (T.Kind) { 2068 case Token::TK_BlockEntry: 2069 // We got an unindented BlockEntry sequence. This is not terminated with 2070 // a BlockEnd. 2071 // Don't eat the TK_BlockEntry, SequenceNode needs it. 2072 return new (NodeAllocator) SequenceNode( stream.CurrentDoc 2073 , AnchorInfo.Range.substr(1) 2074 , SequenceNode::ST_Indentless); 2075 case Token::TK_BlockSequenceStart: 2076 getNext(); 2077 return new (NodeAllocator) 2078 SequenceNode( stream.CurrentDoc 2079 , AnchorInfo.Range.substr(1) 2080 , SequenceNode::ST_Block); 2081 case Token::TK_BlockMappingStart: 2082 getNext(); 2083 return new (NodeAllocator) 2084 MappingNode( stream.CurrentDoc 2085 , AnchorInfo.Range.substr(1) 2086 , MappingNode::MT_Block); 2087 case Token::TK_FlowSequenceStart: 2088 getNext(); 2089 return new (NodeAllocator) 2090 SequenceNode( stream.CurrentDoc 2091 , AnchorInfo.Range.substr(1) 2092 , SequenceNode::ST_Flow); 2093 case Token::TK_FlowMappingStart: 2094 getNext(); 2095 return new (NodeAllocator) 2096 MappingNode( stream.CurrentDoc 2097 , AnchorInfo.Range.substr(1) 2098 , MappingNode::MT_Flow); 2099 case Token::TK_Scalar: 2100 getNext(); 2101 return new (NodeAllocator) 2102 ScalarNode( stream.CurrentDoc 2103 , AnchorInfo.Range.substr(1) 2104 , T.Range); 2105 case Token::TK_Key: 2106 // Don't eat the TK_Key, KeyValueNode expects it. 2107 return new (NodeAllocator) 2108 MappingNode( stream.CurrentDoc 2109 , AnchorInfo.Range.substr(1) 2110 , MappingNode::MT_Inline); 2111 case Token::TK_DocumentStart: 2112 case Token::TK_DocumentEnd: 2113 case Token::TK_StreamEnd: 2114 default: 2115 // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not 2116 // !!null null. 2117 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2118 case Token::TK_Error: 2119 return 0; 2120 } 2121 llvm_unreachable("Control flow shouldn't reach here."); 2122 return 0; 2123 } 2124 2125 bool Document::parseDirectives() { 2126 bool isDirective = false; 2127 while (true) { 2128 Token T = peekNext(); 2129 if (T.Kind == Token::TK_TagDirective) { 2130 handleTagDirective(getNext()); 2131 isDirective = true; 2132 } else if (T.Kind == Token::TK_VersionDirective) { 2133 stream.handleYAMLDirective(getNext()); 2134 isDirective = true; 2135 } else 2136 break; 2137 } 2138 return isDirective; 2139 } 2140 2141 bool Document::expectToken(int TK) { 2142 Token T = getNext(); 2143 if (T.Kind != TK) { 2144 setError("Unexpected token", T); 2145 return false; 2146 } 2147 return true; 2148 } 2149