1 //===--- YAMLParser.cpp - Simple YAML parser ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements a YAML parser. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Support/YAMLParser.h" 15 #include "llvm/ADT/SmallVector.h" 16 #include "llvm/ADT/StringExtras.h" 17 #include "llvm/ADT/Twine.h" 18 #include "llvm/ADT/ilist.h" 19 #include "llvm/ADT/ilist_node.h" 20 #include "llvm/Support/ErrorHandling.h" 21 #include "llvm/Support/MemoryBuffer.h" 22 #include "llvm/Support/SourceMgr.h" 23 #include "llvm/Support/raw_ostream.h" 24 25 using namespace llvm; 26 using namespace yaml; 27 28 enum UnicodeEncodingForm { 29 UEF_UTF32_LE, ///< UTF-32 Little Endian 30 UEF_UTF32_BE, ///< UTF-32 Big Endian 31 UEF_UTF16_LE, ///< UTF-16 Little Endian 32 UEF_UTF16_BE, ///< UTF-16 Big Endian 33 UEF_UTF8, ///< UTF-8 or ascii. 34 UEF_Unknown ///< Not a valid Unicode encoding. 35 }; 36 37 /// EncodingInfo - Holds the encoding type and length of the byte order mark if 38 /// it exists. Length is in {0, 2, 3, 4}. 39 typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo; 40 41 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode 42 /// encoding form of \a Input. 43 /// 44 /// @param Input A string of length 0 or more. 45 /// @returns An EncodingInfo indicating the Unicode encoding form of the input 46 /// and how long the byte order mark is if one exists. 47 static EncodingInfo getUnicodeEncoding(StringRef Input) { 48 if (Input.size() == 0) 49 return std::make_pair(UEF_Unknown, 0); 50 51 switch (uint8_t(Input[0])) { 52 case 0x00: 53 if (Input.size() >= 4) { 54 if ( Input[1] == 0 55 && uint8_t(Input[2]) == 0xFE 56 && uint8_t(Input[3]) == 0xFF) 57 return std::make_pair(UEF_UTF32_BE, 4); 58 if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) 59 return std::make_pair(UEF_UTF32_BE, 0); 60 } 61 62 if (Input.size() >= 2 && Input[1] != 0) 63 return std::make_pair(UEF_UTF16_BE, 0); 64 return std::make_pair(UEF_Unknown, 0); 65 case 0xFF: 66 if ( Input.size() >= 4 67 && uint8_t(Input[1]) == 0xFE 68 && Input[2] == 0 69 && Input[3] == 0) 70 return std::make_pair(UEF_UTF32_LE, 4); 71 72 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) 73 return std::make_pair(UEF_UTF16_LE, 2); 74 return std::make_pair(UEF_Unknown, 0); 75 case 0xFE: 76 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) 77 return std::make_pair(UEF_UTF16_BE, 2); 78 return std::make_pair(UEF_Unknown, 0); 79 case 0xEF: 80 if ( Input.size() >= 3 81 && uint8_t(Input[1]) == 0xBB 82 && uint8_t(Input[2]) == 0xBF) 83 return std::make_pair(UEF_UTF8, 3); 84 return std::make_pair(UEF_Unknown, 0); 85 } 86 87 // It could still be utf-32 or utf-16. 88 if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) 89 return std::make_pair(UEF_UTF32_LE, 0); 90 91 if (Input.size() >= 2 && Input[1] == 0) 92 return std::make_pair(UEF_UTF16_LE, 0); 93 94 return std::make_pair(UEF_UTF8, 0); 95 } 96 97 namespace llvm { 98 namespace yaml { 99 /// Pin the vtables to this file. 100 void Node::anchor() {} 101 void NullNode::anchor() {} 102 void ScalarNode::anchor() {} 103 void KeyValueNode::anchor() {} 104 void MappingNode::anchor() {} 105 void SequenceNode::anchor() {} 106 void AliasNode::anchor() {} 107 108 /// Token - A single YAML token. 109 struct Token : ilist_node<Token> { 110 enum TokenKind { 111 TK_Error, // Uninitialized token. 112 TK_StreamStart, 113 TK_StreamEnd, 114 TK_VersionDirective, 115 TK_TagDirective, 116 TK_DocumentStart, 117 TK_DocumentEnd, 118 TK_BlockEntry, 119 TK_BlockEnd, 120 TK_BlockSequenceStart, 121 TK_BlockMappingStart, 122 TK_FlowEntry, 123 TK_FlowSequenceStart, 124 TK_FlowSequenceEnd, 125 TK_FlowMappingStart, 126 TK_FlowMappingEnd, 127 TK_Key, 128 TK_Value, 129 TK_Scalar, 130 TK_Alias, 131 TK_Anchor, 132 TK_Tag 133 } Kind; 134 135 /// A string of length 0 or more whose begin() points to the logical location 136 /// of the token in the input. 137 StringRef Range; 138 139 Token() : Kind(TK_Error) {} 140 }; 141 } 142 } 143 144 namespace llvm { 145 template<> 146 struct ilist_sentinel_traits<Token> { 147 Token *createSentinel() const { 148 return &Sentinel; 149 } 150 static void destroySentinel(Token*) {} 151 152 Token *provideInitialHead() const { return createSentinel(); } 153 Token *ensureHead(Token*) const { return createSentinel(); } 154 static void noteHead(Token*, Token*) {} 155 156 private: 157 mutable Token Sentinel; 158 }; 159 160 template<> 161 struct ilist_node_traits<Token> { 162 Token *createNode(const Token &V) { 163 return new (Alloc.Allocate<Token>()) Token(V); 164 } 165 static void deleteNode(Token *V) {} 166 167 void addNodeToList(Token *) {} 168 void removeNodeFromList(Token *) {} 169 void transferNodesFromList(ilist_node_traits & /*SrcTraits*/, 170 ilist_iterator<Token> /*first*/, 171 ilist_iterator<Token> /*last*/) {} 172 173 BumpPtrAllocator Alloc; 174 }; 175 } 176 177 typedef ilist<Token> TokenQueueT; 178 179 namespace { 180 /// @brief This struct is used to track simple keys. 181 /// 182 /// Simple keys are handled by creating an entry in SimpleKeys for each Token 183 /// which could legally be the start of a simple key. When peekNext is called, 184 /// if the Token To be returned is referenced by a SimpleKey, we continue 185 /// tokenizing until that potential simple key has either been found to not be 186 /// a simple key (we moved on to the next line or went further than 1024 chars). 187 /// Or when we run into a Value, and then insert a Key token (and possibly 188 /// others) before the SimpleKey's Tok. 189 struct SimpleKey { 190 TokenQueueT::iterator Tok; 191 unsigned Column; 192 unsigned Line; 193 unsigned FlowLevel; 194 bool IsRequired; 195 196 bool operator ==(const SimpleKey &Other) { 197 return Tok == Other.Tok; 198 } 199 }; 200 } 201 202 /// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit 203 /// subsequence and the subsequence's length in code units (uint8_t). 204 /// A length of 0 represents an error. 205 typedef std::pair<uint32_t, unsigned> UTF8Decoded; 206 207 static UTF8Decoded decodeUTF8(StringRef Range) { 208 StringRef::iterator Position= Range.begin(); 209 StringRef::iterator End = Range.end(); 210 // 1 byte: [0x00, 0x7f] 211 // Bit pattern: 0xxxxxxx 212 if ((*Position & 0x80) == 0) { 213 return std::make_pair(*Position, 1); 214 } 215 // 2 bytes: [0x80, 0x7ff] 216 // Bit pattern: 110xxxxx 10xxxxxx 217 if (Position + 1 != End && 218 ((*Position & 0xE0) == 0xC0) && 219 ((*(Position + 1) & 0xC0) == 0x80)) { 220 uint32_t codepoint = ((*Position & 0x1F) << 6) | 221 (*(Position + 1) & 0x3F); 222 if (codepoint >= 0x80) 223 return std::make_pair(codepoint, 2); 224 } 225 // 3 bytes: [0x8000, 0xffff] 226 // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx 227 if (Position + 2 != End && 228 ((*Position & 0xF0) == 0xE0) && 229 ((*(Position + 1) & 0xC0) == 0x80) && 230 ((*(Position + 2) & 0xC0) == 0x80)) { 231 uint32_t codepoint = ((*Position & 0x0F) << 12) | 232 ((*(Position + 1) & 0x3F) << 6) | 233 (*(Position + 2) & 0x3F); 234 // Codepoints between 0xD800 and 0xDFFF are invalid, as 235 // they are high / low surrogate halves used by UTF-16. 236 if (codepoint >= 0x800 && 237 (codepoint < 0xD800 || codepoint > 0xDFFF)) 238 return std::make_pair(codepoint, 3); 239 } 240 // 4 bytes: [0x10000, 0x10FFFF] 241 // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 242 if (Position + 3 != End && 243 ((*Position & 0xF8) == 0xF0) && 244 ((*(Position + 1) & 0xC0) == 0x80) && 245 ((*(Position + 2) & 0xC0) == 0x80) && 246 ((*(Position + 3) & 0xC0) == 0x80)) { 247 uint32_t codepoint = ((*Position & 0x07) << 18) | 248 ((*(Position + 1) & 0x3F) << 12) | 249 ((*(Position + 2) & 0x3F) << 6) | 250 (*(Position + 3) & 0x3F); 251 if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) 252 return std::make_pair(codepoint, 4); 253 } 254 return std::make_pair(0, 0); 255 } 256 257 namespace llvm { 258 namespace yaml { 259 /// @brief Scans YAML tokens from a MemoryBuffer. 260 class Scanner { 261 public: 262 Scanner(const StringRef Input, SourceMgr &SM); 263 Scanner(std::unique_ptr<MemoryBuffer> Buffer, SourceMgr &SM_); 264 265 /// @brief Parse the next token and return it without popping it. 266 Token &peekNext(); 267 268 /// @brief Parse the next token and pop it from the queue. 269 Token getNext(); 270 271 void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, 272 ArrayRef<SMRange> Ranges = None) { 273 SM.PrintMessage(Loc, Kind, Message, Ranges); 274 } 275 276 void setError(const Twine &Message, StringRef::iterator Position) { 277 if (Current >= End) 278 Current = End - 1; 279 280 // Don't print out more errors after the first one we encounter. The rest 281 // are just the result of the first, and have no meaning. 282 if (!Failed) 283 printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message); 284 Failed = true; 285 } 286 287 void setError(const Twine &Message) { 288 setError(Message, Current); 289 } 290 291 /// @brief Returns true if an error occurred while parsing. 292 bool failed() { 293 return Failed; 294 } 295 296 private: 297 StringRef currentInput() { 298 return StringRef(Current, End - Current); 299 } 300 301 /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting 302 /// at \a Position. 303 /// 304 /// If the UTF-8 code units starting at Position do not form a well-formed 305 /// code unit subsequence, then the Unicode scalar value is 0, and the length 306 /// is 0. 307 UTF8Decoded decodeUTF8(StringRef::iterator Position) { 308 return ::decodeUTF8(StringRef(Position, End - Position)); 309 } 310 311 // The following functions are based on the gramar rules in the YAML spec. The 312 // style of the function names it meant to closely match how they are written 313 // in the spec. The number within the [] is the number of the grammar rule in 314 // the spec. 315 // 316 // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. 317 // 318 // c- 319 // A production starting and ending with a special character. 320 // b- 321 // A production matching a single line break. 322 // nb- 323 // A production starting and ending with a non-break character. 324 // s- 325 // A production starting and ending with a white space character. 326 // ns- 327 // A production starting and ending with a non-space character. 328 // l- 329 // A production matching complete line(s). 330 331 /// @brief Skip a single nb-char[27] starting at Position. 332 /// 333 /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] 334 /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] 335 /// 336 /// @returns The code unit after the nb-char, or Position if it's not an 337 /// nb-char. 338 StringRef::iterator skip_nb_char(StringRef::iterator Position); 339 340 /// @brief Skip a single b-break[28] starting at Position. 341 /// 342 /// A b-break is 0xD 0xA | 0xD | 0xA 343 /// 344 /// @returns The code unit after the b-break, or Position if it's not a 345 /// b-break. 346 StringRef::iterator skip_b_break(StringRef::iterator Position); 347 348 /// @brief Skip a single s-white[33] starting at Position. 349 /// 350 /// A s-white is 0x20 | 0x9 351 /// 352 /// @returns The code unit after the s-white, or Position if it's not a 353 /// s-white. 354 StringRef::iterator skip_s_white(StringRef::iterator Position); 355 356 /// @brief Skip a single ns-char[34] starting at Position. 357 /// 358 /// A ns-char is nb-char - s-white 359 /// 360 /// @returns The code unit after the ns-char, or Position if it's not a 361 /// ns-char. 362 StringRef::iterator skip_ns_char(StringRef::iterator Position); 363 364 typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator); 365 /// @brief Skip minimal well-formed code unit subsequences until Func 366 /// returns its input. 367 /// 368 /// @returns The code unit after the last minimal well-formed code unit 369 /// subsequence that Func accepted. 370 StringRef::iterator skip_while( SkipWhileFunc Func 371 , StringRef::iterator Position); 372 373 /// @brief Scan ns-uri-char[39]s starting at Cur. 374 /// 375 /// This updates Cur and Column while scanning. 376 /// 377 /// @returns A StringRef starting at Cur which covers the longest contiguous 378 /// sequence of ns-uri-char. 379 StringRef scan_ns_uri_char(); 380 381 /// @brief Consume a minimal well-formed code unit subsequence starting at 382 /// \a Cur. Return false if it is not the same Unicode scalar value as 383 /// \a Expected. This updates \a Column. 384 bool consume(uint32_t Expected); 385 386 /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. 387 void skip(uint32_t Distance); 388 389 /// @brief Return true if the minimal well-formed code unit subsequence at 390 /// Pos is whitespace or a new line 391 bool isBlankOrBreak(StringRef::iterator Position); 392 393 /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey. 394 void saveSimpleKeyCandidate( TokenQueueT::iterator Tok 395 , unsigned AtColumn 396 , bool IsRequired); 397 398 /// @brief Remove simple keys that can no longer be valid simple keys. 399 /// 400 /// Invalid simple keys are not on the current line or are further than 1024 401 /// columns back. 402 void removeStaleSimpleKeyCandidates(); 403 404 /// @brief Remove all simple keys on FlowLevel \a Level. 405 void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); 406 407 /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd 408 /// tokens if needed. 409 bool unrollIndent(int ToColumn); 410 411 /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint 412 /// if needed. 413 bool rollIndent( int ToColumn 414 , Token::TokenKind Kind 415 , TokenQueueT::iterator InsertPoint); 416 417 /// @brief Skip whitespace and comments until the start of the next token. 418 void scanToNextToken(); 419 420 /// @brief Must be the first token generated. 421 bool scanStreamStart(); 422 423 /// @brief Generate tokens needed to close out the stream. 424 bool scanStreamEnd(); 425 426 /// @brief Scan a %BLAH directive. 427 bool scanDirective(); 428 429 /// @brief Scan a ... or ---. 430 bool scanDocumentIndicator(bool IsStart); 431 432 /// @brief Scan a [ or { and generate the proper flow collection start token. 433 bool scanFlowCollectionStart(bool IsSequence); 434 435 /// @brief Scan a ] or } and generate the proper flow collection end token. 436 bool scanFlowCollectionEnd(bool IsSequence); 437 438 /// @brief Scan the , that separates entries in a flow collection. 439 bool scanFlowEntry(); 440 441 /// @brief Scan the - that starts block sequence entries. 442 bool scanBlockEntry(); 443 444 /// @brief Scan an explicit ? indicating a key. 445 bool scanKey(); 446 447 /// @brief Scan an explicit : indicating a value. 448 bool scanValue(); 449 450 /// @brief Scan a quoted scalar. 451 bool scanFlowScalar(bool IsDoubleQuoted); 452 453 /// @brief Scan an unquoted scalar. 454 bool scanPlainScalar(); 455 456 /// @brief Scan an Alias or Anchor starting with * or &. 457 bool scanAliasOrAnchor(bool IsAlias); 458 459 /// @brief Scan a block scalar starting with | or >. 460 bool scanBlockScalar(bool IsLiteral); 461 462 /// @brief Scan a tag of the form !stuff. 463 bool scanTag(); 464 465 /// @brief Dispatch to the next scanning function based on \a *Cur. 466 bool fetchMoreTokens(); 467 468 /// @brief The SourceMgr used for diagnostics and buffer management. 469 SourceMgr &SM; 470 471 /// @brief The original input. 472 MemoryBuffer *InputBuffer; 473 474 /// @brief The current position of the scanner. 475 StringRef::iterator Current; 476 477 /// @brief The end of the input (one past the last character). 478 StringRef::iterator End; 479 480 /// @brief Current YAML indentation level in spaces. 481 int Indent; 482 483 /// @brief Current column number in Unicode code points. 484 unsigned Column; 485 486 /// @brief Current line number. 487 unsigned Line; 488 489 /// @brief How deep we are in flow style containers. 0 Means at block level. 490 unsigned FlowLevel; 491 492 /// @brief Are we at the start of the stream? 493 bool IsStartOfStream; 494 495 /// @brief Can the next token be the start of a simple key? 496 bool IsSimpleKeyAllowed; 497 498 /// @brief True if an error has occurred. 499 bool Failed; 500 501 /// @brief Queue of tokens. This is required to queue up tokens while looking 502 /// for the end of a simple key. And for cases where a single character 503 /// can produce multiple tokens (e.g. BlockEnd). 504 TokenQueueT TokenQueue; 505 506 /// @brief Indentation levels. 507 SmallVector<int, 4> Indents; 508 509 /// @brief Potential simple keys. 510 SmallVector<SimpleKey, 4> SimpleKeys; 511 }; 512 513 } // end namespace yaml 514 } // end namespace llvm 515 516 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. 517 static void encodeUTF8( uint32_t UnicodeScalarValue 518 , SmallVectorImpl<char> &Result) { 519 if (UnicodeScalarValue <= 0x7F) { 520 Result.push_back(UnicodeScalarValue & 0x7F); 521 } else if (UnicodeScalarValue <= 0x7FF) { 522 uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); 523 uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); 524 Result.push_back(FirstByte); 525 Result.push_back(SecondByte); 526 } else if (UnicodeScalarValue <= 0xFFFF) { 527 uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); 528 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 529 uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); 530 Result.push_back(FirstByte); 531 Result.push_back(SecondByte); 532 Result.push_back(ThirdByte); 533 } else if (UnicodeScalarValue <= 0x10FFFF) { 534 uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); 535 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); 536 uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 537 uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); 538 Result.push_back(FirstByte); 539 Result.push_back(SecondByte); 540 Result.push_back(ThirdByte); 541 Result.push_back(FourthByte); 542 } 543 } 544 545 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { 546 SourceMgr SM; 547 Scanner scanner(Input, SM); 548 while (true) { 549 Token T = scanner.getNext(); 550 switch (T.Kind) { 551 case Token::TK_StreamStart: 552 OS << "Stream-Start: "; 553 break; 554 case Token::TK_StreamEnd: 555 OS << "Stream-End: "; 556 break; 557 case Token::TK_VersionDirective: 558 OS << "Version-Directive: "; 559 break; 560 case Token::TK_TagDirective: 561 OS << "Tag-Directive: "; 562 break; 563 case Token::TK_DocumentStart: 564 OS << "Document-Start: "; 565 break; 566 case Token::TK_DocumentEnd: 567 OS << "Document-End: "; 568 break; 569 case Token::TK_BlockEntry: 570 OS << "Block-Entry: "; 571 break; 572 case Token::TK_BlockEnd: 573 OS << "Block-End: "; 574 break; 575 case Token::TK_BlockSequenceStart: 576 OS << "Block-Sequence-Start: "; 577 break; 578 case Token::TK_BlockMappingStart: 579 OS << "Block-Mapping-Start: "; 580 break; 581 case Token::TK_FlowEntry: 582 OS << "Flow-Entry: "; 583 break; 584 case Token::TK_FlowSequenceStart: 585 OS << "Flow-Sequence-Start: "; 586 break; 587 case Token::TK_FlowSequenceEnd: 588 OS << "Flow-Sequence-End: "; 589 break; 590 case Token::TK_FlowMappingStart: 591 OS << "Flow-Mapping-Start: "; 592 break; 593 case Token::TK_FlowMappingEnd: 594 OS << "Flow-Mapping-End: "; 595 break; 596 case Token::TK_Key: 597 OS << "Key: "; 598 break; 599 case Token::TK_Value: 600 OS << "Value: "; 601 break; 602 case Token::TK_Scalar: 603 OS << "Scalar: "; 604 break; 605 case Token::TK_Alias: 606 OS << "Alias: "; 607 break; 608 case Token::TK_Anchor: 609 OS << "Anchor: "; 610 break; 611 case Token::TK_Tag: 612 OS << "Tag: "; 613 break; 614 case Token::TK_Error: 615 break; 616 } 617 OS << T.Range << "\n"; 618 if (T.Kind == Token::TK_StreamEnd) 619 break; 620 else if (T.Kind == Token::TK_Error) 621 return false; 622 } 623 return true; 624 } 625 626 bool yaml::scanTokens(StringRef Input) { 627 llvm::SourceMgr SM; 628 llvm::yaml::Scanner scanner(Input, SM); 629 for (;;) { 630 llvm::yaml::Token T = scanner.getNext(); 631 if (T.Kind == Token::TK_StreamEnd) 632 break; 633 else if (T.Kind == Token::TK_Error) 634 return false; 635 } 636 return true; 637 } 638 639 std::string yaml::escape(StringRef Input) { 640 std::string EscapedInput; 641 for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { 642 if (*i == '\\') 643 EscapedInput += "\\\\"; 644 else if (*i == '"') 645 EscapedInput += "\\\""; 646 else if (*i == 0) 647 EscapedInput += "\\0"; 648 else if (*i == 0x07) 649 EscapedInput += "\\a"; 650 else if (*i == 0x08) 651 EscapedInput += "\\b"; 652 else if (*i == 0x09) 653 EscapedInput += "\\t"; 654 else if (*i == 0x0A) 655 EscapedInput += "\\n"; 656 else if (*i == 0x0B) 657 EscapedInput += "\\v"; 658 else if (*i == 0x0C) 659 EscapedInput += "\\f"; 660 else if (*i == 0x0D) 661 EscapedInput += "\\r"; 662 else if (*i == 0x1B) 663 EscapedInput += "\\e"; 664 else if ((unsigned char)*i < 0x20) { // Control characters not handled above. 665 std::string HexStr = utohexstr(*i); 666 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 667 } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. 668 UTF8Decoded UnicodeScalarValue 669 = decodeUTF8(StringRef(i, Input.end() - i)); 670 if (UnicodeScalarValue.second == 0) { 671 // Found invalid char. 672 SmallString<4> Val; 673 encodeUTF8(0xFFFD, Val); 674 EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end()); 675 // FIXME: Error reporting. 676 return EscapedInput; 677 } 678 if (UnicodeScalarValue.first == 0x85) 679 EscapedInput += "\\N"; 680 else if (UnicodeScalarValue.first == 0xA0) 681 EscapedInput += "\\_"; 682 else if (UnicodeScalarValue.first == 0x2028) 683 EscapedInput += "\\L"; 684 else if (UnicodeScalarValue.first == 0x2029) 685 EscapedInput += "\\P"; 686 else { 687 std::string HexStr = utohexstr(UnicodeScalarValue.first); 688 if (HexStr.size() <= 2) 689 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 690 else if (HexStr.size() <= 4) 691 EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; 692 else if (HexStr.size() <= 8) 693 EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; 694 } 695 i += UnicodeScalarValue.second - 1; 696 } else 697 EscapedInput.push_back(*i); 698 } 699 return EscapedInput; 700 } 701 702 Scanner::Scanner(StringRef Input, SourceMgr &sm) 703 : SM(sm) 704 , Indent(-1) 705 , Column(0) 706 , Line(0) 707 , FlowLevel(0) 708 , IsStartOfStream(true) 709 , IsSimpleKeyAllowed(true) 710 , Failed(false) { 711 InputBuffer = MemoryBuffer::getMemBuffer(Input, "YAML"); 712 SM.AddNewSourceBuffer(InputBuffer, SMLoc()); 713 Current = InputBuffer->getBufferStart(); 714 End = InputBuffer->getBufferEnd(); 715 } 716 717 Scanner::Scanner(std::unique_ptr<MemoryBuffer> Buffer, SourceMgr &SM_) 718 : SM(SM_), InputBuffer(Buffer.get()), 719 Current(InputBuffer->getBufferStart()), End(InputBuffer->getBufferEnd()), 720 Indent(-1), Column(0), Line(0), FlowLevel(0), IsStartOfStream(true), 721 IsSimpleKeyAllowed(true), Failed(false) { 722 SM.AddNewSourceBuffer(Buffer.release(), SMLoc()); 723 } 724 725 Token &Scanner::peekNext() { 726 // If the current token is a possible simple key, keep parsing until we 727 // can confirm. 728 bool NeedMore = false; 729 while (true) { 730 if (TokenQueue.empty() || NeedMore) { 731 if (!fetchMoreTokens()) { 732 TokenQueue.clear(); 733 TokenQueue.push_back(Token()); 734 return TokenQueue.front(); 735 } 736 } 737 assert(!TokenQueue.empty() && 738 "fetchMoreTokens lied about getting tokens!"); 739 740 removeStaleSimpleKeyCandidates(); 741 SimpleKey SK; 742 SK.Tok = TokenQueue.front(); 743 if (std::find(SimpleKeys.begin(), SimpleKeys.end(), SK) 744 == SimpleKeys.end()) 745 break; 746 else 747 NeedMore = true; 748 } 749 return TokenQueue.front(); 750 } 751 752 Token Scanner::getNext() { 753 Token Ret = peekNext(); 754 // TokenQueue can be empty if there was an error getting the next token. 755 if (!TokenQueue.empty()) 756 TokenQueue.pop_front(); 757 758 // There cannot be any referenced Token's if the TokenQueue is empty. So do a 759 // quick deallocation of them all. 760 if (TokenQueue.empty()) { 761 TokenQueue.Alloc.Reset(); 762 } 763 764 return Ret; 765 } 766 767 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { 768 if (Position == End) 769 return Position; 770 // Check 7 bit c-printable - b-char. 771 if ( *Position == 0x09 772 || (*Position >= 0x20 && *Position <= 0x7E)) 773 return Position + 1; 774 775 // Check for valid UTF-8. 776 if (uint8_t(*Position) & 0x80) { 777 UTF8Decoded u8d = decodeUTF8(Position); 778 if ( u8d.second != 0 779 && u8d.first != 0xFEFF 780 && ( u8d.first == 0x85 781 || ( u8d.first >= 0xA0 782 && u8d.first <= 0xD7FF) 783 || ( u8d.first >= 0xE000 784 && u8d.first <= 0xFFFD) 785 || ( u8d.first >= 0x10000 786 && u8d.first <= 0x10FFFF))) 787 return Position + u8d.second; 788 } 789 return Position; 790 } 791 792 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { 793 if (Position == End) 794 return Position; 795 if (*Position == 0x0D) { 796 if (Position + 1 != End && *(Position + 1) == 0x0A) 797 return Position + 2; 798 return Position + 1; 799 } 800 801 if (*Position == 0x0A) 802 return Position + 1; 803 return Position; 804 } 805 806 807 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { 808 if (Position == End) 809 return Position; 810 if (*Position == ' ' || *Position == '\t') 811 return Position + 1; 812 return Position; 813 } 814 815 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { 816 if (Position == End) 817 return Position; 818 if (*Position == ' ' || *Position == '\t') 819 return Position; 820 return skip_nb_char(Position); 821 } 822 823 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func 824 , StringRef::iterator Position) { 825 while (true) { 826 StringRef::iterator i = (this->*Func)(Position); 827 if (i == Position) 828 break; 829 Position = i; 830 } 831 return Position; 832 } 833 834 static bool is_ns_hex_digit(const char C) { 835 return (C >= '0' && C <= '9') 836 || (C >= 'a' && C <= 'z') 837 || (C >= 'A' && C <= 'Z'); 838 } 839 840 static bool is_ns_word_char(const char C) { 841 return C == '-' 842 || (C >= 'a' && C <= 'z') 843 || (C >= 'A' && C <= 'Z'); 844 } 845 846 StringRef Scanner::scan_ns_uri_char() { 847 StringRef::iterator Start = Current; 848 while (true) { 849 if (Current == End) 850 break; 851 if (( *Current == '%' 852 && Current + 2 < End 853 && is_ns_hex_digit(*(Current + 1)) 854 && is_ns_hex_digit(*(Current + 2))) 855 || is_ns_word_char(*Current) 856 || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") 857 != StringRef::npos) { 858 ++Current; 859 ++Column; 860 } else 861 break; 862 } 863 return StringRef(Start, Current - Start); 864 } 865 866 bool Scanner::consume(uint32_t Expected) { 867 if (Expected >= 0x80) 868 report_fatal_error("Not dealing with this yet"); 869 if (Current == End) 870 return false; 871 if (uint8_t(*Current) >= 0x80) 872 report_fatal_error("Not dealing with this yet"); 873 if (uint8_t(*Current) == Expected) { 874 ++Current; 875 ++Column; 876 return true; 877 } 878 return false; 879 } 880 881 void Scanner::skip(uint32_t Distance) { 882 Current += Distance; 883 Column += Distance; 884 assert(Current <= End && "Skipped past the end"); 885 } 886 887 bool Scanner::isBlankOrBreak(StringRef::iterator Position) { 888 if (Position == End) 889 return false; 890 if ( *Position == ' ' || *Position == '\t' 891 || *Position == '\r' || *Position == '\n') 892 return true; 893 return false; 894 } 895 896 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok 897 , unsigned AtColumn 898 , bool IsRequired) { 899 if (IsSimpleKeyAllowed) { 900 SimpleKey SK; 901 SK.Tok = Tok; 902 SK.Line = Line; 903 SK.Column = AtColumn; 904 SK.IsRequired = IsRequired; 905 SK.FlowLevel = FlowLevel; 906 SimpleKeys.push_back(SK); 907 } 908 } 909 910 void Scanner::removeStaleSimpleKeyCandidates() { 911 for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); 912 i != SimpleKeys.end();) { 913 if (i->Line != Line || i->Column + 1024 < Column) { 914 if (i->IsRequired) 915 setError( "Could not find expected : for simple key" 916 , i->Tok->Range.begin()); 917 i = SimpleKeys.erase(i); 918 } else 919 ++i; 920 } 921 } 922 923 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { 924 if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) 925 SimpleKeys.pop_back(); 926 } 927 928 bool Scanner::unrollIndent(int ToColumn) { 929 Token T; 930 // Indentation is ignored in flow. 931 if (FlowLevel != 0) 932 return true; 933 934 while (Indent > ToColumn) { 935 T.Kind = Token::TK_BlockEnd; 936 T.Range = StringRef(Current, 1); 937 TokenQueue.push_back(T); 938 Indent = Indents.pop_back_val(); 939 } 940 941 return true; 942 } 943 944 bool Scanner::rollIndent( int ToColumn 945 , Token::TokenKind Kind 946 , TokenQueueT::iterator InsertPoint) { 947 if (FlowLevel) 948 return true; 949 if (Indent < ToColumn) { 950 Indents.push_back(Indent); 951 Indent = ToColumn; 952 953 Token T; 954 T.Kind = Kind; 955 T.Range = StringRef(Current, 0); 956 TokenQueue.insert(InsertPoint, T); 957 } 958 return true; 959 } 960 961 void Scanner::scanToNextToken() { 962 while (true) { 963 while (*Current == ' ' || *Current == '\t') { 964 skip(1); 965 } 966 967 // Skip comment. 968 if (*Current == '#') { 969 while (true) { 970 // This may skip more than one byte, thus Column is only incremented 971 // for code points. 972 StringRef::iterator i = skip_nb_char(Current); 973 if (i == Current) 974 break; 975 Current = i; 976 ++Column; 977 } 978 } 979 980 // Skip EOL. 981 StringRef::iterator i = skip_b_break(Current); 982 if (i == Current) 983 break; 984 Current = i; 985 ++Line; 986 Column = 0; 987 // New lines may start a simple key. 988 if (!FlowLevel) 989 IsSimpleKeyAllowed = true; 990 } 991 } 992 993 bool Scanner::scanStreamStart() { 994 IsStartOfStream = false; 995 996 EncodingInfo EI = getUnicodeEncoding(currentInput()); 997 998 Token T; 999 T.Kind = Token::TK_StreamStart; 1000 T.Range = StringRef(Current, EI.second); 1001 TokenQueue.push_back(T); 1002 Current += EI.second; 1003 return true; 1004 } 1005 1006 bool Scanner::scanStreamEnd() { 1007 // Force an ending new line if one isn't present. 1008 if (Column != 0) { 1009 Column = 0; 1010 ++Line; 1011 } 1012 1013 unrollIndent(-1); 1014 SimpleKeys.clear(); 1015 IsSimpleKeyAllowed = false; 1016 1017 Token T; 1018 T.Kind = Token::TK_StreamEnd; 1019 T.Range = StringRef(Current, 0); 1020 TokenQueue.push_back(T); 1021 return true; 1022 } 1023 1024 bool Scanner::scanDirective() { 1025 // Reset the indentation level. 1026 unrollIndent(-1); 1027 SimpleKeys.clear(); 1028 IsSimpleKeyAllowed = false; 1029 1030 StringRef::iterator Start = Current; 1031 consume('%'); 1032 StringRef::iterator NameStart = Current; 1033 Current = skip_while(&Scanner::skip_ns_char, Current); 1034 StringRef Name(NameStart, Current - NameStart); 1035 Current = skip_while(&Scanner::skip_s_white, Current); 1036 1037 Token T; 1038 if (Name == "YAML") { 1039 Current = skip_while(&Scanner::skip_ns_char, Current); 1040 T.Kind = Token::TK_VersionDirective; 1041 T.Range = StringRef(Start, Current - Start); 1042 TokenQueue.push_back(T); 1043 return true; 1044 } else if(Name == "TAG") { 1045 Current = skip_while(&Scanner::skip_ns_char, Current); 1046 Current = skip_while(&Scanner::skip_s_white, Current); 1047 Current = skip_while(&Scanner::skip_ns_char, Current); 1048 T.Kind = Token::TK_TagDirective; 1049 T.Range = StringRef(Start, Current - Start); 1050 TokenQueue.push_back(T); 1051 return true; 1052 } 1053 return false; 1054 } 1055 1056 bool Scanner::scanDocumentIndicator(bool IsStart) { 1057 unrollIndent(-1); 1058 SimpleKeys.clear(); 1059 IsSimpleKeyAllowed = false; 1060 1061 Token T; 1062 T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; 1063 T.Range = StringRef(Current, 3); 1064 skip(3); 1065 TokenQueue.push_back(T); 1066 return true; 1067 } 1068 1069 bool Scanner::scanFlowCollectionStart(bool IsSequence) { 1070 Token T; 1071 T.Kind = IsSequence ? Token::TK_FlowSequenceStart 1072 : Token::TK_FlowMappingStart; 1073 T.Range = StringRef(Current, 1); 1074 skip(1); 1075 TokenQueue.push_back(T); 1076 1077 // [ and { may begin a simple key. 1078 saveSimpleKeyCandidate(TokenQueue.back(), Column - 1, false); 1079 1080 // And may also be followed by a simple key. 1081 IsSimpleKeyAllowed = true; 1082 ++FlowLevel; 1083 return true; 1084 } 1085 1086 bool Scanner::scanFlowCollectionEnd(bool IsSequence) { 1087 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1088 IsSimpleKeyAllowed = false; 1089 Token T; 1090 T.Kind = IsSequence ? Token::TK_FlowSequenceEnd 1091 : Token::TK_FlowMappingEnd; 1092 T.Range = StringRef(Current, 1); 1093 skip(1); 1094 TokenQueue.push_back(T); 1095 if (FlowLevel) 1096 --FlowLevel; 1097 return true; 1098 } 1099 1100 bool Scanner::scanFlowEntry() { 1101 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1102 IsSimpleKeyAllowed = true; 1103 Token T; 1104 T.Kind = Token::TK_FlowEntry; 1105 T.Range = StringRef(Current, 1); 1106 skip(1); 1107 TokenQueue.push_back(T); 1108 return true; 1109 } 1110 1111 bool Scanner::scanBlockEntry() { 1112 rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); 1113 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1114 IsSimpleKeyAllowed = true; 1115 Token T; 1116 T.Kind = Token::TK_BlockEntry; 1117 T.Range = StringRef(Current, 1); 1118 skip(1); 1119 TokenQueue.push_back(T); 1120 return true; 1121 } 1122 1123 bool Scanner::scanKey() { 1124 if (!FlowLevel) 1125 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1126 1127 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1128 IsSimpleKeyAllowed = !FlowLevel; 1129 1130 Token T; 1131 T.Kind = Token::TK_Key; 1132 T.Range = StringRef(Current, 1); 1133 skip(1); 1134 TokenQueue.push_back(T); 1135 return true; 1136 } 1137 1138 bool Scanner::scanValue() { 1139 // If the previous token could have been a simple key, insert the key token 1140 // into the token queue. 1141 if (!SimpleKeys.empty()) { 1142 SimpleKey SK = SimpleKeys.pop_back_val(); 1143 Token T; 1144 T.Kind = Token::TK_Key; 1145 T.Range = SK.Tok->Range; 1146 TokenQueueT::iterator i, e; 1147 for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { 1148 if (i == SK.Tok) 1149 break; 1150 } 1151 assert(i != e && "SimpleKey not in token queue!"); 1152 i = TokenQueue.insert(i, T); 1153 1154 // We may also need to add a Block-Mapping-Start token. 1155 rollIndent(SK.Column, Token::TK_BlockMappingStart, i); 1156 1157 IsSimpleKeyAllowed = false; 1158 } else { 1159 if (!FlowLevel) 1160 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1161 IsSimpleKeyAllowed = !FlowLevel; 1162 } 1163 1164 Token T; 1165 T.Kind = Token::TK_Value; 1166 T.Range = StringRef(Current, 1); 1167 skip(1); 1168 TokenQueue.push_back(T); 1169 return true; 1170 } 1171 1172 // Forbidding inlining improves performance by roughly 20%. 1173 // FIXME: Remove once llvm optimizes this to the faster version without hints. 1174 LLVM_ATTRIBUTE_NOINLINE static bool 1175 wasEscaped(StringRef::iterator First, StringRef::iterator Position); 1176 1177 // Returns whether a character at 'Position' was escaped with a leading '\'. 1178 // 'First' specifies the position of the first character in the string. 1179 static bool wasEscaped(StringRef::iterator First, 1180 StringRef::iterator Position) { 1181 assert(Position - 1 >= First); 1182 StringRef::iterator I = Position - 1; 1183 // We calculate the number of consecutive '\'s before the current position 1184 // by iterating backwards through our string. 1185 while (I >= First && *I == '\\') --I; 1186 // (Position - 1 - I) now contains the number of '\'s before the current 1187 // position. If it is odd, the character at 'Position' was escaped. 1188 return (Position - 1 - I) % 2 == 1; 1189 } 1190 1191 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { 1192 StringRef::iterator Start = Current; 1193 unsigned ColStart = Column; 1194 if (IsDoubleQuoted) { 1195 do { 1196 ++Current; 1197 while (Current != End && *Current != '"') 1198 ++Current; 1199 // Repeat until the previous character was not a '\' or was an escaped 1200 // backslash. 1201 } while ( Current != End 1202 && *(Current - 1) == '\\' 1203 && wasEscaped(Start + 1, Current)); 1204 } else { 1205 skip(1); 1206 while (true) { 1207 // Skip a ' followed by another '. 1208 if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { 1209 skip(2); 1210 continue; 1211 } else if (*Current == '\'') 1212 break; 1213 StringRef::iterator i = skip_nb_char(Current); 1214 if (i == Current) { 1215 i = skip_b_break(Current); 1216 if (i == Current) 1217 break; 1218 Current = i; 1219 Column = 0; 1220 ++Line; 1221 } else { 1222 if (i == End) 1223 break; 1224 Current = i; 1225 ++Column; 1226 } 1227 } 1228 } 1229 1230 if (Current == End) { 1231 setError("Expected quote at end of scalar", Current); 1232 return false; 1233 } 1234 1235 skip(1); // Skip ending quote. 1236 Token T; 1237 T.Kind = Token::TK_Scalar; 1238 T.Range = StringRef(Start, Current - Start); 1239 TokenQueue.push_back(T); 1240 1241 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1242 1243 IsSimpleKeyAllowed = false; 1244 1245 return true; 1246 } 1247 1248 bool Scanner::scanPlainScalar() { 1249 StringRef::iterator Start = Current; 1250 unsigned ColStart = Column; 1251 unsigned LeadingBlanks = 0; 1252 assert(Indent >= -1 && "Indent must be >= -1 !"); 1253 unsigned indent = static_cast<unsigned>(Indent + 1); 1254 while (true) { 1255 if (*Current == '#') 1256 break; 1257 1258 while (!isBlankOrBreak(Current)) { 1259 if ( FlowLevel && *Current == ':' 1260 && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) { 1261 setError("Found unexpected ':' while scanning a plain scalar", Current); 1262 return false; 1263 } 1264 1265 // Check for the end of the plain scalar. 1266 if ( (*Current == ':' && isBlankOrBreak(Current + 1)) 1267 || ( FlowLevel 1268 && (StringRef(Current, 1).find_first_of(",:?[]{}") 1269 != StringRef::npos))) 1270 break; 1271 1272 StringRef::iterator i = skip_nb_char(Current); 1273 if (i == Current) 1274 break; 1275 Current = i; 1276 ++Column; 1277 } 1278 1279 // Are we at the end? 1280 if (!isBlankOrBreak(Current)) 1281 break; 1282 1283 // Eat blanks. 1284 StringRef::iterator Tmp = Current; 1285 while (isBlankOrBreak(Tmp)) { 1286 StringRef::iterator i = skip_s_white(Tmp); 1287 if (i != Tmp) { 1288 if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { 1289 setError("Found invalid tab character in indentation", Tmp); 1290 return false; 1291 } 1292 Tmp = i; 1293 ++Column; 1294 } else { 1295 i = skip_b_break(Tmp); 1296 if (!LeadingBlanks) 1297 LeadingBlanks = 1; 1298 Tmp = i; 1299 Column = 0; 1300 ++Line; 1301 } 1302 } 1303 1304 if (!FlowLevel && Column < indent) 1305 break; 1306 1307 Current = Tmp; 1308 } 1309 if (Start == Current) { 1310 setError("Got empty plain scalar", Start); 1311 return false; 1312 } 1313 Token T; 1314 T.Kind = Token::TK_Scalar; 1315 T.Range = StringRef(Start, Current - Start); 1316 TokenQueue.push_back(T); 1317 1318 // Plain scalars can be simple keys. 1319 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1320 1321 IsSimpleKeyAllowed = false; 1322 1323 return true; 1324 } 1325 1326 bool Scanner::scanAliasOrAnchor(bool IsAlias) { 1327 StringRef::iterator Start = Current; 1328 unsigned ColStart = Column; 1329 skip(1); 1330 while(true) { 1331 if ( *Current == '[' || *Current == ']' 1332 || *Current == '{' || *Current == '}' 1333 || *Current == ',' 1334 || *Current == ':') 1335 break; 1336 StringRef::iterator i = skip_ns_char(Current); 1337 if (i == Current) 1338 break; 1339 Current = i; 1340 ++Column; 1341 } 1342 1343 if (Start == Current) { 1344 setError("Got empty alias or anchor", Start); 1345 return false; 1346 } 1347 1348 Token T; 1349 T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; 1350 T.Range = StringRef(Start, Current - Start); 1351 TokenQueue.push_back(T); 1352 1353 // Alias and anchors can be simple keys. 1354 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1355 1356 IsSimpleKeyAllowed = false; 1357 1358 return true; 1359 } 1360 1361 bool Scanner::scanBlockScalar(bool IsLiteral) { 1362 StringRef::iterator Start = Current; 1363 skip(1); // Eat | or > 1364 while(true) { 1365 StringRef::iterator i = skip_nb_char(Current); 1366 if (i == Current) { 1367 if (Column == 0) 1368 break; 1369 i = skip_b_break(Current); 1370 if (i != Current) { 1371 // We got a line break. 1372 Column = 0; 1373 ++Line; 1374 Current = i; 1375 continue; 1376 } else { 1377 // There was an error, which should already have been printed out. 1378 return false; 1379 } 1380 } 1381 Current = i; 1382 ++Column; 1383 } 1384 1385 if (Start == Current) { 1386 setError("Got empty block scalar", Start); 1387 return false; 1388 } 1389 1390 Token T; 1391 T.Kind = Token::TK_Scalar; 1392 T.Range = StringRef(Start, Current - Start); 1393 TokenQueue.push_back(T); 1394 return true; 1395 } 1396 1397 bool Scanner::scanTag() { 1398 StringRef::iterator Start = Current; 1399 unsigned ColStart = Column; 1400 skip(1); // Eat !. 1401 if (Current == End || isBlankOrBreak(Current)); // An empty tag. 1402 else if (*Current == '<') { 1403 skip(1); 1404 scan_ns_uri_char(); 1405 if (!consume('>')) 1406 return false; 1407 } else { 1408 // FIXME: Actually parse the c-ns-shorthand-tag rule. 1409 Current = skip_while(&Scanner::skip_ns_char, Current); 1410 } 1411 1412 Token T; 1413 T.Kind = Token::TK_Tag; 1414 T.Range = StringRef(Start, Current - Start); 1415 TokenQueue.push_back(T); 1416 1417 // Tags can be simple keys. 1418 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1419 1420 IsSimpleKeyAllowed = false; 1421 1422 return true; 1423 } 1424 1425 bool Scanner::fetchMoreTokens() { 1426 if (IsStartOfStream) 1427 return scanStreamStart(); 1428 1429 scanToNextToken(); 1430 1431 if (Current == End) 1432 return scanStreamEnd(); 1433 1434 removeStaleSimpleKeyCandidates(); 1435 1436 unrollIndent(Column); 1437 1438 if (Column == 0 && *Current == '%') 1439 return scanDirective(); 1440 1441 if (Column == 0 && Current + 4 <= End 1442 && *Current == '-' 1443 && *(Current + 1) == '-' 1444 && *(Current + 2) == '-' 1445 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1446 return scanDocumentIndicator(true); 1447 1448 if (Column == 0 && Current + 4 <= End 1449 && *Current == '.' 1450 && *(Current + 1) == '.' 1451 && *(Current + 2) == '.' 1452 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1453 return scanDocumentIndicator(false); 1454 1455 if (*Current == '[') 1456 return scanFlowCollectionStart(true); 1457 1458 if (*Current == '{') 1459 return scanFlowCollectionStart(false); 1460 1461 if (*Current == ']') 1462 return scanFlowCollectionEnd(true); 1463 1464 if (*Current == '}') 1465 return scanFlowCollectionEnd(false); 1466 1467 if (*Current == ',') 1468 return scanFlowEntry(); 1469 1470 if (*Current == '-' && isBlankOrBreak(Current + 1)) 1471 return scanBlockEntry(); 1472 1473 if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1))) 1474 return scanKey(); 1475 1476 if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1))) 1477 return scanValue(); 1478 1479 if (*Current == '*') 1480 return scanAliasOrAnchor(true); 1481 1482 if (*Current == '&') 1483 return scanAliasOrAnchor(false); 1484 1485 if (*Current == '!') 1486 return scanTag(); 1487 1488 if (*Current == '|' && !FlowLevel) 1489 return scanBlockScalar(true); 1490 1491 if (*Current == '>' && !FlowLevel) 1492 return scanBlockScalar(false); 1493 1494 if (*Current == '\'') 1495 return scanFlowScalar(false); 1496 1497 if (*Current == '"') 1498 return scanFlowScalar(true); 1499 1500 // Get a plain scalar. 1501 StringRef FirstChar(Current, 1); 1502 if (!(isBlankOrBreak(Current) 1503 || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos) 1504 || (*Current == '-' && !isBlankOrBreak(Current + 1)) 1505 || (!FlowLevel && (*Current == '?' || *Current == ':') 1506 && isBlankOrBreak(Current + 1)) 1507 || (!FlowLevel && *Current == ':' 1508 && Current + 2 < End 1509 && *(Current + 1) == ':' 1510 && !isBlankOrBreak(Current + 2))) 1511 return scanPlainScalar(); 1512 1513 setError("Unrecognized character while tokenizing."); 1514 return false; 1515 } 1516 1517 Stream::Stream(StringRef Input, SourceMgr &SM) 1518 : scanner(new Scanner(Input, SM)), CurrentDoc() {} 1519 1520 Stream::Stream(std::unique_ptr<MemoryBuffer> InputBuffer, SourceMgr &SM) 1521 : scanner(new Scanner(std::move(InputBuffer), SM)), CurrentDoc() {} 1522 1523 Stream::~Stream() {} 1524 1525 bool Stream::failed() { return scanner->failed(); } 1526 1527 void Stream::printError(Node *N, const Twine &Msg) { 1528 SmallVector<SMRange, 1> Ranges; 1529 Ranges.push_back(N->getSourceRange()); 1530 scanner->printError( N->getSourceRange().Start 1531 , SourceMgr::DK_Error 1532 , Msg 1533 , Ranges); 1534 } 1535 1536 document_iterator Stream::begin() { 1537 if (CurrentDoc) 1538 report_fatal_error("Can only iterate over the stream once"); 1539 1540 // Skip Stream-Start. 1541 scanner->getNext(); 1542 1543 CurrentDoc.reset(new Document(*this)); 1544 return document_iterator(CurrentDoc); 1545 } 1546 1547 document_iterator Stream::end() { 1548 return document_iterator(); 1549 } 1550 1551 void Stream::skip() { 1552 for (document_iterator i = begin(), e = end(); i != e; ++i) 1553 i->skip(); 1554 } 1555 1556 Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A, 1557 StringRef T) 1558 : Doc(D), TypeID(Type), Anchor(A), Tag(T) { 1559 SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); 1560 SourceRange = SMRange(Start, Start); 1561 } 1562 1563 std::string Node::getVerbatimTag() const { 1564 StringRef Raw = getRawTag(); 1565 if (!Raw.empty() && Raw != "!") { 1566 std::string Ret; 1567 if (Raw.find_last_of('!') == 0) { 1568 Ret = Doc->getTagMap().find("!")->second; 1569 Ret += Raw.substr(1); 1570 return std::move(Ret); 1571 } else if (Raw.startswith("!!")) { 1572 Ret = Doc->getTagMap().find("!!")->second; 1573 Ret += Raw.substr(2); 1574 return std::move(Ret); 1575 } else { 1576 StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1); 1577 std::map<StringRef, StringRef>::const_iterator It = 1578 Doc->getTagMap().find(TagHandle); 1579 if (It != Doc->getTagMap().end()) 1580 Ret = It->second; 1581 else { 1582 Token T; 1583 T.Kind = Token::TK_Tag; 1584 T.Range = TagHandle; 1585 setError(Twine("Unknown tag handle ") + TagHandle, T); 1586 } 1587 Ret += Raw.substr(Raw.find_last_of('!') + 1); 1588 return std::move(Ret); 1589 } 1590 } 1591 1592 switch (getType()) { 1593 case NK_Null: 1594 return "tag:yaml.org,2002:null"; 1595 case NK_Scalar: 1596 // TODO: Tag resolution. 1597 return "tag:yaml.org,2002:str"; 1598 case NK_Mapping: 1599 return "tag:yaml.org,2002:map"; 1600 case NK_Sequence: 1601 return "tag:yaml.org,2002:seq"; 1602 } 1603 1604 return ""; 1605 } 1606 1607 Token &Node::peekNext() { 1608 return Doc->peekNext(); 1609 } 1610 1611 Token Node::getNext() { 1612 return Doc->getNext(); 1613 } 1614 1615 Node *Node::parseBlockNode() { 1616 return Doc->parseBlockNode(); 1617 } 1618 1619 BumpPtrAllocator &Node::getAllocator() { 1620 return Doc->NodeAllocator; 1621 } 1622 1623 void Node::setError(const Twine &Msg, Token &Tok) const { 1624 Doc->setError(Msg, Tok); 1625 } 1626 1627 bool Node::failed() const { 1628 return Doc->failed(); 1629 } 1630 1631 1632 1633 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { 1634 // TODO: Handle newlines properly. We need to remove leading whitespace. 1635 if (Value[0] == '"') { // Double quoted. 1636 // Pull off the leading and trailing "s. 1637 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1638 // Search for characters that would require unescaping the value. 1639 StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); 1640 if (i != StringRef::npos) 1641 return unescapeDoubleQuoted(UnquotedValue, i, Storage); 1642 return UnquotedValue; 1643 } else if (Value[0] == '\'') { // Single quoted. 1644 // Pull off the leading and trailing 's. 1645 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1646 StringRef::size_type i = UnquotedValue.find('\''); 1647 if (i != StringRef::npos) { 1648 // We're going to need Storage. 1649 Storage.clear(); 1650 Storage.reserve(UnquotedValue.size()); 1651 for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { 1652 StringRef Valid(UnquotedValue.begin(), i); 1653 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1654 Storage.push_back('\''); 1655 UnquotedValue = UnquotedValue.substr(i + 2); 1656 } 1657 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 1658 return StringRef(Storage.begin(), Storage.size()); 1659 } 1660 return UnquotedValue; 1661 } 1662 // Plain or block. 1663 return Value.rtrim(" "); 1664 } 1665 1666 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue 1667 , StringRef::size_type i 1668 , SmallVectorImpl<char> &Storage) 1669 const { 1670 // Use Storage to build proper value. 1671 Storage.clear(); 1672 Storage.reserve(UnquotedValue.size()); 1673 for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { 1674 // Insert all previous chars into Storage. 1675 StringRef Valid(UnquotedValue.begin(), i); 1676 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1677 // Chop off inserted chars. 1678 UnquotedValue = UnquotedValue.substr(i); 1679 1680 assert(!UnquotedValue.empty() && "Can't be empty!"); 1681 1682 // Parse escape or line break. 1683 switch (UnquotedValue[0]) { 1684 case '\r': 1685 case '\n': 1686 Storage.push_back('\n'); 1687 if ( UnquotedValue.size() > 1 1688 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1689 UnquotedValue = UnquotedValue.substr(1); 1690 UnquotedValue = UnquotedValue.substr(1); 1691 break; 1692 default: 1693 if (UnquotedValue.size() == 1) 1694 // TODO: Report error. 1695 break; 1696 UnquotedValue = UnquotedValue.substr(1); 1697 switch (UnquotedValue[0]) { 1698 default: { 1699 Token T; 1700 T.Range = StringRef(UnquotedValue.begin(), 1); 1701 setError("Unrecognized escape code!", T); 1702 return ""; 1703 } 1704 case '\r': 1705 case '\n': 1706 // Remove the new line. 1707 if ( UnquotedValue.size() > 1 1708 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1709 UnquotedValue = UnquotedValue.substr(1); 1710 // If this was just a single byte newline, it will get skipped 1711 // below. 1712 break; 1713 case '0': 1714 Storage.push_back(0x00); 1715 break; 1716 case 'a': 1717 Storage.push_back(0x07); 1718 break; 1719 case 'b': 1720 Storage.push_back(0x08); 1721 break; 1722 case 't': 1723 case 0x09: 1724 Storage.push_back(0x09); 1725 break; 1726 case 'n': 1727 Storage.push_back(0x0A); 1728 break; 1729 case 'v': 1730 Storage.push_back(0x0B); 1731 break; 1732 case 'f': 1733 Storage.push_back(0x0C); 1734 break; 1735 case 'r': 1736 Storage.push_back(0x0D); 1737 break; 1738 case 'e': 1739 Storage.push_back(0x1B); 1740 break; 1741 case ' ': 1742 Storage.push_back(0x20); 1743 break; 1744 case '"': 1745 Storage.push_back(0x22); 1746 break; 1747 case '/': 1748 Storage.push_back(0x2F); 1749 break; 1750 case '\\': 1751 Storage.push_back(0x5C); 1752 break; 1753 case 'N': 1754 encodeUTF8(0x85, Storage); 1755 break; 1756 case '_': 1757 encodeUTF8(0xA0, Storage); 1758 break; 1759 case 'L': 1760 encodeUTF8(0x2028, Storage); 1761 break; 1762 case 'P': 1763 encodeUTF8(0x2029, Storage); 1764 break; 1765 case 'x': { 1766 if (UnquotedValue.size() < 3) 1767 // TODO: Report error. 1768 break; 1769 unsigned int UnicodeScalarValue; 1770 if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) 1771 // TODO: Report error. 1772 UnicodeScalarValue = 0xFFFD; 1773 encodeUTF8(UnicodeScalarValue, Storage); 1774 UnquotedValue = UnquotedValue.substr(2); 1775 break; 1776 } 1777 case 'u': { 1778 if (UnquotedValue.size() < 5) 1779 // TODO: Report error. 1780 break; 1781 unsigned int UnicodeScalarValue; 1782 if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) 1783 // TODO: Report error. 1784 UnicodeScalarValue = 0xFFFD; 1785 encodeUTF8(UnicodeScalarValue, Storage); 1786 UnquotedValue = UnquotedValue.substr(4); 1787 break; 1788 } 1789 case 'U': { 1790 if (UnquotedValue.size() < 9) 1791 // TODO: Report error. 1792 break; 1793 unsigned int UnicodeScalarValue; 1794 if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) 1795 // TODO: Report error. 1796 UnicodeScalarValue = 0xFFFD; 1797 encodeUTF8(UnicodeScalarValue, Storage); 1798 UnquotedValue = UnquotedValue.substr(8); 1799 break; 1800 } 1801 } 1802 UnquotedValue = UnquotedValue.substr(1); 1803 } 1804 } 1805 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 1806 return StringRef(Storage.begin(), Storage.size()); 1807 } 1808 1809 Node *KeyValueNode::getKey() { 1810 if (Key) 1811 return Key; 1812 // Handle implicit null keys. 1813 { 1814 Token &t = peekNext(); 1815 if ( t.Kind == Token::TK_BlockEnd 1816 || t.Kind == Token::TK_Value 1817 || t.Kind == Token::TK_Error) { 1818 return Key = new (getAllocator()) NullNode(Doc); 1819 } 1820 if (t.Kind == Token::TK_Key) 1821 getNext(); // skip TK_Key. 1822 } 1823 1824 // Handle explicit null keys. 1825 Token &t = peekNext(); 1826 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { 1827 return Key = new (getAllocator()) NullNode(Doc); 1828 } 1829 1830 // We've got a normal key. 1831 return Key = parseBlockNode(); 1832 } 1833 1834 Node *KeyValueNode::getValue() { 1835 if (Value) 1836 return Value; 1837 getKey()->skip(); 1838 if (failed()) 1839 return Value = new (getAllocator()) NullNode(Doc); 1840 1841 // Handle implicit null values. 1842 { 1843 Token &t = peekNext(); 1844 if ( t.Kind == Token::TK_BlockEnd 1845 || t.Kind == Token::TK_FlowMappingEnd 1846 || t.Kind == Token::TK_Key 1847 || t.Kind == Token::TK_FlowEntry 1848 || t.Kind == Token::TK_Error) { 1849 return Value = new (getAllocator()) NullNode(Doc); 1850 } 1851 1852 if (t.Kind != Token::TK_Value) { 1853 setError("Unexpected token in Key Value.", t); 1854 return Value = new (getAllocator()) NullNode(Doc); 1855 } 1856 getNext(); // skip TK_Value. 1857 } 1858 1859 // Handle explicit null values. 1860 Token &t = peekNext(); 1861 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { 1862 return Value = new (getAllocator()) NullNode(Doc); 1863 } 1864 1865 // We got a normal value. 1866 return Value = parseBlockNode(); 1867 } 1868 1869 void MappingNode::increment() { 1870 if (failed()) { 1871 IsAtEnd = true; 1872 CurrentEntry = nullptr; 1873 return; 1874 } 1875 if (CurrentEntry) { 1876 CurrentEntry->skip(); 1877 if (Type == MT_Inline) { 1878 IsAtEnd = true; 1879 CurrentEntry = nullptr; 1880 return; 1881 } 1882 } 1883 Token T = peekNext(); 1884 if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { 1885 // KeyValueNode eats the TK_Key. That way it can detect null keys. 1886 CurrentEntry = new (getAllocator()) KeyValueNode(Doc); 1887 } else if (Type == MT_Block) { 1888 switch (T.Kind) { 1889 case Token::TK_BlockEnd: 1890 getNext(); 1891 IsAtEnd = true; 1892 CurrentEntry = nullptr; 1893 break; 1894 default: 1895 setError("Unexpected token. Expected Key or Block End", T); 1896 case Token::TK_Error: 1897 IsAtEnd = true; 1898 CurrentEntry = nullptr; 1899 } 1900 } else { 1901 switch (T.Kind) { 1902 case Token::TK_FlowEntry: 1903 // Eat the flow entry and recurse. 1904 getNext(); 1905 return increment(); 1906 case Token::TK_FlowMappingEnd: 1907 getNext(); 1908 case Token::TK_Error: 1909 // Set this to end iterator. 1910 IsAtEnd = true; 1911 CurrentEntry = nullptr; 1912 break; 1913 default: 1914 setError( "Unexpected token. Expected Key, Flow Entry, or Flow " 1915 "Mapping End." 1916 , T); 1917 IsAtEnd = true; 1918 CurrentEntry = nullptr; 1919 } 1920 } 1921 } 1922 1923 void SequenceNode::increment() { 1924 if (failed()) { 1925 IsAtEnd = true; 1926 CurrentEntry = nullptr; 1927 return; 1928 } 1929 if (CurrentEntry) 1930 CurrentEntry->skip(); 1931 Token T = peekNext(); 1932 if (SeqType == ST_Block) { 1933 switch (T.Kind) { 1934 case Token::TK_BlockEntry: 1935 getNext(); 1936 CurrentEntry = parseBlockNode(); 1937 if (!CurrentEntry) { // An error occurred. 1938 IsAtEnd = true; 1939 CurrentEntry = nullptr; 1940 } 1941 break; 1942 case Token::TK_BlockEnd: 1943 getNext(); 1944 IsAtEnd = true; 1945 CurrentEntry = nullptr; 1946 break; 1947 default: 1948 setError( "Unexpected token. Expected Block Entry or Block End." 1949 , T); 1950 case Token::TK_Error: 1951 IsAtEnd = true; 1952 CurrentEntry = nullptr; 1953 } 1954 } else if (SeqType == ST_Indentless) { 1955 switch (T.Kind) { 1956 case Token::TK_BlockEntry: 1957 getNext(); 1958 CurrentEntry = parseBlockNode(); 1959 if (!CurrentEntry) { // An error occurred. 1960 IsAtEnd = true; 1961 CurrentEntry = nullptr; 1962 } 1963 break; 1964 default: 1965 case Token::TK_Error: 1966 IsAtEnd = true; 1967 CurrentEntry = nullptr; 1968 } 1969 } else if (SeqType == ST_Flow) { 1970 switch (T.Kind) { 1971 case Token::TK_FlowEntry: 1972 // Eat the flow entry and recurse. 1973 getNext(); 1974 WasPreviousTokenFlowEntry = true; 1975 return increment(); 1976 case Token::TK_FlowSequenceEnd: 1977 getNext(); 1978 case Token::TK_Error: 1979 // Set this to end iterator. 1980 IsAtEnd = true; 1981 CurrentEntry = nullptr; 1982 break; 1983 case Token::TK_StreamEnd: 1984 case Token::TK_DocumentEnd: 1985 case Token::TK_DocumentStart: 1986 setError("Could not find closing ]!", T); 1987 // Set this to end iterator. 1988 IsAtEnd = true; 1989 CurrentEntry = nullptr; 1990 break; 1991 default: 1992 if (!WasPreviousTokenFlowEntry) { 1993 setError("Expected , between entries!", T); 1994 IsAtEnd = true; 1995 CurrentEntry = nullptr; 1996 break; 1997 } 1998 // Otherwise it must be a flow entry. 1999 CurrentEntry = parseBlockNode(); 2000 if (!CurrentEntry) { 2001 IsAtEnd = true; 2002 } 2003 WasPreviousTokenFlowEntry = false; 2004 break; 2005 } 2006 } 2007 } 2008 2009 Document::Document(Stream &S) : stream(S), Root(nullptr) { 2010 // Tag maps starts with two default mappings. 2011 TagMap["!"] = "!"; 2012 TagMap["!!"] = "tag:yaml.org,2002:"; 2013 2014 if (parseDirectives()) 2015 expectToken(Token::TK_DocumentStart); 2016 Token &T = peekNext(); 2017 if (T.Kind == Token::TK_DocumentStart) 2018 getNext(); 2019 } 2020 2021 bool Document::skip() { 2022 if (stream.scanner->failed()) 2023 return false; 2024 if (!Root) 2025 getRoot(); 2026 Root->skip(); 2027 Token &T = peekNext(); 2028 if (T.Kind == Token::TK_StreamEnd) 2029 return false; 2030 if (T.Kind == Token::TK_DocumentEnd) { 2031 getNext(); 2032 return skip(); 2033 } 2034 return true; 2035 } 2036 2037 Token &Document::peekNext() { 2038 return stream.scanner->peekNext(); 2039 } 2040 2041 Token Document::getNext() { 2042 return stream.scanner->getNext(); 2043 } 2044 2045 void Document::setError(const Twine &Message, Token &Location) const { 2046 stream.scanner->setError(Message, Location.Range.begin()); 2047 } 2048 2049 bool Document::failed() const { 2050 return stream.scanner->failed(); 2051 } 2052 2053 Node *Document::parseBlockNode() { 2054 Token T = peekNext(); 2055 // Handle properties. 2056 Token AnchorInfo; 2057 Token TagInfo; 2058 parse_property: 2059 switch (T.Kind) { 2060 case Token::TK_Alias: 2061 getNext(); 2062 return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); 2063 case Token::TK_Anchor: 2064 if (AnchorInfo.Kind == Token::TK_Anchor) { 2065 setError("Already encountered an anchor for this node!", T); 2066 return nullptr; 2067 } 2068 AnchorInfo = getNext(); // Consume TK_Anchor. 2069 T = peekNext(); 2070 goto parse_property; 2071 case Token::TK_Tag: 2072 if (TagInfo.Kind == Token::TK_Tag) { 2073 setError("Already encountered a tag for this node!", T); 2074 return nullptr; 2075 } 2076 TagInfo = getNext(); // Consume TK_Tag. 2077 T = peekNext(); 2078 goto parse_property; 2079 default: 2080 break; 2081 } 2082 2083 switch (T.Kind) { 2084 case Token::TK_BlockEntry: 2085 // We got an unindented BlockEntry sequence. This is not terminated with 2086 // a BlockEnd. 2087 // Don't eat the TK_BlockEntry, SequenceNode needs it. 2088 return new (NodeAllocator) SequenceNode( stream.CurrentDoc 2089 , AnchorInfo.Range.substr(1) 2090 , TagInfo.Range 2091 , SequenceNode::ST_Indentless); 2092 case Token::TK_BlockSequenceStart: 2093 getNext(); 2094 return new (NodeAllocator) 2095 SequenceNode( stream.CurrentDoc 2096 , AnchorInfo.Range.substr(1) 2097 , TagInfo.Range 2098 , SequenceNode::ST_Block); 2099 case Token::TK_BlockMappingStart: 2100 getNext(); 2101 return new (NodeAllocator) 2102 MappingNode( stream.CurrentDoc 2103 , AnchorInfo.Range.substr(1) 2104 , TagInfo.Range 2105 , MappingNode::MT_Block); 2106 case Token::TK_FlowSequenceStart: 2107 getNext(); 2108 return new (NodeAllocator) 2109 SequenceNode( stream.CurrentDoc 2110 , AnchorInfo.Range.substr(1) 2111 , TagInfo.Range 2112 , SequenceNode::ST_Flow); 2113 case Token::TK_FlowMappingStart: 2114 getNext(); 2115 return new (NodeAllocator) 2116 MappingNode( stream.CurrentDoc 2117 , AnchorInfo.Range.substr(1) 2118 , TagInfo.Range 2119 , MappingNode::MT_Flow); 2120 case Token::TK_Scalar: 2121 getNext(); 2122 return new (NodeAllocator) 2123 ScalarNode( stream.CurrentDoc 2124 , AnchorInfo.Range.substr(1) 2125 , TagInfo.Range 2126 , T.Range); 2127 case Token::TK_Key: 2128 // Don't eat the TK_Key, KeyValueNode expects it. 2129 return new (NodeAllocator) 2130 MappingNode( stream.CurrentDoc 2131 , AnchorInfo.Range.substr(1) 2132 , TagInfo.Range 2133 , MappingNode::MT_Inline); 2134 case Token::TK_DocumentStart: 2135 case Token::TK_DocumentEnd: 2136 case Token::TK_StreamEnd: 2137 default: 2138 // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not 2139 // !!null null. 2140 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2141 case Token::TK_Error: 2142 return nullptr; 2143 } 2144 llvm_unreachable("Control flow shouldn't reach here."); 2145 return nullptr; 2146 } 2147 2148 bool Document::parseDirectives() { 2149 bool isDirective = false; 2150 while (true) { 2151 Token T = peekNext(); 2152 if (T.Kind == Token::TK_TagDirective) { 2153 parseTAGDirective(); 2154 isDirective = true; 2155 } else if (T.Kind == Token::TK_VersionDirective) { 2156 parseYAMLDirective(); 2157 isDirective = true; 2158 } else 2159 break; 2160 } 2161 return isDirective; 2162 } 2163 2164 void Document::parseYAMLDirective() { 2165 getNext(); // Eat %YAML <version> 2166 } 2167 2168 void Document::parseTAGDirective() { 2169 Token Tag = getNext(); // %TAG <handle> <prefix> 2170 StringRef T = Tag.Range; 2171 // Strip %TAG 2172 T = T.substr(T.find_first_of(" \t")).ltrim(" \t"); 2173 std::size_t HandleEnd = T.find_first_of(" \t"); 2174 StringRef TagHandle = T.substr(0, HandleEnd); 2175 StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t"); 2176 TagMap[TagHandle] = TagPrefix; 2177 } 2178 2179 bool Document::expectToken(int TK) { 2180 Token T = getNext(); 2181 if (T.Kind != TK) { 2182 setError("Unexpected token", T); 2183 return false; 2184 } 2185 return true; 2186 } 2187