1 //===--- YAMLParser.cpp - Simple YAML parser ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements a YAML parser. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Support/YAMLParser.h" 15 #include "llvm/ADT/SmallVector.h" 16 #include "llvm/ADT/StringExtras.h" 17 #include "llvm/ADT/Twine.h" 18 #include "llvm/ADT/ilist.h" 19 #include "llvm/ADT/ilist_node.h" 20 #include "llvm/Support/ErrorHandling.h" 21 #include "llvm/Support/MemoryBuffer.h" 22 #include "llvm/Support/SourceMgr.h" 23 #include "llvm/Support/raw_ostream.h" 24 25 using namespace llvm; 26 using namespace yaml; 27 28 enum UnicodeEncodingForm { 29 UEF_UTF32_LE, ///< UTF-32 Little Endian 30 UEF_UTF32_BE, ///< UTF-32 Big Endian 31 UEF_UTF16_LE, ///< UTF-16 Little Endian 32 UEF_UTF16_BE, ///< UTF-16 Big Endian 33 UEF_UTF8, ///< UTF-8 or ascii. 34 UEF_Unknown ///< Not a valid Unicode encoding. 35 }; 36 37 /// EncodingInfo - Holds the encoding type and length of the byte order mark if 38 /// it exists. Length is in {0, 2, 3, 4}. 39 typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo; 40 41 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode 42 /// encoding form of \a Input. 43 /// 44 /// @param Input A string of length 0 or more. 45 /// @returns An EncodingInfo indicating the Unicode encoding form of the input 46 /// and how long the byte order mark is if one exists. 47 static EncodingInfo getUnicodeEncoding(StringRef Input) { 48 if (Input.size() == 0) 49 return std::make_pair(UEF_Unknown, 0); 50 51 switch (uint8_t(Input[0])) { 52 case 0x00: 53 if (Input.size() >= 4) { 54 if ( Input[1] == 0 55 && uint8_t(Input[2]) == 0xFE 56 && uint8_t(Input[3]) == 0xFF) 57 return std::make_pair(UEF_UTF32_BE, 4); 58 if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) 59 return std::make_pair(UEF_UTF32_BE, 0); 60 } 61 62 if (Input.size() >= 2 && Input[1] != 0) 63 return std::make_pair(UEF_UTF16_BE, 0); 64 return std::make_pair(UEF_Unknown, 0); 65 case 0xFF: 66 if ( Input.size() >= 4 67 && uint8_t(Input[1]) == 0xFE 68 && Input[2] == 0 69 && Input[3] == 0) 70 return std::make_pair(UEF_UTF32_LE, 4); 71 72 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) 73 return std::make_pair(UEF_UTF16_LE, 2); 74 return std::make_pair(UEF_Unknown, 0); 75 case 0xFE: 76 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) 77 return std::make_pair(UEF_UTF16_BE, 2); 78 return std::make_pair(UEF_Unknown, 0); 79 case 0xEF: 80 if ( Input.size() >= 3 81 && uint8_t(Input[1]) == 0xBB 82 && uint8_t(Input[2]) == 0xBF) 83 return std::make_pair(UEF_UTF8, 3); 84 return std::make_pair(UEF_Unknown, 0); 85 } 86 87 // It could still be utf-32 or utf-16. 88 if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) 89 return std::make_pair(UEF_UTF32_LE, 0); 90 91 if (Input.size() >= 2 && Input[1] == 0) 92 return std::make_pair(UEF_UTF16_LE, 0); 93 94 return std::make_pair(UEF_UTF8, 0); 95 } 96 97 namespace llvm { 98 namespace yaml { 99 /// Pin the vtables to this file. 100 void Node::anchor() {} 101 void NullNode::anchor() {} 102 void ScalarNode::anchor() {} 103 void KeyValueNode::anchor() {} 104 void MappingNode::anchor() {} 105 void SequenceNode::anchor() {} 106 void AliasNode::anchor() {} 107 108 /// Token - A single YAML token. 109 struct Token : ilist_node<Token> { 110 enum TokenKind { 111 TK_Error, // Uninitialized token. 112 TK_StreamStart, 113 TK_StreamEnd, 114 TK_VersionDirective, 115 TK_TagDirective, 116 TK_DocumentStart, 117 TK_DocumentEnd, 118 TK_BlockEntry, 119 TK_BlockEnd, 120 TK_BlockSequenceStart, 121 TK_BlockMappingStart, 122 TK_FlowEntry, 123 TK_FlowSequenceStart, 124 TK_FlowSequenceEnd, 125 TK_FlowMappingStart, 126 TK_FlowMappingEnd, 127 TK_Key, 128 TK_Value, 129 TK_Scalar, 130 TK_Alias, 131 TK_Anchor, 132 TK_Tag 133 } Kind; 134 135 /// A string of length 0 or more whose begin() points to the logical location 136 /// of the token in the input. 137 StringRef Range; 138 139 Token() : Kind(TK_Error) {} 140 }; 141 } 142 } 143 144 namespace llvm { 145 template<> 146 struct ilist_sentinel_traits<Token> { 147 Token *createSentinel() const { 148 return &Sentinel; 149 } 150 static void destroySentinel(Token*) {} 151 152 Token *provideInitialHead() const { return createSentinel(); } 153 Token *ensureHead(Token*) const { return createSentinel(); } 154 static void noteHead(Token*, Token*) {} 155 156 private: 157 mutable Token Sentinel; 158 }; 159 160 template<> 161 struct ilist_node_traits<Token> { 162 Token *createNode(const Token &V) { 163 return new (Alloc.Allocate<Token>()) Token(V); 164 } 165 static void deleteNode(Token *V) {} 166 167 void addNodeToList(Token *) {} 168 void removeNodeFromList(Token *) {} 169 void transferNodesFromList(ilist_node_traits & /*SrcTraits*/, 170 ilist_iterator<Token> /*first*/, 171 ilist_iterator<Token> /*last*/) {} 172 173 BumpPtrAllocator Alloc; 174 }; 175 } 176 177 typedef ilist<Token> TokenQueueT; 178 179 namespace { 180 /// @brief This struct is used to track simple keys. 181 /// 182 /// Simple keys are handled by creating an entry in SimpleKeys for each Token 183 /// which could legally be the start of a simple key. When peekNext is called, 184 /// if the Token To be returned is referenced by a SimpleKey, we continue 185 /// tokenizing until that potential simple key has either been found to not be 186 /// a simple key (we moved on to the next line or went further than 1024 chars). 187 /// Or when we run into a Value, and then insert a Key token (and possibly 188 /// others) before the SimpleKey's Tok. 189 struct SimpleKey { 190 TokenQueueT::iterator Tok; 191 unsigned Column; 192 unsigned Line; 193 unsigned FlowLevel; 194 bool IsRequired; 195 196 bool operator ==(const SimpleKey &Other) { 197 return Tok == Other.Tok; 198 } 199 }; 200 } 201 202 /// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit 203 /// subsequence and the subsequence's length in code units (uint8_t). 204 /// A length of 0 represents an error. 205 typedef std::pair<uint32_t, unsigned> UTF8Decoded; 206 207 static UTF8Decoded decodeUTF8(StringRef Range) { 208 StringRef::iterator Position= Range.begin(); 209 StringRef::iterator End = Range.end(); 210 // 1 byte: [0x00, 0x7f] 211 // Bit pattern: 0xxxxxxx 212 if ((*Position & 0x80) == 0) { 213 return std::make_pair(*Position, 1); 214 } 215 // 2 bytes: [0x80, 0x7ff] 216 // Bit pattern: 110xxxxx 10xxxxxx 217 if (Position + 1 != End && 218 ((*Position & 0xE0) == 0xC0) && 219 ((*(Position + 1) & 0xC0) == 0x80)) { 220 uint32_t codepoint = ((*Position & 0x1F) << 6) | 221 (*(Position + 1) & 0x3F); 222 if (codepoint >= 0x80) 223 return std::make_pair(codepoint, 2); 224 } 225 // 3 bytes: [0x8000, 0xffff] 226 // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx 227 if (Position + 2 != End && 228 ((*Position & 0xF0) == 0xE0) && 229 ((*(Position + 1) & 0xC0) == 0x80) && 230 ((*(Position + 2) & 0xC0) == 0x80)) { 231 uint32_t codepoint = ((*Position & 0x0F) << 12) | 232 ((*(Position + 1) & 0x3F) << 6) | 233 (*(Position + 2) & 0x3F); 234 // Codepoints between 0xD800 and 0xDFFF are invalid, as 235 // they are high / low surrogate halves used by UTF-16. 236 if (codepoint >= 0x800 && 237 (codepoint < 0xD800 || codepoint > 0xDFFF)) 238 return std::make_pair(codepoint, 3); 239 } 240 // 4 bytes: [0x10000, 0x10FFFF] 241 // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 242 if (Position + 3 != End && 243 ((*Position & 0xF8) == 0xF0) && 244 ((*(Position + 1) & 0xC0) == 0x80) && 245 ((*(Position + 2) & 0xC0) == 0x80) && 246 ((*(Position + 3) & 0xC0) == 0x80)) { 247 uint32_t codepoint = ((*Position & 0x07) << 18) | 248 ((*(Position + 1) & 0x3F) << 12) | 249 ((*(Position + 2) & 0x3F) << 6) | 250 (*(Position + 3) & 0x3F); 251 if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) 252 return std::make_pair(codepoint, 4); 253 } 254 return std::make_pair(0, 0); 255 } 256 257 namespace llvm { 258 namespace yaml { 259 /// @brief Scans YAML tokens from a MemoryBuffer. 260 class Scanner { 261 public: 262 Scanner(const StringRef Input, SourceMgr &SM); 263 Scanner(std::unique_ptr<MemoryBuffer> Buffer, SourceMgr &SM_); 264 265 /// @brief Parse the next token and return it without popping it. 266 Token &peekNext(); 267 268 /// @brief Parse the next token and pop it from the queue. 269 Token getNext(); 270 271 void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, 272 ArrayRef<SMRange> Ranges = None) { 273 SM.PrintMessage(Loc, Kind, Message, Ranges); 274 } 275 276 void setError(const Twine &Message, StringRef::iterator Position) { 277 if (Current >= End) 278 Current = End - 1; 279 280 // Don't print out more errors after the first one we encounter. The rest 281 // are just the result of the first, and have no meaning. 282 if (!Failed) 283 printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message); 284 Failed = true; 285 } 286 287 void setError(const Twine &Message) { 288 setError(Message, Current); 289 } 290 291 /// @brief Returns true if an error occurred while parsing. 292 bool failed() { 293 return Failed; 294 } 295 296 private: 297 StringRef currentInput() { 298 return StringRef(Current, End - Current); 299 } 300 301 /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting 302 /// at \a Position. 303 /// 304 /// If the UTF-8 code units starting at Position do not form a well-formed 305 /// code unit subsequence, then the Unicode scalar value is 0, and the length 306 /// is 0. 307 UTF8Decoded decodeUTF8(StringRef::iterator Position) { 308 return ::decodeUTF8(StringRef(Position, End - Position)); 309 } 310 311 // The following functions are based on the gramar rules in the YAML spec. The 312 // style of the function names it meant to closely match how they are written 313 // in the spec. The number within the [] is the number of the grammar rule in 314 // the spec. 315 // 316 // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. 317 // 318 // c- 319 // A production starting and ending with a special character. 320 // b- 321 // A production matching a single line break. 322 // nb- 323 // A production starting and ending with a non-break character. 324 // s- 325 // A production starting and ending with a white space character. 326 // ns- 327 // A production starting and ending with a non-space character. 328 // l- 329 // A production matching complete line(s). 330 331 /// @brief Skip a single nb-char[27] starting at Position. 332 /// 333 /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] 334 /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] 335 /// 336 /// @returns The code unit after the nb-char, or Position if it's not an 337 /// nb-char. 338 StringRef::iterator skip_nb_char(StringRef::iterator Position); 339 340 /// @brief Skip a single b-break[28] starting at Position. 341 /// 342 /// A b-break is 0xD 0xA | 0xD | 0xA 343 /// 344 /// @returns The code unit after the b-break, or Position if it's not a 345 /// b-break. 346 StringRef::iterator skip_b_break(StringRef::iterator Position); 347 348 /// @brief Skip a single s-white[33] starting at Position. 349 /// 350 /// A s-white is 0x20 | 0x9 351 /// 352 /// @returns The code unit after the s-white, or Position if it's not a 353 /// s-white. 354 StringRef::iterator skip_s_white(StringRef::iterator Position); 355 356 /// @brief Skip a single ns-char[34] starting at Position. 357 /// 358 /// A ns-char is nb-char - s-white 359 /// 360 /// @returns The code unit after the ns-char, or Position if it's not a 361 /// ns-char. 362 StringRef::iterator skip_ns_char(StringRef::iterator Position); 363 364 typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator); 365 /// @brief Skip minimal well-formed code unit subsequences until Func 366 /// returns its input. 367 /// 368 /// @returns The code unit after the last minimal well-formed code unit 369 /// subsequence that Func accepted. 370 StringRef::iterator skip_while( SkipWhileFunc Func 371 , StringRef::iterator Position); 372 373 /// @brief Scan ns-uri-char[39]s starting at Cur. 374 /// 375 /// This updates Cur and Column while scanning. 376 /// 377 /// @returns A StringRef starting at Cur which covers the longest contiguous 378 /// sequence of ns-uri-char. 379 StringRef scan_ns_uri_char(); 380 381 /// @brief Consume a minimal well-formed code unit subsequence starting at 382 /// \a Cur. Return false if it is not the same Unicode scalar value as 383 /// \a Expected. This updates \a Column. 384 bool consume(uint32_t Expected); 385 386 /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. 387 void skip(uint32_t Distance); 388 389 /// @brief Return true if the minimal well-formed code unit subsequence at 390 /// Pos is whitespace or a new line 391 bool isBlankOrBreak(StringRef::iterator Position); 392 393 /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey. 394 void saveSimpleKeyCandidate( TokenQueueT::iterator Tok 395 , unsigned AtColumn 396 , bool IsRequired); 397 398 /// @brief Remove simple keys that can no longer be valid simple keys. 399 /// 400 /// Invalid simple keys are not on the current line or are further than 1024 401 /// columns back. 402 void removeStaleSimpleKeyCandidates(); 403 404 /// @brief Remove all simple keys on FlowLevel \a Level. 405 void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); 406 407 /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd 408 /// tokens if needed. 409 bool unrollIndent(int ToColumn); 410 411 /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint 412 /// if needed. 413 bool rollIndent( int ToColumn 414 , Token::TokenKind Kind 415 , TokenQueueT::iterator InsertPoint); 416 417 /// @brief Skip whitespace and comments until the start of the next token. 418 void scanToNextToken(); 419 420 /// @brief Must be the first token generated. 421 bool scanStreamStart(); 422 423 /// @brief Generate tokens needed to close out the stream. 424 bool scanStreamEnd(); 425 426 /// @brief Scan a %BLAH directive. 427 bool scanDirective(); 428 429 /// @brief Scan a ... or ---. 430 bool scanDocumentIndicator(bool IsStart); 431 432 /// @brief Scan a [ or { and generate the proper flow collection start token. 433 bool scanFlowCollectionStart(bool IsSequence); 434 435 /// @brief Scan a ] or } and generate the proper flow collection end token. 436 bool scanFlowCollectionEnd(bool IsSequence); 437 438 /// @brief Scan the , that separates entries in a flow collection. 439 bool scanFlowEntry(); 440 441 /// @brief Scan the - that starts block sequence entries. 442 bool scanBlockEntry(); 443 444 /// @brief Scan an explicit ? indicating a key. 445 bool scanKey(); 446 447 /// @brief Scan an explicit : indicating a value. 448 bool scanValue(); 449 450 /// @brief Scan a quoted scalar. 451 bool scanFlowScalar(bool IsDoubleQuoted); 452 453 /// @brief Scan an unquoted scalar. 454 bool scanPlainScalar(); 455 456 /// @brief Scan an Alias or Anchor starting with * or &. 457 bool scanAliasOrAnchor(bool IsAlias); 458 459 /// @brief Scan a block scalar starting with | or >. 460 bool scanBlockScalar(bool IsLiteral); 461 462 /// @brief Scan a tag of the form !stuff. 463 bool scanTag(); 464 465 /// @brief Dispatch to the next scanning function based on \a *Cur. 466 bool fetchMoreTokens(); 467 468 /// @brief The SourceMgr used for diagnostics and buffer management. 469 SourceMgr &SM; 470 471 /// @brief The original input. 472 MemoryBuffer *InputBuffer; 473 474 /// @brief The current position of the scanner. 475 StringRef::iterator Current; 476 477 /// @brief The end of the input (one past the last character). 478 StringRef::iterator End; 479 480 /// @brief Current YAML indentation level in spaces. 481 int Indent; 482 483 /// @brief Current column number in Unicode code points. 484 unsigned Column; 485 486 /// @brief Current line number. 487 unsigned Line; 488 489 /// @brief How deep we are in flow style containers. 0 Means at block level. 490 unsigned FlowLevel; 491 492 /// @brief Are we at the start of the stream? 493 bool IsStartOfStream; 494 495 /// @brief Can the next token be the start of a simple key? 496 bool IsSimpleKeyAllowed; 497 498 /// @brief True if an error has occurred. 499 bool Failed; 500 501 /// @brief Queue of tokens. This is required to queue up tokens while looking 502 /// for the end of a simple key. And for cases where a single character 503 /// can produce multiple tokens (e.g. BlockEnd). 504 TokenQueueT TokenQueue; 505 506 /// @brief Indentation levels. 507 SmallVector<int, 4> Indents; 508 509 /// @brief Potential simple keys. 510 SmallVector<SimpleKey, 4> SimpleKeys; 511 }; 512 513 } // end namespace yaml 514 } // end namespace llvm 515 516 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. 517 static void encodeUTF8( uint32_t UnicodeScalarValue 518 , SmallVectorImpl<char> &Result) { 519 if (UnicodeScalarValue <= 0x7F) { 520 Result.push_back(UnicodeScalarValue & 0x7F); 521 } else if (UnicodeScalarValue <= 0x7FF) { 522 uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); 523 uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); 524 Result.push_back(FirstByte); 525 Result.push_back(SecondByte); 526 } else if (UnicodeScalarValue <= 0xFFFF) { 527 uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); 528 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 529 uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); 530 Result.push_back(FirstByte); 531 Result.push_back(SecondByte); 532 Result.push_back(ThirdByte); 533 } else if (UnicodeScalarValue <= 0x10FFFF) { 534 uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); 535 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); 536 uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 537 uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); 538 Result.push_back(FirstByte); 539 Result.push_back(SecondByte); 540 Result.push_back(ThirdByte); 541 Result.push_back(FourthByte); 542 } 543 } 544 545 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { 546 SourceMgr SM; 547 Scanner scanner(Input, SM); 548 while (true) { 549 Token T = scanner.getNext(); 550 switch (T.Kind) { 551 case Token::TK_StreamStart: 552 OS << "Stream-Start: "; 553 break; 554 case Token::TK_StreamEnd: 555 OS << "Stream-End: "; 556 break; 557 case Token::TK_VersionDirective: 558 OS << "Version-Directive: "; 559 break; 560 case Token::TK_TagDirective: 561 OS << "Tag-Directive: "; 562 break; 563 case Token::TK_DocumentStart: 564 OS << "Document-Start: "; 565 break; 566 case Token::TK_DocumentEnd: 567 OS << "Document-End: "; 568 break; 569 case Token::TK_BlockEntry: 570 OS << "Block-Entry: "; 571 break; 572 case Token::TK_BlockEnd: 573 OS << "Block-End: "; 574 break; 575 case Token::TK_BlockSequenceStart: 576 OS << "Block-Sequence-Start: "; 577 break; 578 case Token::TK_BlockMappingStart: 579 OS << "Block-Mapping-Start: "; 580 break; 581 case Token::TK_FlowEntry: 582 OS << "Flow-Entry: "; 583 break; 584 case Token::TK_FlowSequenceStart: 585 OS << "Flow-Sequence-Start: "; 586 break; 587 case Token::TK_FlowSequenceEnd: 588 OS << "Flow-Sequence-End: "; 589 break; 590 case Token::TK_FlowMappingStart: 591 OS << "Flow-Mapping-Start: "; 592 break; 593 case Token::TK_FlowMappingEnd: 594 OS << "Flow-Mapping-End: "; 595 break; 596 case Token::TK_Key: 597 OS << "Key: "; 598 break; 599 case Token::TK_Value: 600 OS << "Value: "; 601 break; 602 case Token::TK_Scalar: 603 OS << "Scalar: "; 604 break; 605 case Token::TK_Alias: 606 OS << "Alias: "; 607 break; 608 case Token::TK_Anchor: 609 OS << "Anchor: "; 610 break; 611 case Token::TK_Tag: 612 OS << "Tag: "; 613 break; 614 case Token::TK_Error: 615 break; 616 } 617 OS << T.Range << "\n"; 618 if (T.Kind == Token::TK_StreamEnd) 619 break; 620 else if (T.Kind == Token::TK_Error) 621 return false; 622 } 623 return true; 624 } 625 626 bool yaml::scanTokens(StringRef Input) { 627 llvm::SourceMgr SM; 628 llvm::yaml::Scanner scanner(Input, SM); 629 for (;;) { 630 llvm::yaml::Token T = scanner.getNext(); 631 if (T.Kind == Token::TK_StreamEnd) 632 break; 633 else if (T.Kind == Token::TK_Error) 634 return false; 635 } 636 return true; 637 } 638 639 std::string yaml::escape(StringRef Input) { 640 std::string EscapedInput; 641 for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { 642 if (*i == '\\') 643 EscapedInput += "\\\\"; 644 else if (*i == '"') 645 EscapedInput += "\\\""; 646 else if (*i == 0) 647 EscapedInput += "\\0"; 648 else if (*i == 0x07) 649 EscapedInput += "\\a"; 650 else if (*i == 0x08) 651 EscapedInput += "\\b"; 652 else if (*i == 0x09) 653 EscapedInput += "\\t"; 654 else if (*i == 0x0A) 655 EscapedInput += "\\n"; 656 else if (*i == 0x0B) 657 EscapedInput += "\\v"; 658 else if (*i == 0x0C) 659 EscapedInput += "\\f"; 660 else if (*i == 0x0D) 661 EscapedInput += "\\r"; 662 else if (*i == 0x1B) 663 EscapedInput += "\\e"; 664 else if ((unsigned char)*i < 0x20) { // Control characters not handled above. 665 std::string HexStr = utohexstr(*i); 666 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 667 } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. 668 UTF8Decoded UnicodeScalarValue 669 = decodeUTF8(StringRef(i, Input.end() - i)); 670 if (UnicodeScalarValue.second == 0) { 671 // Found invalid char. 672 SmallString<4> Val; 673 encodeUTF8(0xFFFD, Val); 674 EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end()); 675 // FIXME: Error reporting. 676 return EscapedInput; 677 } 678 if (UnicodeScalarValue.first == 0x85) 679 EscapedInput += "\\N"; 680 else if (UnicodeScalarValue.first == 0xA0) 681 EscapedInput += "\\_"; 682 else if (UnicodeScalarValue.first == 0x2028) 683 EscapedInput += "\\L"; 684 else if (UnicodeScalarValue.first == 0x2029) 685 EscapedInput += "\\P"; 686 else { 687 std::string HexStr = utohexstr(UnicodeScalarValue.first); 688 if (HexStr.size() <= 2) 689 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 690 else if (HexStr.size() <= 4) 691 EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; 692 else if (HexStr.size() <= 8) 693 EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; 694 } 695 i += UnicodeScalarValue.second - 1; 696 } else 697 EscapedInput.push_back(*i); 698 } 699 return EscapedInput; 700 } 701 702 Scanner::Scanner(StringRef Input, SourceMgr &sm) 703 : SM(sm) 704 , Indent(-1) 705 , Column(0) 706 , Line(0) 707 , FlowLevel(0) 708 , IsStartOfStream(true) 709 , IsSimpleKeyAllowed(true) 710 , Failed(false) { 711 std::unique_ptr<MemoryBuffer> InputBufferOwner( 712 MemoryBuffer::getMemBuffer(Input, "YAML")); 713 InputBuffer = InputBufferOwner.get(); 714 SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc()); 715 Current = InputBuffer->getBufferStart(); 716 End = InputBuffer->getBufferEnd(); 717 } 718 719 Scanner::Scanner(std::unique_ptr<MemoryBuffer> Buffer, SourceMgr &SM_) 720 : SM(SM_), InputBuffer(Buffer.get()), 721 Current(InputBuffer->getBufferStart()), End(InputBuffer->getBufferEnd()), 722 Indent(-1), Column(0), Line(0), FlowLevel(0), IsStartOfStream(true), 723 IsSimpleKeyAllowed(true), Failed(false) { 724 SM.AddNewSourceBuffer(std::move(Buffer), SMLoc()); 725 } 726 727 Token &Scanner::peekNext() { 728 // If the current token is a possible simple key, keep parsing until we 729 // can confirm. 730 bool NeedMore = false; 731 while (true) { 732 if (TokenQueue.empty() || NeedMore) { 733 if (!fetchMoreTokens()) { 734 TokenQueue.clear(); 735 TokenQueue.push_back(Token()); 736 return TokenQueue.front(); 737 } 738 } 739 assert(!TokenQueue.empty() && 740 "fetchMoreTokens lied about getting tokens!"); 741 742 removeStaleSimpleKeyCandidates(); 743 SimpleKey SK; 744 SK.Tok = TokenQueue.front(); 745 if (std::find(SimpleKeys.begin(), SimpleKeys.end(), SK) 746 == SimpleKeys.end()) 747 break; 748 else 749 NeedMore = true; 750 } 751 return TokenQueue.front(); 752 } 753 754 Token Scanner::getNext() { 755 Token Ret = peekNext(); 756 // TokenQueue can be empty if there was an error getting the next token. 757 if (!TokenQueue.empty()) 758 TokenQueue.pop_front(); 759 760 // There cannot be any referenced Token's if the TokenQueue is empty. So do a 761 // quick deallocation of them all. 762 if (TokenQueue.empty()) { 763 TokenQueue.Alloc.Reset(); 764 } 765 766 return Ret; 767 } 768 769 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { 770 if (Position == End) 771 return Position; 772 // Check 7 bit c-printable - b-char. 773 if ( *Position == 0x09 774 || (*Position >= 0x20 && *Position <= 0x7E)) 775 return Position + 1; 776 777 // Check for valid UTF-8. 778 if (uint8_t(*Position) & 0x80) { 779 UTF8Decoded u8d = decodeUTF8(Position); 780 if ( u8d.second != 0 781 && u8d.first != 0xFEFF 782 && ( u8d.first == 0x85 783 || ( u8d.first >= 0xA0 784 && u8d.first <= 0xD7FF) 785 || ( u8d.first >= 0xE000 786 && u8d.first <= 0xFFFD) 787 || ( u8d.first >= 0x10000 788 && u8d.first <= 0x10FFFF))) 789 return Position + u8d.second; 790 } 791 return Position; 792 } 793 794 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { 795 if (Position == End) 796 return Position; 797 if (*Position == 0x0D) { 798 if (Position + 1 != End && *(Position + 1) == 0x0A) 799 return Position + 2; 800 return Position + 1; 801 } 802 803 if (*Position == 0x0A) 804 return Position + 1; 805 return Position; 806 } 807 808 809 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { 810 if (Position == End) 811 return Position; 812 if (*Position == ' ' || *Position == '\t') 813 return Position + 1; 814 return Position; 815 } 816 817 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { 818 if (Position == End) 819 return Position; 820 if (*Position == ' ' || *Position == '\t') 821 return Position; 822 return skip_nb_char(Position); 823 } 824 825 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func 826 , StringRef::iterator Position) { 827 while (true) { 828 StringRef::iterator i = (this->*Func)(Position); 829 if (i == Position) 830 break; 831 Position = i; 832 } 833 return Position; 834 } 835 836 static bool is_ns_hex_digit(const char C) { 837 return (C >= '0' && C <= '9') 838 || (C >= 'a' && C <= 'z') 839 || (C >= 'A' && C <= 'Z'); 840 } 841 842 static bool is_ns_word_char(const char C) { 843 return C == '-' 844 || (C >= 'a' && C <= 'z') 845 || (C >= 'A' && C <= 'Z'); 846 } 847 848 StringRef Scanner::scan_ns_uri_char() { 849 StringRef::iterator Start = Current; 850 while (true) { 851 if (Current == End) 852 break; 853 if (( *Current == '%' 854 && Current + 2 < End 855 && is_ns_hex_digit(*(Current + 1)) 856 && is_ns_hex_digit(*(Current + 2))) 857 || is_ns_word_char(*Current) 858 || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") 859 != StringRef::npos) { 860 ++Current; 861 ++Column; 862 } else 863 break; 864 } 865 return StringRef(Start, Current - Start); 866 } 867 868 bool Scanner::consume(uint32_t Expected) { 869 if (Expected >= 0x80) 870 report_fatal_error("Not dealing with this yet"); 871 if (Current == End) 872 return false; 873 if (uint8_t(*Current) >= 0x80) 874 report_fatal_error("Not dealing with this yet"); 875 if (uint8_t(*Current) == Expected) { 876 ++Current; 877 ++Column; 878 return true; 879 } 880 return false; 881 } 882 883 void Scanner::skip(uint32_t Distance) { 884 Current += Distance; 885 Column += Distance; 886 assert(Current <= End && "Skipped past the end"); 887 } 888 889 bool Scanner::isBlankOrBreak(StringRef::iterator Position) { 890 if (Position == End) 891 return false; 892 if ( *Position == ' ' || *Position == '\t' 893 || *Position == '\r' || *Position == '\n') 894 return true; 895 return false; 896 } 897 898 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok 899 , unsigned AtColumn 900 , bool IsRequired) { 901 if (IsSimpleKeyAllowed) { 902 SimpleKey SK; 903 SK.Tok = Tok; 904 SK.Line = Line; 905 SK.Column = AtColumn; 906 SK.IsRequired = IsRequired; 907 SK.FlowLevel = FlowLevel; 908 SimpleKeys.push_back(SK); 909 } 910 } 911 912 void Scanner::removeStaleSimpleKeyCandidates() { 913 for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); 914 i != SimpleKeys.end();) { 915 if (i->Line != Line || i->Column + 1024 < Column) { 916 if (i->IsRequired) 917 setError( "Could not find expected : for simple key" 918 , i->Tok->Range.begin()); 919 i = SimpleKeys.erase(i); 920 } else 921 ++i; 922 } 923 } 924 925 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { 926 if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) 927 SimpleKeys.pop_back(); 928 } 929 930 bool Scanner::unrollIndent(int ToColumn) { 931 Token T; 932 // Indentation is ignored in flow. 933 if (FlowLevel != 0) 934 return true; 935 936 while (Indent > ToColumn) { 937 T.Kind = Token::TK_BlockEnd; 938 T.Range = StringRef(Current, 1); 939 TokenQueue.push_back(T); 940 Indent = Indents.pop_back_val(); 941 } 942 943 return true; 944 } 945 946 bool Scanner::rollIndent( int ToColumn 947 , Token::TokenKind Kind 948 , TokenQueueT::iterator InsertPoint) { 949 if (FlowLevel) 950 return true; 951 if (Indent < ToColumn) { 952 Indents.push_back(Indent); 953 Indent = ToColumn; 954 955 Token T; 956 T.Kind = Kind; 957 T.Range = StringRef(Current, 0); 958 TokenQueue.insert(InsertPoint, T); 959 } 960 return true; 961 } 962 963 void Scanner::scanToNextToken() { 964 while (true) { 965 while (*Current == ' ' || *Current == '\t') { 966 skip(1); 967 } 968 969 // Skip comment. 970 if (*Current == '#') { 971 while (true) { 972 // This may skip more than one byte, thus Column is only incremented 973 // for code points. 974 StringRef::iterator i = skip_nb_char(Current); 975 if (i == Current) 976 break; 977 Current = i; 978 ++Column; 979 } 980 } 981 982 // Skip EOL. 983 StringRef::iterator i = skip_b_break(Current); 984 if (i == Current) 985 break; 986 Current = i; 987 ++Line; 988 Column = 0; 989 // New lines may start a simple key. 990 if (!FlowLevel) 991 IsSimpleKeyAllowed = true; 992 } 993 } 994 995 bool Scanner::scanStreamStart() { 996 IsStartOfStream = false; 997 998 EncodingInfo EI = getUnicodeEncoding(currentInput()); 999 1000 Token T; 1001 T.Kind = Token::TK_StreamStart; 1002 T.Range = StringRef(Current, EI.second); 1003 TokenQueue.push_back(T); 1004 Current += EI.second; 1005 return true; 1006 } 1007 1008 bool Scanner::scanStreamEnd() { 1009 // Force an ending new line if one isn't present. 1010 if (Column != 0) { 1011 Column = 0; 1012 ++Line; 1013 } 1014 1015 unrollIndent(-1); 1016 SimpleKeys.clear(); 1017 IsSimpleKeyAllowed = false; 1018 1019 Token T; 1020 T.Kind = Token::TK_StreamEnd; 1021 T.Range = StringRef(Current, 0); 1022 TokenQueue.push_back(T); 1023 return true; 1024 } 1025 1026 bool Scanner::scanDirective() { 1027 // Reset the indentation level. 1028 unrollIndent(-1); 1029 SimpleKeys.clear(); 1030 IsSimpleKeyAllowed = false; 1031 1032 StringRef::iterator Start = Current; 1033 consume('%'); 1034 StringRef::iterator NameStart = Current; 1035 Current = skip_while(&Scanner::skip_ns_char, Current); 1036 StringRef Name(NameStart, Current - NameStart); 1037 Current = skip_while(&Scanner::skip_s_white, Current); 1038 1039 Token T; 1040 if (Name == "YAML") { 1041 Current = skip_while(&Scanner::skip_ns_char, Current); 1042 T.Kind = Token::TK_VersionDirective; 1043 T.Range = StringRef(Start, Current - Start); 1044 TokenQueue.push_back(T); 1045 return true; 1046 } else if(Name == "TAG") { 1047 Current = skip_while(&Scanner::skip_ns_char, Current); 1048 Current = skip_while(&Scanner::skip_s_white, Current); 1049 Current = skip_while(&Scanner::skip_ns_char, Current); 1050 T.Kind = Token::TK_TagDirective; 1051 T.Range = StringRef(Start, Current - Start); 1052 TokenQueue.push_back(T); 1053 return true; 1054 } 1055 return false; 1056 } 1057 1058 bool Scanner::scanDocumentIndicator(bool IsStart) { 1059 unrollIndent(-1); 1060 SimpleKeys.clear(); 1061 IsSimpleKeyAllowed = false; 1062 1063 Token T; 1064 T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; 1065 T.Range = StringRef(Current, 3); 1066 skip(3); 1067 TokenQueue.push_back(T); 1068 return true; 1069 } 1070 1071 bool Scanner::scanFlowCollectionStart(bool IsSequence) { 1072 Token T; 1073 T.Kind = IsSequence ? Token::TK_FlowSequenceStart 1074 : Token::TK_FlowMappingStart; 1075 T.Range = StringRef(Current, 1); 1076 skip(1); 1077 TokenQueue.push_back(T); 1078 1079 // [ and { may begin a simple key. 1080 saveSimpleKeyCandidate(TokenQueue.back(), Column - 1, false); 1081 1082 // And may also be followed by a simple key. 1083 IsSimpleKeyAllowed = true; 1084 ++FlowLevel; 1085 return true; 1086 } 1087 1088 bool Scanner::scanFlowCollectionEnd(bool IsSequence) { 1089 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1090 IsSimpleKeyAllowed = false; 1091 Token T; 1092 T.Kind = IsSequence ? Token::TK_FlowSequenceEnd 1093 : Token::TK_FlowMappingEnd; 1094 T.Range = StringRef(Current, 1); 1095 skip(1); 1096 TokenQueue.push_back(T); 1097 if (FlowLevel) 1098 --FlowLevel; 1099 return true; 1100 } 1101 1102 bool Scanner::scanFlowEntry() { 1103 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1104 IsSimpleKeyAllowed = true; 1105 Token T; 1106 T.Kind = Token::TK_FlowEntry; 1107 T.Range = StringRef(Current, 1); 1108 skip(1); 1109 TokenQueue.push_back(T); 1110 return true; 1111 } 1112 1113 bool Scanner::scanBlockEntry() { 1114 rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); 1115 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1116 IsSimpleKeyAllowed = true; 1117 Token T; 1118 T.Kind = Token::TK_BlockEntry; 1119 T.Range = StringRef(Current, 1); 1120 skip(1); 1121 TokenQueue.push_back(T); 1122 return true; 1123 } 1124 1125 bool Scanner::scanKey() { 1126 if (!FlowLevel) 1127 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1128 1129 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1130 IsSimpleKeyAllowed = !FlowLevel; 1131 1132 Token T; 1133 T.Kind = Token::TK_Key; 1134 T.Range = StringRef(Current, 1); 1135 skip(1); 1136 TokenQueue.push_back(T); 1137 return true; 1138 } 1139 1140 bool Scanner::scanValue() { 1141 // If the previous token could have been a simple key, insert the key token 1142 // into the token queue. 1143 if (!SimpleKeys.empty()) { 1144 SimpleKey SK = SimpleKeys.pop_back_val(); 1145 Token T; 1146 T.Kind = Token::TK_Key; 1147 T.Range = SK.Tok->Range; 1148 TokenQueueT::iterator i, e; 1149 for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { 1150 if (i == SK.Tok) 1151 break; 1152 } 1153 assert(i != e && "SimpleKey not in token queue!"); 1154 i = TokenQueue.insert(i, T); 1155 1156 // We may also need to add a Block-Mapping-Start token. 1157 rollIndent(SK.Column, Token::TK_BlockMappingStart, i); 1158 1159 IsSimpleKeyAllowed = false; 1160 } else { 1161 if (!FlowLevel) 1162 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1163 IsSimpleKeyAllowed = !FlowLevel; 1164 } 1165 1166 Token T; 1167 T.Kind = Token::TK_Value; 1168 T.Range = StringRef(Current, 1); 1169 skip(1); 1170 TokenQueue.push_back(T); 1171 return true; 1172 } 1173 1174 // Forbidding inlining improves performance by roughly 20%. 1175 // FIXME: Remove once llvm optimizes this to the faster version without hints. 1176 LLVM_ATTRIBUTE_NOINLINE static bool 1177 wasEscaped(StringRef::iterator First, StringRef::iterator Position); 1178 1179 // Returns whether a character at 'Position' was escaped with a leading '\'. 1180 // 'First' specifies the position of the first character in the string. 1181 static bool wasEscaped(StringRef::iterator First, 1182 StringRef::iterator Position) { 1183 assert(Position - 1 >= First); 1184 StringRef::iterator I = Position - 1; 1185 // We calculate the number of consecutive '\'s before the current position 1186 // by iterating backwards through our string. 1187 while (I >= First && *I == '\\') --I; 1188 // (Position - 1 - I) now contains the number of '\'s before the current 1189 // position. If it is odd, the character at 'Position' was escaped. 1190 return (Position - 1 - I) % 2 == 1; 1191 } 1192 1193 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { 1194 StringRef::iterator Start = Current; 1195 unsigned ColStart = Column; 1196 if (IsDoubleQuoted) { 1197 do { 1198 ++Current; 1199 while (Current != End && *Current != '"') 1200 ++Current; 1201 // Repeat until the previous character was not a '\' or was an escaped 1202 // backslash. 1203 } while ( Current != End 1204 && *(Current - 1) == '\\' 1205 && wasEscaped(Start + 1, Current)); 1206 } else { 1207 skip(1); 1208 while (true) { 1209 // Skip a ' followed by another '. 1210 if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { 1211 skip(2); 1212 continue; 1213 } else if (*Current == '\'') 1214 break; 1215 StringRef::iterator i = skip_nb_char(Current); 1216 if (i == Current) { 1217 i = skip_b_break(Current); 1218 if (i == Current) 1219 break; 1220 Current = i; 1221 Column = 0; 1222 ++Line; 1223 } else { 1224 if (i == End) 1225 break; 1226 Current = i; 1227 ++Column; 1228 } 1229 } 1230 } 1231 1232 if (Current == End) { 1233 setError("Expected quote at end of scalar", Current); 1234 return false; 1235 } 1236 1237 skip(1); // Skip ending quote. 1238 Token T; 1239 T.Kind = Token::TK_Scalar; 1240 T.Range = StringRef(Start, Current - Start); 1241 TokenQueue.push_back(T); 1242 1243 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1244 1245 IsSimpleKeyAllowed = false; 1246 1247 return true; 1248 } 1249 1250 bool Scanner::scanPlainScalar() { 1251 StringRef::iterator Start = Current; 1252 unsigned ColStart = Column; 1253 unsigned LeadingBlanks = 0; 1254 assert(Indent >= -1 && "Indent must be >= -1 !"); 1255 unsigned indent = static_cast<unsigned>(Indent + 1); 1256 while (true) { 1257 if (*Current == '#') 1258 break; 1259 1260 while (!isBlankOrBreak(Current)) { 1261 if ( FlowLevel && *Current == ':' 1262 && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) { 1263 setError("Found unexpected ':' while scanning a plain scalar", Current); 1264 return false; 1265 } 1266 1267 // Check for the end of the plain scalar. 1268 if ( (*Current == ':' && isBlankOrBreak(Current + 1)) 1269 || ( FlowLevel 1270 && (StringRef(Current, 1).find_first_of(",:?[]{}") 1271 != StringRef::npos))) 1272 break; 1273 1274 StringRef::iterator i = skip_nb_char(Current); 1275 if (i == Current) 1276 break; 1277 Current = i; 1278 ++Column; 1279 } 1280 1281 // Are we at the end? 1282 if (!isBlankOrBreak(Current)) 1283 break; 1284 1285 // Eat blanks. 1286 StringRef::iterator Tmp = Current; 1287 while (isBlankOrBreak(Tmp)) { 1288 StringRef::iterator i = skip_s_white(Tmp); 1289 if (i != Tmp) { 1290 if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { 1291 setError("Found invalid tab character in indentation", Tmp); 1292 return false; 1293 } 1294 Tmp = i; 1295 ++Column; 1296 } else { 1297 i = skip_b_break(Tmp); 1298 if (!LeadingBlanks) 1299 LeadingBlanks = 1; 1300 Tmp = i; 1301 Column = 0; 1302 ++Line; 1303 } 1304 } 1305 1306 if (!FlowLevel && Column < indent) 1307 break; 1308 1309 Current = Tmp; 1310 } 1311 if (Start == Current) { 1312 setError("Got empty plain scalar", Start); 1313 return false; 1314 } 1315 Token T; 1316 T.Kind = Token::TK_Scalar; 1317 T.Range = StringRef(Start, Current - Start); 1318 TokenQueue.push_back(T); 1319 1320 // Plain scalars can be simple keys. 1321 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1322 1323 IsSimpleKeyAllowed = false; 1324 1325 return true; 1326 } 1327 1328 bool Scanner::scanAliasOrAnchor(bool IsAlias) { 1329 StringRef::iterator Start = Current; 1330 unsigned ColStart = Column; 1331 skip(1); 1332 while(true) { 1333 if ( *Current == '[' || *Current == ']' 1334 || *Current == '{' || *Current == '}' 1335 || *Current == ',' 1336 || *Current == ':') 1337 break; 1338 StringRef::iterator i = skip_ns_char(Current); 1339 if (i == Current) 1340 break; 1341 Current = i; 1342 ++Column; 1343 } 1344 1345 if (Start == Current) { 1346 setError("Got empty alias or anchor", Start); 1347 return false; 1348 } 1349 1350 Token T; 1351 T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; 1352 T.Range = StringRef(Start, Current - Start); 1353 TokenQueue.push_back(T); 1354 1355 // Alias and anchors can be simple keys. 1356 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1357 1358 IsSimpleKeyAllowed = false; 1359 1360 return true; 1361 } 1362 1363 bool Scanner::scanBlockScalar(bool IsLiteral) { 1364 StringRef::iterator Start = Current; 1365 skip(1); // Eat | or > 1366 while(true) { 1367 StringRef::iterator i = skip_nb_char(Current); 1368 if (i == Current) { 1369 if (Column == 0) 1370 break; 1371 i = skip_b_break(Current); 1372 if (i != Current) { 1373 // We got a line break. 1374 Column = 0; 1375 ++Line; 1376 Current = i; 1377 continue; 1378 } else { 1379 // There was an error, which should already have been printed out. 1380 return false; 1381 } 1382 } 1383 Current = i; 1384 ++Column; 1385 } 1386 1387 if (Start == Current) { 1388 setError("Got empty block scalar", Start); 1389 return false; 1390 } 1391 1392 Token T; 1393 T.Kind = Token::TK_Scalar; 1394 T.Range = StringRef(Start, Current - Start); 1395 TokenQueue.push_back(T); 1396 return true; 1397 } 1398 1399 bool Scanner::scanTag() { 1400 StringRef::iterator Start = Current; 1401 unsigned ColStart = Column; 1402 skip(1); // Eat !. 1403 if (Current == End || isBlankOrBreak(Current)); // An empty tag. 1404 else if (*Current == '<') { 1405 skip(1); 1406 scan_ns_uri_char(); 1407 if (!consume('>')) 1408 return false; 1409 } else { 1410 // FIXME: Actually parse the c-ns-shorthand-tag rule. 1411 Current = skip_while(&Scanner::skip_ns_char, Current); 1412 } 1413 1414 Token T; 1415 T.Kind = Token::TK_Tag; 1416 T.Range = StringRef(Start, Current - Start); 1417 TokenQueue.push_back(T); 1418 1419 // Tags can be simple keys. 1420 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1421 1422 IsSimpleKeyAllowed = false; 1423 1424 return true; 1425 } 1426 1427 bool Scanner::fetchMoreTokens() { 1428 if (IsStartOfStream) 1429 return scanStreamStart(); 1430 1431 scanToNextToken(); 1432 1433 if (Current == End) 1434 return scanStreamEnd(); 1435 1436 removeStaleSimpleKeyCandidates(); 1437 1438 unrollIndent(Column); 1439 1440 if (Column == 0 && *Current == '%') 1441 return scanDirective(); 1442 1443 if (Column == 0 && Current + 4 <= End 1444 && *Current == '-' 1445 && *(Current + 1) == '-' 1446 && *(Current + 2) == '-' 1447 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1448 return scanDocumentIndicator(true); 1449 1450 if (Column == 0 && Current + 4 <= End 1451 && *Current == '.' 1452 && *(Current + 1) == '.' 1453 && *(Current + 2) == '.' 1454 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1455 return scanDocumentIndicator(false); 1456 1457 if (*Current == '[') 1458 return scanFlowCollectionStart(true); 1459 1460 if (*Current == '{') 1461 return scanFlowCollectionStart(false); 1462 1463 if (*Current == ']') 1464 return scanFlowCollectionEnd(true); 1465 1466 if (*Current == '}') 1467 return scanFlowCollectionEnd(false); 1468 1469 if (*Current == ',') 1470 return scanFlowEntry(); 1471 1472 if (*Current == '-' && isBlankOrBreak(Current + 1)) 1473 return scanBlockEntry(); 1474 1475 if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1))) 1476 return scanKey(); 1477 1478 if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1))) 1479 return scanValue(); 1480 1481 if (*Current == '*') 1482 return scanAliasOrAnchor(true); 1483 1484 if (*Current == '&') 1485 return scanAliasOrAnchor(false); 1486 1487 if (*Current == '!') 1488 return scanTag(); 1489 1490 if (*Current == '|' && !FlowLevel) 1491 return scanBlockScalar(true); 1492 1493 if (*Current == '>' && !FlowLevel) 1494 return scanBlockScalar(false); 1495 1496 if (*Current == '\'') 1497 return scanFlowScalar(false); 1498 1499 if (*Current == '"') 1500 return scanFlowScalar(true); 1501 1502 // Get a plain scalar. 1503 StringRef FirstChar(Current, 1); 1504 if (!(isBlankOrBreak(Current) 1505 || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos) 1506 || (*Current == '-' && !isBlankOrBreak(Current + 1)) 1507 || (!FlowLevel && (*Current == '?' || *Current == ':') 1508 && isBlankOrBreak(Current + 1)) 1509 || (!FlowLevel && *Current == ':' 1510 && Current + 2 < End 1511 && *(Current + 1) == ':' 1512 && !isBlankOrBreak(Current + 2))) 1513 return scanPlainScalar(); 1514 1515 setError("Unrecognized character while tokenizing."); 1516 return false; 1517 } 1518 1519 Stream::Stream(StringRef Input, SourceMgr &SM) 1520 : scanner(new Scanner(Input, SM)), CurrentDoc() {} 1521 1522 Stream::Stream(std::unique_ptr<MemoryBuffer> InputBuffer, SourceMgr &SM) 1523 : scanner(new Scanner(std::move(InputBuffer), SM)), CurrentDoc() {} 1524 1525 Stream::~Stream() {} 1526 1527 bool Stream::failed() { return scanner->failed(); } 1528 1529 void Stream::printError(Node *N, const Twine &Msg) { 1530 SmallVector<SMRange, 1> Ranges; 1531 Ranges.push_back(N->getSourceRange()); 1532 scanner->printError( N->getSourceRange().Start 1533 , SourceMgr::DK_Error 1534 , Msg 1535 , Ranges); 1536 } 1537 1538 document_iterator Stream::begin() { 1539 if (CurrentDoc) 1540 report_fatal_error("Can only iterate over the stream once"); 1541 1542 // Skip Stream-Start. 1543 scanner->getNext(); 1544 1545 CurrentDoc.reset(new Document(*this)); 1546 return document_iterator(CurrentDoc); 1547 } 1548 1549 document_iterator Stream::end() { 1550 return document_iterator(); 1551 } 1552 1553 void Stream::skip() { 1554 for (document_iterator i = begin(), e = end(); i != e; ++i) 1555 i->skip(); 1556 } 1557 1558 Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A, 1559 StringRef T) 1560 : Doc(D), TypeID(Type), Anchor(A), Tag(T) { 1561 SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); 1562 SourceRange = SMRange(Start, Start); 1563 } 1564 1565 std::string Node::getVerbatimTag() const { 1566 StringRef Raw = getRawTag(); 1567 if (!Raw.empty() && Raw != "!") { 1568 std::string Ret; 1569 if (Raw.find_last_of('!') == 0) { 1570 Ret = Doc->getTagMap().find("!")->second; 1571 Ret += Raw.substr(1); 1572 return std::move(Ret); 1573 } else if (Raw.startswith("!!")) { 1574 Ret = Doc->getTagMap().find("!!")->second; 1575 Ret += Raw.substr(2); 1576 return std::move(Ret); 1577 } else { 1578 StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1); 1579 std::map<StringRef, StringRef>::const_iterator It = 1580 Doc->getTagMap().find(TagHandle); 1581 if (It != Doc->getTagMap().end()) 1582 Ret = It->second; 1583 else { 1584 Token T; 1585 T.Kind = Token::TK_Tag; 1586 T.Range = TagHandle; 1587 setError(Twine("Unknown tag handle ") + TagHandle, T); 1588 } 1589 Ret += Raw.substr(Raw.find_last_of('!') + 1); 1590 return std::move(Ret); 1591 } 1592 } 1593 1594 switch (getType()) { 1595 case NK_Null: 1596 return "tag:yaml.org,2002:null"; 1597 case NK_Scalar: 1598 // TODO: Tag resolution. 1599 return "tag:yaml.org,2002:str"; 1600 case NK_Mapping: 1601 return "tag:yaml.org,2002:map"; 1602 case NK_Sequence: 1603 return "tag:yaml.org,2002:seq"; 1604 } 1605 1606 return ""; 1607 } 1608 1609 Token &Node::peekNext() { 1610 return Doc->peekNext(); 1611 } 1612 1613 Token Node::getNext() { 1614 return Doc->getNext(); 1615 } 1616 1617 Node *Node::parseBlockNode() { 1618 return Doc->parseBlockNode(); 1619 } 1620 1621 BumpPtrAllocator &Node::getAllocator() { 1622 return Doc->NodeAllocator; 1623 } 1624 1625 void Node::setError(const Twine &Msg, Token &Tok) const { 1626 Doc->setError(Msg, Tok); 1627 } 1628 1629 bool Node::failed() const { 1630 return Doc->failed(); 1631 } 1632 1633 1634 1635 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { 1636 // TODO: Handle newlines properly. We need to remove leading whitespace. 1637 if (Value[0] == '"') { // Double quoted. 1638 // Pull off the leading and trailing "s. 1639 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1640 // Search for characters that would require unescaping the value. 1641 StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); 1642 if (i != StringRef::npos) 1643 return unescapeDoubleQuoted(UnquotedValue, i, Storage); 1644 return UnquotedValue; 1645 } else if (Value[0] == '\'') { // Single quoted. 1646 // Pull off the leading and trailing 's. 1647 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1648 StringRef::size_type i = UnquotedValue.find('\''); 1649 if (i != StringRef::npos) { 1650 // We're going to need Storage. 1651 Storage.clear(); 1652 Storage.reserve(UnquotedValue.size()); 1653 for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { 1654 StringRef Valid(UnquotedValue.begin(), i); 1655 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1656 Storage.push_back('\''); 1657 UnquotedValue = UnquotedValue.substr(i + 2); 1658 } 1659 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 1660 return StringRef(Storage.begin(), Storage.size()); 1661 } 1662 return UnquotedValue; 1663 } 1664 // Plain or block. 1665 return Value.rtrim(" "); 1666 } 1667 1668 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue 1669 , StringRef::size_type i 1670 , SmallVectorImpl<char> &Storage) 1671 const { 1672 // Use Storage to build proper value. 1673 Storage.clear(); 1674 Storage.reserve(UnquotedValue.size()); 1675 for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { 1676 // Insert all previous chars into Storage. 1677 StringRef Valid(UnquotedValue.begin(), i); 1678 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1679 // Chop off inserted chars. 1680 UnquotedValue = UnquotedValue.substr(i); 1681 1682 assert(!UnquotedValue.empty() && "Can't be empty!"); 1683 1684 // Parse escape or line break. 1685 switch (UnquotedValue[0]) { 1686 case '\r': 1687 case '\n': 1688 Storage.push_back('\n'); 1689 if ( UnquotedValue.size() > 1 1690 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1691 UnquotedValue = UnquotedValue.substr(1); 1692 UnquotedValue = UnquotedValue.substr(1); 1693 break; 1694 default: 1695 if (UnquotedValue.size() == 1) 1696 // TODO: Report error. 1697 break; 1698 UnquotedValue = UnquotedValue.substr(1); 1699 switch (UnquotedValue[0]) { 1700 default: { 1701 Token T; 1702 T.Range = StringRef(UnquotedValue.begin(), 1); 1703 setError("Unrecognized escape code!", T); 1704 return ""; 1705 } 1706 case '\r': 1707 case '\n': 1708 // Remove the new line. 1709 if ( UnquotedValue.size() > 1 1710 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1711 UnquotedValue = UnquotedValue.substr(1); 1712 // If this was just a single byte newline, it will get skipped 1713 // below. 1714 break; 1715 case '0': 1716 Storage.push_back(0x00); 1717 break; 1718 case 'a': 1719 Storage.push_back(0x07); 1720 break; 1721 case 'b': 1722 Storage.push_back(0x08); 1723 break; 1724 case 't': 1725 case 0x09: 1726 Storage.push_back(0x09); 1727 break; 1728 case 'n': 1729 Storage.push_back(0x0A); 1730 break; 1731 case 'v': 1732 Storage.push_back(0x0B); 1733 break; 1734 case 'f': 1735 Storage.push_back(0x0C); 1736 break; 1737 case 'r': 1738 Storage.push_back(0x0D); 1739 break; 1740 case 'e': 1741 Storage.push_back(0x1B); 1742 break; 1743 case ' ': 1744 Storage.push_back(0x20); 1745 break; 1746 case '"': 1747 Storage.push_back(0x22); 1748 break; 1749 case '/': 1750 Storage.push_back(0x2F); 1751 break; 1752 case '\\': 1753 Storage.push_back(0x5C); 1754 break; 1755 case 'N': 1756 encodeUTF8(0x85, Storage); 1757 break; 1758 case '_': 1759 encodeUTF8(0xA0, Storage); 1760 break; 1761 case 'L': 1762 encodeUTF8(0x2028, Storage); 1763 break; 1764 case 'P': 1765 encodeUTF8(0x2029, Storage); 1766 break; 1767 case 'x': { 1768 if (UnquotedValue.size() < 3) 1769 // TODO: Report error. 1770 break; 1771 unsigned int UnicodeScalarValue; 1772 if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) 1773 // TODO: Report error. 1774 UnicodeScalarValue = 0xFFFD; 1775 encodeUTF8(UnicodeScalarValue, Storage); 1776 UnquotedValue = UnquotedValue.substr(2); 1777 break; 1778 } 1779 case 'u': { 1780 if (UnquotedValue.size() < 5) 1781 // TODO: Report error. 1782 break; 1783 unsigned int UnicodeScalarValue; 1784 if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) 1785 // TODO: Report error. 1786 UnicodeScalarValue = 0xFFFD; 1787 encodeUTF8(UnicodeScalarValue, Storage); 1788 UnquotedValue = UnquotedValue.substr(4); 1789 break; 1790 } 1791 case 'U': { 1792 if (UnquotedValue.size() < 9) 1793 // TODO: Report error. 1794 break; 1795 unsigned int UnicodeScalarValue; 1796 if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) 1797 // TODO: Report error. 1798 UnicodeScalarValue = 0xFFFD; 1799 encodeUTF8(UnicodeScalarValue, Storage); 1800 UnquotedValue = UnquotedValue.substr(8); 1801 break; 1802 } 1803 } 1804 UnquotedValue = UnquotedValue.substr(1); 1805 } 1806 } 1807 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 1808 return StringRef(Storage.begin(), Storage.size()); 1809 } 1810 1811 Node *KeyValueNode::getKey() { 1812 if (Key) 1813 return Key; 1814 // Handle implicit null keys. 1815 { 1816 Token &t = peekNext(); 1817 if ( t.Kind == Token::TK_BlockEnd 1818 || t.Kind == Token::TK_Value 1819 || t.Kind == Token::TK_Error) { 1820 return Key = new (getAllocator()) NullNode(Doc); 1821 } 1822 if (t.Kind == Token::TK_Key) 1823 getNext(); // skip TK_Key. 1824 } 1825 1826 // Handle explicit null keys. 1827 Token &t = peekNext(); 1828 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { 1829 return Key = new (getAllocator()) NullNode(Doc); 1830 } 1831 1832 // We've got a normal key. 1833 return Key = parseBlockNode(); 1834 } 1835 1836 Node *KeyValueNode::getValue() { 1837 if (Value) 1838 return Value; 1839 getKey()->skip(); 1840 if (failed()) 1841 return Value = new (getAllocator()) NullNode(Doc); 1842 1843 // Handle implicit null values. 1844 { 1845 Token &t = peekNext(); 1846 if ( t.Kind == Token::TK_BlockEnd 1847 || t.Kind == Token::TK_FlowMappingEnd 1848 || t.Kind == Token::TK_Key 1849 || t.Kind == Token::TK_FlowEntry 1850 || t.Kind == Token::TK_Error) { 1851 return Value = new (getAllocator()) NullNode(Doc); 1852 } 1853 1854 if (t.Kind != Token::TK_Value) { 1855 setError("Unexpected token in Key Value.", t); 1856 return Value = new (getAllocator()) NullNode(Doc); 1857 } 1858 getNext(); // skip TK_Value. 1859 } 1860 1861 // Handle explicit null values. 1862 Token &t = peekNext(); 1863 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { 1864 return Value = new (getAllocator()) NullNode(Doc); 1865 } 1866 1867 // We got a normal value. 1868 return Value = parseBlockNode(); 1869 } 1870 1871 void MappingNode::increment() { 1872 if (failed()) { 1873 IsAtEnd = true; 1874 CurrentEntry = nullptr; 1875 return; 1876 } 1877 if (CurrentEntry) { 1878 CurrentEntry->skip(); 1879 if (Type == MT_Inline) { 1880 IsAtEnd = true; 1881 CurrentEntry = nullptr; 1882 return; 1883 } 1884 } 1885 Token T = peekNext(); 1886 if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { 1887 // KeyValueNode eats the TK_Key. That way it can detect null keys. 1888 CurrentEntry = new (getAllocator()) KeyValueNode(Doc); 1889 } else if (Type == MT_Block) { 1890 switch (T.Kind) { 1891 case Token::TK_BlockEnd: 1892 getNext(); 1893 IsAtEnd = true; 1894 CurrentEntry = nullptr; 1895 break; 1896 default: 1897 setError("Unexpected token. Expected Key or Block End", T); 1898 case Token::TK_Error: 1899 IsAtEnd = true; 1900 CurrentEntry = nullptr; 1901 } 1902 } else { 1903 switch (T.Kind) { 1904 case Token::TK_FlowEntry: 1905 // Eat the flow entry and recurse. 1906 getNext(); 1907 return increment(); 1908 case Token::TK_FlowMappingEnd: 1909 getNext(); 1910 case Token::TK_Error: 1911 // Set this to end iterator. 1912 IsAtEnd = true; 1913 CurrentEntry = nullptr; 1914 break; 1915 default: 1916 setError( "Unexpected token. Expected Key, Flow Entry, or Flow " 1917 "Mapping End." 1918 , T); 1919 IsAtEnd = true; 1920 CurrentEntry = nullptr; 1921 } 1922 } 1923 } 1924 1925 void SequenceNode::increment() { 1926 if (failed()) { 1927 IsAtEnd = true; 1928 CurrentEntry = nullptr; 1929 return; 1930 } 1931 if (CurrentEntry) 1932 CurrentEntry->skip(); 1933 Token T = peekNext(); 1934 if (SeqType == ST_Block) { 1935 switch (T.Kind) { 1936 case Token::TK_BlockEntry: 1937 getNext(); 1938 CurrentEntry = parseBlockNode(); 1939 if (!CurrentEntry) { // An error occurred. 1940 IsAtEnd = true; 1941 CurrentEntry = nullptr; 1942 } 1943 break; 1944 case Token::TK_BlockEnd: 1945 getNext(); 1946 IsAtEnd = true; 1947 CurrentEntry = nullptr; 1948 break; 1949 default: 1950 setError( "Unexpected token. Expected Block Entry or Block End." 1951 , T); 1952 case Token::TK_Error: 1953 IsAtEnd = true; 1954 CurrentEntry = nullptr; 1955 } 1956 } else if (SeqType == ST_Indentless) { 1957 switch (T.Kind) { 1958 case Token::TK_BlockEntry: 1959 getNext(); 1960 CurrentEntry = parseBlockNode(); 1961 if (!CurrentEntry) { // An error occurred. 1962 IsAtEnd = true; 1963 CurrentEntry = nullptr; 1964 } 1965 break; 1966 default: 1967 case Token::TK_Error: 1968 IsAtEnd = true; 1969 CurrentEntry = nullptr; 1970 } 1971 } else if (SeqType == ST_Flow) { 1972 switch (T.Kind) { 1973 case Token::TK_FlowEntry: 1974 // Eat the flow entry and recurse. 1975 getNext(); 1976 WasPreviousTokenFlowEntry = true; 1977 return increment(); 1978 case Token::TK_FlowSequenceEnd: 1979 getNext(); 1980 case Token::TK_Error: 1981 // Set this to end iterator. 1982 IsAtEnd = true; 1983 CurrentEntry = nullptr; 1984 break; 1985 case Token::TK_StreamEnd: 1986 case Token::TK_DocumentEnd: 1987 case Token::TK_DocumentStart: 1988 setError("Could not find closing ]!", T); 1989 // Set this to end iterator. 1990 IsAtEnd = true; 1991 CurrentEntry = nullptr; 1992 break; 1993 default: 1994 if (!WasPreviousTokenFlowEntry) { 1995 setError("Expected , between entries!", T); 1996 IsAtEnd = true; 1997 CurrentEntry = nullptr; 1998 break; 1999 } 2000 // Otherwise it must be a flow entry. 2001 CurrentEntry = parseBlockNode(); 2002 if (!CurrentEntry) { 2003 IsAtEnd = true; 2004 } 2005 WasPreviousTokenFlowEntry = false; 2006 break; 2007 } 2008 } 2009 } 2010 2011 Document::Document(Stream &S) : stream(S), Root(nullptr) { 2012 // Tag maps starts with two default mappings. 2013 TagMap["!"] = "!"; 2014 TagMap["!!"] = "tag:yaml.org,2002:"; 2015 2016 if (parseDirectives()) 2017 expectToken(Token::TK_DocumentStart); 2018 Token &T = peekNext(); 2019 if (T.Kind == Token::TK_DocumentStart) 2020 getNext(); 2021 } 2022 2023 bool Document::skip() { 2024 if (stream.scanner->failed()) 2025 return false; 2026 if (!Root) 2027 getRoot(); 2028 Root->skip(); 2029 Token &T = peekNext(); 2030 if (T.Kind == Token::TK_StreamEnd) 2031 return false; 2032 if (T.Kind == Token::TK_DocumentEnd) { 2033 getNext(); 2034 return skip(); 2035 } 2036 return true; 2037 } 2038 2039 Token &Document::peekNext() { 2040 return stream.scanner->peekNext(); 2041 } 2042 2043 Token Document::getNext() { 2044 return stream.scanner->getNext(); 2045 } 2046 2047 void Document::setError(const Twine &Message, Token &Location) const { 2048 stream.scanner->setError(Message, Location.Range.begin()); 2049 } 2050 2051 bool Document::failed() const { 2052 return stream.scanner->failed(); 2053 } 2054 2055 Node *Document::parseBlockNode() { 2056 Token T = peekNext(); 2057 // Handle properties. 2058 Token AnchorInfo; 2059 Token TagInfo; 2060 parse_property: 2061 switch (T.Kind) { 2062 case Token::TK_Alias: 2063 getNext(); 2064 return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); 2065 case Token::TK_Anchor: 2066 if (AnchorInfo.Kind == Token::TK_Anchor) { 2067 setError("Already encountered an anchor for this node!", T); 2068 return nullptr; 2069 } 2070 AnchorInfo = getNext(); // Consume TK_Anchor. 2071 T = peekNext(); 2072 goto parse_property; 2073 case Token::TK_Tag: 2074 if (TagInfo.Kind == Token::TK_Tag) { 2075 setError("Already encountered a tag for this node!", T); 2076 return nullptr; 2077 } 2078 TagInfo = getNext(); // Consume TK_Tag. 2079 T = peekNext(); 2080 goto parse_property; 2081 default: 2082 break; 2083 } 2084 2085 switch (T.Kind) { 2086 case Token::TK_BlockEntry: 2087 // We got an unindented BlockEntry sequence. This is not terminated with 2088 // a BlockEnd. 2089 // Don't eat the TK_BlockEntry, SequenceNode needs it. 2090 return new (NodeAllocator) SequenceNode( stream.CurrentDoc 2091 , AnchorInfo.Range.substr(1) 2092 , TagInfo.Range 2093 , SequenceNode::ST_Indentless); 2094 case Token::TK_BlockSequenceStart: 2095 getNext(); 2096 return new (NodeAllocator) 2097 SequenceNode( stream.CurrentDoc 2098 , AnchorInfo.Range.substr(1) 2099 , TagInfo.Range 2100 , SequenceNode::ST_Block); 2101 case Token::TK_BlockMappingStart: 2102 getNext(); 2103 return new (NodeAllocator) 2104 MappingNode( stream.CurrentDoc 2105 , AnchorInfo.Range.substr(1) 2106 , TagInfo.Range 2107 , MappingNode::MT_Block); 2108 case Token::TK_FlowSequenceStart: 2109 getNext(); 2110 return new (NodeAllocator) 2111 SequenceNode( stream.CurrentDoc 2112 , AnchorInfo.Range.substr(1) 2113 , TagInfo.Range 2114 , SequenceNode::ST_Flow); 2115 case Token::TK_FlowMappingStart: 2116 getNext(); 2117 return new (NodeAllocator) 2118 MappingNode( stream.CurrentDoc 2119 , AnchorInfo.Range.substr(1) 2120 , TagInfo.Range 2121 , MappingNode::MT_Flow); 2122 case Token::TK_Scalar: 2123 getNext(); 2124 return new (NodeAllocator) 2125 ScalarNode( stream.CurrentDoc 2126 , AnchorInfo.Range.substr(1) 2127 , TagInfo.Range 2128 , T.Range); 2129 case Token::TK_Key: 2130 // Don't eat the TK_Key, KeyValueNode expects it. 2131 return new (NodeAllocator) 2132 MappingNode( stream.CurrentDoc 2133 , AnchorInfo.Range.substr(1) 2134 , TagInfo.Range 2135 , MappingNode::MT_Inline); 2136 case Token::TK_DocumentStart: 2137 case Token::TK_DocumentEnd: 2138 case Token::TK_StreamEnd: 2139 default: 2140 // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not 2141 // !!null null. 2142 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2143 case Token::TK_Error: 2144 return nullptr; 2145 } 2146 llvm_unreachable("Control flow shouldn't reach here."); 2147 return nullptr; 2148 } 2149 2150 bool Document::parseDirectives() { 2151 bool isDirective = false; 2152 while (true) { 2153 Token T = peekNext(); 2154 if (T.Kind == Token::TK_TagDirective) { 2155 parseTAGDirective(); 2156 isDirective = true; 2157 } else if (T.Kind == Token::TK_VersionDirective) { 2158 parseYAMLDirective(); 2159 isDirective = true; 2160 } else 2161 break; 2162 } 2163 return isDirective; 2164 } 2165 2166 void Document::parseYAMLDirective() { 2167 getNext(); // Eat %YAML <version> 2168 } 2169 2170 void Document::parseTAGDirective() { 2171 Token Tag = getNext(); // %TAG <handle> <prefix> 2172 StringRef T = Tag.Range; 2173 // Strip %TAG 2174 T = T.substr(T.find_first_of(" \t")).ltrim(" \t"); 2175 std::size_t HandleEnd = T.find_first_of(" \t"); 2176 StringRef TagHandle = T.substr(0, HandleEnd); 2177 StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t"); 2178 TagMap[TagHandle] = TagPrefix; 2179 } 2180 2181 bool Document::expectToken(int TK) { 2182 Token T = getNext(); 2183 if (T.Kind != TK) { 2184 setError("Unexpected token", T); 2185 return false; 2186 } 2187 return true; 2188 } 2189