1 //===--- YAMLParser.cpp - Simple YAML parser ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements a YAML parser. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Support/YAMLParser.h" 15 #include "llvm/ADT/SmallVector.h" 16 #include "llvm/ADT/StringExtras.h" 17 #include "llvm/ADT/Twine.h" 18 #include "llvm/ADT/ilist.h" 19 #include "llvm/ADT/ilist_node.h" 20 #include "llvm/Support/ErrorHandling.h" 21 #include "llvm/Support/MemoryBuffer.h" 22 #include "llvm/Support/SourceMgr.h" 23 #include "llvm/Support/raw_ostream.h" 24 25 using namespace llvm; 26 using namespace yaml; 27 28 enum UnicodeEncodingForm { 29 UEF_UTF32_LE, ///< UTF-32 Little Endian 30 UEF_UTF32_BE, ///< UTF-32 Big Endian 31 UEF_UTF16_LE, ///< UTF-16 Little Endian 32 UEF_UTF16_BE, ///< UTF-16 Big Endian 33 UEF_UTF8, ///< UTF-8 or ascii. 34 UEF_Unknown ///< Not a valid Unicode encoding. 35 }; 36 37 /// EncodingInfo - Holds the encoding type and length of the byte order mark if 38 /// it exists. Length is in {0, 2, 3, 4}. 39 typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo; 40 41 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode 42 /// encoding form of \a Input. 43 /// 44 /// @param Input A string of length 0 or more. 45 /// @returns An EncodingInfo indicating the Unicode encoding form of the input 46 /// and how long the byte order mark is if one exists. 47 static EncodingInfo getUnicodeEncoding(StringRef Input) { 48 if (Input.size() == 0) 49 return std::make_pair(UEF_Unknown, 0); 50 51 switch (uint8_t(Input[0])) { 52 case 0x00: 53 if (Input.size() >= 4) { 54 if ( Input[1] == 0 55 && uint8_t(Input[2]) == 0xFE 56 && uint8_t(Input[3]) == 0xFF) 57 return std::make_pair(UEF_UTF32_BE, 4); 58 if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) 59 return std::make_pair(UEF_UTF32_BE, 0); 60 } 61 62 if (Input.size() >= 2 && Input[1] != 0) 63 return std::make_pair(UEF_UTF16_BE, 0); 64 return std::make_pair(UEF_Unknown, 0); 65 case 0xFF: 66 if ( Input.size() >= 4 67 && uint8_t(Input[1]) == 0xFE 68 && Input[2] == 0 69 && Input[3] == 0) 70 return std::make_pair(UEF_UTF32_LE, 4); 71 72 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) 73 return std::make_pair(UEF_UTF16_LE, 2); 74 return std::make_pair(UEF_Unknown, 0); 75 case 0xFE: 76 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) 77 return std::make_pair(UEF_UTF16_BE, 2); 78 return std::make_pair(UEF_Unknown, 0); 79 case 0xEF: 80 if ( Input.size() >= 3 81 && uint8_t(Input[1]) == 0xBB 82 && uint8_t(Input[2]) == 0xBF) 83 return std::make_pair(UEF_UTF8, 3); 84 return std::make_pair(UEF_Unknown, 0); 85 } 86 87 // It could still be utf-32 or utf-16. 88 if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) 89 return std::make_pair(UEF_UTF32_LE, 0); 90 91 if (Input.size() >= 2 && Input[1] == 0) 92 return std::make_pair(UEF_UTF16_LE, 0); 93 94 return std::make_pair(UEF_UTF8, 0); 95 } 96 97 namespace llvm { 98 namespace yaml { 99 /// Pin the vtables to this file. 100 void Node::anchor() {} 101 void NullNode::anchor() {} 102 void ScalarNode::anchor() {} 103 void KeyValueNode::anchor() {} 104 void MappingNode::anchor() {} 105 void SequenceNode::anchor() {} 106 void AliasNode::anchor() {} 107 108 /// Token - A single YAML token. 109 struct Token : ilist_node<Token> { 110 enum TokenKind { 111 TK_Error, // Uninitialized token. 112 TK_StreamStart, 113 TK_StreamEnd, 114 TK_VersionDirective, 115 TK_TagDirective, 116 TK_DocumentStart, 117 TK_DocumentEnd, 118 TK_BlockEntry, 119 TK_BlockEnd, 120 TK_BlockSequenceStart, 121 TK_BlockMappingStart, 122 TK_FlowEntry, 123 TK_FlowSequenceStart, 124 TK_FlowSequenceEnd, 125 TK_FlowMappingStart, 126 TK_FlowMappingEnd, 127 TK_Key, 128 TK_Value, 129 TK_Scalar, 130 TK_Alias, 131 TK_Anchor, 132 TK_Tag 133 } Kind; 134 135 /// A string of length 0 or more whose begin() points to the logical location 136 /// of the token in the input. 137 StringRef Range; 138 139 Token() : Kind(TK_Error) {} 140 }; 141 } 142 } 143 144 namespace llvm { 145 template<> 146 struct ilist_sentinel_traits<Token> { 147 Token *createSentinel() const { 148 return &Sentinel; 149 } 150 static void destroySentinel(Token*) {} 151 152 Token *provideInitialHead() const { return createSentinel(); } 153 Token *ensureHead(Token*) const { return createSentinel(); } 154 static void noteHead(Token*, Token*) {} 155 156 private: 157 mutable Token Sentinel; 158 }; 159 160 template<> 161 struct ilist_node_traits<Token> { 162 Token *createNode(const Token &V) { 163 return new (Alloc.Allocate<Token>()) Token(V); 164 } 165 static void deleteNode(Token *V) {} 166 167 void addNodeToList(Token *) {} 168 void removeNodeFromList(Token *) {} 169 void transferNodesFromList(ilist_node_traits & /*SrcTraits*/, 170 ilist_iterator<Token> /*first*/, 171 ilist_iterator<Token> /*last*/) {} 172 173 BumpPtrAllocator Alloc; 174 }; 175 } 176 177 typedef ilist<Token> TokenQueueT; 178 179 namespace { 180 /// @brief This struct is used to track simple keys. 181 /// 182 /// Simple keys are handled by creating an entry in SimpleKeys for each Token 183 /// which could legally be the start of a simple key. When peekNext is called, 184 /// if the Token To be returned is referenced by a SimpleKey, we continue 185 /// tokenizing until that potential simple key has either been found to not be 186 /// a simple key (we moved on to the next line or went further than 1024 chars). 187 /// Or when we run into a Value, and then insert a Key token (and possibly 188 /// others) before the SimpleKey's Tok. 189 struct SimpleKey { 190 TokenQueueT::iterator Tok; 191 unsigned Column; 192 unsigned Line; 193 unsigned FlowLevel; 194 bool IsRequired; 195 196 bool operator ==(const SimpleKey &Other) { 197 return Tok == Other.Tok; 198 } 199 }; 200 } 201 202 /// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit 203 /// subsequence and the subsequence's length in code units (uint8_t). 204 /// A length of 0 represents an error. 205 typedef std::pair<uint32_t, unsigned> UTF8Decoded; 206 207 static UTF8Decoded decodeUTF8(StringRef Range) { 208 StringRef::iterator Position= Range.begin(); 209 StringRef::iterator End = Range.end(); 210 // 1 byte: [0x00, 0x7f] 211 // Bit pattern: 0xxxxxxx 212 if ((*Position & 0x80) == 0) { 213 return std::make_pair(*Position, 1); 214 } 215 // 2 bytes: [0x80, 0x7ff] 216 // Bit pattern: 110xxxxx 10xxxxxx 217 if (Position + 1 != End && 218 ((*Position & 0xE0) == 0xC0) && 219 ((*(Position + 1) & 0xC0) == 0x80)) { 220 uint32_t codepoint = ((*Position & 0x1F) << 6) | 221 (*(Position + 1) & 0x3F); 222 if (codepoint >= 0x80) 223 return std::make_pair(codepoint, 2); 224 } 225 // 3 bytes: [0x8000, 0xffff] 226 // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx 227 if (Position + 2 != End && 228 ((*Position & 0xF0) == 0xE0) && 229 ((*(Position + 1) & 0xC0) == 0x80) && 230 ((*(Position + 2) & 0xC0) == 0x80)) { 231 uint32_t codepoint = ((*Position & 0x0F) << 12) | 232 ((*(Position + 1) & 0x3F) << 6) | 233 (*(Position + 2) & 0x3F); 234 // Codepoints between 0xD800 and 0xDFFF are invalid, as 235 // they are high / low surrogate halves used by UTF-16. 236 if (codepoint >= 0x800 && 237 (codepoint < 0xD800 || codepoint > 0xDFFF)) 238 return std::make_pair(codepoint, 3); 239 } 240 // 4 bytes: [0x10000, 0x10FFFF] 241 // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 242 if (Position + 3 != End && 243 ((*Position & 0xF8) == 0xF0) && 244 ((*(Position + 1) & 0xC0) == 0x80) && 245 ((*(Position + 2) & 0xC0) == 0x80) && 246 ((*(Position + 3) & 0xC0) == 0x80)) { 247 uint32_t codepoint = ((*Position & 0x07) << 18) | 248 ((*(Position + 1) & 0x3F) << 12) | 249 ((*(Position + 2) & 0x3F) << 6) | 250 (*(Position + 3) & 0x3F); 251 if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) 252 return std::make_pair(codepoint, 4); 253 } 254 return std::make_pair(0, 0); 255 } 256 257 namespace llvm { 258 namespace yaml { 259 /// @brief Scans YAML tokens from a MemoryBuffer. 260 class Scanner { 261 public: 262 Scanner(StringRef Input, SourceMgr &SM); 263 Scanner(MemoryBufferRef Buffer, SourceMgr &SM_); 264 265 /// @brief Parse the next token and return it without popping it. 266 Token &peekNext(); 267 268 /// @brief Parse the next token and pop it from the queue. 269 Token getNext(); 270 271 void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, 272 ArrayRef<SMRange> Ranges = None) { 273 SM.PrintMessage(Loc, Kind, Message, Ranges); 274 } 275 276 void setError(const Twine &Message, StringRef::iterator Position) { 277 if (Current >= End) 278 Current = End - 1; 279 280 // Don't print out more errors after the first one we encounter. The rest 281 // are just the result of the first, and have no meaning. 282 if (!Failed) 283 printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message); 284 Failed = true; 285 } 286 287 void setError(const Twine &Message) { 288 setError(Message, Current); 289 } 290 291 /// @brief Returns true if an error occurred while parsing. 292 bool failed() { 293 return Failed; 294 } 295 296 private: 297 void init(MemoryBufferRef Buffer); 298 299 StringRef currentInput() { 300 return StringRef(Current, End - Current); 301 } 302 303 /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting 304 /// at \a Position. 305 /// 306 /// If the UTF-8 code units starting at Position do not form a well-formed 307 /// code unit subsequence, then the Unicode scalar value is 0, and the length 308 /// is 0. 309 UTF8Decoded decodeUTF8(StringRef::iterator Position) { 310 return ::decodeUTF8(StringRef(Position, End - Position)); 311 } 312 313 // The following functions are based on the gramar rules in the YAML spec. The 314 // style of the function names it meant to closely match how they are written 315 // in the spec. The number within the [] is the number of the grammar rule in 316 // the spec. 317 // 318 // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. 319 // 320 // c- 321 // A production starting and ending with a special character. 322 // b- 323 // A production matching a single line break. 324 // nb- 325 // A production starting and ending with a non-break character. 326 // s- 327 // A production starting and ending with a white space character. 328 // ns- 329 // A production starting and ending with a non-space character. 330 // l- 331 // A production matching complete line(s). 332 333 /// @brief Skip a single nb-char[27] starting at Position. 334 /// 335 /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] 336 /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] 337 /// 338 /// @returns The code unit after the nb-char, or Position if it's not an 339 /// nb-char. 340 StringRef::iterator skip_nb_char(StringRef::iterator Position); 341 342 /// @brief Skip a single b-break[28] starting at Position. 343 /// 344 /// A b-break is 0xD 0xA | 0xD | 0xA 345 /// 346 /// @returns The code unit after the b-break, or Position if it's not a 347 /// b-break. 348 StringRef::iterator skip_b_break(StringRef::iterator Position); 349 350 /// @brief Skip a single s-white[33] starting at Position. 351 /// 352 /// A s-white is 0x20 | 0x9 353 /// 354 /// @returns The code unit after the s-white, or Position if it's not a 355 /// s-white. 356 StringRef::iterator skip_s_white(StringRef::iterator Position); 357 358 /// @brief Skip a single ns-char[34] starting at Position. 359 /// 360 /// A ns-char is nb-char - s-white 361 /// 362 /// @returns The code unit after the ns-char, or Position if it's not a 363 /// ns-char. 364 StringRef::iterator skip_ns_char(StringRef::iterator Position); 365 366 typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator); 367 /// @brief Skip minimal well-formed code unit subsequences until Func 368 /// returns its input. 369 /// 370 /// @returns The code unit after the last minimal well-formed code unit 371 /// subsequence that Func accepted. 372 StringRef::iterator skip_while( SkipWhileFunc Func 373 , StringRef::iterator Position); 374 375 /// @brief Scan ns-uri-char[39]s starting at Cur. 376 /// 377 /// This updates Cur and Column while scanning. 378 /// 379 /// @returns A StringRef starting at Cur which covers the longest contiguous 380 /// sequence of ns-uri-char. 381 StringRef scan_ns_uri_char(); 382 383 /// @brief Consume a minimal well-formed code unit subsequence starting at 384 /// \a Cur. Return false if it is not the same Unicode scalar value as 385 /// \a Expected. This updates \a Column. 386 bool consume(uint32_t Expected); 387 388 /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. 389 void skip(uint32_t Distance); 390 391 /// @brief Return true if the minimal well-formed code unit subsequence at 392 /// Pos is whitespace or a new line 393 bool isBlankOrBreak(StringRef::iterator Position); 394 395 /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey. 396 void saveSimpleKeyCandidate( TokenQueueT::iterator Tok 397 , unsigned AtColumn 398 , bool IsRequired); 399 400 /// @brief Remove simple keys that can no longer be valid simple keys. 401 /// 402 /// Invalid simple keys are not on the current line or are further than 1024 403 /// columns back. 404 void removeStaleSimpleKeyCandidates(); 405 406 /// @brief Remove all simple keys on FlowLevel \a Level. 407 void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); 408 409 /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd 410 /// tokens if needed. 411 bool unrollIndent(int ToColumn); 412 413 /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint 414 /// if needed. 415 bool rollIndent( int ToColumn 416 , Token::TokenKind Kind 417 , TokenQueueT::iterator InsertPoint); 418 419 /// @brief Skip whitespace and comments until the start of the next token. 420 void scanToNextToken(); 421 422 /// @brief Must be the first token generated. 423 bool scanStreamStart(); 424 425 /// @brief Generate tokens needed to close out the stream. 426 bool scanStreamEnd(); 427 428 /// @brief Scan a %BLAH directive. 429 bool scanDirective(); 430 431 /// @brief Scan a ... or ---. 432 bool scanDocumentIndicator(bool IsStart); 433 434 /// @brief Scan a [ or { and generate the proper flow collection start token. 435 bool scanFlowCollectionStart(bool IsSequence); 436 437 /// @brief Scan a ] or } and generate the proper flow collection end token. 438 bool scanFlowCollectionEnd(bool IsSequence); 439 440 /// @brief Scan the , that separates entries in a flow collection. 441 bool scanFlowEntry(); 442 443 /// @brief Scan the - that starts block sequence entries. 444 bool scanBlockEntry(); 445 446 /// @brief Scan an explicit ? indicating a key. 447 bool scanKey(); 448 449 /// @brief Scan an explicit : indicating a value. 450 bool scanValue(); 451 452 /// @brief Scan a quoted scalar. 453 bool scanFlowScalar(bool IsDoubleQuoted); 454 455 /// @brief Scan an unquoted scalar. 456 bool scanPlainScalar(); 457 458 /// @brief Scan an Alias or Anchor starting with * or &. 459 bool scanAliasOrAnchor(bool IsAlias); 460 461 /// @brief Scan a block scalar starting with | or >. 462 bool scanBlockScalar(bool IsLiteral); 463 464 /// @brief Scan a tag of the form !stuff. 465 bool scanTag(); 466 467 /// @brief Dispatch to the next scanning function based on \a *Cur. 468 bool fetchMoreTokens(); 469 470 /// @brief The SourceMgr used for diagnostics and buffer management. 471 SourceMgr &SM; 472 473 /// @brief The original input. 474 MemoryBufferRef InputBuffer; 475 476 /// @brief The current position of the scanner. 477 StringRef::iterator Current; 478 479 /// @brief The end of the input (one past the last character). 480 StringRef::iterator End; 481 482 /// @brief Current YAML indentation level in spaces. 483 int Indent; 484 485 /// @brief Current column number in Unicode code points. 486 unsigned Column; 487 488 /// @brief Current line number. 489 unsigned Line; 490 491 /// @brief How deep we are in flow style containers. 0 Means at block level. 492 unsigned FlowLevel; 493 494 /// @brief Are we at the start of the stream? 495 bool IsStartOfStream; 496 497 /// @brief Can the next token be the start of a simple key? 498 bool IsSimpleKeyAllowed; 499 500 /// @brief True if an error has occurred. 501 bool Failed; 502 503 /// @brief Queue of tokens. This is required to queue up tokens while looking 504 /// for the end of a simple key. And for cases where a single character 505 /// can produce multiple tokens (e.g. BlockEnd). 506 TokenQueueT TokenQueue; 507 508 /// @brief Indentation levels. 509 SmallVector<int, 4> Indents; 510 511 /// @brief Potential simple keys. 512 SmallVector<SimpleKey, 4> SimpleKeys; 513 }; 514 515 } // end namespace yaml 516 } // end namespace llvm 517 518 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. 519 static void encodeUTF8( uint32_t UnicodeScalarValue 520 , SmallVectorImpl<char> &Result) { 521 if (UnicodeScalarValue <= 0x7F) { 522 Result.push_back(UnicodeScalarValue & 0x7F); 523 } else if (UnicodeScalarValue <= 0x7FF) { 524 uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); 525 uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); 526 Result.push_back(FirstByte); 527 Result.push_back(SecondByte); 528 } else if (UnicodeScalarValue <= 0xFFFF) { 529 uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); 530 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 531 uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); 532 Result.push_back(FirstByte); 533 Result.push_back(SecondByte); 534 Result.push_back(ThirdByte); 535 } else if (UnicodeScalarValue <= 0x10FFFF) { 536 uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); 537 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); 538 uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 539 uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); 540 Result.push_back(FirstByte); 541 Result.push_back(SecondByte); 542 Result.push_back(ThirdByte); 543 Result.push_back(FourthByte); 544 } 545 } 546 547 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { 548 SourceMgr SM; 549 Scanner scanner(Input, SM); 550 while (true) { 551 Token T = scanner.getNext(); 552 switch (T.Kind) { 553 case Token::TK_StreamStart: 554 OS << "Stream-Start: "; 555 break; 556 case Token::TK_StreamEnd: 557 OS << "Stream-End: "; 558 break; 559 case Token::TK_VersionDirective: 560 OS << "Version-Directive: "; 561 break; 562 case Token::TK_TagDirective: 563 OS << "Tag-Directive: "; 564 break; 565 case Token::TK_DocumentStart: 566 OS << "Document-Start: "; 567 break; 568 case Token::TK_DocumentEnd: 569 OS << "Document-End: "; 570 break; 571 case Token::TK_BlockEntry: 572 OS << "Block-Entry: "; 573 break; 574 case Token::TK_BlockEnd: 575 OS << "Block-End: "; 576 break; 577 case Token::TK_BlockSequenceStart: 578 OS << "Block-Sequence-Start: "; 579 break; 580 case Token::TK_BlockMappingStart: 581 OS << "Block-Mapping-Start: "; 582 break; 583 case Token::TK_FlowEntry: 584 OS << "Flow-Entry: "; 585 break; 586 case Token::TK_FlowSequenceStart: 587 OS << "Flow-Sequence-Start: "; 588 break; 589 case Token::TK_FlowSequenceEnd: 590 OS << "Flow-Sequence-End: "; 591 break; 592 case Token::TK_FlowMappingStart: 593 OS << "Flow-Mapping-Start: "; 594 break; 595 case Token::TK_FlowMappingEnd: 596 OS << "Flow-Mapping-End: "; 597 break; 598 case Token::TK_Key: 599 OS << "Key: "; 600 break; 601 case Token::TK_Value: 602 OS << "Value: "; 603 break; 604 case Token::TK_Scalar: 605 OS << "Scalar: "; 606 break; 607 case Token::TK_Alias: 608 OS << "Alias: "; 609 break; 610 case Token::TK_Anchor: 611 OS << "Anchor: "; 612 break; 613 case Token::TK_Tag: 614 OS << "Tag: "; 615 break; 616 case Token::TK_Error: 617 break; 618 } 619 OS << T.Range << "\n"; 620 if (T.Kind == Token::TK_StreamEnd) 621 break; 622 else if (T.Kind == Token::TK_Error) 623 return false; 624 } 625 return true; 626 } 627 628 bool yaml::scanTokens(StringRef Input) { 629 llvm::SourceMgr SM; 630 llvm::yaml::Scanner scanner(Input, SM); 631 for (;;) { 632 llvm::yaml::Token T = scanner.getNext(); 633 if (T.Kind == Token::TK_StreamEnd) 634 break; 635 else if (T.Kind == Token::TK_Error) 636 return false; 637 } 638 return true; 639 } 640 641 std::string yaml::escape(StringRef Input) { 642 std::string EscapedInput; 643 for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { 644 if (*i == '\\') 645 EscapedInput += "\\\\"; 646 else if (*i == '"') 647 EscapedInput += "\\\""; 648 else if (*i == 0) 649 EscapedInput += "\\0"; 650 else if (*i == 0x07) 651 EscapedInput += "\\a"; 652 else if (*i == 0x08) 653 EscapedInput += "\\b"; 654 else if (*i == 0x09) 655 EscapedInput += "\\t"; 656 else if (*i == 0x0A) 657 EscapedInput += "\\n"; 658 else if (*i == 0x0B) 659 EscapedInput += "\\v"; 660 else if (*i == 0x0C) 661 EscapedInput += "\\f"; 662 else if (*i == 0x0D) 663 EscapedInput += "\\r"; 664 else if (*i == 0x1B) 665 EscapedInput += "\\e"; 666 else if ((unsigned char)*i < 0x20) { // Control characters not handled above. 667 std::string HexStr = utohexstr(*i); 668 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 669 } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. 670 UTF8Decoded UnicodeScalarValue 671 = decodeUTF8(StringRef(i, Input.end() - i)); 672 if (UnicodeScalarValue.second == 0) { 673 // Found invalid char. 674 SmallString<4> Val; 675 encodeUTF8(0xFFFD, Val); 676 EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end()); 677 // FIXME: Error reporting. 678 return EscapedInput; 679 } 680 if (UnicodeScalarValue.first == 0x85) 681 EscapedInput += "\\N"; 682 else if (UnicodeScalarValue.first == 0xA0) 683 EscapedInput += "\\_"; 684 else if (UnicodeScalarValue.first == 0x2028) 685 EscapedInput += "\\L"; 686 else if (UnicodeScalarValue.first == 0x2029) 687 EscapedInput += "\\P"; 688 else { 689 std::string HexStr = utohexstr(UnicodeScalarValue.first); 690 if (HexStr.size() <= 2) 691 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 692 else if (HexStr.size() <= 4) 693 EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; 694 else if (HexStr.size() <= 8) 695 EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; 696 } 697 i += UnicodeScalarValue.second - 1; 698 } else 699 EscapedInput.push_back(*i); 700 } 701 return EscapedInput; 702 } 703 704 Scanner::Scanner(StringRef Input, SourceMgr &sm) : SM(sm) { 705 init(MemoryBufferRef(Input, "YAML")); 706 } 707 708 Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_) : SM(SM_) { 709 init(Buffer); 710 } 711 712 void Scanner::init(MemoryBufferRef Buffer) { 713 InputBuffer = Buffer; 714 Current = InputBuffer.getBufferStart(); 715 End = InputBuffer.getBufferEnd(); 716 Indent = -1; 717 Column = 0; 718 Line = 0; 719 FlowLevel = 0; 720 IsStartOfStream = true; 721 IsSimpleKeyAllowed = true; 722 Failed = false; 723 std::unique_ptr<MemoryBuffer> InputBufferOwner = 724 MemoryBuffer::getMemBuffer(Buffer); 725 SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc()); 726 } 727 728 Token &Scanner::peekNext() { 729 // If the current token is a possible simple key, keep parsing until we 730 // can confirm. 731 bool NeedMore = false; 732 while (true) { 733 if (TokenQueue.empty() || NeedMore) { 734 if (!fetchMoreTokens()) { 735 TokenQueue.clear(); 736 TokenQueue.push_back(Token()); 737 return TokenQueue.front(); 738 } 739 } 740 assert(!TokenQueue.empty() && 741 "fetchMoreTokens lied about getting tokens!"); 742 743 removeStaleSimpleKeyCandidates(); 744 SimpleKey SK; 745 SK.Tok = TokenQueue.front(); 746 if (std::find(SimpleKeys.begin(), SimpleKeys.end(), SK) 747 == SimpleKeys.end()) 748 break; 749 else 750 NeedMore = true; 751 } 752 return TokenQueue.front(); 753 } 754 755 Token Scanner::getNext() { 756 Token Ret = peekNext(); 757 // TokenQueue can be empty if there was an error getting the next token. 758 if (!TokenQueue.empty()) 759 TokenQueue.pop_front(); 760 761 // There cannot be any referenced Token's if the TokenQueue is empty. So do a 762 // quick deallocation of them all. 763 if (TokenQueue.empty()) { 764 TokenQueue.Alloc.Reset(); 765 } 766 767 return Ret; 768 } 769 770 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { 771 if (Position == End) 772 return Position; 773 // Check 7 bit c-printable - b-char. 774 if ( *Position == 0x09 775 || (*Position >= 0x20 && *Position <= 0x7E)) 776 return Position + 1; 777 778 // Check for valid UTF-8. 779 if (uint8_t(*Position) & 0x80) { 780 UTF8Decoded u8d = decodeUTF8(Position); 781 if ( u8d.second != 0 782 && u8d.first != 0xFEFF 783 && ( u8d.first == 0x85 784 || ( u8d.first >= 0xA0 785 && u8d.first <= 0xD7FF) 786 || ( u8d.first >= 0xE000 787 && u8d.first <= 0xFFFD) 788 || ( u8d.first >= 0x10000 789 && u8d.first <= 0x10FFFF))) 790 return Position + u8d.second; 791 } 792 return Position; 793 } 794 795 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { 796 if (Position == End) 797 return Position; 798 if (*Position == 0x0D) { 799 if (Position + 1 != End && *(Position + 1) == 0x0A) 800 return Position + 2; 801 return Position + 1; 802 } 803 804 if (*Position == 0x0A) 805 return Position + 1; 806 return Position; 807 } 808 809 810 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { 811 if (Position == End) 812 return Position; 813 if (*Position == ' ' || *Position == '\t') 814 return Position + 1; 815 return Position; 816 } 817 818 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { 819 if (Position == End) 820 return Position; 821 if (*Position == ' ' || *Position == '\t') 822 return Position; 823 return skip_nb_char(Position); 824 } 825 826 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func 827 , StringRef::iterator Position) { 828 while (true) { 829 StringRef::iterator i = (this->*Func)(Position); 830 if (i == Position) 831 break; 832 Position = i; 833 } 834 return Position; 835 } 836 837 static bool is_ns_hex_digit(const char C) { 838 return (C >= '0' && C <= '9') 839 || (C >= 'a' && C <= 'z') 840 || (C >= 'A' && C <= 'Z'); 841 } 842 843 static bool is_ns_word_char(const char C) { 844 return C == '-' 845 || (C >= 'a' && C <= 'z') 846 || (C >= 'A' && C <= 'Z'); 847 } 848 849 StringRef Scanner::scan_ns_uri_char() { 850 StringRef::iterator Start = Current; 851 while (true) { 852 if (Current == End) 853 break; 854 if (( *Current == '%' 855 && Current + 2 < End 856 && is_ns_hex_digit(*(Current + 1)) 857 && is_ns_hex_digit(*(Current + 2))) 858 || is_ns_word_char(*Current) 859 || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") 860 != StringRef::npos) { 861 ++Current; 862 ++Column; 863 } else 864 break; 865 } 866 return StringRef(Start, Current - Start); 867 } 868 869 bool Scanner::consume(uint32_t Expected) { 870 if (Expected >= 0x80) 871 report_fatal_error("Not dealing with this yet"); 872 if (Current == End) 873 return false; 874 if (uint8_t(*Current) >= 0x80) 875 report_fatal_error("Not dealing with this yet"); 876 if (uint8_t(*Current) == Expected) { 877 ++Current; 878 ++Column; 879 return true; 880 } 881 return false; 882 } 883 884 void Scanner::skip(uint32_t Distance) { 885 Current += Distance; 886 Column += Distance; 887 assert(Current <= End && "Skipped past the end"); 888 } 889 890 bool Scanner::isBlankOrBreak(StringRef::iterator Position) { 891 if (Position == End) 892 return false; 893 if ( *Position == ' ' || *Position == '\t' 894 || *Position == '\r' || *Position == '\n') 895 return true; 896 return false; 897 } 898 899 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok 900 , unsigned AtColumn 901 , bool IsRequired) { 902 if (IsSimpleKeyAllowed) { 903 SimpleKey SK; 904 SK.Tok = Tok; 905 SK.Line = Line; 906 SK.Column = AtColumn; 907 SK.IsRequired = IsRequired; 908 SK.FlowLevel = FlowLevel; 909 SimpleKeys.push_back(SK); 910 } 911 } 912 913 void Scanner::removeStaleSimpleKeyCandidates() { 914 for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); 915 i != SimpleKeys.end();) { 916 if (i->Line != Line || i->Column + 1024 < Column) { 917 if (i->IsRequired) 918 setError( "Could not find expected : for simple key" 919 , i->Tok->Range.begin()); 920 i = SimpleKeys.erase(i); 921 } else 922 ++i; 923 } 924 } 925 926 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { 927 if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) 928 SimpleKeys.pop_back(); 929 } 930 931 bool Scanner::unrollIndent(int ToColumn) { 932 Token T; 933 // Indentation is ignored in flow. 934 if (FlowLevel != 0) 935 return true; 936 937 while (Indent > ToColumn) { 938 T.Kind = Token::TK_BlockEnd; 939 T.Range = StringRef(Current, 1); 940 TokenQueue.push_back(T); 941 Indent = Indents.pop_back_val(); 942 } 943 944 return true; 945 } 946 947 bool Scanner::rollIndent( int ToColumn 948 , Token::TokenKind Kind 949 , TokenQueueT::iterator InsertPoint) { 950 if (FlowLevel) 951 return true; 952 if (Indent < ToColumn) { 953 Indents.push_back(Indent); 954 Indent = ToColumn; 955 956 Token T; 957 T.Kind = Kind; 958 T.Range = StringRef(Current, 0); 959 TokenQueue.insert(InsertPoint, T); 960 } 961 return true; 962 } 963 964 void Scanner::scanToNextToken() { 965 while (true) { 966 while (*Current == ' ' || *Current == '\t') { 967 skip(1); 968 } 969 970 // Skip comment. 971 if (*Current == '#') { 972 while (true) { 973 // This may skip more than one byte, thus Column is only incremented 974 // for code points. 975 StringRef::iterator i = skip_nb_char(Current); 976 if (i == Current) 977 break; 978 Current = i; 979 ++Column; 980 } 981 } 982 983 // Skip EOL. 984 StringRef::iterator i = skip_b_break(Current); 985 if (i == Current) 986 break; 987 Current = i; 988 ++Line; 989 Column = 0; 990 // New lines may start a simple key. 991 if (!FlowLevel) 992 IsSimpleKeyAllowed = true; 993 } 994 } 995 996 bool Scanner::scanStreamStart() { 997 IsStartOfStream = false; 998 999 EncodingInfo EI = getUnicodeEncoding(currentInput()); 1000 1001 Token T; 1002 T.Kind = Token::TK_StreamStart; 1003 T.Range = StringRef(Current, EI.second); 1004 TokenQueue.push_back(T); 1005 Current += EI.second; 1006 return true; 1007 } 1008 1009 bool Scanner::scanStreamEnd() { 1010 // Force an ending new line if one isn't present. 1011 if (Column != 0) { 1012 Column = 0; 1013 ++Line; 1014 } 1015 1016 unrollIndent(-1); 1017 SimpleKeys.clear(); 1018 IsSimpleKeyAllowed = false; 1019 1020 Token T; 1021 T.Kind = Token::TK_StreamEnd; 1022 T.Range = StringRef(Current, 0); 1023 TokenQueue.push_back(T); 1024 return true; 1025 } 1026 1027 bool Scanner::scanDirective() { 1028 // Reset the indentation level. 1029 unrollIndent(-1); 1030 SimpleKeys.clear(); 1031 IsSimpleKeyAllowed = false; 1032 1033 StringRef::iterator Start = Current; 1034 consume('%'); 1035 StringRef::iterator NameStart = Current; 1036 Current = skip_while(&Scanner::skip_ns_char, Current); 1037 StringRef Name(NameStart, Current - NameStart); 1038 Current = skip_while(&Scanner::skip_s_white, Current); 1039 1040 Token T; 1041 if (Name == "YAML") { 1042 Current = skip_while(&Scanner::skip_ns_char, Current); 1043 T.Kind = Token::TK_VersionDirective; 1044 T.Range = StringRef(Start, Current - Start); 1045 TokenQueue.push_back(T); 1046 return true; 1047 } else if(Name == "TAG") { 1048 Current = skip_while(&Scanner::skip_ns_char, Current); 1049 Current = skip_while(&Scanner::skip_s_white, Current); 1050 Current = skip_while(&Scanner::skip_ns_char, Current); 1051 T.Kind = Token::TK_TagDirective; 1052 T.Range = StringRef(Start, Current - Start); 1053 TokenQueue.push_back(T); 1054 return true; 1055 } 1056 return false; 1057 } 1058 1059 bool Scanner::scanDocumentIndicator(bool IsStart) { 1060 unrollIndent(-1); 1061 SimpleKeys.clear(); 1062 IsSimpleKeyAllowed = false; 1063 1064 Token T; 1065 T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; 1066 T.Range = StringRef(Current, 3); 1067 skip(3); 1068 TokenQueue.push_back(T); 1069 return true; 1070 } 1071 1072 bool Scanner::scanFlowCollectionStart(bool IsSequence) { 1073 Token T; 1074 T.Kind = IsSequence ? Token::TK_FlowSequenceStart 1075 : Token::TK_FlowMappingStart; 1076 T.Range = StringRef(Current, 1); 1077 skip(1); 1078 TokenQueue.push_back(T); 1079 1080 // [ and { may begin a simple key. 1081 saveSimpleKeyCandidate(TokenQueue.back(), Column - 1, false); 1082 1083 // And may also be followed by a simple key. 1084 IsSimpleKeyAllowed = true; 1085 ++FlowLevel; 1086 return true; 1087 } 1088 1089 bool Scanner::scanFlowCollectionEnd(bool IsSequence) { 1090 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1091 IsSimpleKeyAllowed = false; 1092 Token T; 1093 T.Kind = IsSequence ? Token::TK_FlowSequenceEnd 1094 : Token::TK_FlowMappingEnd; 1095 T.Range = StringRef(Current, 1); 1096 skip(1); 1097 TokenQueue.push_back(T); 1098 if (FlowLevel) 1099 --FlowLevel; 1100 return true; 1101 } 1102 1103 bool Scanner::scanFlowEntry() { 1104 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1105 IsSimpleKeyAllowed = true; 1106 Token T; 1107 T.Kind = Token::TK_FlowEntry; 1108 T.Range = StringRef(Current, 1); 1109 skip(1); 1110 TokenQueue.push_back(T); 1111 return true; 1112 } 1113 1114 bool Scanner::scanBlockEntry() { 1115 rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); 1116 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1117 IsSimpleKeyAllowed = true; 1118 Token T; 1119 T.Kind = Token::TK_BlockEntry; 1120 T.Range = StringRef(Current, 1); 1121 skip(1); 1122 TokenQueue.push_back(T); 1123 return true; 1124 } 1125 1126 bool Scanner::scanKey() { 1127 if (!FlowLevel) 1128 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1129 1130 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1131 IsSimpleKeyAllowed = !FlowLevel; 1132 1133 Token T; 1134 T.Kind = Token::TK_Key; 1135 T.Range = StringRef(Current, 1); 1136 skip(1); 1137 TokenQueue.push_back(T); 1138 return true; 1139 } 1140 1141 bool Scanner::scanValue() { 1142 // If the previous token could have been a simple key, insert the key token 1143 // into the token queue. 1144 if (!SimpleKeys.empty()) { 1145 SimpleKey SK = SimpleKeys.pop_back_val(); 1146 Token T; 1147 T.Kind = Token::TK_Key; 1148 T.Range = SK.Tok->Range; 1149 TokenQueueT::iterator i, e; 1150 for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { 1151 if (i == SK.Tok) 1152 break; 1153 } 1154 assert(i != e && "SimpleKey not in token queue!"); 1155 i = TokenQueue.insert(i, T); 1156 1157 // We may also need to add a Block-Mapping-Start token. 1158 rollIndent(SK.Column, Token::TK_BlockMappingStart, i); 1159 1160 IsSimpleKeyAllowed = false; 1161 } else { 1162 if (!FlowLevel) 1163 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1164 IsSimpleKeyAllowed = !FlowLevel; 1165 } 1166 1167 Token T; 1168 T.Kind = Token::TK_Value; 1169 T.Range = StringRef(Current, 1); 1170 skip(1); 1171 TokenQueue.push_back(T); 1172 return true; 1173 } 1174 1175 // Forbidding inlining improves performance by roughly 20%. 1176 // FIXME: Remove once llvm optimizes this to the faster version without hints. 1177 LLVM_ATTRIBUTE_NOINLINE static bool 1178 wasEscaped(StringRef::iterator First, StringRef::iterator Position); 1179 1180 // Returns whether a character at 'Position' was escaped with a leading '\'. 1181 // 'First' specifies the position of the first character in the string. 1182 static bool wasEscaped(StringRef::iterator First, 1183 StringRef::iterator Position) { 1184 assert(Position - 1 >= First); 1185 StringRef::iterator I = Position - 1; 1186 // We calculate the number of consecutive '\'s before the current position 1187 // by iterating backwards through our string. 1188 while (I >= First && *I == '\\') --I; 1189 // (Position - 1 - I) now contains the number of '\'s before the current 1190 // position. If it is odd, the character at 'Position' was escaped. 1191 return (Position - 1 - I) % 2 == 1; 1192 } 1193 1194 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { 1195 StringRef::iterator Start = Current; 1196 unsigned ColStart = Column; 1197 if (IsDoubleQuoted) { 1198 do { 1199 ++Current; 1200 while (Current != End && *Current != '"') 1201 ++Current; 1202 // Repeat until the previous character was not a '\' or was an escaped 1203 // backslash. 1204 } while ( Current != End 1205 && *(Current - 1) == '\\' 1206 && wasEscaped(Start + 1, Current)); 1207 } else { 1208 skip(1); 1209 while (true) { 1210 // Skip a ' followed by another '. 1211 if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { 1212 skip(2); 1213 continue; 1214 } else if (*Current == '\'') 1215 break; 1216 StringRef::iterator i = skip_nb_char(Current); 1217 if (i == Current) { 1218 i = skip_b_break(Current); 1219 if (i == Current) 1220 break; 1221 Current = i; 1222 Column = 0; 1223 ++Line; 1224 } else { 1225 if (i == End) 1226 break; 1227 Current = i; 1228 ++Column; 1229 } 1230 } 1231 } 1232 1233 if (Current == End) { 1234 setError("Expected quote at end of scalar", Current); 1235 return false; 1236 } 1237 1238 skip(1); // Skip ending quote. 1239 Token T; 1240 T.Kind = Token::TK_Scalar; 1241 T.Range = StringRef(Start, Current - Start); 1242 TokenQueue.push_back(T); 1243 1244 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1245 1246 IsSimpleKeyAllowed = false; 1247 1248 return true; 1249 } 1250 1251 bool Scanner::scanPlainScalar() { 1252 StringRef::iterator Start = Current; 1253 unsigned ColStart = Column; 1254 unsigned LeadingBlanks = 0; 1255 assert(Indent >= -1 && "Indent must be >= -1 !"); 1256 unsigned indent = static_cast<unsigned>(Indent + 1); 1257 while (true) { 1258 if (*Current == '#') 1259 break; 1260 1261 while (!isBlankOrBreak(Current)) { 1262 if ( FlowLevel && *Current == ':' 1263 && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) { 1264 setError("Found unexpected ':' while scanning a plain scalar", Current); 1265 return false; 1266 } 1267 1268 // Check for the end of the plain scalar. 1269 if ( (*Current == ':' && isBlankOrBreak(Current + 1)) 1270 || ( FlowLevel 1271 && (StringRef(Current, 1).find_first_of(",:?[]{}") 1272 != StringRef::npos))) 1273 break; 1274 1275 StringRef::iterator i = skip_nb_char(Current); 1276 if (i == Current) 1277 break; 1278 Current = i; 1279 ++Column; 1280 } 1281 1282 // Are we at the end? 1283 if (!isBlankOrBreak(Current)) 1284 break; 1285 1286 // Eat blanks. 1287 StringRef::iterator Tmp = Current; 1288 while (isBlankOrBreak(Tmp)) { 1289 StringRef::iterator i = skip_s_white(Tmp); 1290 if (i != Tmp) { 1291 if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { 1292 setError("Found invalid tab character in indentation", Tmp); 1293 return false; 1294 } 1295 Tmp = i; 1296 ++Column; 1297 } else { 1298 i = skip_b_break(Tmp); 1299 if (!LeadingBlanks) 1300 LeadingBlanks = 1; 1301 Tmp = i; 1302 Column = 0; 1303 ++Line; 1304 } 1305 } 1306 1307 if (!FlowLevel && Column < indent) 1308 break; 1309 1310 Current = Tmp; 1311 } 1312 if (Start == Current) { 1313 setError("Got empty plain scalar", Start); 1314 return false; 1315 } 1316 Token T; 1317 T.Kind = Token::TK_Scalar; 1318 T.Range = StringRef(Start, Current - Start); 1319 TokenQueue.push_back(T); 1320 1321 // Plain scalars can be simple keys. 1322 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1323 1324 IsSimpleKeyAllowed = false; 1325 1326 return true; 1327 } 1328 1329 bool Scanner::scanAliasOrAnchor(bool IsAlias) { 1330 StringRef::iterator Start = Current; 1331 unsigned ColStart = Column; 1332 skip(1); 1333 while(true) { 1334 if ( *Current == '[' || *Current == ']' 1335 || *Current == '{' || *Current == '}' 1336 || *Current == ',' 1337 || *Current == ':') 1338 break; 1339 StringRef::iterator i = skip_ns_char(Current); 1340 if (i == Current) 1341 break; 1342 Current = i; 1343 ++Column; 1344 } 1345 1346 if (Start == Current) { 1347 setError("Got empty alias or anchor", Start); 1348 return false; 1349 } 1350 1351 Token T; 1352 T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; 1353 T.Range = StringRef(Start, Current - Start); 1354 TokenQueue.push_back(T); 1355 1356 // Alias and anchors can be simple keys. 1357 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1358 1359 IsSimpleKeyAllowed = false; 1360 1361 return true; 1362 } 1363 1364 bool Scanner::scanBlockScalar(bool IsLiteral) { 1365 StringRef::iterator Start = Current; 1366 skip(1); // Eat | or > 1367 while(true) { 1368 StringRef::iterator i = skip_nb_char(Current); 1369 if (i == Current) { 1370 if (Column == 0) 1371 break; 1372 i = skip_b_break(Current); 1373 if (i != Current) { 1374 // We got a line break. 1375 Column = 0; 1376 ++Line; 1377 Current = i; 1378 continue; 1379 } else { 1380 // There was an error, which should already have been printed out. 1381 return false; 1382 } 1383 } 1384 Current = i; 1385 ++Column; 1386 } 1387 1388 if (Start == Current) { 1389 setError("Got empty block scalar", Start); 1390 return false; 1391 } 1392 1393 Token T; 1394 T.Kind = Token::TK_Scalar; 1395 T.Range = StringRef(Start, Current - Start); 1396 TokenQueue.push_back(T); 1397 return true; 1398 } 1399 1400 bool Scanner::scanTag() { 1401 StringRef::iterator Start = Current; 1402 unsigned ColStart = Column; 1403 skip(1); // Eat !. 1404 if (Current == End || isBlankOrBreak(Current)); // An empty tag. 1405 else if (*Current == '<') { 1406 skip(1); 1407 scan_ns_uri_char(); 1408 if (!consume('>')) 1409 return false; 1410 } else { 1411 // FIXME: Actually parse the c-ns-shorthand-tag rule. 1412 Current = skip_while(&Scanner::skip_ns_char, Current); 1413 } 1414 1415 Token T; 1416 T.Kind = Token::TK_Tag; 1417 T.Range = StringRef(Start, Current - Start); 1418 TokenQueue.push_back(T); 1419 1420 // Tags can be simple keys. 1421 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1422 1423 IsSimpleKeyAllowed = false; 1424 1425 return true; 1426 } 1427 1428 bool Scanner::fetchMoreTokens() { 1429 if (IsStartOfStream) 1430 return scanStreamStart(); 1431 1432 scanToNextToken(); 1433 1434 if (Current == End) 1435 return scanStreamEnd(); 1436 1437 removeStaleSimpleKeyCandidates(); 1438 1439 unrollIndent(Column); 1440 1441 if (Column == 0 && *Current == '%') 1442 return scanDirective(); 1443 1444 if (Column == 0 && Current + 4 <= End 1445 && *Current == '-' 1446 && *(Current + 1) == '-' 1447 && *(Current + 2) == '-' 1448 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1449 return scanDocumentIndicator(true); 1450 1451 if (Column == 0 && Current + 4 <= End 1452 && *Current == '.' 1453 && *(Current + 1) == '.' 1454 && *(Current + 2) == '.' 1455 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1456 return scanDocumentIndicator(false); 1457 1458 if (*Current == '[') 1459 return scanFlowCollectionStart(true); 1460 1461 if (*Current == '{') 1462 return scanFlowCollectionStart(false); 1463 1464 if (*Current == ']') 1465 return scanFlowCollectionEnd(true); 1466 1467 if (*Current == '}') 1468 return scanFlowCollectionEnd(false); 1469 1470 if (*Current == ',') 1471 return scanFlowEntry(); 1472 1473 if (*Current == '-' && isBlankOrBreak(Current + 1)) 1474 return scanBlockEntry(); 1475 1476 if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1))) 1477 return scanKey(); 1478 1479 if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1))) 1480 return scanValue(); 1481 1482 if (*Current == '*') 1483 return scanAliasOrAnchor(true); 1484 1485 if (*Current == '&') 1486 return scanAliasOrAnchor(false); 1487 1488 if (*Current == '!') 1489 return scanTag(); 1490 1491 if (*Current == '|' && !FlowLevel) 1492 return scanBlockScalar(true); 1493 1494 if (*Current == '>' && !FlowLevel) 1495 return scanBlockScalar(false); 1496 1497 if (*Current == '\'') 1498 return scanFlowScalar(false); 1499 1500 if (*Current == '"') 1501 return scanFlowScalar(true); 1502 1503 // Get a plain scalar. 1504 StringRef FirstChar(Current, 1); 1505 if (!(isBlankOrBreak(Current) 1506 || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos) 1507 || (*Current == '-' && !isBlankOrBreak(Current + 1)) 1508 || (!FlowLevel && (*Current == '?' || *Current == ':') 1509 && isBlankOrBreak(Current + 1)) 1510 || (!FlowLevel && *Current == ':' 1511 && Current + 2 < End 1512 && *(Current + 1) == ':' 1513 && !isBlankOrBreak(Current + 2))) 1514 return scanPlainScalar(); 1515 1516 setError("Unrecognized character while tokenizing."); 1517 return false; 1518 } 1519 1520 Stream::Stream(StringRef Input, SourceMgr &SM) 1521 : scanner(new Scanner(Input, SM)), CurrentDoc() {} 1522 1523 Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM) 1524 : scanner(new Scanner(InputBuffer, SM)), CurrentDoc() {} 1525 1526 Stream::~Stream() {} 1527 1528 bool Stream::failed() { return scanner->failed(); } 1529 1530 void Stream::printError(Node *N, const Twine &Msg) { 1531 SmallVector<SMRange, 1> Ranges; 1532 Ranges.push_back(N->getSourceRange()); 1533 scanner->printError( N->getSourceRange().Start 1534 , SourceMgr::DK_Error 1535 , Msg 1536 , Ranges); 1537 } 1538 1539 document_iterator Stream::begin() { 1540 if (CurrentDoc) 1541 report_fatal_error("Can only iterate over the stream once"); 1542 1543 // Skip Stream-Start. 1544 scanner->getNext(); 1545 1546 CurrentDoc.reset(new Document(*this)); 1547 return document_iterator(CurrentDoc); 1548 } 1549 1550 document_iterator Stream::end() { 1551 return document_iterator(); 1552 } 1553 1554 void Stream::skip() { 1555 for (document_iterator i = begin(), e = end(); i != e; ++i) 1556 i->skip(); 1557 } 1558 1559 Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A, 1560 StringRef T) 1561 : Doc(D), TypeID(Type), Anchor(A), Tag(T) { 1562 SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); 1563 SourceRange = SMRange(Start, Start); 1564 } 1565 1566 std::string Node::getVerbatimTag() const { 1567 StringRef Raw = getRawTag(); 1568 if (!Raw.empty() && Raw != "!") { 1569 std::string Ret; 1570 if (Raw.find_last_of('!') == 0) { 1571 Ret = Doc->getTagMap().find("!")->second; 1572 Ret += Raw.substr(1); 1573 return std::move(Ret); 1574 } else if (Raw.startswith("!!")) { 1575 Ret = Doc->getTagMap().find("!!")->second; 1576 Ret += Raw.substr(2); 1577 return std::move(Ret); 1578 } else { 1579 StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1); 1580 std::map<StringRef, StringRef>::const_iterator It = 1581 Doc->getTagMap().find(TagHandle); 1582 if (It != Doc->getTagMap().end()) 1583 Ret = It->second; 1584 else { 1585 Token T; 1586 T.Kind = Token::TK_Tag; 1587 T.Range = TagHandle; 1588 setError(Twine("Unknown tag handle ") + TagHandle, T); 1589 } 1590 Ret += Raw.substr(Raw.find_last_of('!') + 1); 1591 return std::move(Ret); 1592 } 1593 } 1594 1595 switch (getType()) { 1596 case NK_Null: 1597 return "tag:yaml.org,2002:null"; 1598 case NK_Scalar: 1599 // TODO: Tag resolution. 1600 return "tag:yaml.org,2002:str"; 1601 case NK_Mapping: 1602 return "tag:yaml.org,2002:map"; 1603 case NK_Sequence: 1604 return "tag:yaml.org,2002:seq"; 1605 } 1606 1607 return ""; 1608 } 1609 1610 Token &Node::peekNext() { 1611 return Doc->peekNext(); 1612 } 1613 1614 Token Node::getNext() { 1615 return Doc->getNext(); 1616 } 1617 1618 Node *Node::parseBlockNode() { 1619 return Doc->parseBlockNode(); 1620 } 1621 1622 BumpPtrAllocator &Node::getAllocator() { 1623 return Doc->NodeAllocator; 1624 } 1625 1626 void Node::setError(const Twine &Msg, Token &Tok) const { 1627 Doc->setError(Msg, Tok); 1628 } 1629 1630 bool Node::failed() const { 1631 return Doc->failed(); 1632 } 1633 1634 1635 1636 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { 1637 // TODO: Handle newlines properly. We need to remove leading whitespace. 1638 if (Value[0] == '"') { // Double quoted. 1639 // Pull off the leading and trailing "s. 1640 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1641 // Search for characters that would require unescaping the value. 1642 StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); 1643 if (i != StringRef::npos) 1644 return unescapeDoubleQuoted(UnquotedValue, i, Storage); 1645 return UnquotedValue; 1646 } else if (Value[0] == '\'') { // Single quoted. 1647 // Pull off the leading and trailing 's. 1648 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1649 StringRef::size_type i = UnquotedValue.find('\''); 1650 if (i != StringRef::npos) { 1651 // We're going to need Storage. 1652 Storage.clear(); 1653 Storage.reserve(UnquotedValue.size()); 1654 for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { 1655 StringRef Valid(UnquotedValue.begin(), i); 1656 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1657 Storage.push_back('\''); 1658 UnquotedValue = UnquotedValue.substr(i + 2); 1659 } 1660 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 1661 return StringRef(Storage.begin(), Storage.size()); 1662 } 1663 return UnquotedValue; 1664 } 1665 // Plain or block. 1666 return Value.rtrim(" "); 1667 } 1668 1669 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue 1670 , StringRef::size_type i 1671 , SmallVectorImpl<char> &Storage) 1672 const { 1673 // Use Storage to build proper value. 1674 Storage.clear(); 1675 Storage.reserve(UnquotedValue.size()); 1676 for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { 1677 // Insert all previous chars into Storage. 1678 StringRef Valid(UnquotedValue.begin(), i); 1679 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1680 // Chop off inserted chars. 1681 UnquotedValue = UnquotedValue.substr(i); 1682 1683 assert(!UnquotedValue.empty() && "Can't be empty!"); 1684 1685 // Parse escape or line break. 1686 switch (UnquotedValue[0]) { 1687 case '\r': 1688 case '\n': 1689 Storage.push_back('\n'); 1690 if ( UnquotedValue.size() > 1 1691 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1692 UnquotedValue = UnquotedValue.substr(1); 1693 UnquotedValue = UnquotedValue.substr(1); 1694 break; 1695 default: 1696 if (UnquotedValue.size() == 1) 1697 // TODO: Report error. 1698 break; 1699 UnquotedValue = UnquotedValue.substr(1); 1700 switch (UnquotedValue[0]) { 1701 default: { 1702 Token T; 1703 T.Range = StringRef(UnquotedValue.begin(), 1); 1704 setError("Unrecognized escape code!", T); 1705 return ""; 1706 } 1707 case '\r': 1708 case '\n': 1709 // Remove the new line. 1710 if ( UnquotedValue.size() > 1 1711 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1712 UnquotedValue = UnquotedValue.substr(1); 1713 // If this was just a single byte newline, it will get skipped 1714 // below. 1715 break; 1716 case '0': 1717 Storage.push_back(0x00); 1718 break; 1719 case 'a': 1720 Storage.push_back(0x07); 1721 break; 1722 case 'b': 1723 Storage.push_back(0x08); 1724 break; 1725 case 't': 1726 case 0x09: 1727 Storage.push_back(0x09); 1728 break; 1729 case 'n': 1730 Storage.push_back(0x0A); 1731 break; 1732 case 'v': 1733 Storage.push_back(0x0B); 1734 break; 1735 case 'f': 1736 Storage.push_back(0x0C); 1737 break; 1738 case 'r': 1739 Storage.push_back(0x0D); 1740 break; 1741 case 'e': 1742 Storage.push_back(0x1B); 1743 break; 1744 case ' ': 1745 Storage.push_back(0x20); 1746 break; 1747 case '"': 1748 Storage.push_back(0x22); 1749 break; 1750 case '/': 1751 Storage.push_back(0x2F); 1752 break; 1753 case '\\': 1754 Storage.push_back(0x5C); 1755 break; 1756 case 'N': 1757 encodeUTF8(0x85, Storage); 1758 break; 1759 case '_': 1760 encodeUTF8(0xA0, Storage); 1761 break; 1762 case 'L': 1763 encodeUTF8(0x2028, Storage); 1764 break; 1765 case 'P': 1766 encodeUTF8(0x2029, Storage); 1767 break; 1768 case 'x': { 1769 if (UnquotedValue.size() < 3) 1770 // TODO: Report error. 1771 break; 1772 unsigned int UnicodeScalarValue; 1773 if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) 1774 // TODO: Report error. 1775 UnicodeScalarValue = 0xFFFD; 1776 encodeUTF8(UnicodeScalarValue, Storage); 1777 UnquotedValue = UnquotedValue.substr(2); 1778 break; 1779 } 1780 case 'u': { 1781 if (UnquotedValue.size() < 5) 1782 // TODO: Report error. 1783 break; 1784 unsigned int UnicodeScalarValue; 1785 if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) 1786 // TODO: Report error. 1787 UnicodeScalarValue = 0xFFFD; 1788 encodeUTF8(UnicodeScalarValue, Storage); 1789 UnquotedValue = UnquotedValue.substr(4); 1790 break; 1791 } 1792 case 'U': { 1793 if (UnquotedValue.size() < 9) 1794 // TODO: Report error. 1795 break; 1796 unsigned int UnicodeScalarValue; 1797 if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) 1798 // TODO: Report error. 1799 UnicodeScalarValue = 0xFFFD; 1800 encodeUTF8(UnicodeScalarValue, Storage); 1801 UnquotedValue = UnquotedValue.substr(8); 1802 break; 1803 } 1804 } 1805 UnquotedValue = UnquotedValue.substr(1); 1806 } 1807 } 1808 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 1809 return StringRef(Storage.begin(), Storage.size()); 1810 } 1811 1812 Node *KeyValueNode::getKey() { 1813 if (Key) 1814 return Key; 1815 // Handle implicit null keys. 1816 { 1817 Token &t = peekNext(); 1818 if ( t.Kind == Token::TK_BlockEnd 1819 || t.Kind == Token::TK_Value 1820 || t.Kind == Token::TK_Error) { 1821 return Key = new (getAllocator()) NullNode(Doc); 1822 } 1823 if (t.Kind == Token::TK_Key) 1824 getNext(); // skip TK_Key. 1825 } 1826 1827 // Handle explicit null keys. 1828 Token &t = peekNext(); 1829 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { 1830 return Key = new (getAllocator()) NullNode(Doc); 1831 } 1832 1833 // We've got a normal key. 1834 return Key = parseBlockNode(); 1835 } 1836 1837 Node *KeyValueNode::getValue() { 1838 if (Value) 1839 return Value; 1840 getKey()->skip(); 1841 if (failed()) 1842 return Value = new (getAllocator()) NullNode(Doc); 1843 1844 // Handle implicit null values. 1845 { 1846 Token &t = peekNext(); 1847 if ( t.Kind == Token::TK_BlockEnd 1848 || t.Kind == Token::TK_FlowMappingEnd 1849 || t.Kind == Token::TK_Key 1850 || t.Kind == Token::TK_FlowEntry 1851 || t.Kind == Token::TK_Error) { 1852 return Value = new (getAllocator()) NullNode(Doc); 1853 } 1854 1855 if (t.Kind != Token::TK_Value) { 1856 setError("Unexpected token in Key Value.", t); 1857 return Value = new (getAllocator()) NullNode(Doc); 1858 } 1859 getNext(); // skip TK_Value. 1860 } 1861 1862 // Handle explicit null values. 1863 Token &t = peekNext(); 1864 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { 1865 return Value = new (getAllocator()) NullNode(Doc); 1866 } 1867 1868 // We got a normal value. 1869 return Value = parseBlockNode(); 1870 } 1871 1872 void MappingNode::increment() { 1873 if (failed()) { 1874 IsAtEnd = true; 1875 CurrentEntry = nullptr; 1876 return; 1877 } 1878 if (CurrentEntry) { 1879 CurrentEntry->skip(); 1880 if (Type == MT_Inline) { 1881 IsAtEnd = true; 1882 CurrentEntry = nullptr; 1883 return; 1884 } 1885 } 1886 Token T = peekNext(); 1887 if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { 1888 // KeyValueNode eats the TK_Key. That way it can detect null keys. 1889 CurrentEntry = new (getAllocator()) KeyValueNode(Doc); 1890 } else if (Type == MT_Block) { 1891 switch (T.Kind) { 1892 case Token::TK_BlockEnd: 1893 getNext(); 1894 IsAtEnd = true; 1895 CurrentEntry = nullptr; 1896 break; 1897 default: 1898 setError("Unexpected token. Expected Key or Block End", T); 1899 case Token::TK_Error: 1900 IsAtEnd = true; 1901 CurrentEntry = nullptr; 1902 } 1903 } else { 1904 switch (T.Kind) { 1905 case Token::TK_FlowEntry: 1906 // Eat the flow entry and recurse. 1907 getNext(); 1908 return increment(); 1909 case Token::TK_FlowMappingEnd: 1910 getNext(); 1911 case Token::TK_Error: 1912 // Set this to end iterator. 1913 IsAtEnd = true; 1914 CurrentEntry = nullptr; 1915 break; 1916 default: 1917 setError( "Unexpected token. Expected Key, Flow Entry, or Flow " 1918 "Mapping End." 1919 , T); 1920 IsAtEnd = true; 1921 CurrentEntry = nullptr; 1922 } 1923 } 1924 } 1925 1926 void SequenceNode::increment() { 1927 if (failed()) { 1928 IsAtEnd = true; 1929 CurrentEntry = nullptr; 1930 return; 1931 } 1932 if (CurrentEntry) 1933 CurrentEntry->skip(); 1934 Token T = peekNext(); 1935 if (SeqType == ST_Block) { 1936 switch (T.Kind) { 1937 case Token::TK_BlockEntry: 1938 getNext(); 1939 CurrentEntry = parseBlockNode(); 1940 if (!CurrentEntry) { // An error occurred. 1941 IsAtEnd = true; 1942 CurrentEntry = nullptr; 1943 } 1944 break; 1945 case Token::TK_BlockEnd: 1946 getNext(); 1947 IsAtEnd = true; 1948 CurrentEntry = nullptr; 1949 break; 1950 default: 1951 setError( "Unexpected token. Expected Block Entry or Block End." 1952 , T); 1953 case Token::TK_Error: 1954 IsAtEnd = true; 1955 CurrentEntry = nullptr; 1956 } 1957 } else if (SeqType == ST_Indentless) { 1958 switch (T.Kind) { 1959 case Token::TK_BlockEntry: 1960 getNext(); 1961 CurrentEntry = parseBlockNode(); 1962 if (!CurrentEntry) { // An error occurred. 1963 IsAtEnd = true; 1964 CurrentEntry = nullptr; 1965 } 1966 break; 1967 default: 1968 case Token::TK_Error: 1969 IsAtEnd = true; 1970 CurrentEntry = nullptr; 1971 } 1972 } else if (SeqType == ST_Flow) { 1973 switch (T.Kind) { 1974 case Token::TK_FlowEntry: 1975 // Eat the flow entry and recurse. 1976 getNext(); 1977 WasPreviousTokenFlowEntry = true; 1978 return increment(); 1979 case Token::TK_FlowSequenceEnd: 1980 getNext(); 1981 case Token::TK_Error: 1982 // Set this to end iterator. 1983 IsAtEnd = true; 1984 CurrentEntry = nullptr; 1985 break; 1986 case Token::TK_StreamEnd: 1987 case Token::TK_DocumentEnd: 1988 case Token::TK_DocumentStart: 1989 setError("Could not find closing ]!", T); 1990 // Set this to end iterator. 1991 IsAtEnd = true; 1992 CurrentEntry = nullptr; 1993 break; 1994 default: 1995 if (!WasPreviousTokenFlowEntry) { 1996 setError("Expected , between entries!", T); 1997 IsAtEnd = true; 1998 CurrentEntry = nullptr; 1999 break; 2000 } 2001 // Otherwise it must be a flow entry. 2002 CurrentEntry = parseBlockNode(); 2003 if (!CurrentEntry) { 2004 IsAtEnd = true; 2005 } 2006 WasPreviousTokenFlowEntry = false; 2007 break; 2008 } 2009 } 2010 } 2011 2012 Document::Document(Stream &S) : stream(S), Root(nullptr) { 2013 // Tag maps starts with two default mappings. 2014 TagMap["!"] = "!"; 2015 TagMap["!!"] = "tag:yaml.org,2002:"; 2016 2017 if (parseDirectives()) 2018 expectToken(Token::TK_DocumentStart); 2019 Token &T = peekNext(); 2020 if (T.Kind == Token::TK_DocumentStart) 2021 getNext(); 2022 } 2023 2024 bool Document::skip() { 2025 if (stream.scanner->failed()) 2026 return false; 2027 if (!Root) 2028 getRoot(); 2029 Root->skip(); 2030 Token &T = peekNext(); 2031 if (T.Kind == Token::TK_StreamEnd) 2032 return false; 2033 if (T.Kind == Token::TK_DocumentEnd) { 2034 getNext(); 2035 return skip(); 2036 } 2037 return true; 2038 } 2039 2040 Token &Document::peekNext() { 2041 return stream.scanner->peekNext(); 2042 } 2043 2044 Token Document::getNext() { 2045 return stream.scanner->getNext(); 2046 } 2047 2048 void Document::setError(const Twine &Message, Token &Location) const { 2049 stream.scanner->setError(Message, Location.Range.begin()); 2050 } 2051 2052 bool Document::failed() const { 2053 return stream.scanner->failed(); 2054 } 2055 2056 Node *Document::parseBlockNode() { 2057 Token T = peekNext(); 2058 // Handle properties. 2059 Token AnchorInfo; 2060 Token TagInfo; 2061 parse_property: 2062 switch (T.Kind) { 2063 case Token::TK_Alias: 2064 getNext(); 2065 return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); 2066 case Token::TK_Anchor: 2067 if (AnchorInfo.Kind == Token::TK_Anchor) { 2068 setError("Already encountered an anchor for this node!", T); 2069 return nullptr; 2070 } 2071 AnchorInfo = getNext(); // Consume TK_Anchor. 2072 T = peekNext(); 2073 goto parse_property; 2074 case Token::TK_Tag: 2075 if (TagInfo.Kind == Token::TK_Tag) { 2076 setError("Already encountered a tag for this node!", T); 2077 return nullptr; 2078 } 2079 TagInfo = getNext(); // Consume TK_Tag. 2080 T = peekNext(); 2081 goto parse_property; 2082 default: 2083 break; 2084 } 2085 2086 switch (T.Kind) { 2087 case Token::TK_BlockEntry: 2088 // We got an unindented BlockEntry sequence. This is not terminated with 2089 // a BlockEnd. 2090 // Don't eat the TK_BlockEntry, SequenceNode needs it. 2091 return new (NodeAllocator) SequenceNode( stream.CurrentDoc 2092 , AnchorInfo.Range.substr(1) 2093 , TagInfo.Range 2094 , SequenceNode::ST_Indentless); 2095 case Token::TK_BlockSequenceStart: 2096 getNext(); 2097 return new (NodeAllocator) 2098 SequenceNode( stream.CurrentDoc 2099 , AnchorInfo.Range.substr(1) 2100 , TagInfo.Range 2101 , SequenceNode::ST_Block); 2102 case Token::TK_BlockMappingStart: 2103 getNext(); 2104 return new (NodeAllocator) 2105 MappingNode( stream.CurrentDoc 2106 , AnchorInfo.Range.substr(1) 2107 , TagInfo.Range 2108 , MappingNode::MT_Block); 2109 case Token::TK_FlowSequenceStart: 2110 getNext(); 2111 return new (NodeAllocator) 2112 SequenceNode( stream.CurrentDoc 2113 , AnchorInfo.Range.substr(1) 2114 , TagInfo.Range 2115 , SequenceNode::ST_Flow); 2116 case Token::TK_FlowMappingStart: 2117 getNext(); 2118 return new (NodeAllocator) 2119 MappingNode( stream.CurrentDoc 2120 , AnchorInfo.Range.substr(1) 2121 , TagInfo.Range 2122 , MappingNode::MT_Flow); 2123 case Token::TK_Scalar: 2124 getNext(); 2125 return new (NodeAllocator) 2126 ScalarNode( stream.CurrentDoc 2127 , AnchorInfo.Range.substr(1) 2128 , TagInfo.Range 2129 , T.Range); 2130 case Token::TK_Key: 2131 // Don't eat the TK_Key, KeyValueNode expects it. 2132 return new (NodeAllocator) 2133 MappingNode( stream.CurrentDoc 2134 , AnchorInfo.Range.substr(1) 2135 , TagInfo.Range 2136 , MappingNode::MT_Inline); 2137 case Token::TK_DocumentStart: 2138 case Token::TK_DocumentEnd: 2139 case Token::TK_StreamEnd: 2140 default: 2141 // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not 2142 // !!null null. 2143 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2144 case Token::TK_Error: 2145 return nullptr; 2146 } 2147 llvm_unreachable("Control flow shouldn't reach here."); 2148 return nullptr; 2149 } 2150 2151 bool Document::parseDirectives() { 2152 bool isDirective = false; 2153 while (true) { 2154 Token T = peekNext(); 2155 if (T.Kind == Token::TK_TagDirective) { 2156 parseTAGDirective(); 2157 isDirective = true; 2158 } else if (T.Kind == Token::TK_VersionDirective) { 2159 parseYAMLDirective(); 2160 isDirective = true; 2161 } else 2162 break; 2163 } 2164 return isDirective; 2165 } 2166 2167 void Document::parseYAMLDirective() { 2168 getNext(); // Eat %YAML <version> 2169 } 2170 2171 void Document::parseTAGDirective() { 2172 Token Tag = getNext(); // %TAG <handle> <prefix> 2173 StringRef T = Tag.Range; 2174 // Strip %TAG 2175 T = T.substr(T.find_first_of(" \t")).ltrim(" \t"); 2176 std::size_t HandleEnd = T.find_first_of(" \t"); 2177 StringRef TagHandle = T.substr(0, HandleEnd); 2178 StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t"); 2179 TagMap[TagHandle] = TagPrefix; 2180 } 2181 2182 bool Document::expectToken(int TK) { 2183 Token T = getNext(); 2184 if (T.Kind != TK) { 2185 setError("Unexpected token", T); 2186 return false; 2187 } 2188 return true; 2189 } 2190