1 //===- YAMLParser.cpp - Simple YAML parser --------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements a YAML parser. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Support/YAMLParser.h" 15 #include "llvm/ADT/AllocatorList.h" 16 #include "llvm/ADT/ArrayRef.h" 17 #include "llvm/ADT/None.h" 18 #include "llvm/ADT/STLExtras.h" 19 #include "llvm/ADT/SmallString.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/ADT/StringExtras.h" 22 #include "llvm/ADT/StringRef.h" 23 #include "llvm/ADT/Twine.h" 24 #include "llvm/Support/Compiler.h" 25 #include "llvm/Support/ErrorHandling.h" 26 #include "llvm/Support/MemoryBuffer.h" 27 #include "llvm/Support/SMLoc.h" 28 #include "llvm/Support/SourceMgr.h" 29 #include "llvm/Support/Unicode.h" 30 #include "llvm/Support/raw_ostream.h" 31 #include <algorithm> 32 #include <cassert> 33 #include <cstddef> 34 #include <cstdint> 35 #include <map> 36 #include <memory> 37 #include <string> 38 #include <system_error> 39 #include <utility> 40 41 using namespace llvm; 42 using namespace yaml; 43 44 enum UnicodeEncodingForm { 45 UEF_UTF32_LE, ///< UTF-32 Little Endian 46 UEF_UTF32_BE, ///< UTF-32 Big Endian 47 UEF_UTF16_LE, ///< UTF-16 Little Endian 48 UEF_UTF16_BE, ///< UTF-16 Big Endian 49 UEF_UTF8, ///< UTF-8 or ascii. 50 UEF_Unknown ///< Not a valid Unicode encoding. 51 }; 52 53 /// EncodingInfo - Holds the encoding type and length of the byte order mark if 54 /// it exists. Length is in {0, 2, 3, 4}. 55 using EncodingInfo = std::pair<UnicodeEncodingForm, unsigned>; 56 57 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode 58 /// encoding form of \a Input. 59 /// 60 /// @param Input A string of length 0 or more. 61 /// @returns An EncodingInfo indicating the Unicode encoding form of the input 62 /// and how long the byte order mark is if one exists. 63 static EncodingInfo getUnicodeEncoding(StringRef Input) { 64 if (Input.empty()) 65 return std::make_pair(UEF_Unknown, 0); 66 67 switch (uint8_t(Input[0])) { 68 case 0x00: 69 if (Input.size() >= 4) { 70 if ( Input[1] == 0 71 && uint8_t(Input[2]) == 0xFE 72 && uint8_t(Input[3]) == 0xFF) 73 return std::make_pair(UEF_UTF32_BE, 4); 74 if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) 75 return std::make_pair(UEF_UTF32_BE, 0); 76 } 77 78 if (Input.size() >= 2 && Input[1] != 0) 79 return std::make_pair(UEF_UTF16_BE, 0); 80 return std::make_pair(UEF_Unknown, 0); 81 case 0xFF: 82 if ( Input.size() >= 4 83 && uint8_t(Input[1]) == 0xFE 84 && Input[2] == 0 85 && Input[3] == 0) 86 return std::make_pair(UEF_UTF32_LE, 4); 87 88 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) 89 return std::make_pair(UEF_UTF16_LE, 2); 90 return std::make_pair(UEF_Unknown, 0); 91 case 0xFE: 92 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) 93 return std::make_pair(UEF_UTF16_BE, 2); 94 return std::make_pair(UEF_Unknown, 0); 95 case 0xEF: 96 if ( Input.size() >= 3 97 && uint8_t(Input[1]) == 0xBB 98 && uint8_t(Input[2]) == 0xBF) 99 return std::make_pair(UEF_UTF8, 3); 100 return std::make_pair(UEF_Unknown, 0); 101 } 102 103 // It could still be utf-32 or utf-16. 104 if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) 105 return std::make_pair(UEF_UTF32_LE, 0); 106 107 if (Input.size() >= 2 && Input[1] == 0) 108 return std::make_pair(UEF_UTF16_LE, 0); 109 110 return std::make_pair(UEF_UTF8, 0); 111 } 112 113 /// Pin the vtables to this file. 114 void Node::anchor() {} 115 void NullNode::anchor() {} 116 void ScalarNode::anchor() {} 117 void BlockScalarNode::anchor() {} 118 void KeyValueNode::anchor() {} 119 void MappingNode::anchor() {} 120 void SequenceNode::anchor() {} 121 void AliasNode::anchor() {} 122 123 namespace llvm { 124 namespace yaml { 125 126 /// Token - A single YAML token. 127 struct Token { 128 enum TokenKind { 129 TK_Error, // Uninitialized token. 130 TK_StreamStart, 131 TK_StreamEnd, 132 TK_VersionDirective, 133 TK_TagDirective, 134 TK_DocumentStart, 135 TK_DocumentEnd, 136 TK_BlockEntry, 137 TK_BlockEnd, 138 TK_BlockSequenceStart, 139 TK_BlockMappingStart, 140 TK_FlowEntry, 141 TK_FlowSequenceStart, 142 TK_FlowSequenceEnd, 143 TK_FlowMappingStart, 144 TK_FlowMappingEnd, 145 TK_Key, 146 TK_Value, 147 TK_Scalar, 148 TK_BlockScalar, 149 TK_Alias, 150 TK_Anchor, 151 TK_Tag 152 } Kind = TK_Error; 153 154 /// A string of length 0 or more whose begin() points to the logical location 155 /// of the token in the input. 156 StringRef Range; 157 158 /// The value of a block scalar node. 159 std::string Value; 160 161 Token() = default; 162 }; 163 164 } // end namespace yaml 165 } // end namespace llvm 166 167 using TokenQueueT = BumpPtrList<Token>; 168 169 namespace { 170 171 /// This struct is used to track simple keys. 172 /// 173 /// Simple keys are handled by creating an entry in SimpleKeys for each Token 174 /// which could legally be the start of a simple key. When peekNext is called, 175 /// if the Token To be returned is referenced by a SimpleKey, we continue 176 /// tokenizing until that potential simple key has either been found to not be 177 /// a simple key (we moved on to the next line or went further than 1024 chars). 178 /// Or when we run into a Value, and then insert a Key token (and possibly 179 /// others) before the SimpleKey's Tok. 180 struct SimpleKey { 181 TokenQueueT::iterator Tok; 182 unsigned Column; 183 unsigned Line; 184 unsigned FlowLevel; 185 bool IsRequired; 186 187 bool operator ==(const SimpleKey &Other) { 188 return Tok == Other.Tok; 189 } 190 }; 191 192 } // end anonymous namespace 193 194 /// The Unicode scalar value of a UTF-8 minimal well-formed code unit 195 /// subsequence and the subsequence's length in code units (uint8_t). 196 /// A length of 0 represents an error. 197 using UTF8Decoded = std::pair<uint32_t, unsigned>; 198 199 static UTF8Decoded decodeUTF8(StringRef Range) { 200 StringRef::iterator Position= Range.begin(); 201 StringRef::iterator End = Range.end(); 202 // 1 byte: [0x00, 0x7f] 203 // Bit pattern: 0xxxxxxx 204 if ((*Position & 0x80) == 0) { 205 return std::make_pair(*Position, 1); 206 } 207 // 2 bytes: [0x80, 0x7ff] 208 // Bit pattern: 110xxxxx 10xxxxxx 209 if (Position + 1 != End && 210 ((*Position & 0xE0) == 0xC0) && 211 ((*(Position + 1) & 0xC0) == 0x80)) { 212 uint32_t codepoint = ((*Position & 0x1F) << 6) | 213 (*(Position + 1) & 0x3F); 214 if (codepoint >= 0x80) 215 return std::make_pair(codepoint, 2); 216 } 217 // 3 bytes: [0x8000, 0xffff] 218 // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx 219 if (Position + 2 != End && 220 ((*Position & 0xF0) == 0xE0) && 221 ((*(Position + 1) & 0xC0) == 0x80) && 222 ((*(Position + 2) & 0xC0) == 0x80)) { 223 uint32_t codepoint = ((*Position & 0x0F) << 12) | 224 ((*(Position + 1) & 0x3F) << 6) | 225 (*(Position + 2) & 0x3F); 226 // Codepoints between 0xD800 and 0xDFFF are invalid, as 227 // they are high / low surrogate halves used by UTF-16. 228 if (codepoint >= 0x800 && 229 (codepoint < 0xD800 || codepoint > 0xDFFF)) 230 return std::make_pair(codepoint, 3); 231 } 232 // 4 bytes: [0x10000, 0x10FFFF] 233 // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 234 if (Position + 3 != End && 235 ((*Position & 0xF8) == 0xF0) && 236 ((*(Position + 1) & 0xC0) == 0x80) && 237 ((*(Position + 2) & 0xC0) == 0x80) && 238 ((*(Position + 3) & 0xC0) == 0x80)) { 239 uint32_t codepoint = ((*Position & 0x07) << 18) | 240 ((*(Position + 1) & 0x3F) << 12) | 241 ((*(Position + 2) & 0x3F) << 6) | 242 (*(Position + 3) & 0x3F); 243 if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) 244 return std::make_pair(codepoint, 4); 245 } 246 return std::make_pair(0, 0); 247 } 248 249 namespace llvm { 250 namespace yaml { 251 252 /// Scans YAML tokens from a MemoryBuffer. 253 class Scanner { 254 public: 255 Scanner(StringRef Input, SourceMgr &SM, bool ShowColors = true, 256 std::error_code *EC = nullptr); 257 Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors = true, 258 std::error_code *EC = nullptr); 259 260 /// Parse the next token and return it without popping it. 261 Token &peekNext(); 262 263 /// Parse the next token and pop it from the queue. 264 Token getNext(); 265 266 void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, 267 ArrayRef<SMRange> Ranges = None) { 268 SM.PrintMessage(Loc, Kind, Message, Ranges, /* FixIts= */ None, ShowColors); 269 } 270 271 void setError(const Twine &Message, StringRef::iterator Position) { 272 if (Current >= End) 273 Current = End - 1; 274 275 // propagate the error if possible 276 if (EC) 277 *EC = make_error_code(std::errc::invalid_argument); 278 279 // Don't print out more errors after the first one we encounter. The rest 280 // are just the result of the first, and have no meaning. 281 if (!Failed) 282 printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message); 283 Failed = true; 284 } 285 286 void setError(const Twine &Message) { 287 setError(Message, Current); 288 } 289 290 /// Returns true if an error occurred while parsing. 291 bool failed() { 292 return Failed; 293 } 294 295 private: 296 void init(MemoryBufferRef Buffer); 297 298 StringRef currentInput() { 299 return StringRef(Current, End - Current); 300 } 301 302 /// Decode a UTF-8 minimal well-formed code unit subsequence starting 303 /// at \a Position. 304 /// 305 /// If the UTF-8 code units starting at Position do not form a well-formed 306 /// code unit subsequence, then the Unicode scalar value is 0, and the length 307 /// is 0. 308 UTF8Decoded decodeUTF8(StringRef::iterator Position) { 309 return ::decodeUTF8(StringRef(Position, End - Position)); 310 } 311 312 // The following functions are based on the gramar rules in the YAML spec. The 313 // style of the function names it meant to closely match how they are written 314 // in the spec. The number within the [] is the number of the grammar rule in 315 // the spec. 316 // 317 // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. 318 // 319 // c- 320 // A production starting and ending with a special character. 321 // b- 322 // A production matching a single line break. 323 // nb- 324 // A production starting and ending with a non-break character. 325 // s- 326 // A production starting and ending with a white space character. 327 // ns- 328 // A production starting and ending with a non-space character. 329 // l- 330 // A production matching complete line(s). 331 332 /// Skip a single nb-char[27] starting at Position. 333 /// 334 /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] 335 /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] 336 /// 337 /// @returns The code unit after the nb-char, or Position if it's not an 338 /// nb-char. 339 StringRef::iterator skip_nb_char(StringRef::iterator Position); 340 341 /// Skip a single b-break[28] starting at Position. 342 /// 343 /// A b-break is 0xD 0xA | 0xD | 0xA 344 /// 345 /// @returns The code unit after the b-break, or Position if it's not a 346 /// b-break. 347 StringRef::iterator skip_b_break(StringRef::iterator Position); 348 349 /// Skip a single s-space[31] starting at Position. 350 /// 351 /// An s-space is 0x20 352 /// 353 /// @returns The code unit after the s-space, or Position if it's not a 354 /// s-space. 355 StringRef::iterator skip_s_space(StringRef::iterator Position); 356 357 /// Skip a single s-white[33] starting at Position. 358 /// 359 /// A s-white is 0x20 | 0x9 360 /// 361 /// @returns The code unit after the s-white, or Position if it's not a 362 /// s-white. 363 StringRef::iterator skip_s_white(StringRef::iterator Position); 364 365 /// Skip a single ns-char[34] starting at Position. 366 /// 367 /// A ns-char is nb-char - s-white 368 /// 369 /// @returns The code unit after the ns-char, or Position if it's not a 370 /// ns-char. 371 StringRef::iterator skip_ns_char(StringRef::iterator Position); 372 373 using SkipWhileFunc = StringRef::iterator (Scanner::*)(StringRef::iterator); 374 375 /// Skip minimal well-formed code unit subsequences until Func 376 /// returns its input. 377 /// 378 /// @returns The code unit after the last minimal well-formed code unit 379 /// subsequence that Func accepted. 380 StringRef::iterator skip_while( SkipWhileFunc Func 381 , StringRef::iterator Position); 382 383 /// Skip minimal well-formed code unit subsequences until Func returns its 384 /// input. 385 void advanceWhile(SkipWhileFunc Func); 386 387 /// Scan ns-uri-char[39]s starting at Cur. 388 /// 389 /// This updates Cur and Column while scanning. 390 void scan_ns_uri_char(); 391 392 /// Consume a minimal well-formed code unit subsequence starting at 393 /// \a Cur. Return false if it is not the same Unicode scalar value as 394 /// \a Expected. This updates \a Column. 395 bool consume(uint32_t Expected); 396 397 /// Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. 398 void skip(uint32_t Distance); 399 400 /// Return true if the minimal well-formed code unit subsequence at 401 /// Pos is whitespace or a new line 402 bool isBlankOrBreak(StringRef::iterator Position); 403 404 /// Consume a single b-break[28] if it's present at the current position. 405 /// 406 /// Return false if the code unit at the current position isn't a line break. 407 bool consumeLineBreakIfPresent(); 408 409 /// If IsSimpleKeyAllowed, create and push_back a new SimpleKey. 410 void saveSimpleKeyCandidate( TokenQueueT::iterator Tok 411 , unsigned AtColumn 412 , bool IsRequired); 413 414 /// Remove simple keys that can no longer be valid simple keys. 415 /// 416 /// Invalid simple keys are not on the current line or are further than 1024 417 /// columns back. 418 void removeStaleSimpleKeyCandidates(); 419 420 /// Remove all simple keys on FlowLevel \a Level. 421 void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); 422 423 /// Unroll indentation in \a Indents back to \a Col. Creates BlockEnd 424 /// tokens if needed. 425 bool unrollIndent(int ToColumn); 426 427 /// Increase indent to \a Col. Creates \a Kind token at \a InsertPoint 428 /// if needed. 429 bool rollIndent( int ToColumn 430 , Token::TokenKind Kind 431 , TokenQueueT::iterator InsertPoint); 432 433 /// Skip a single-line comment when the comment starts at the current 434 /// position of the scanner. 435 void skipComment(); 436 437 /// Skip whitespace and comments until the start of the next token. 438 void scanToNextToken(); 439 440 /// Must be the first token generated. 441 bool scanStreamStart(); 442 443 /// Generate tokens needed to close out the stream. 444 bool scanStreamEnd(); 445 446 /// Scan a %BLAH directive. 447 bool scanDirective(); 448 449 /// Scan a ... or ---. 450 bool scanDocumentIndicator(bool IsStart); 451 452 /// Scan a [ or { and generate the proper flow collection start token. 453 bool scanFlowCollectionStart(bool IsSequence); 454 455 /// Scan a ] or } and generate the proper flow collection end token. 456 bool scanFlowCollectionEnd(bool IsSequence); 457 458 /// Scan the , that separates entries in a flow collection. 459 bool scanFlowEntry(); 460 461 /// Scan the - that starts block sequence entries. 462 bool scanBlockEntry(); 463 464 /// Scan an explicit ? indicating a key. 465 bool scanKey(); 466 467 /// Scan an explicit : indicating a value. 468 bool scanValue(); 469 470 /// Scan a quoted scalar. 471 bool scanFlowScalar(bool IsDoubleQuoted); 472 473 /// Scan an unquoted scalar. 474 bool scanPlainScalar(); 475 476 /// Scan an Alias or Anchor starting with * or &. 477 bool scanAliasOrAnchor(bool IsAlias); 478 479 /// Scan a block scalar starting with | or >. 480 bool scanBlockScalar(bool IsLiteral); 481 482 /// Scan a chomping indicator in a block scalar header. 483 char scanBlockChompingIndicator(); 484 485 /// Scan an indentation indicator in a block scalar header. 486 unsigned scanBlockIndentationIndicator(); 487 488 /// Scan a block scalar header. 489 /// 490 /// Return false if an error occurred. 491 bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator, 492 bool &IsDone); 493 494 /// Look for the indentation level of a block scalar. 495 /// 496 /// Return false if an error occurred. 497 bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent, 498 unsigned &LineBreaks, bool &IsDone); 499 500 /// Scan the indentation of a text line in a block scalar. 501 /// 502 /// Return false if an error occurred. 503 bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent, 504 bool &IsDone); 505 506 /// Scan a tag of the form !stuff. 507 bool scanTag(); 508 509 /// Dispatch to the next scanning function based on \a *Cur. 510 bool fetchMoreTokens(); 511 512 /// The SourceMgr used for diagnostics and buffer management. 513 SourceMgr &SM; 514 515 /// The original input. 516 MemoryBufferRef InputBuffer; 517 518 /// The current position of the scanner. 519 StringRef::iterator Current; 520 521 /// The end of the input (one past the last character). 522 StringRef::iterator End; 523 524 /// Current YAML indentation level in spaces. 525 int Indent; 526 527 /// Current column number in Unicode code points. 528 unsigned Column; 529 530 /// Current line number. 531 unsigned Line; 532 533 /// How deep we are in flow style containers. 0 Means at block level. 534 unsigned FlowLevel; 535 536 /// Are we at the start of the stream? 537 bool IsStartOfStream; 538 539 /// Can the next token be the start of a simple key? 540 bool IsSimpleKeyAllowed; 541 542 /// True if an error has occurred. 543 bool Failed; 544 545 /// Should colors be used when printing out the diagnostic messages? 546 bool ShowColors; 547 548 /// Queue of tokens. This is required to queue up tokens while looking 549 /// for the end of a simple key. And for cases where a single character 550 /// can produce multiple tokens (e.g. BlockEnd). 551 TokenQueueT TokenQueue; 552 553 /// Indentation levels. 554 SmallVector<int, 4> Indents; 555 556 /// Potential simple keys. 557 SmallVector<SimpleKey, 4> SimpleKeys; 558 559 std::error_code *EC; 560 }; 561 562 } // end namespace yaml 563 } // end namespace llvm 564 565 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. 566 static void encodeUTF8( uint32_t UnicodeScalarValue 567 , SmallVectorImpl<char> &Result) { 568 if (UnicodeScalarValue <= 0x7F) { 569 Result.push_back(UnicodeScalarValue & 0x7F); 570 } else if (UnicodeScalarValue <= 0x7FF) { 571 uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); 572 uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); 573 Result.push_back(FirstByte); 574 Result.push_back(SecondByte); 575 } else if (UnicodeScalarValue <= 0xFFFF) { 576 uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); 577 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 578 uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); 579 Result.push_back(FirstByte); 580 Result.push_back(SecondByte); 581 Result.push_back(ThirdByte); 582 } else if (UnicodeScalarValue <= 0x10FFFF) { 583 uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); 584 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); 585 uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 586 uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); 587 Result.push_back(FirstByte); 588 Result.push_back(SecondByte); 589 Result.push_back(ThirdByte); 590 Result.push_back(FourthByte); 591 } 592 } 593 594 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { 595 SourceMgr SM; 596 Scanner scanner(Input, SM); 597 while (true) { 598 Token T = scanner.getNext(); 599 switch (T.Kind) { 600 case Token::TK_StreamStart: 601 OS << "Stream-Start: "; 602 break; 603 case Token::TK_StreamEnd: 604 OS << "Stream-End: "; 605 break; 606 case Token::TK_VersionDirective: 607 OS << "Version-Directive: "; 608 break; 609 case Token::TK_TagDirective: 610 OS << "Tag-Directive: "; 611 break; 612 case Token::TK_DocumentStart: 613 OS << "Document-Start: "; 614 break; 615 case Token::TK_DocumentEnd: 616 OS << "Document-End: "; 617 break; 618 case Token::TK_BlockEntry: 619 OS << "Block-Entry: "; 620 break; 621 case Token::TK_BlockEnd: 622 OS << "Block-End: "; 623 break; 624 case Token::TK_BlockSequenceStart: 625 OS << "Block-Sequence-Start: "; 626 break; 627 case Token::TK_BlockMappingStart: 628 OS << "Block-Mapping-Start: "; 629 break; 630 case Token::TK_FlowEntry: 631 OS << "Flow-Entry: "; 632 break; 633 case Token::TK_FlowSequenceStart: 634 OS << "Flow-Sequence-Start: "; 635 break; 636 case Token::TK_FlowSequenceEnd: 637 OS << "Flow-Sequence-End: "; 638 break; 639 case Token::TK_FlowMappingStart: 640 OS << "Flow-Mapping-Start: "; 641 break; 642 case Token::TK_FlowMappingEnd: 643 OS << "Flow-Mapping-End: "; 644 break; 645 case Token::TK_Key: 646 OS << "Key: "; 647 break; 648 case Token::TK_Value: 649 OS << "Value: "; 650 break; 651 case Token::TK_Scalar: 652 OS << "Scalar: "; 653 break; 654 case Token::TK_BlockScalar: 655 OS << "Block Scalar: "; 656 break; 657 case Token::TK_Alias: 658 OS << "Alias: "; 659 break; 660 case Token::TK_Anchor: 661 OS << "Anchor: "; 662 break; 663 case Token::TK_Tag: 664 OS << "Tag: "; 665 break; 666 case Token::TK_Error: 667 break; 668 } 669 OS << T.Range << "\n"; 670 if (T.Kind == Token::TK_StreamEnd) 671 break; 672 else if (T.Kind == Token::TK_Error) 673 return false; 674 } 675 return true; 676 } 677 678 bool yaml::scanTokens(StringRef Input) { 679 SourceMgr SM; 680 Scanner scanner(Input, SM); 681 while (true) { 682 Token T = scanner.getNext(); 683 if (T.Kind == Token::TK_StreamEnd) 684 break; 685 else if (T.Kind == Token::TK_Error) 686 return false; 687 } 688 return true; 689 } 690 691 std::string yaml::escape(StringRef Input, bool EscapePrintable) { 692 std::string EscapedInput; 693 for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { 694 if (*i == '\\') 695 EscapedInput += "\\\\"; 696 else if (*i == '"') 697 EscapedInput += "\\\""; 698 else if (*i == 0) 699 EscapedInput += "\\0"; 700 else if (*i == 0x07) 701 EscapedInput += "\\a"; 702 else if (*i == 0x08) 703 EscapedInput += "\\b"; 704 else if (*i == 0x09) 705 EscapedInput += "\\t"; 706 else if (*i == 0x0A) 707 EscapedInput += "\\n"; 708 else if (*i == 0x0B) 709 EscapedInput += "\\v"; 710 else if (*i == 0x0C) 711 EscapedInput += "\\f"; 712 else if (*i == 0x0D) 713 EscapedInput += "\\r"; 714 else if (*i == 0x1B) 715 EscapedInput += "\\e"; 716 else if ((unsigned char)*i < 0x20) { // Control characters not handled above. 717 std::string HexStr = utohexstr(*i); 718 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 719 } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. 720 UTF8Decoded UnicodeScalarValue 721 = decodeUTF8(StringRef(i, Input.end() - i)); 722 if (UnicodeScalarValue.second == 0) { 723 // Found invalid char. 724 SmallString<4> Val; 725 encodeUTF8(0xFFFD, Val); 726 EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end()); 727 // FIXME: Error reporting. 728 return EscapedInput; 729 } 730 if (UnicodeScalarValue.first == 0x85) 731 EscapedInput += "\\N"; 732 else if (UnicodeScalarValue.first == 0xA0) 733 EscapedInput += "\\_"; 734 else if (UnicodeScalarValue.first == 0x2028) 735 EscapedInput += "\\L"; 736 else if (UnicodeScalarValue.first == 0x2029) 737 EscapedInput += "\\P"; 738 else if (!EscapePrintable && 739 sys::unicode::isPrintable(UnicodeScalarValue.first)) 740 EscapedInput += StringRef(i, UnicodeScalarValue.second); 741 else { 742 std::string HexStr = utohexstr(UnicodeScalarValue.first); 743 if (HexStr.size() <= 2) 744 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 745 else if (HexStr.size() <= 4) 746 EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; 747 else if (HexStr.size() <= 8) 748 EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; 749 } 750 i += UnicodeScalarValue.second - 1; 751 } else 752 EscapedInput.push_back(*i); 753 } 754 return EscapedInput; 755 } 756 757 Scanner::Scanner(StringRef Input, SourceMgr &sm, bool ShowColors, 758 std::error_code *EC) 759 : SM(sm), ShowColors(ShowColors), EC(EC) { 760 init(MemoryBufferRef(Input, "YAML")); 761 } 762 763 Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors, 764 std::error_code *EC) 765 : SM(SM_), ShowColors(ShowColors), EC(EC) { 766 init(Buffer); 767 } 768 769 void Scanner::init(MemoryBufferRef Buffer) { 770 InputBuffer = Buffer; 771 Current = InputBuffer.getBufferStart(); 772 End = InputBuffer.getBufferEnd(); 773 Indent = -1; 774 Column = 0; 775 Line = 0; 776 FlowLevel = 0; 777 IsStartOfStream = true; 778 IsSimpleKeyAllowed = true; 779 Failed = false; 780 std::unique_ptr<MemoryBuffer> InputBufferOwner = 781 MemoryBuffer::getMemBuffer(Buffer); 782 SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc()); 783 } 784 785 Token &Scanner::peekNext() { 786 // If the current token is a possible simple key, keep parsing until we 787 // can confirm. 788 bool NeedMore = false; 789 while (true) { 790 if (TokenQueue.empty() || NeedMore) { 791 if (!fetchMoreTokens()) { 792 TokenQueue.clear(); 793 TokenQueue.push_back(Token()); 794 return TokenQueue.front(); 795 } 796 } 797 assert(!TokenQueue.empty() && 798 "fetchMoreTokens lied about getting tokens!"); 799 800 removeStaleSimpleKeyCandidates(); 801 SimpleKey SK; 802 SK.Tok = TokenQueue.begin(); 803 if (!is_contained(SimpleKeys, SK)) 804 break; 805 else 806 NeedMore = true; 807 } 808 return TokenQueue.front(); 809 } 810 811 Token Scanner::getNext() { 812 Token Ret = peekNext(); 813 // TokenQueue can be empty if there was an error getting the next token. 814 if (!TokenQueue.empty()) 815 TokenQueue.pop_front(); 816 817 // There cannot be any referenced Token's if the TokenQueue is empty. So do a 818 // quick deallocation of them all. 819 if (TokenQueue.empty()) 820 TokenQueue.resetAlloc(); 821 822 return Ret; 823 } 824 825 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { 826 if (Position == End) 827 return Position; 828 // Check 7 bit c-printable - b-char. 829 if ( *Position == 0x09 830 || (*Position >= 0x20 && *Position <= 0x7E)) 831 return Position + 1; 832 833 // Check for valid UTF-8. 834 if (uint8_t(*Position) & 0x80) { 835 UTF8Decoded u8d = decodeUTF8(Position); 836 if ( u8d.second != 0 837 && u8d.first != 0xFEFF 838 && ( u8d.first == 0x85 839 || ( u8d.first >= 0xA0 840 && u8d.first <= 0xD7FF) 841 || ( u8d.first >= 0xE000 842 && u8d.first <= 0xFFFD) 843 || ( u8d.first >= 0x10000 844 && u8d.first <= 0x10FFFF))) 845 return Position + u8d.second; 846 } 847 return Position; 848 } 849 850 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { 851 if (Position == End) 852 return Position; 853 if (*Position == 0x0D) { 854 if (Position + 1 != End && *(Position + 1) == 0x0A) 855 return Position + 2; 856 return Position + 1; 857 } 858 859 if (*Position == 0x0A) 860 return Position + 1; 861 return Position; 862 } 863 864 StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) { 865 if (Position == End) 866 return Position; 867 if (*Position == ' ') 868 return Position + 1; 869 return Position; 870 } 871 872 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { 873 if (Position == End) 874 return Position; 875 if (*Position == ' ' || *Position == '\t') 876 return Position + 1; 877 return Position; 878 } 879 880 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { 881 if (Position == End) 882 return Position; 883 if (*Position == ' ' || *Position == '\t') 884 return Position; 885 return skip_nb_char(Position); 886 } 887 888 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func 889 , StringRef::iterator Position) { 890 while (true) { 891 StringRef::iterator i = (this->*Func)(Position); 892 if (i == Position) 893 break; 894 Position = i; 895 } 896 return Position; 897 } 898 899 void Scanner::advanceWhile(SkipWhileFunc Func) { 900 auto Final = skip_while(Func, Current); 901 Column += Final - Current; 902 Current = Final; 903 } 904 905 static bool is_ns_hex_digit(const char C) { 906 return (C >= '0' && C <= '9') 907 || (C >= 'a' && C <= 'z') 908 || (C >= 'A' && C <= 'Z'); 909 } 910 911 static bool is_ns_word_char(const char C) { 912 return C == '-' 913 || (C >= 'a' && C <= 'z') 914 || (C >= 'A' && C <= 'Z'); 915 } 916 917 void Scanner::scan_ns_uri_char() { 918 while (true) { 919 if (Current == End) 920 break; 921 if (( *Current == '%' 922 && Current + 2 < End 923 && is_ns_hex_digit(*(Current + 1)) 924 && is_ns_hex_digit(*(Current + 2))) 925 || is_ns_word_char(*Current) 926 || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") 927 != StringRef::npos) { 928 ++Current; 929 ++Column; 930 } else 931 break; 932 } 933 } 934 935 bool Scanner::consume(uint32_t Expected) { 936 if (Expected >= 0x80) 937 report_fatal_error("Not dealing with this yet"); 938 if (Current == End) 939 return false; 940 if (uint8_t(*Current) >= 0x80) 941 report_fatal_error("Not dealing with this yet"); 942 if (uint8_t(*Current) == Expected) { 943 ++Current; 944 ++Column; 945 return true; 946 } 947 return false; 948 } 949 950 void Scanner::skip(uint32_t Distance) { 951 Current += Distance; 952 Column += Distance; 953 assert(Current <= End && "Skipped past the end"); 954 } 955 956 bool Scanner::isBlankOrBreak(StringRef::iterator Position) { 957 if (Position == End) 958 return false; 959 return *Position == ' ' || *Position == '\t' || *Position == '\r' || 960 *Position == '\n'; 961 } 962 963 bool Scanner::consumeLineBreakIfPresent() { 964 auto Next = skip_b_break(Current); 965 if (Next == Current) 966 return false; 967 Column = 0; 968 ++Line; 969 Current = Next; 970 return true; 971 } 972 973 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok 974 , unsigned AtColumn 975 , bool IsRequired) { 976 if (IsSimpleKeyAllowed) { 977 SimpleKey SK; 978 SK.Tok = Tok; 979 SK.Line = Line; 980 SK.Column = AtColumn; 981 SK.IsRequired = IsRequired; 982 SK.FlowLevel = FlowLevel; 983 SimpleKeys.push_back(SK); 984 } 985 } 986 987 void Scanner::removeStaleSimpleKeyCandidates() { 988 for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); 989 i != SimpleKeys.end();) { 990 if (i->Line != Line || i->Column + 1024 < Column) { 991 if (i->IsRequired) 992 setError( "Could not find expected : for simple key" 993 , i->Tok->Range.begin()); 994 i = SimpleKeys.erase(i); 995 } else 996 ++i; 997 } 998 } 999 1000 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { 1001 if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) 1002 SimpleKeys.pop_back(); 1003 } 1004 1005 bool Scanner::unrollIndent(int ToColumn) { 1006 Token T; 1007 // Indentation is ignored in flow. 1008 if (FlowLevel != 0) 1009 return true; 1010 1011 while (Indent > ToColumn) { 1012 T.Kind = Token::TK_BlockEnd; 1013 T.Range = StringRef(Current, 1); 1014 TokenQueue.push_back(T); 1015 Indent = Indents.pop_back_val(); 1016 } 1017 1018 return true; 1019 } 1020 1021 bool Scanner::rollIndent( int ToColumn 1022 , Token::TokenKind Kind 1023 , TokenQueueT::iterator InsertPoint) { 1024 if (FlowLevel) 1025 return true; 1026 if (Indent < ToColumn) { 1027 Indents.push_back(Indent); 1028 Indent = ToColumn; 1029 1030 Token T; 1031 T.Kind = Kind; 1032 T.Range = StringRef(Current, 0); 1033 TokenQueue.insert(InsertPoint, T); 1034 } 1035 return true; 1036 } 1037 1038 void Scanner::skipComment() { 1039 if (*Current != '#') 1040 return; 1041 while (true) { 1042 // This may skip more than one byte, thus Column is only incremented 1043 // for code points. 1044 StringRef::iterator I = skip_nb_char(Current); 1045 if (I == Current) 1046 break; 1047 Current = I; 1048 ++Column; 1049 } 1050 } 1051 1052 void Scanner::scanToNextToken() { 1053 while (true) { 1054 while (*Current == ' ' || *Current == '\t') { 1055 skip(1); 1056 } 1057 1058 skipComment(); 1059 1060 // Skip EOL. 1061 StringRef::iterator i = skip_b_break(Current); 1062 if (i == Current) 1063 break; 1064 Current = i; 1065 ++Line; 1066 Column = 0; 1067 // New lines may start a simple key. 1068 if (!FlowLevel) 1069 IsSimpleKeyAllowed = true; 1070 } 1071 } 1072 1073 bool Scanner::scanStreamStart() { 1074 IsStartOfStream = false; 1075 1076 EncodingInfo EI = getUnicodeEncoding(currentInput()); 1077 1078 Token T; 1079 T.Kind = Token::TK_StreamStart; 1080 T.Range = StringRef(Current, EI.second); 1081 TokenQueue.push_back(T); 1082 Current += EI.second; 1083 return true; 1084 } 1085 1086 bool Scanner::scanStreamEnd() { 1087 // Force an ending new line if one isn't present. 1088 if (Column != 0) { 1089 Column = 0; 1090 ++Line; 1091 } 1092 1093 unrollIndent(-1); 1094 SimpleKeys.clear(); 1095 IsSimpleKeyAllowed = false; 1096 1097 Token T; 1098 T.Kind = Token::TK_StreamEnd; 1099 T.Range = StringRef(Current, 0); 1100 TokenQueue.push_back(T); 1101 return true; 1102 } 1103 1104 bool Scanner::scanDirective() { 1105 // Reset the indentation level. 1106 unrollIndent(-1); 1107 SimpleKeys.clear(); 1108 IsSimpleKeyAllowed = false; 1109 1110 StringRef::iterator Start = Current; 1111 consume('%'); 1112 StringRef::iterator NameStart = Current; 1113 Current = skip_while(&Scanner::skip_ns_char, Current); 1114 StringRef Name(NameStart, Current - NameStart); 1115 Current = skip_while(&Scanner::skip_s_white, Current); 1116 1117 Token T; 1118 if (Name == "YAML") { 1119 Current = skip_while(&Scanner::skip_ns_char, Current); 1120 T.Kind = Token::TK_VersionDirective; 1121 T.Range = StringRef(Start, Current - Start); 1122 TokenQueue.push_back(T); 1123 return true; 1124 } else if(Name == "TAG") { 1125 Current = skip_while(&Scanner::skip_ns_char, Current); 1126 Current = skip_while(&Scanner::skip_s_white, Current); 1127 Current = skip_while(&Scanner::skip_ns_char, Current); 1128 T.Kind = Token::TK_TagDirective; 1129 T.Range = StringRef(Start, Current - Start); 1130 TokenQueue.push_back(T); 1131 return true; 1132 } 1133 return false; 1134 } 1135 1136 bool Scanner::scanDocumentIndicator(bool IsStart) { 1137 unrollIndent(-1); 1138 SimpleKeys.clear(); 1139 IsSimpleKeyAllowed = false; 1140 1141 Token T; 1142 T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; 1143 T.Range = StringRef(Current, 3); 1144 skip(3); 1145 TokenQueue.push_back(T); 1146 return true; 1147 } 1148 1149 bool Scanner::scanFlowCollectionStart(bool IsSequence) { 1150 Token T; 1151 T.Kind = IsSequence ? Token::TK_FlowSequenceStart 1152 : Token::TK_FlowMappingStart; 1153 T.Range = StringRef(Current, 1); 1154 skip(1); 1155 TokenQueue.push_back(T); 1156 1157 // [ and { may begin a simple key. 1158 saveSimpleKeyCandidate(--TokenQueue.end(), Column - 1, false); 1159 1160 // And may also be followed by a simple key. 1161 IsSimpleKeyAllowed = true; 1162 ++FlowLevel; 1163 return true; 1164 } 1165 1166 bool Scanner::scanFlowCollectionEnd(bool IsSequence) { 1167 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1168 IsSimpleKeyAllowed = false; 1169 Token T; 1170 T.Kind = IsSequence ? Token::TK_FlowSequenceEnd 1171 : Token::TK_FlowMappingEnd; 1172 T.Range = StringRef(Current, 1); 1173 skip(1); 1174 TokenQueue.push_back(T); 1175 if (FlowLevel) 1176 --FlowLevel; 1177 return true; 1178 } 1179 1180 bool Scanner::scanFlowEntry() { 1181 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1182 IsSimpleKeyAllowed = true; 1183 Token T; 1184 T.Kind = Token::TK_FlowEntry; 1185 T.Range = StringRef(Current, 1); 1186 skip(1); 1187 TokenQueue.push_back(T); 1188 return true; 1189 } 1190 1191 bool Scanner::scanBlockEntry() { 1192 rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); 1193 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1194 IsSimpleKeyAllowed = true; 1195 Token T; 1196 T.Kind = Token::TK_BlockEntry; 1197 T.Range = StringRef(Current, 1); 1198 skip(1); 1199 TokenQueue.push_back(T); 1200 return true; 1201 } 1202 1203 bool Scanner::scanKey() { 1204 if (!FlowLevel) 1205 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1206 1207 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1208 IsSimpleKeyAllowed = !FlowLevel; 1209 1210 Token T; 1211 T.Kind = Token::TK_Key; 1212 T.Range = StringRef(Current, 1); 1213 skip(1); 1214 TokenQueue.push_back(T); 1215 return true; 1216 } 1217 1218 bool Scanner::scanValue() { 1219 // If the previous token could have been a simple key, insert the key token 1220 // into the token queue. 1221 if (!SimpleKeys.empty()) { 1222 SimpleKey SK = SimpleKeys.pop_back_val(); 1223 Token T; 1224 T.Kind = Token::TK_Key; 1225 T.Range = SK.Tok->Range; 1226 TokenQueueT::iterator i, e; 1227 for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { 1228 if (i == SK.Tok) 1229 break; 1230 } 1231 assert(i != e && "SimpleKey not in token queue!"); 1232 i = TokenQueue.insert(i, T); 1233 1234 // We may also need to add a Block-Mapping-Start token. 1235 rollIndent(SK.Column, Token::TK_BlockMappingStart, i); 1236 1237 IsSimpleKeyAllowed = false; 1238 } else { 1239 if (!FlowLevel) 1240 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1241 IsSimpleKeyAllowed = !FlowLevel; 1242 } 1243 1244 Token T; 1245 T.Kind = Token::TK_Value; 1246 T.Range = StringRef(Current, 1); 1247 skip(1); 1248 TokenQueue.push_back(T); 1249 return true; 1250 } 1251 1252 // Forbidding inlining improves performance by roughly 20%. 1253 // FIXME: Remove once llvm optimizes this to the faster version without hints. 1254 LLVM_ATTRIBUTE_NOINLINE static bool 1255 wasEscaped(StringRef::iterator First, StringRef::iterator Position); 1256 1257 // Returns whether a character at 'Position' was escaped with a leading '\'. 1258 // 'First' specifies the position of the first character in the string. 1259 static bool wasEscaped(StringRef::iterator First, 1260 StringRef::iterator Position) { 1261 assert(Position - 1 >= First); 1262 StringRef::iterator I = Position - 1; 1263 // We calculate the number of consecutive '\'s before the current position 1264 // by iterating backwards through our string. 1265 while (I >= First && *I == '\\') --I; 1266 // (Position - 1 - I) now contains the number of '\'s before the current 1267 // position. If it is odd, the character at 'Position' was escaped. 1268 return (Position - 1 - I) % 2 == 1; 1269 } 1270 1271 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { 1272 StringRef::iterator Start = Current; 1273 unsigned ColStart = Column; 1274 if (IsDoubleQuoted) { 1275 do { 1276 ++Current; 1277 while (Current != End && *Current != '"') 1278 ++Current; 1279 // Repeat until the previous character was not a '\' or was an escaped 1280 // backslash. 1281 } while ( Current != End 1282 && *(Current - 1) == '\\' 1283 && wasEscaped(Start + 1, Current)); 1284 } else { 1285 skip(1); 1286 while (true) { 1287 // Skip a ' followed by another '. 1288 if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { 1289 skip(2); 1290 continue; 1291 } else if (*Current == '\'') 1292 break; 1293 StringRef::iterator i = skip_nb_char(Current); 1294 if (i == Current) { 1295 i = skip_b_break(Current); 1296 if (i == Current) 1297 break; 1298 Current = i; 1299 Column = 0; 1300 ++Line; 1301 } else { 1302 if (i == End) 1303 break; 1304 Current = i; 1305 ++Column; 1306 } 1307 } 1308 } 1309 1310 if (Current == End) { 1311 setError("Expected quote at end of scalar", Current); 1312 return false; 1313 } 1314 1315 skip(1); // Skip ending quote. 1316 Token T; 1317 T.Kind = Token::TK_Scalar; 1318 T.Range = StringRef(Start, Current - Start); 1319 TokenQueue.push_back(T); 1320 1321 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1322 1323 IsSimpleKeyAllowed = false; 1324 1325 return true; 1326 } 1327 1328 bool Scanner::scanPlainScalar() { 1329 StringRef::iterator Start = Current; 1330 unsigned ColStart = Column; 1331 unsigned LeadingBlanks = 0; 1332 assert(Indent >= -1 && "Indent must be >= -1 !"); 1333 unsigned indent = static_cast<unsigned>(Indent + 1); 1334 while (true) { 1335 if (*Current == '#') 1336 break; 1337 1338 while (!isBlankOrBreak(Current)) { 1339 if ( FlowLevel && *Current == ':' 1340 && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) { 1341 setError("Found unexpected ':' while scanning a plain scalar", Current); 1342 return false; 1343 } 1344 1345 // Check for the end of the plain scalar. 1346 if ( (*Current == ':' && isBlankOrBreak(Current + 1)) 1347 || ( FlowLevel 1348 && (StringRef(Current, 1).find_first_of(",:?[]{}") 1349 != StringRef::npos))) 1350 break; 1351 1352 StringRef::iterator i = skip_nb_char(Current); 1353 if (i == Current) 1354 break; 1355 Current = i; 1356 ++Column; 1357 } 1358 1359 // Are we at the end? 1360 if (!isBlankOrBreak(Current)) 1361 break; 1362 1363 // Eat blanks. 1364 StringRef::iterator Tmp = Current; 1365 while (isBlankOrBreak(Tmp)) { 1366 StringRef::iterator i = skip_s_white(Tmp); 1367 if (i != Tmp) { 1368 if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { 1369 setError("Found invalid tab character in indentation", Tmp); 1370 return false; 1371 } 1372 Tmp = i; 1373 ++Column; 1374 } else { 1375 i = skip_b_break(Tmp); 1376 if (!LeadingBlanks) 1377 LeadingBlanks = 1; 1378 Tmp = i; 1379 Column = 0; 1380 ++Line; 1381 } 1382 } 1383 1384 if (!FlowLevel && Column < indent) 1385 break; 1386 1387 Current = Tmp; 1388 } 1389 if (Start == Current) { 1390 setError("Got empty plain scalar", Start); 1391 return false; 1392 } 1393 Token T; 1394 T.Kind = Token::TK_Scalar; 1395 T.Range = StringRef(Start, Current - Start); 1396 TokenQueue.push_back(T); 1397 1398 // Plain scalars can be simple keys. 1399 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1400 1401 IsSimpleKeyAllowed = false; 1402 1403 return true; 1404 } 1405 1406 bool Scanner::scanAliasOrAnchor(bool IsAlias) { 1407 StringRef::iterator Start = Current; 1408 unsigned ColStart = Column; 1409 skip(1); 1410 while(true) { 1411 if ( *Current == '[' || *Current == ']' 1412 || *Current == '{' || *Current == '}' 1413 || *Current == ',' 1414 || *Current == ':') 1415 break; 1416 StringRef::iterator i = skip_ns_char(Current); 1417 if (i == Current) 1418 break; 1419 Current = i; 1420 ++Column; 1421 } 1422 1423 if (Start == Current) { 1424 setError("Got empty alias or anchor", Start); 1425 return false; 1426 } 1427 1428 Token T; 1429 T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; 1430 T.Range = StringRef(Start, Current - Start); 1431 TokenQueue.push_back(T); 1432 1433 // Alias and anchors can be simple keys. 1434 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1435 1436 IsSimpleKeyAllowed = false; 1437 1438 return true; 1439 } 1440 1441 char Scanner::scanBlockChompingIndicator() { 1442 char Indicator = ' '; 1443 if (Current != End && (*Current == '+' || *Current == '-')) { 1444 Indicator = *Current; 1445 skip(1); 1446 } 1447 return Indicator; 1448 } 1449 1450 /// Get the number of line breaks after chomping. 1451 /// 1452 /// Return the number of trailing line breaks to emit, depending on 1453 /// \p ChompingIndicator. 1454 static unsigned getChompedLineBreaks(char ChompingIndicator, 1455 unsigned LineBreaks, StringRef Str) { 1456 if (ChompingIndicator == '-') // Strip all line breaks. 1457 return 0; 1458 if (ChompingIndicator == '+') // Keep all line breaks. 1459 return LineBreaks; 1460 // Clip trailing lines. 1461 return Str.empty() ? 0 : 1; 1462 } 1463 1464 unsigned Scanner::scanBlockIndentationIndicator() { 1465 unsigned Indent = 0; 1466 if (Current != End && (*Current >= '1' && *Current <= '9')) { 1467 Indent = unsigned(*Current - '0'); 1468 skip(1); 1469 } 1470 return Indent; 1471 } 1472 1473 bool Scanner::scanBlockScalarHeader(char &ChompingIndicator, 1474 unsigned &IndentIndicator, bool &IsDone) { 1475 auto Start = Current; 1476 1477 ChompingIndicator = scanBlockChompingIndicator(); 1478 IndentIndicator = scanBlockIndentationIndicator(); 1479 // Check for the chomping indicator once again. 1480 if (ChompingIndicator == ' ') 1481 ChompingIndicator = scanBlockChompingIndicator(); 1482 Current = skip_while(&Scanner::skip_s_white, Current); 1483 skipComment(); 1484 1485 if (Current == End) { // EOF, we have an empty scalar. 1486 Token T; 1487 T.Kind = Token::TK_BlockScalar; 1488 T.Range = StringRef(Start, Current - Start); 1489 TokenQueue.push_back(T); 1490 IsDone = true; 1491 return true; 1492 } 1493 1494 if (!consumeLineBreakIfPresent()) { 1495 setError("Expected a line break after block scalar header", Current); 1496 return false; 1497 } 1498 return true; 1499 } 1500 1501 bool Scanner::findBlockScalarIndent(unsigned &BlockIndent, 1502 unsigned BlockExitIndent, 1503 unsigned &LineBreaks, bool &IsDone) { 1504 unsigned MaxAllSpaceLineCharacters = 0; 1505 StringRef::iterator LongestAllSpaceLine; 1506 1507 while (true) { 1508 advanceWhile(&Scanner::skip_s_space); 1509 if (skip_nb_char(Current) != Current) { 1510 // This line isn't empty, so try and find the indentation. 1511 if (Column <= BlockExitIndent) { // End of the block literal. 1512 IsDone = true; 1513 return true; 1514 } 1515 // We found the block's indentation. 1516 BlockIndent = Column; 1517 if (MaxAllSpaceLineCharacters > BlockIndent) { 1518 setError( 1519 "Leading all-spaces line must be smaller than the block indent", 1520 LongestAllSpaceLine); 1521 return false; 1522 } 1523 return true; 1524 } 1525 if (skip_b_break(Current) != Current && 1526 Column > MaxAllSpaceLineCharacters) { 1527 // Record the longest all-space line in case it's longer than the 1528 // discovered block indent. 1529 MaxAllSpaceLineCharacters = Column; 1530 LongestAllSpaceLine = Current; 1531 } 1532 1533 // Check for EOF. 1534 if (Current == End) { 1535 IsDone = true; 1536 return true; 1537 } 1538 1539 if (!consumeLineBreakIfPresent()) { 1540 IsDone = true; 1541 return true; 1542 } 1543 ++LineBreaks; 1544 } 1545 return true; 1546 } 1547 1548 bool Scanner::scanBlockScalarIndent(unsigned BlockIndent, 1549 unsigned BlockExitIndent, bool &IsDone) { 1550 // Skip the indentation. 1551 while (Column < BlockIndent) { 1552 auto I = skip_s_space(Current); 1553 if (I == Current) 1554 break; 1555 Current = I; 1556 ++Column; 1557 } 1558 1559 if (skip_nb_char(Current) == Current) 1560 return true; 1561 1562 if (Column <= BlockExitIndent) { // End of the block literal. 1563 IsDone = true; 1564 return true; 1565 } 1566 1567 if (Column < BlockIndent) { 1568 if (Current != End && *Current == '#') { // Trailing comment. 1569 IsDone = true; 1570 return true; 1571 } 1572 setError("A text line is less indented than the block scalar", Current); 1573 return false; 1574 } 1575 return true; // A normal text line. 1576 } 1577 1578 bool Scanner::scanBlockScalar(bool IsLiteral) { 1579 // Eat '|' or '>' 1580 assert(*Current == '|' || *Current == '>'); 1581 skip(1); 1582 1583 char ChompingIndicator; 1584 unsigned BlockIndent; 1585 bool IsDone = false; 1586 if (!scanBlockScalarHeader(ChompingIndicator, BlockIndent, IsDone)) 1587 return false; 1588 if (IsDone) 1589 return true; 1590 1591 auto Start = Current; 1592 unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent; 1593 unsigned LineBreaks = 0; 1594 if (BlockIndent == 0) { 1595 if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks, 1596 IsDone)) 1597 return false; 1598 } 1599 1600 // Scan the block's scalars body. 1601 SmallString<256> Str; 1602 while (!IsDone) { 1603 if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone)) 1604 return false; 1605 if (IsDone) 1606 break; 1607 1608 // Parse the current line. 1609 auto LineStart = Current; 1610 advanceWhile(&Scanner::skip_nb_char); 1611 if (LineStart != Current) { 1612 Str.append(LineBreaks, '\n'); 1613 Str.append(StringRef(LineStart, Current - LineStart)); 1614 LineBreaks = 0; 1615 } 1616 1617 // Check for EOF. 1618 if (Current == End) 1619 break; 1620 1621 if (!consumeLineBreakIfPresent()) 1622 break; 1623 ++LineBreaks; 1624 } 1625 1626 if (Current == End && !LineBreaks) 1627 // Ensure that there is at least one line break before the end of file. 1628 LineBreaks = 1; 1629 Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n'); 1630 1631 // New lines may start a simple key. 1632 if (!FlowLevel) 1633 IsSimpleKeyAllowed = true; 1634 1635 Token T; 1636 T.Kind = Token::TK_BlockScalar; 1637 T.Range = StringRef(Start, Current - Start); 1638 T.Value = Str.str().str(); 1639 TokenQueue.push_back(T); 1640 return true; 1641 } 1642 1643 bool Scanner::scanTag() { 1644 StringRef::iterator Start = Current; 1645 unsigned ColStart = Column; 1646 skip(1); // Eat !. 1647 if (Current == End || isBlankOrBreak(Current)); // An empty tag. 1648 else if (*Current == '<') { 1649 skip(1); 1650 scan_ns_uri_char(); 1651 if (!consume('>')) 1652 return false; 1653 } else { 1654 // FIXME: Actually parse the c-ns-shorthand-tag rule. 1655 Current = skip_while(&Scanner::skip_ns_char, Current); 1656 } 1657 1658 Token T; 1659 T.Kind = Token::TK_Tag; 1660 T.Range = StringRef(Start, Current - Start); 1661 TokenQueue.push_back(T); 1662 1663 // Tags can be simple keys. 1664 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1665 1666 IsSimpleKeyAllowed = false; 1667 1668 return true; 1669 } 1670 1671 bool Scanner::fetchMoreTokens() { 1672 if (IsStartOfStream) 1673 return scanStreamStart(); 1674 1675 scanToNextToken(); 1676 1677 if (Current == End) 1678 return scanStreamEnd(); 1679 1680 removeStaleSimpleKeyCandidates(); 1681 1682 unrollIndent(Column); 1683 1684 if (Column == 0 && *Current == '%') 1685 return scanDirective(); 1686 1687 if (Column == 0 && Current + 4 <= End 1688 && *Current == '-' 1689 && *(Current + 1) == '-' 1690 && *(Current + 2) == '-' 1691 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1692 return scanDocumentIndicator(true); 1693 1694 if (Column == 0 && Current + 4 <= End 1695 && *Current == '.' 1696 && *(Current + 1) == '.' 1697 && *(Current + 2) == '.' 1698 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1699 return scanDocumentIndicator(false); 1700 1701 if (*Current == '[') 1702 return scanFlowCollectionStart(true); 1703 1704 if (*Current == '{') 1705 return scanFlowCollectionStart(false); 1706 1707 if (*Current == ']') 1708 return scanFlowCollectionEnd(true); 1709 1710 if (*Current == '}') 1711 return scanFlowCollectionEnd(false); 1712 1713 if (*Current == ',') 1714 return scanFlowEntry(); 1715 1716 if (*Current == '-' && isBlankOrBreak(Current + 1)) 1717 return scanBlockEntry(); 1718 1719 if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1))) 1720 return scanKey(); 1721 1722 if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1))) 1723 return scanValue(); 1724 1725 if (*Current == '*') 1726 return scanAliasOrAnchor(true); 1727 1728 if (*Current == '&') 1729 return scanAliasOrAnchor(false); 1730 1731 if (*Current == '!') 1732 return scanTag(); 1733 1734 if (*Current == '|' && !FlowLevel) 1735 return scanBlockScalar(true); 1736 1737 if (*Current == '>' && !FlowLevel) 1738 return scanBlockScalar(false); 1739 1740 if (*Current == '\'') 1741 return scanFlowScalar(false); 1742 1743 if (*Current == '"') 1744 return scanFlowScalar(true); 1745 1746 // Get a plain scalar. 1747 StringRef FirstChar(Current, 1); 1748 if (!(isBlankOrBreak(Current) 1749 || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos) 1750 || (*Current == '-' && !isBlankOrBreak(Current + 1)) 1751 || (!FlowLevel && (*Current == '?' || *Current == ':') 1752 && isBlankOrBreak(Current + 1)) 1753 || (!FlowLevel && *Current == ':' 1754 && Current + 2 < End 1755 && *(Current + 1) == ':' 1756 && !isBlankOrBreak(Current + 2))) 1757 return scanPlainScalar(); 1758 1759 setError("Unrecognized character while tokenizing."); 1760 return false; 1761 } 1762 1763 Stream::Stream(StringRef Input, SourceMgr &SM, bool ShowColors, 1764 std::error_code *EC) 1765 : scanner(new Scanner(Input, SM, ShowColors, EC)), CurrentDoc() {} 1766 1767 Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors, 1768 std::error_code *EC) 1769 : scanner(new Scanner(InputBuffer, SM, ShowColors, EC)), CurrentDoc() {} 1770 1771 Stream::~Stream() = default; 1772 1773 bool Stream::failed() { return scanner->failed(); } 1774 1775 void Stream::printError(Node *N, const Twine &Msg) { 1776 scanner->printError( N->getSourceRange().Start 1777 , SourceMgr::DK_Error 1778 , Msg 1779 , N->getSourceRange()); 1780 } 1781 1782 document_iterator Stream::begin() { 1783 if (CurrentDoc) 1784 report_fatal_error("Can only iterate over the stream once"); 1785 1786 // Skip Stream-Start. 1787 scanner->getNext(); 1788 1789 CurrentDoc.reset(new Document(*this)); 1790 return document_iterator(CurrentDoc); 1791 } 1792 1793 document_iterator Stream::end() { 1794 return document_iterator(); 1795 } 1796 1797 void Stream::skip() { 1798 for (document_iterator i = begin(), e = end(); i != e; ++i) 1799 i->skip(); 1800 } 1801 1802 Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A, 1803 StringRef T) 1804 : Doc(D), TypeID(Type), Anchor(A), Tag(T) { 1805 SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); 1806 SourceRange = SMRange(Start, Start); 1807 } 1808 1809 std::string Node::getVerbatimTag() const { 1810 StringRef Raw = getRawTag(); 1811 if (!Raw.empty() && Raw != "!") { 1812 std::string Ret; 1813 if (Raw.find_last_of('!') == 0) { 1814 Ret = Doc->getTagMap().find("!")->second; 1815 Ret += Raw.substr(1); 1816 return Ret; 1817 } else if (Raw.startswith("!!")) { 1818 Ret = Doc->getTagMap().find("!!")->second; 1819 Ret += Raw.substr(2); 1820 return Ret; 1821 } else { 1822 StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1); 1823 std::map<StringRef, StringRef>::const_iterator It = 1824 Doc->getTagMap().find(TagHandle); 1825 if (It != Doc->getTagMap().end()) 1826 Ret = It->second; 1827 else { 1828 Token T; 1829 T.Kind = Token::TK_Tag; 1830 T.Range = TagHandle; 1831 setError(Twine("Unknown tag handle ") + TagHandle, T); 1832 } 1833 Ret += Raw.substr(Raw.find_last_of('!') + 1); 1834 return Ret; 1835 } 1836 } 1837 1838 switch (getType()) { 1839 case NK_Null: 1840 return "tag:yaml.org,2002:null"; 1841 case NK_Scalar: 1842 case NK_BlockScalar: 1843 // TODO: Tag resolution. 1844 return "tag:yaml.org,2002:str"; 1845 case NK_Mapping: 1846 return "tag:yaml.org,2002:map"; 1847 case NK_Sequence: 1848 return "tag:yaml.org,2002:seq"; 1849 } 1850 1851 return ""; 1852 } 1853 1854 Token &Node::peekNext() { 1855 return Doc->peekNext(); 1856 } 1857 1858 Token Node::getNext() { 1859 return Doc->getNext(); 1860 } 1861 1862 Node *Node::parseBlockNode() { 1863 return Doc->parseBlockNode(); 1864 } 1865 1866 BumpPtrAllocator &Node::getAllocator() { 1867 return Doc->NodeAllocator; 1868 } 1869 1870 void Node::setError(const Twine &Msg, Token &Tok) const { 1871 Doc->setError(Msg, Tok); 1872 } 1873 1874 bool Node::failed() const { 1875 return Doc->failed(); 1876 } 1877 1878 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { 1879 // TODO: Handle newlines properly. We need to remove leading whitespace. 1880 if (Value[0] == '"') { // Double quoted. 1881 // Pull off the leading and trailing "s. 1882 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1883 // Search for characters that would require unescaping the value. 1884 StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); 1885 if (i != StringRef::npos) 1886 return unescapeDoubleQuoted(UnquotedValue, i, Storage); 1887 return UnquotedValue; 1888 } else if (Value[0] == '\'') { // Single quoted. 1889 // Pull off the leading and trailing 's. 1890 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1891 StringRef::size_type i = UnquotedValue.find('\''); 1892 if (i != StringRef::npos) { 1893 // We're going to need Storage. 1894 Storage.clear(); 1895 Storage.reserve(UnquotedValue.size()); 1896 for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { 1897 StringRef Valid(UnquotedValue.begin(), i); 1898 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1899 Storage.push_back('\''); 1900 UnquotedValue = UnquotedValue.substr(i + 2); 1901 } 1902 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 1903 return StringRef(Storage.begin(), Storage.size()); 1904 } 1905 return UnquotedValue; 1906 } 1907 // Plain or block. 1908 return Value.rtrim(' '); 1909 } 1910 1911 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue 1912 , StringRef::size_type i 1913 , SmallVectorImpl<char> &Storage) 1914 const { 1915 // Use Storage to build proper value. 1916 Storage.clear(); 1917 Storage.reserve(UnquotedValue.size()); 1918 for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { 1919 // Insert all previous chars into Storage. 1920 StringRef Valid(UnquotedValue.begin(), i); 1921 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1922 // Chop off inserted chars. 1923 UnquotedValue = UnquotedValue.substr(i); 1924 1925 assert(!UnquotedValue.empty() && "Can't be empty!"); 1926 1927 // Parse escape or line break. 1928 switch (UnquotedValue[0]) { 1929 case '\r': 1930 case '\n': 1931 Storage.push_back('\n'); 1932 if ( UnquotedValue.size() > 1 1933 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1934 UnquotedValue = UnquotedValue.substr(1); 1935 UnquotedValue = UnquotedValue.substr(1); 1936 break; 1937 default: 1938 if (UnquotedValue.size() == 1) 1939 // TODO: Report error. 1940 break; 1941 UnquotedValue = UnquotedValue.substr(1); 1942 switch (UnquotedValue[0]) { 1943 default: { 1944 Token T; 1945 T.Range = StringRef(UnquotedValue.begin(), 1); 1946 setError("Unrecognized escape code!", T); 1947 return ""; 1948 } 1949 case '\r': 1950 case '\n': 1951 // Remove the new line. 1952 if ( UnquotedValue.size() > 1 1953 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1954 UnquotedValue = UnquotedValue.substr(1); 1955 // If this was just a single byte newline, it will get skipped 1956 // below. 1957 break; 1958 case '0': 1959 Storage.push_back(0x00); 1960 break; 1961 case 'a': 1962 Storage.push_back(0x07); 1963 break; 1964 case 'b': 1965 Storage.push_back(0x08); 1966 break; 1967 case 't': 1968 case 0x09: 1969 Storage.push_back(0x09); 1970 break; 1971 case 'n': 1972 Storage.push_back(0x0A); 1973 break; 1974 case 'v': 1975 Storage.push_back(0x0B); 1976 break; 1977 case 'f': 1978 Storage.push_back(0x0C); 1979 break; 1980 case 'r': 1981 Storage.push_back(0x0D); 1982 break; 1983 case 'e': 1984 Storage.push_back(0x1B); 1985 break; 1986 case ' ': 1987 Storage.push_back(0x20); 1988 break; 1989 case '"': 1990 Storage.push_back(0x22); 1991 break; 1992 case '/': 1993 Storage.push_back(0x2F); 1994 break; 1995 case '\\': 1996 Storage.push_back(0x5C); 1997 break; 1998 case 'N': 1999 encodeUTF8(0x85, Storage); 2000 break; 2001 case '_': 2002 encodeUTF8(0xA0, Storage); 2003 break; 2004 case 'L': 2005 encodeUTF8(0x2028, Storage); 2006 break; 2007 case 'P': 2008 encodeUTF8(0x2029, Storage); 2009 break; 2010 case 'x': { 2011 if (UnquotedValue.size() < 3) 2012 // TODO: Report error. 2013 break; 2014 unsigned int UnicodeScalarValue; 2015 if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) 2016 // TODO: Report error. 2017 UnicodeScalarValue = 0xFFFD; 2018 encodeUTF8(UnicodeScalarValue, Storage); 2019 UnquotedValue = UnquotedValue.substr(2); 2020 break; 2021 } 2022 case 'u': { 2023 if (UnquotedValue.size() < 5) 2024 // TODO: Report error. 2025 break; 2026 unsigned int UnicodeScalarValue; 2027 if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) 2028 // TODO: Report error. 2029 UnicodeScalarValue = 0xFFFD; 2030 encodeUTF8(UnicodeScalarValue, Storage); 2031 UnquotedValue = UnquotedValue.substr(4); 2032 break; 2033 } 2034 case 'U': { 2035 if (UnquotedValue.size() < 9) 2036 // TODO: Report error. 2037 break; 2038 unsigned int UnicodeScalarValue; 2039 if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) 2040 // TODO: Report error. 2041 UnicodeScalarValue = 0xFFFD; 2042 encodeUTF8(UnicodeScalarValue, Storage); 2043 UnquotedValue = UnquotedValue.substr(8); 2044 break; 2045 } 2046 } 2047 UnquotedValue = UnquotedValue.substr(1); 2048 } 2049 } 2050 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 2051 return StringRef(Storage.begin(), Storage.size()); 2052 } 2053 2054 Node *KeyValueNode::getKey() { 2055 if (Key) 2056 return Key; 2057 // Handle implicit null keys. 2058 { 2059 Token &t = peekNext(); 2060 if ( t.Kind == Token::TK_BlockEnd 2061 || t.Kind == Token::TK_Value 2062 || t.Kind == Token::TK_Error) { 2063 return Key = new (getAllocator()) NullNode(Doc); 2064 } 2065 if (t.Kind == Token::TK_Key) 2066 getNext(); // skip TK_Key. 2067 } 2068 2069 // Handle explicit null keys. 2070 Token &t = peekNext(); 2071 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { 2072 return Key = new (getAllocator()) NullNode(Doc); 2073 } 2074 2075 // We've got a normal key. 2076 return Key = parseBlockNode(); 2077 } 2078 2079 Node *KeyValueNode::getValue() { 2080 if (Value) 2081 return Value; 2082 getKey()->skip(); 2083 if (failed()) 2084 return Value = new (getAllocator()) NullNode(Doc); 2085 2086 // Handle implicit null values. 2087 { 2088 Token &t = peekNext(); 2089 if ( t.Kind == Token::TK_BlockEnd 2090 || t.Kind == Token::TK_FlowMappingEnd 2091 || t.Kind == Token::TK_Key 2092 || t.Kind == Token::TK_FlowEntry 2093 || t.Kind == Token::TK_Error) { 2094 return Value = new (getAllocator()) NullNode(Doc); 2095 } 2096 2097 if (t.Kind != Token::TK_Value) { 2098 setError("Unexpected token in Key Value.", t); 2099 return Value = new (getAllocator()) NullNode(Doc); 2100 } 2101 getNext(); // skip TK_Value. 2102 } 2103 2104 // Handle explicit null values. 2105 Token &t = peekNext(); 2106 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { 2107 return Value = new (getAllocator()) NullNode(Doc); 2108 } 2109 2110 // We got a normal value. 2111 return Value = parseBlockNode(); 2112 } 2113 2114 void MappingNode::increment() { 2115 if (failed()) { 2116 IsAtEnd = true; 2117 CurrentEntry = nullptr; 2118 return; 2119 } 2120 if (CurrentEntry) { 2121 CurrentEntry->skip(); 2122 if (Type == MT_Inline) { 2123 IsAtEnd = true; 2124 CurrentEntry = nullptr; 2125 return; 2126 } 2127 } 2128 Token T = peekNext(); 2129 if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { 2130 // KeyValueNode eats the TK_Key. That way it can detect null keys. 2131 CurrentEntry = new (getAllocator()) KeyValueNode(Doc); 2132 } else if (Type == MT_Block) { 2133 switch (T.Kind) { 2134 case Token::TK_BlockEnd: 2135 getNext(); 2136 IsAtEnd = true; 2137 CurrentEntry = nullptr; 2138 break; 2139 default: 2140 setError("Unexpected token. Expected Key or Block End", T); 2141 LLVM_FALLTHROUGH; 2142 case Token::TK_Error: 2143 IsAtEnd = true; 2144 CurrentEntry = nullptr; 2145 } 2146 } else { 2147 switch (T.Kind) { 2148 case Token::TK_FlowEntry: 2149 // Eat the flow entry and recurse. 2150 getNext(); 2151 return increment(); 2152 case Token::TK_FlowMappingEnd: 2153 getNext(); 2154 LLVM_FALLTHROUGH; 2155 case Token::TK_Error: 2156 // Set this to end iterator. 2157 IsAtEnd = true; 2158 CurrentEntry = nullptr; 2159 break; 2160 default: 2161 setError( "Unexpected token. Expected Key, Flow Entry, or Flow " 2162 "Mapping End." 2163 , T); 2164 IsAtEnd = true; 2165 CurrentEntry = nullptr; 2166 } 2167 } 2168 } 2169 2170 void SequenceNode::increment() { 2171 if (failed()) { 2172 IsAtEnd = true; 2173 CurrentEntry = nullptr; 2174 return; 2175 } 2176 if (CurrentEntry) 2177 CurrentEntry->skip(); 2178 Token T = peekNext(); 2179 if (SeqType == ST_Block) { 2180 switch (T.Kind) { 2181 case Token::TK_BlockEntry: 2182 getNext(); 2183 CurrentEntry = parseBlockNode(); 2184 if (!CurrentEntry) { // An error occurred. 2185 IsAtEnd = true; 2186 CurrentEntry = nullptr; 2187 } 2188 break; 2189 case Token::TK_BlockEnd: 2190 getNext(); 2191 IsAtEnd = true; 2192 CurrentEntry = nullptr; 2193 break; 2194 default: 2195 setError( "Unexpected token. Expected Block Entry or Block End." 2196 , T); 2197 LLVM_FALLTHROUGH; 2198 case Token::TK_Error: 2199 IsAtEnd = true; 2200 CurrentEntry = nullptr; 2201 } 2202 } else if (SeqType == ST_Indentless) { 2203 switch (T.Kind) { 2204 case Token::TK_BlockEntry: 2205 getNext(); 2206 CurrentEntry = parseBlockNode(); 2207 if (!CurrentEntry) { // An error occurred. 2208 IsAtEnd = true; 2209 CurrentEntry = nullptr; 2210 } 2211 break; 2212 default: 2213 case Token::TK_Error: 2214 IsAtEnd = true; 2215 CurrentEntry = nullptr; 2216 } 2217 } else if (SeqType == ST_Flow) { 2218 switch (T.Kind) { 2219 case Token::TK_FlowEntry: 2220 // Eat the flow entry and recurse. 2221 getNext(); 2222 WasPreviousTokenFlowEntry = true; 2223 return increment(); 2224 case Token::TK_FlowSequenceEnd: 2225 getNext(); 2226 LLVM_FALLTHROUGH; 2227 case Token::TK_Error: 2228 // Set this to end iterator. 2229 IsAtEnd = true; 2230 CurrentEntry = nullptr; 2231 break; 2232 case Token::TK_StreamEnd: 2233 case Token::TK_DocumentEnd: 2234 case Token::TK_DocumentStart: 2235 setError("Could not find closing ]!", T); 2236 // Set this to end iterator. 2237 IsAtEnd = true; 2238 CurrentEntry = nullptr; 2239 break; 2240 default: 2241 if (!WasPreviousTokenFlowEntry) { 2242 setError("Expected , between entries!", T); 2243 IsAtEnd = true; 2244 CurrentEntry = nullptr; 2245 break; 2246 } 2247 // Otherwise it must be a flow entry. 2248 CurrentEntry = parseBlockNode(); 2249 if (!CurrentEntry) { 2250 IsAtEnd = true; 2251 } 2252 WasPreviousTokenFlowEntry = false; 2253 break; 2254 } 2255 } 2256 } 2257 2258 Document::Document(Stream &S) : stream(S), Root(nullptr) { 2259 // Tag maps starts with two default mappings. 2260 TagMap["!"] = "!"; 2261 TagMap["!!"] = "tag:yaml.org,2002:"; 2262 2263 if (parseDirectives()) 2264 expectToken(Token::TK_DocumentStart); 2265 Token &T = peekNext(); 2266 if (T.Kind == Token::TK_DocumentStart) 2267 getNext(); 2268 } 2269 2270 bool Document::skip() { 2271 if (stream.scanner->failed()) 2272 return false; 2273 if (!Root) 2274 getRoot(); 2275 Root->skip(); 2276 Token &T = peekNext(); 2277 if (T.Kind == Token::TK_StreamEnd) 2278 return false; 2279 if (T.Kind == Token::TK_DocumentEnd) { 2280 getNext(); 2281 return skip(); 2282 } 2283 return true; 2284 } 2285 2286 Token &Document::peekNext() { 2287 return stream.scanner->peekNext(); 2288 } 2289 2290 Token Document::getNext() { 2291 return stream.scanner->getNext(); 2292 } 2293 2294 void Document::setError(const Twine &Message, Token &Location) const { 2295 stream.scanner->setError(Message, Location.Range.begin()); 2296 } 2297 2298 bool Document::failed() const { 2299 return stream.scanner->failed(); 2300 } 2301 2302 Node *Document::parseBlockNode() { 2303 Token T = peekNext(); 2304 // Handle properties. 2305 Token AnchorInfo; 2306 Token TagInfo; 2307 parse_property: 2308 switch (T.Kind) { 2309 case Token::TK_Alias: 2310 getNext(); 2311 return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); 2312 case Token::TK_Anchor: 2313 if (AnchorInfo.Kind == Token::TK_Anchor) { 2314 setError("Already encountered an anchor for this node!", T); 2315 return nullptr; 2316 } 2317 AnchorInfo = getNext(); // Consume TK_Anchor. 2318 T = peekNext(); 2319 goto parse_property; 2320 case Token::TK_Tag: 2321 if (TagInfo.Kind == Token::TK_Tag) { 2322 setError("Already encountered a tag for this node!", T); 2323 return nullptr; 2324 } 2325 TagInfo = getNext(); // Consume TK_Tag. 2326 T = peekNext(); 2327 goto parse_property; 2328 default: 2329 break; 2330 } 2331 2332 switch (T.Kind) { 2333 case Token::TK_BlockEntry: 2334 // We got an unindented BlockEntry sequence. This is not terminated with 2335 // a BlockEnd. 2336 // Don't eat the TK_BlockEntry, SequenceNode needs it. 2337 return new (NodeAllocator) SequenceNode( stream.CurrentDoc 2338 , AnchorInfo.Range.substr(1) 2339 , TagInfo.Range 2340 , SequenceNode::ST_Indentless); 2341 case Token::TK_BlockSequenceStart: 2342 getNext(); 2343 return new (NodeAllocator) 2344 SequenceNode( stream.CurrentDoc 2345 , AnchorInfo.Range.substr(1) 2346 , TagInfo.Range 2347 , SequenceNode::ST_Block); 2348 case Token::TK_BlockMappingStart: 2349 getNext(); 2350 return new (NodeAllocator) 2351 MappingNode( stream.CurrentDoc 2352 , AnchorInfo.Range.substr(1) 2353 , TagInfo.Range 2354 , MappingNode::MT_Block); 2355 case Token::TK_FlowSequenceStart: 2356 getNext(); 2357 return new (NodeAllocator) 2358 SequenceNode( stream.CurrentDoc 2359 , AnchorInfo.Range.substr(1) 2360 , TagInfo.Range 2361 , SequenceNode::ST_Flow); 2362 case Token::TK_FlowMappingStart: 2363 getNext(); 2364 return new (NodeAllocator) 2365 MappingNode( stream.CurrentDoc 2366 , AnchorInfo.Range.substr(1) 2367 , TagInfo.Range 2368 , MappingNode::MT_Flow); 2369 case Token::TK_Scalar: 2370 getNext(); 2371 return new (NodeAllocator) 2372 ScalarNode( stream.CurrentDoc 2373 , AnchorInfo.Range.substr(1) 2374 , TagInfo.Range 2375 , T.Range); 2376 case Token::TK_BlockScalar: { 2377 getNext(); 2378 StringRef NullTerminatedStr(T.Value.c_str(), T.Value.length() + 1); 2379 StringRef StrCopy = NullTerminatedStr.copy(NodeAllocator).drop_back(); 2380 return new (NodeAllocator) 2381 BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1), 2382 TagInfo.Range, StrCopy, T.Range); 2383 } 2384 case Token::TK_Key: 2385 // Don't eat the TK_Key, KeyValueNode expects it. 2386 return new (NodeAllocator) 2387 MappingNode( stream.CurrentDoc 2388 , AnchorInfo.Range.substr(1) 2389 , TagInfo.Range 2390 , MappingNode::MT_Inline); 2391 case Token::TK_DocumentStart: 2392 case Token::TK_DocumentEnd: 2393 case Token::TK_StreamEnd: 2394 default: 2395 // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not 2396 // !!null null. 2397 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2398 case Token::TK_Error: 2399 return nullptr; 2400 } 2401 llvm_unreachable("Control flow shouldn't reach here."); 2402 return nullptr; 2403 } 2404 2405 bool Document::parseDirectives() { 2406 bool isDirective = false; 2407 while (true) { 2408 Token T = peekNext(); 2409 if (T.Kind == Token::TK_TagDirective) { 2410 parseTAGDirective(); 2411 isDirective = true; 2412 } else if (T.Kind == Token::TK_VersionDirective) { 2413 parseYAMLDirective(); 2414 isDirective = true; 2415 } else 2416 break; 2417 } 2418 return isDirective; 2419 } 2420 2421 void Document::parseYAMLDirective() { 2422 getNext(); // Eat %YAML <version> 2423 } 2424 2425 void Document::parseTAGDirective() { 2426 Token Tag = getNext(); // %TAG <handle> <prefix> 2427 StringRef T = Tag.Range; 2428 // Strip %TAG 2429 T = T.substr(T.find_first_of(" \t")).ltrim(" \t"); 2430 std::size_t HandleEnd = T.find_first_of(" \t"); 2431 StringRef TagHandle = T.substr(0, HandleEnd); 2432 StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t"); 2433 TagMap[TagHandle] = TagPrefix; 2434 } 2435 2436 bool Document::expectToken(int TK) { 2437 Token T = getNext(); 2438 if (T.Kind != TK) { 2439 setError("Unexpected token", T); 2440 return false; 2441 } 2442 return true; 2443 } 2444