1 //===- YAMLParser.cpp - Simple YAML parser --------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements a YAML parser. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Support/YAMLParser.h" 15 #include "llvm/ADT/AllocatorList.h" 16 #include "llvm/ADT/ArrayRef.h" 17 #include "llvm/ADT/None.h" 18 #include "llvm/ADT/STLExtras.h" 19 #include "llvm/ADT/SmallString.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/ADT/StringExtras.h" 22 #include "llvm/ADT/StringRef.h" 23 #include "llvm/ADT/Twine.h" 24 #include "llvm/Support/Compiler.h" 25 #include "llvm/Support/ErrorHandling.h" 26 #include "llvm/Support/MemoryBuffer.h" 27 #include "llvm/Support/SMLoc.h" 28 #include "llvm/Support/SourceMgr.h" 29 #include "llvm/Support/raw_ostream.h" 30 #include <algorithm> 31 #include <cassert> 32 #include <cstddef> 33 #include <cstdint> 34 #include <map> 35 #include <memory> 36 #include <string> 37 #include <system_error> 38 #include <utility> 39 40 using namespace llvm; 41 using namespace yaml; 42 43 enum UnicodeEncodingForm { 44 UEF_UTF32_LE, ///< UTF-32 Little Endian 45 UEF_UTF32_BE, ///< UTF-32 Big Endian 46 UEF_UTF16_LE, ///< UTF-16 Little Endian 47 UEF_UTF16_BE, ///< UTF-16 Big Endian 48 UEF_UTF8, ///< UTF-8 or ascii. 49 UEF_Unknown ///< Not a valid Unicode encoding. 50 }; 51 52 /// EncodingInfo - Holds the encoding type and length of the byte order mark if 53 /// it exists. Length is in {0, 2, 3, 4}. 54 using EncodingInfo = std::pair<UnicodeEncodingForm, unsigned>; 55 56 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode 57 /// encoding form of \a Input. 58 /// 59 /// @param Input A string of length 0 or more. 60 /// @returns An EncodingInfo indicating the Unicode encoding form of the input 61 /// and how long the byte order mark is if one exists. 62 static EncodingInfo getUnicodeEncoding(StringRef Input) { 63 if (Input.empty()) 64 return std::make_pair(UEF_Unknown, 0); 65 66 switch (uint8_t(Input[0])) { 67 case 0x00: 68 if (Input.size() >= 4) { 69 if ( Input[1] == 0 70 && uint8_t(Input[2]) == 0xFE 71 && uint8_t(Input[3]) == 0xFF) 72 return std::make_pair(UEF_UTF32_BE, 4); 73 if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) 74 return std::make_pair(UEF_UTF32_BE, 0); 75 } 76 77 if (Input.size() >= 2 && Input[1] != 0) 78 return std::make_pair(UEF_UTF16_BE, 0); 79 return std::make_pair(UEF_Unknown, 0); 80 case 0xFF: 81 if ( Input.size() >= 4 82 && uint8_t(Input[1]) == 0xFE 83 && Input[2] == 0 84 && Input[3] == 0) 85 return std::make_pair(UEF_UTF32_LE, 4); 86 87 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) 88 return std::make_pair(UEF_UTF16_LE, 2); 89 return std::make_pair(UEF_Unknown, 0); 90 case 0xFE: 91 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) 92 return std::make_pair(UEF_UTF16_BE, 2); 93 return std::make_pair(UEF_Unknown, 0); 94 case 0xEF: 95 if ( Input.size() >= 3 96 && uint8_t(Input[1]) == 0xBB 97 && uint8_t(Input[2]) == 0xBF) 98 return std::make_pair(UEF_UTF8, 3); 99 return std::make_pair(UEF_Unknown, 0); 100 } 101 102 // It could still be utf-32 or utf-16. 103 if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) 104 return std::make_pair(UEF_UTF32_LE, 0); 105 106 if (Input.size() >= 2 && Input[1] == 0) 107 return std::make_pair(UEF_UTF16_LE, 0); 108 109 return std::make_pair(UEF_UTF8, 0); 110 } 111 112 /// Pin the vtables to this file. 113 void Node::anchor() {} 114 void NullNode::anchor() {} 115 void ScalarNode::anchor() {} 116 void BlockScalarNode::anchor() {} 117 void KeyValueNode::anchor() {} 118 void MappingNode::anchor() {} 119 void SequenceNode::anchor() {} 120 void AliasNode::anchor() {} 121 122 namespace llvm { 123 namespace yaml { 124 125 /// Token - A single YAML token. 126 struct Token { 127 enum TokenKind { 128 TK_Error, // Uninitialized token. 129 TK_StreamStart, 130 TK_StreamEnd, 131 TK_VersionDirective, 132 TK_TagDirective, 133 TK_DocumentStart, 134 TK_DocumentEnd, 135 TK_BlockEntry, 136 TK_BlockEnd, 137 TK_BlockSequenceStart, 138 TK_BlockMappingStart, 139 TK_FlowEntry, 140 TK_FlowSequenceStart, 141 TK_FlowSequenceEnd, 142 TK_FlowMappingStart, 143 TK_FlowMappingEnd, 144 TK_Key, 145 TK_Value, 146 TK_Scalar, 147 TK_BlockScalar, 148 TK_Alias, 149 TK_Anchor, 150 TK_Tag 151 } Kind = TK_Error; 152 153 /// A string of length 0 or more whose begin() points to the logical location 154 /// of the token in the input. 155 StringRef Range; 156 157 /// The value of a block scalar node. 158 std::string Value; 159 160 Token() = default; 161 }; 162 163 } // end namespace yaml 164 } // end namespace llvm 165 166 using TokenQueueT = BumpPtrList<Token>; 167 168 namespace { 169 170 /// @brief This struct is used to track simple keys. 171 /// 172 /// Simple keys are handled by creating an entry in SimpleKeys for each Token 173 /// which could legally be the start of a simple key. When peekNext is called, 174 /// if the Token To be returned is referenced by a SimpleKey, we continue 175 /// tokenizing until that potential simple key has either been found to not be 176 /// a simple key (we moved on to the next line or went further than 1024 chars). 177 /// Or when we run into a Value, and then insert a Key token (and possibly 178 /// others) before the SimpleKey's Tok. 179 struct SimpleKey { 180 TokenQueueT::iterator Tok; 181 unsigned Column; 182 unsigned Line; 183 unsigned FlowLevel; 184 bool IsRequired; 185 186 bool operator ==(const SimpleKey &Other) { 187 return Tok == Other.Tok; 188 } 189 }; 190 191 } // end anonymous namespace 192 193 /// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit 194 /// subsequence and the subsequence's length in code units (uint8_t). 195 /// A length of 0 represents an error. 196 using UTF8Decoded = std::pair<uint32_t, unsigned>; 197 198 static UTF8Decoded decodeUTF8(StringRef Range) { 199 StringRef::iterator Position= Range.begin(); 200 StringRef::iterator End = Range.end(); 201 // 1 byte: [0x00, 0x7f] 202 // Bit pattern: 0xxxxxxx 203 if ((*Position & 0x80) == 0) { 204 return std::make_pair(*Position, 1); 205 } 206 // 2 bytes: [0x80, 0x7ff] 207 // Bit pattern: 110xxxxx 10xxxxxx 208 if (Position + 1 != End && 209 ((*Position & 0xE0) == 0xC0) && 210 ((*(Position + 1) & 0xC0) == 0x80)) { 211 uint32_t codepoint = ((*Position & 0x1F) << 6) | 212 (*(Position + 1) & 0x3F); 213 if (codepoint >= 0x80) 214 return std::make_pair(codepoint, 2); 215 } 216 // 3 bytes: [0x8000, 0xffff] 217 // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx 218 if (Position + 2 != End && 219 ((*Position & 0xF0) == 0xE0) && 220 ((*(Position + 1) & 0xC0) == 0x80) && 221 ((*(Position + 2) & 0xC0) == 0x80)) { 222 uint32_t codepoint = ((*Position & 0x0F) << 12) | 223 ((*(Position + 1) & 0x3F) << 6) | 224 (*(Position + 2) & 0x3F); 225 // Codepoints between 0xD800 and 0xDFFF are invalid, as 226 // they are high / low surrogate halves used by UTF-16. 227 if (codepoint >= 0x800 && 228 (codepoint < 0xD800 || codepoint > 0xDFFF)) 229 return std::make_pair(codepoint, 3); 230 } 231 // 4 bytes: [0x10000, 0x10FFFF] 232 // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 233 if (Position + 3 != End && 234 ((*Position & 0xF8) == 0xF0) && 235 ((*(Position + 1) & 0xC0) == 0x80) && 236 ((*(Position + 2) & 0xC0) == 0x80) && 237 ((*(Position + 3) & 0xC0) == 0x80)) { 238 uint32_t codepoint = ((*Position & 0x07) << 18) | 239 ((*(Position + 1) & 0x3F) << 12) | 240 ((*(Position + 2) & 0x3F) << 6) | 241 (*(Position + 3) & 0x3F); 242 if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) 243 return std::make_pair(codepoint, 4); 244 } 245 return std::make_pair(0, 0); 246 } 247 248 namespace llvm { 249 namespace yaml { 250 251 /// @brief Scans YAML tokens from a MemoryBuffer. 252 class Scanner { 253 public: 254 Scanner(StringRef Input, SourceMgr &SM, bool ShowColors = true, 255 std::error_code *EC = nullptr); 256 Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors = true, 257 std::error_code *EC = nullptr); 258 259 /// @brief Parse the next token and return it without popping it. 260 Token &peekNext(); 261 262 /// @brief Parse the next token and pop it from the queue. 263 Token getNext(); 264 265 void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, 266 ArrayRef<SMRange> Ranges = None) { 267 SM.PrintMessage(Loc, Kind, Message, Ranges, /* FixIts= */ None, ShowColors); 268 } 269 270 void setError(const Twine &Message, StringRef::iterator Position) { 271 if (Current >= End) 272 Current = End - 1; 273 274 // propagate the error if possible 275 if (EC) 276 *EC = make_error_code(std::errc::invalid_argument); 277 278 // Don't print out more errors after the first one we encounter. The rest 279 // are just the result of the first, and have no meaning. 280 if (!Failed) 281 printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message); 282 Failed = true; 283 } 284 285 void setError(const Twine &Message) { 286 setError(Message, Current); 287 } 288 289 /// @brief Returns true if an error occurred while parsing. 290 bool failed() { 291 return Failed; 292 } 293 294 private: 295 void init(MemoryBufferRef Buffer); 296 297 StringRef currentInput() { 298 return StringRef(Current, End - Current); 299 } 300 301 /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting 302 /// at \a Position. 303 /// 304 /// If the UTF-8 code units starting at Position do not form a well-formed 305 /// code unit subsequence, then the Unicode scalar value is 0, and the length 306 /// is 0. 307 UTF8Decoded decodeUTF8(StringRef::iterator Position) { 308 return ::decodeUTF8(StringRef(Position, End - Position)); 309 } 310 311 // The following functions are based on the gramar rules in the YAML spec. The 312 // style of the function names it meant to closely match how they are written 313 // in the spec. The number within the [] is the number of the grammar rule in 314 // the spec. 315 // 316 // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. 317 // 318 // c- 319 // A production starting and ending with a special character. 320 // b- 321 // A production matching a single line break. 322 // nb- 323 // A production starting and ending with a non-break character. 324 // s- 325 // A production starting and ending with a white space character. 326 // ns- 327 // A production starting and ending with a non-space character. 328 // l- 329 // A production matching complete line(s). 330 331 /// @brief Skip a single nb-char[27] starting at Position. 332 /// 333 /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] 334 /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] 335 /// 336 /// @returns The code unit after the nb-char, or Position if it's not an 337 /// nb-char. 338 StringRef::iterator skip_nb_char(StringRef::iterator Position); 339 340 /// @brief Skip a single b-break[28] starting at Position. 341 /// 342 /// A b-break is 0xD 0xA | 0xD | 0xA 343 /// 344 /// @returns The code unit after the b-break, or Position if it's not a 345 /// b-break. 346 StringRef::iterator skip_b_break(StringRef::iterator Position); 347 348 /// Skip a single s-space[31] starting at Position. 349 /// 350 /// An s-space is 0x20 351 /// 352 /// @returns The code unit after the s-space, or Position if it's not a 353 /// s-space. 354 StringRef::iterator skip_s_space(StringRef::iterator Position); 355 356 /// @brief Skip a single s-white[33] starting at Position. 357 /// 358 /// A s-white is 0x20 | 0x9 359 /// 360 /// @returns The code unit after the s-white, or Position if it's not a 361 /// s-white. 362 StringRef::iterator skip_s_white(StringRef::iterator Position); 363 364 /// @brief Skip a single ns-char[34] starting at Position. 365 /// 366 /// A ns-char is nb-char - s-white 367 /// 368 /// @returns The code unit after the ns-char, or Position if it's not a 369 /// ns-char. 370 StringRef::iterator skip_ns_char(StringRef::iterator Position); 371 372 using SkipWhileFunc = StringRef::iterator (Scanner::*)(StringRef::iterator); 373 374 /// @brief Skip minimal well-formed code unit subsequences until Func 375 /// returns its input. 376 /// 377 /// @returns The code unit after the last minimal well-formed code unit 378 /// subsequence that Func accepted. 379 StringRef::iterator skip_while( SkipWhileFunc Func 380 , StringRef::iterator Position); 381 382 /// Skip minimal well-formed code unit subsequences until Func returns its 383 /// input. 384 void advanceWhile(SkipWhileFunc Func); 385 386 /// @brief Scan ns-uri-char[39]s starting at Cur. 387 /// 388 /// This updates Cur and Column while scanning. 389 void scan_ns_uri_char(); 390 391 /// @brief Consume a minimal well-formed code unit subsequence starting at 392 /// \a Cur. Return false if it is not the same Unicode scalar value as 393 /// \a Expected. This updates \a Column. 394 bool consume(uint32_t Expected); 395 396 /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. 397 void skip(uint32_t Distance); 398 399 /// @brief Return true if the minimal well-formed code unit subsequence at 400 /// Pos is whitespace or a new line 401 bool isBlankOrBreak(StringRef::iterator Position); 402 403 /// Consume a single b-break[28] if it's present at the current position. 404 /// 405 /// Return false if the code unit at the current position isn't a line break. 406 bool consumeLineBreakIfPresent(); 407 408 /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey. 409 void saveSimpleKeyCandidate( TokenQueueT::iterator Tok 410 , unsigned AtColumn 411 , bool IsRequired); 412 413 /// @brief Remove simple keys that can no longer be valid simple keys. 414 /// 415 /// Invalid simple keys are not on the current line or are further than 1024 416 /// columns back. 417 void removeStaleSimpleKeyCandidates(); 418 419 /// @brief Remove all simple keys on FlowLevel \a Level. 420 void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); 421 422 /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd 423 /// tokens if needed. 424 bool unrollIndent(int ToColumn); 425 426 /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint 427 /// if needed. 428 bool rollIndent( int ToColumn 429 , Token::TokenKind Kind 430 , TokenQueueT::iterator InsertPoint); 431 432 /// @brief Skip a single-line comment when the comment starts at the current 433 /// position of the scanner. 434 void skipComment(); 435 436 /// @brief Skip whitespace and comments until the start of the next token. 437 void scanToNextToken(); 438 439 /// @brief Must be the first token generated. 440 bool scanStreamStart(); 441 442 /// @brief Generate tokens needed to close out the stream. 443 bool scanStreamEnd(); 444 445 /// @brief Scan a %BLAH directive. 446 bool scanDirective(); 447 448 /// @brief Scan a ... or ---. 449 bool scanDocumentIndicator(bool IsStart); 450 451 /// @brief Scan a [ or { and generate the proper flow collection start token. 452 bool scanFlowCollectionStart(bool IsSequence); 453 454 /// @brief Scan a ] or } and generate the proper flow collection end token. 455 bool scanFlowCollectionEnd(bool IsSequence); 456 457 /// @brief Scan the , that separates entries in a flow collection. 458 bool scanFlowEntry(); 459 460 /// @brief Scan the - that starts block sequence entries. 461 bool scanBlockEntry(); 462 463 /// @brief Scan an explicit ? indicating a key. 464 bool scanKey(); 465 466 /// @brief Scan an explicit : indicating a value. 467 bool scanValue(); 468 469 /// @brief Scan a quoted scalar. 470 bool scanFlowScalar(bool IsDoubleQuoted); 471 472 /// @brief Scan an unquoted scalar. 473 bool scanPlainScalar(); 474 475 /// @brief Scan an Alias or Anchor starting with * or &. 476 bool scanAliasOrAnchor(bool IsAlias); 477 478 /// @brief Scan a block scalar starting with | or >. 479 bool scanBlockScalar(bool IsLiteral); 480 481 /// Scan a chomping indicator in a block scalar header. 482 char scanBlockChompingIndicator(); 483 484 /// Scan an indentation indicator in a block scalar header. 485 unsigned scanBlockIndentationIndicator(); 486 487 /// Scan a block scalar header. 488 /// 489 /// Return false if an error occurred. 490 bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator, 491 bool &IsDone); 492 493 /// Look for the indentation level of a block scalar. 494 /// 495 /// Return false if an error occurred. 496 bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent, 497 unsigned &LineBreaks, bool &IsDone); 498 499 /// Scan the indentation of a text line in a block scalar. 500 /// 501 /// Return false if an error occurred. 502 bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent, 503 bool &IsDone); 504 505 /// @brief Scan a tag of the form !stuff. 506 bool scanTag(); 507 508 /// @brief Dispatch to the next scanning function based on \a *Cur. 509 bool fetchMoreTokens(); 510 511 /// @brief The SourceMgr used for diagnostics and buffer management. 512 SourceMgr &SM; 513 514 /// @brief The original input. 515 MemoryBufferRef InputBuffer; 516 517 /// @brief The current position of the scanner. 518 StringRef::iterator Current; 519 520 /// @brief The end of the input (one past the last character). 521 StringRef::iterator End; 522 523 /// @brief Current YAML indentation level in spaces. 524 int Indent; 525 526 /// @brief Current column number in Unicode code points. 527 unsigned Column; 528 529 /// @brief Current line number. 530 unsigned Line; 531 532 /// @brief How deep we are in flow style containers. 0 Means at block level. 533 unsigned FlowLevel; 534 535 /// @brief Are we at the start of the stream? 536 bool IsStartOfStream; 537 538 /// @brief Can the next token be the start of a simple key? 539 bool IsSimpleKeyAllowed; 540 541 /// @brief True if an error has occurred. 542 bool Failed; 543 544 /// @brief Should colors be used when printing out the diagnostic messages? 545 bool ShowColors; 546 547 /// @brief Queue of tokens. This is required to queue up tokens while looking 548 /// for the end of a simple key. And for cases where a single character 549 /// can produce multiple tokens (e.g. BlockEnd). 550 TokenQueueT TokenQueue; 551 552 /// @brief Indentation levels. 553 SmallVector<int, 4> Indents; 554 555 /// @brief Potential simple keys. 556 SmallVector<SimpleKey, 4> SimpleKeys; 557 558 std::error_code *EC; 559 }; 560 561 } // end namespace yaml 562 } // end namespace llvm 563 564 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. 565 static void encodeUTF8( uint32_t UnicodeScalarValue 566 , SmallVectorImpl<char> &Result) { 567 if (UnicodeScalarValue <= 0x7F) { 568 Result.push_back(UnicodeScalarValue & 0x7F); 569 } else if (UnicodeScalarValue <= 0x7FF) { 570 uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); 571 uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); 572 Result.push_back(FirstByte); 573 Result.push_back(SecondByte); 574 } else if (UnicodeScalarValue <= 0xFFFF) { 575 uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); 576 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 577 uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); 578 Result.push_back(FirstByte); 579 Result.push_back(SecondByte); 580 Result.push_back(ThirdByte); 581 } else if (UnicodeScalarValue <= 0x10FFFF) { 582 uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); 583 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); 584 uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 585 uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); 586 Result.push_back(FirstByte); 587 Result.push_back(SecondByte); 588 Result.push_back(ThirdByte); 589 Result.push_back(FourthByte); 590 } 591 } 592 593 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { 594 SourceMgr SM; 595 Scanner scanner(Input, SM); 596 while (true) { 597 Token T = scanner.getNext(); 598 switch (T.Kind) { 599 case Token::TK_StreamStart: 600 OS << "Stream-Start: "; 601 break; 602 case Token::TK_StreamEnd: 603 OS << "Stream-End: "; 604 break; 605 case Token::TK_VersionDirective: 606 OS << "Version-Directive: "; 607 break; 608 case Token::TK_TagDirective: 609 OS << "Tag-Directive: "; 610 break; 611 case Token::TK_DocumentStart: 612 OS << "Document-Start: "; 613 break; 614 case Token::TK_DocumentEnd: 615 OS << "Document-End: "; 616 break; 617 case Token::TK_BlockEntry: 618 OS << "Block-Entry: "; 619 break; 620 case Token::TK_BlockEnd: 621 OS << "Block-End: "; 622 break; 623 case Token::TK_BlockSequenceStart: 624 OS << "Block-Sequence-Start: "; 625 break; 626 case Token::TK_BlockMappingStart: 627 OS << "Block-Mapping-Start: "; 628 break; 629 case Token::TK_FlowEntry: 630 OS << "Flow-Entry: "; 631 break; 632 case Token::TK_FlowSequenceStart: 633 OS << "Flow-Sequence-Start: "; 634 break; 635 case Token::TK_FlowSequenceEnd: 636 OS << "Flow-Sequence-End: "; 637 break; 638 case Token::TK_FlowMappingStart: 639 OS << "Flow-Mapping-Start: "; 640 break; 641 case Token::TK_FlowMappingEnd: 642 OS << "Flow-Mapping-End: "; 643 break; 644 case Token::TK_Key: 645 OS << "Key: "; 646 break; 647 case Token::TK_Value: 648 OS << "Value: "; 649 break; 650 case Token::TK_Scalar: 651 OS << "Scalar: "; 652 break; 653 case Token::TK_BlockScalar: 654 OS << "Block Scalar: "; 655 break; 656 case Token::TK_Alias: 657 OS << "Alias: "; 658 break; 659 case Token::TK_Anchor: 660 OS << "Anchor: "; 661 break; 662 case Token::TK_Tag: 663 OS << "Tag: "; 664 break; 665 case Token::TK_Error: 666 break; 667 } 668 OS << T.Range << "\n"; 669 if (T.Kind == Token::TK_StreamEnd) 670 break; 671 else if (T.Kind == Token::TK_Error) 672 return false; 673 } 674 return true; 675 } 676 677 bool yaml::scanTokens(StringRef Input) { 678 SourceMgr SM; 679 Scanner scanner(Input, SM); 680 while (true) { 681 Token T = scanner.getNext(); 682 if (T.Kind == Token::TK_StreamEnd) 683 break; 684 else if (T.Kind == Token::TK_Error) 685 return false; 686 } 687 return true; 688 } 689 690 std::string yaml::escape(StringRef Input) { 691 std::string EscapedInput; 692 for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { 693 if (*i == '\\') 694 EscapedInput += "\\\\"; 695 else if (*i == '"') 696 EscapedInput += "\\\""; 697 else if (*i == 0) 698 EscapedInput += "\\0"; 699 else if (*i == 0x07) 700 EscapedInput += "\\a"; 701 else if (*i == 0x08) 702 EscapedInput += "\\b"; 703 else if (*i == 0x09) 704 EscapedInput += "\\t"; 705 else if (*i == 0x0A) 706 EscapedInput += "\\n"; 707 else if (*i == 0x0B) 708 EscapedInput += "\\v"; 709 else if (*i == 0x0C) 710 EscapedInput += "\\f"; 711 else if (*i == 0x0D) 712 EscapedInput += "\\r"; 713 else if (*i == 0x1B) 714 EscapedInput += "\\e"; 715 else if ((unsigned char)*i < 0x20) { // Control characters not handled above. 716 std::string HexStr = utohexstr(*i); 717 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 718 } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. 719 UTF8Decoded UnicodeScalarValue 720 = decodeUTF8(StringRef(i, Input.end() - i)); 721 if (UnicodeScalarValue.second == 0) { 722 // Found invalid char. 723 SmallString<4> Val; 724 encodeUTF8(0xFFFD, Val); 725 EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end()); 726 // FIXME: Error reporting. 727 return EscapedInput; 728 } 729 if (UnicodeScalarValue.first == 0x85) 730 EscapedInput += "\\N"; 731 else if (UnicodeScalarValue.first == 0xA0) 732 EscapedInput += "\\_"; 733 else if (UnicodeScalarValue.first == 0x2028) 734 EscapedInput += "\\L"; 735 else if (UnicodeScalarValue.first == 0x2029) 736 EscapedInput += "\\P"; 737 else { 738 std::string HexStr = utohexstr(UnicodeScalarValue.first); 739 if (HexStr.size() <= 2) 740 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 741 else if (HexStr.size() <= 4) 742 EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; 743 else if (HexStr.size() <= 8) 744 EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; 745 } 746 i += UnicodeScalarValue.second - 1; 747 } else 748 EscapedInput.push_back(*i); 749 } 750 return EscapedInput; 751 } 752 753 Scanner::Scanner(StringRef Input, SourceMgr &sm, bool ShowColors, 754 std::error_code *EC) 755 : SM(sm), ShowColors(ShowColors), EC(EC) { 756 init(MemoryBufferRef(Input, "YAML")); 757 } 758 759 Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors, 760 std::error_code *EC) 761 : SM(SM_), ShowColors(ShowColors), EC(EC) { 762 init(Buffer); 763 } 764 765 void Scanner::init(MemoryBufferRef Buffer) { 766 InputBuffer = Buffer; 767 Current = InputBuffer.getBufferStart(); 768 End = InputBuffer.getBufferEnd(); 769 Indent = -1; 770 Column = 0; 771 Line = 0; 772 FlowLevel = 0; 773 IsStartOfStream = true; 774 IsSimpleKeyAllowed = true; 775 Failed = false; 776 std::unique_ptr<MemoryBuffer> InputBufferOwner = 777 MemoryBuffer::getMemBuffer(Buffer); 778 SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc()); 779 } 780 781 Token &Scanner::peekNext() { 782 // If the current token is a possible simple key, keep parsing until we 783 // can confirm. 784 bool NeedMore = false; 785 while (true) { 786 if (TokenQueue.empty() || NeedMore) { 787 if (!fetchMoreTokens()) { 788 TokenQueue.clear(); 789 TokenQueue.push_back(Token()); 790 return TokenQueue.front(); 791 } 792 } 793 assert(!TokenQueue.empty() && 794 "fetchMoreTokens lied about getting tokens!"); 795 796 removeStaleSimpleKeyCandidates(); 797 SimpleKey SK; 798 SK.Tok = TokenQueue.begin(); 799 if (!is_contained(SimpleKeys, SK)) 800 break; 801 else 802 NeedMore = true; 803 } 804 return TokenQueue.front(); 805 } 806 807 Token Scanner::getNext() { 808 Token Ret = peekNext(); 809 // TokenQueue can be empty if there was an error getting the next token. 810 if (!TokenQueue.empty()) 811 TokenQueue.pop_front(); 812 813 // There cannot be any referenced Token's if the TokenQueue is empty. So do a 814 // quick deallocation of them all. 815 if (TokenQueue.empty()) 816 TokenQueue.resetAlloc(); 817 818 return Ret; 819 } 820 821 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { 822 if (Position == End) 823 return Position; 824 // Check 7 bit c-printable - b-char. 825 if ( *Position == 0x09 826 || (*Position >= 0x20 && *Position <= 0x7E)) 827 return Position + 1; 828 829 // Check for valid UTF-8. 830 if (uint8_t(*Position) & 0x80) { 831 UTF8Decoded u8d = decodeUTF8(Position); 832 if ( u8d.second != 0 833 && u8d.first != 0xFEFF 834 && ( u8d.first == 0x85 835 || ( u8d.first >= 0xA0 836 && u8d.first <= 0xD7FF) 837 || ( u8d.first >= 0xE000 838 && u8d.first <= 0xFFFD) 839 || ( u8d.first >= 0x10000 840 && u8d.first <= 0x10FFFF))) 841 return Position + u8d.second; 842 } 843 return Position; 844 } 845 846 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { 847 if (Position == End) 848 return Position; 849 if (*Position == 0x0D) { 850 if (Position + 1 != End && *(Position + 1) == 0x0A) 851 return Position + 2; 852 return Position + 1; 853 } 854 855 if (*Position == 0x0A) 856 return Position + 1; 857 return Position; 858 } 859 860 StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) { 861 if (Position == End) 862 return Position; 863 if (*Position == ' ') 864 return Position + 1; 865 return Position; 866 } 867 868 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { 869 if (Position == End) 870 return Position; 871 if (*Position == ' ' || *Position == '\t') 872 return Position + 1; 873 return Position; 874 } 875 876 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { 877 if (Position == End) 878 return Position; 879 if (*Position == ' ' || *Position == '\t') 880 return Position; 881 return skip_nb_char(Position); 882 } 883 884 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func 885 , StringRef::iterator Position) { 886 while (true) { 887 StringRef::iterator i = (this->*Func)(Position); 888 if (i == Position) 889 break; 890 Position = i; 891 } 892 return Position; 893 } 894 895 void Scanner::advanceWhile(SkipWhileFunc Func) { 896 auto Final = skip_while(Func, Current); 897 Column += Final - Current; 898 Current = Final; 899 } 900 901 static bool is_ns_hex_digit(const char C) { 902 return (C >= '0' && C <= '9') 903 || (C >= 'a' && C <= 'z') 904 || (C >= 'A' && C <= 'Z'); 905 } 906 907 static bool is_ns_word_char(const char C) { 908 return C == '-' 909 || (C >= 'a' && C <= 'z') 910 || (C >= 'A' && C <= 'Z'); 911 } 912 913 void Scanner::scan_ns_uri_char() { 914 while (true) { 915 if (Current == End) 916 break; 917 if (( *Current == '%' 918 && Current + 2 < End 919 && is_ns_hex_digit(*(Current + 1)) 920 && is_ns_hex_digit(*(Current + 2))) 921 || is_ns_word_char(*Current) 922 || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") 923 != StringRef::npos) { 924 ++Current; 925 ++Column; 926 } else 927 break; 928 } 929 } 930 931 bool Scanner::consume(uint32_t Expected) { 932 if (Expected >= 0x80) 933 report_fatal_error("Not dealing with this yet"); 934 if (Current == End) 935 return false; 936 if (uint8_t(*Current) >= 0x80) 937 report_fatal_error("Not dealing with this yet"); 938 if (uint8_t(*Current) == Expected) { 939 ++Current; 940 ++Column; 941 return true; 942 } 943 return false; 944 } 945 946 void Scanner::skip(uint32_t Distance) { 947 Current += Distance; 948 Column += Distance; 949 assert(Current <= End && "Skipped past the end"); 950 } 951 952 bool Scanner::isBlankOrBreak(StringRef::iterator Position) { 953 if (Position == End) 954 return false; 955 return *Position == ' ' || *Position == '\t' || *Position == '\r' || 956 *Position == '\n'; 957 } 958 959 bool Scanner::consumeLineBreakIfPresent() { 960 auto Next = skip_b_break(Current); 961 if (Next == Current) 962 return false; 963 Column = 0; 964 ++Line; 965 Current = Next; 966 return true; 967 } 968 969 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok 970 , unsigned AtColumn 971 , bool IsRequired) { 972 if (IsSimpleKeyAllowed) { 973 SimpleKey SK; 974 SK.Tok = Tok; 975 SK.Line = Line; 976 SK.Column = AtColumn; 977 SK.IsRequired = IsRequired; 978 SK.FlowLevel = FlowLevel; 979 SimpleKeys.push_back(SK); 980 } 981 } 982 983 void Scanner::removeStaleSimpleKeyCandidates() { 984 for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); 985 i != SimpleKeys.end();) { 986 if (i->Line != Line || i->Column + 1024 < Column) { 987 if (i->IsRequired) 988 setError( "Could not find expected : for simple key" 989 , i->Tok->Range.begin()); 990 i = SimpleKeys.erase(i); 991 } else 992 ++i; 993 } 994 } 995 996 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { 997 if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) 998 SimpleKeys.pop_back(); 999 } 1000 1001 bool Scanner::unrollIndent(int ToColumn) { 1002 Token T; 1003 // Indentation is ignored in flow. 1004 if (FlowLevel != 0) 1005 return true; 1006 1007 while (Indent > ToColumn) { 1008 T.Kind = Token::TK_BlockEnd; 1009 T.Range = StringRef(Current, 1); 1010 TokenQueue.push_back(T); 1011 Indent = Indents.pop_back_val(); 1012 } 1013 1014 return true; 1015 } 1016 1017 bool Scanner::rollIndent( int ToColumn 1018 , Token::TokenKind Kind 1019 , TokenQueueT::iterator InsertPoint) { 1020 if (FlowLevel) 1021 return true; 1022 if (Indent < ToColumn) { 1023 Indents.push_back(Indent); 1024 Indent = ToColumn; 1025 1026 Token T; 1027 T.Kind = Kind; 1028 T.Range = StringRef(Current, 0); 1029 TokenQueue.insert(InsertPoint, T); 1030 } 1031 return true; 1032 } 1033 1034 void Scanner::skipComment() { 1035 if (*Current != '#') 1036 return; 1037 while (true) { 1038 // This may skip more than one byte, thus Column is only incremented 1039 // for code points. 1040 StringRef::iterator I = skip_nb_char(Current); 1041 if (I == Current) 1042 break; 1043 Current = I; 1044 ++Column; 1045 } 1046 } 1047 1048 void Scanner::scanToNextToken() { 1049 while (true) { 1050 while (*Current == ' ' || *Current == '\t') { 1051 skip(1); 1052 } 1053 1054 skipComment(); 1055 1056 // Skip EOL. 1057 StringRef::iterator i = skip_b_break(Current); 1058 if (i == Current) 1059 break; 1060 Current = i; 1061 ++Line; 1062 Column = 0; 1063 // New lines may start a simple key. 1064 if (!FlowLevel) 1065 IsSimpleKeyAllowed = true; 1066 } 1067 } 1068 1069 bool Scanner::scanStreamStart() { 1070 IsStartOfStream = false; 1071 1072 EncodingInfo EI = getUnicodeEncoding(currentInput()); 1073 1074 Token T; 1075 T.Kind = Token::TK_StreamStart; 1076 T.Range = StringRef(Current, EI.second); 1077 TokenQueue.push_back(T); 1078 Current += EI.second; 1079 return true; 1080 } 1081 1082 bool Scanner::scanStreamEnd() { 1083 // Force an ending new line if one isn't present. 1084 if (Column != 0) { 1085 Column = 0; 1086 ++Line; 1087 } 1088 1089 unrollIndent(-1); 1090 SimpleKeys.clear(); 1091 IsSimpleKeyAllowed = false; 1092 1093 Token T; 1094 T.Kind = Token::TK_StreamEnd; 1095 T.Range = StringRef(Current, 0); 1096 TokenQueue.push_back(T); 1097 return true; 1098 } 1099 1100 bool Scanner::scanDirective() { 1101 // Reset the indentation level. 1102 unrollIndent(-1); 1103 SimpleKeys.clear(); 1104 IsSimpleKeyAllowed = false; 1105 1106 StringRef::iterator Start = Current; 1107 consume('%'); 1108 StringRef::iterator NameStart = Current; 1109 Current = skip_while(&Scanner::skip_ns_char, Current); 1110 StringRef Name(NameStart, Current - NameStart); 1111 Current = skip_while(&Scanner::skip_s_white, Current); 1112 1113 Token T; 1114 if (Name == "YAML") { 1115 Current = skip_while(&Scanner::skip_ns_char, Current); 1116 T.Kind = Token::TK_VersionDirective; 1117 T.Range = StringRef(Start, Current - Start); 1118 TokenQueue.push_back(T); 1119 return true; 1120 } else if(Name == "TAG") { 1121 Current = skip_while(&Scanner::skip_ns_char, Current); 1122 Current = skip_while(&Scanner::skip_s_white, Current); 1123 Current = skip_while(&Scanner::skip_ns_char, Current); 1124 T.Kind = Token::TK_TagDirective; 1125 T.Range = StringRef(Start, Current - Start); 1126 TokenQueue.push_back(T); 1127 return true; 1128 } 1129 return false; 1130 } 1131 1132 bool Scanner::scanDocumentIndicator(bool IsStart) { 1133 unrollIndent(-1); 1134 SimpleKeys.clear(); 1135 IsSimpleKeyAllowed = false; 1136 1137 Token T; 1138 T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; 1139 T.Range = StringRef(Current, 3); 1140 skip(3); 1141 TokenQueue.push_back(T); 1142 return true; 1143 } 1144 1145 bool Scanner::scanFlowCollectionStart(bool IsSequence) { 1146 Token T; 1147 T.Kind = IsSequence ? Token::TK_FlowSequenceStart 1148 : Token::TK_FlowMappingStart; 1149 T.Range = StringRef(Current, 1); 1150 skip(1); 1151 TokenQueue.push_back(T); 1152 1153 // [ and { may begin a simple key. 1154 saveSimpleKeyCandidate(--TokenQueue.end(), Column - 1, false); 1155 1156 // And may also be followed by a simple key. 1157 IsSimpleKeyAllowed = true; 1158 ++FlowLevel; 1159 return true; 1160 } 1161 1162 bool Scanner::scanFlowCollectionEnd(bool IsSequence) { 1163 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1164 IsSimpleKeyAllowed = false; 1165 Token T; 1166 T.Kind = IsSequence ? Token::TK_FlowSequenceEnd 1167 : Token::TK_FlowMappingEnd; 1168 T.Range = StringRef(Current, 1); 1169 skip(1); 1170 TokenQueue.push_back(T); 1171 if (FlowLevel) 1172 --FlowLevel; 1173 return true; 1174 } 1175 1176 bool Scanner::scanFlowEntry() { 1177 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1178 IsSimpleKeyAllowed = true; 1179 Token T; 1180 T.Kind = Token::TK_FlowEntry; 1181 T.Range = StringRef(Current, 1); 1182 skip(1); 1183 TokenQueue.push_back(T); 1184 return true; 1185 } 1186 1187 bool Scanner::scanBlockEntry() { 1188 rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); 1189 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1190 IsSimpleKeyAllowed = true; 1191 Token T; 1192 T.Kind = Token::TK_BlockEntry; 1193 T.Range = StringRef(Current, 1); 1194 skip(1); 1195 TokenQueue.push_back(T); 1196 return true; 1197 } 1198 1199 bool Scanner::scanKey() { 1200 if (!FlowLevel) 1201 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1202 1203 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1204 IsSimpleKeyAllowed = !FlowLevel; 1205 1206 Token T; 1207 T.Kind = Token::TK_Key; 1208 T.Range = StringRef(Current, 1); 1209 skip(1); 1210 TokenQueue.push_back(T); 1211 return true; 1212 } 1213 1214 bool Scanner::scanValue() { 1215 // If the previous token could have been a simple key, insert the key token 1216 // into the token queue. 1217 if (!SimpleKeys.empty()) { 1218 SimpleKey SK = SimpleKeys.pop_back_val(); 1219 Token T; 1220 T.Kind = Token::TK_Key; 1221 T.Range = SK.Tok->Range; 1222 TokenQueueT::iterator i, e; 1223 for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { 1224 if (i == SK.Tok) 1225 break; 1226 } 1227 assert(i != e && "SimpleKey not in token queue!"); 1228 i = TokenQueue.insert(i, T); 1229 1230 // We may also need to add a Block-Mapping-Start token. 1231 rollIndent(SK.Column, Token::TK_BlockMappingStart, i); 1232 1233 IsSimpleKeyAllowed = false; 1234 } else { 1235 if (!FlowLevel) 1236 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1237 IsSimpleKeyAllowed = !FlowLevel; 1238 } 1239 1240 Token T; 1241 T.Kind = Token::TK_Value; 1242 T.Range = StringRef(Current, 1); 1243 skip(1); 1244 TokenQueue.push_back(T); 1245 return true; 1246 } 1247 1248 // Forbidding inlining improves performance by roughly 20%. 1249 // FIXME: Remove once llvm optimizes this to the faster version without hints. 1250 LLVM_ATTRIBUTE_NOINLINE static bool 1251 wasEscaped(StringRef::iterator First, StringRef::iterator Position); 1252 1253 // Returns whether a character at 'Position' was escaped with a leading '\'. 1254 // 'First' specifies the position of the first character in the string. 1255 static bool wasEscaped(StringRef::iterator First, 1256 StringRef::iterator Position) { 1257 assert(Position - 1 >= First); 1258 StringRef::iterator I = Position - 1; 1259 // We calculate the number of consecutive '\'s before the current position 1260 // by iterating backwards through our string. 1261 while (I >= First && *I == '\\') --I; 1262 // (Position - 1 - I) now contains the number of '\'s before the current 1263 // position. If it is odd, the character at 'Position' was escaped. 1264 return (Position - 1 - I) % 2 == 1; 1265 } 1266 1267 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { 1268 StringRef::iterator Start = Current; 1269 unsigned ColStart = Column; 1270 if (IsDoubleQuoted) { 1271 do { 1272 ++Current; 1273 while (Current != End && *Current != '"') 1274 ++Current; 1275 // Repeat until the previous character was not a '\' or was an escaped 1276 // backslash. 1277 } while ( Current != End 1278 && *(Current - 1) == '\\' 1279 && wasEscaped(Start + 1, Current)); 1280 } else { 1281 skip(1); 1282 while (true) { 1283 // Skip a ' followed by another '. 1284 if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { 1285 skip(2); 1286 continue; 1287 } else if (*Current == '\'') 1288 break; 1289 StringRef::iterator i = skip_nb_char(Current); 1290 if (i == Current) { 1291 i = skip_b_break(Current); 1292 if (i == Current) 1293 break; 1294 Current = i; 1295 Column = 0; 1296 ++Line; 1297 } else { 1298 if (i == End) 1299 break; 1300 Current = i; 1301 ++Column; 1302 } 1303 } 1304 } 1305 1306 if (Current == End) { 1307 setError("Expected quote at end of scalar", Current); 1308 return false; 1309 } 1310 1311 skip(1); // Skip ending quote. 1312 Token T; 1313 T.Kind = Token::TK_Scalar; 1314 T.Range = StringRef(Start, Current - Start); 1315 TokenQueue.push_back(T); 1316 1317 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1318 1319 IsSimpleKeyAllowed = false; 1320 1321 return true; 1322 } 1323 1324 bool Scanner::scanPlainScalar() { 1325 StringRef::iterator Start = Current; 1326 unsigned ColStart = Column; 1327 unsigned LeadingBlanks = 0; 1328 assert(Indent >= -1 && "Indent must be >= -1 !"); 1329 unsigned indent = static_cast<unsigned>(Indent + 1); 1330 while (true) { 1331 if (*Current == '#') 1332 break; 1333 1334 while (!isBlankOrBreak(Current)) { 1335 if ( FlowLevel && *Current == ':' 1336 && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) { 1337 setError("Found unexpected ':' while scanning a plain scalar", Current); 1338 return false; 1339 } 1340 1341 // Check for the end of the plain scalar. 1342 if ( (*Current == ':' && isBlankOrBreak(Current + 1)) 1343 || ( FlowLevel 1344 && (StringRef(Current, 1).find_first_of(",:?[]{}") 1345 != StringRef::npos))) 1346 break; 1347 1348 StringRef::iterator i = skip_nb_char(Current); 1349 if (i == Current) 1350 break; 1351 Current = i; 1352 ++Column; 1353 } 1354 1355 // Are we at the end? 1356 if (!isBlankOrBreak(Current)) 1357 break; 1358 1359 // Eat blanks. 1360 StringRef::iterator Tmp = Current; 1361 while (isBlankOrBreak(Tmp)) { 1362 StringRef::iterator i = skip_s_white(Tmp); 1363 if (i != Tmp) { 1364 if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { 1365 setError("Found invalid tab character in indentation", Tmp); 1366 return false; 1367 } 1368 Tmp = i; 1369 ++Column; 1370 } else { 1371 i = skip_b_break(Tmp); 1372 if (!LeadingBlanks) 1373 LeadingBlanks = 1; 1374 Tmp = i; 1375 Column = 0; 1376 ++Line; 1377 } 1378 } 1379 1380 if (!FlowLevel && Column < indent) 1381 break; 1382 1383 Current = Tmp; 1384 } 1385 if (Start == Current) { 1386 setError("Got empty plain scalar", Start); 1387 return false; 1388 } 1389 Token T; 1390 T.Kind = Token::TK_Scalar; 1391 T.Range = StringRef(Start, Current - Start); 1392 TokenQueue.push_back(T); 1393 1394 // Plain scalars can be simple keys. 1395 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1396 1397 IsSimpleKeyAllowed = false; 1398 1399 return true; 1400 } 1401 1402 bool Scanner::scanAliasOrAnchor(bool IsAlias) { 1403 StringRef::iterator Start = Current; 1404 unsigned ColStart = Column; 1405 skip(1); 1406 while(true) { 1407 if ( *Current == '[' || *Current == ']' 1408 || *Current == '{' || *Current == '}' 1409 || *Current == ',' 1410 || *Current == ':') 1411 break; 1412 StringRef::iterator i = skip_ns_char(Current); 1413 if (i == Current) 1414 break; 1415 Current = i; 1416 ++Column; 1417 } 1418 1419 if (Start == Current) { 1420 setError("Got empty alias or anchor", Start); 1421 return false; 1422 } 1423 1424 Token T; 1425 T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; 1426 T.Range = StringRef(Start, Current - Start); 1427 TokenQueue.push_back(T); 1428 1429 // Alias and anchors can be simple keys. 1430 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1431 1432 IsSimpleKeyAllowed = false; 1433 1434 return true; 1435 } 1436 1437 char Scanner::scanBlockChompingIndicator() { 1438 char Indicator = ' '; 1439 if (Current != End && (*Current == '+' || *Current == '-')) { 1440 Indicator = *Current; 1441 skip(1); 1442 } 1443 return Indicator; 1444 } 1445 1446 /// Get the number of line breaks after chomping. 1447 /// 1448 /// Return the number of trailing line breaks to emit, depending on 1449 /// \p ChompingIndicator. 1450 static unsigned getChompedLineBreaks(char ChompingIndicator, 1451 unsigned LineBreaks, StringRef Str) { 1452 if (ChompingIndicator == '-') // Strip all line breaks. 1453 return 0; 1454 if (ChompingIndicator == '+') // Keep all line breaks. 1455 return LineBreaks; 1456 // Clip trailing lines. 1457 return Str.empty() ? 0 : 1; 1458 } 1459 1460 unsigned Scanner::scanBlockIndentationIndicator() { 1461 unsigned Indent = 0; 1462 if (Current != End && (*Current >= '1' && *Current <= '9')) { 1463 Indent = unsigned(*Current - '0'); 1464 skip(1); 1465 } 1466 return Indent; 1467 } 1468 1469 bool Scanner::scanBlockScalarHeader(char &ChompingIndicator, 1470 unsigned &IndentIndicator, bool &IsDone) { 1471 auto Start = Current; 1472 1473 ChompingIndicator = scanBlockChompingIndicator(); 1474 IndentIndicator = scanBlockIndentationIndicator(); 1475 // Check for the chomping indicator once again. 1476 if (ChompingIndicator == ' ') 1477 ChompingIndicator = scanBlockChompingIndicator(); 1478 Current = skip_while(&Scanner::skip_s_white, Current); 1479 skipComment(); 1480 1481 if (Current == End) { // EOF, we have an empty scalar. 1482 Token T; 1483 T.Kind = Token::TK_BlockScalar; 1484 T.Range = StringRef(Start, Current - Start); 1485 TokenQueue.push_back(T); 1486 IsDone = true; 1487 return true; 1488 } 1489 1490 if (!consumeLineBreakIfPresent()) { 1491 setError("Expected a line break after block scalar header", Current); 1492 return false; 1493 } 1494 return true; 1495 } 1496 1497 bool Scanner::findBlockScalarIndent(unsigned &BlockIndent, 1498 unsigned BlockExitIndent, 1499 unsigned &LineBreaks, bool &IsDone) { 1500 unsigned MaxAllSpaceLineCharacters = 0; 1501 StringRef::iterator LongestAllSpaceLine; 1502 1503 while (true) { 1504 advanceWhile(&Scanner::skip_s_space); 1505 if (skip_nb_char(Current) != Current) { 1506 // This line isn't empty, so try and find the indentation. 1507 if (Column <= BlockExitIndent) { // End of the block literal. 1508 IsDone = true; 1509 return true; 1510 } 1511 // We found the block's indentation. 1512 BlockIndent = Column; 1513 if (MaxAllSpaceLineCharacters > BlockIndent) { 1514 setError( 1515 "Leading all-spaces line must be smaller than the block indent", 1516 LongestAllSpaceLine); 1517 return false; 1518 } 1519 return true; 1520 } 1521 if (skip_b_break(Current) != Current && 1522 Column > MaxAllSpaceLineCharacters) { 1523 // Record the longest all-space line in case it's longer than the 1524 // discovered block indent. 1525 MaxAllSpaceLineCharacters = Column; 1526 LongestAllSpaceLine = Current; 1527 } 1528 1529 // Check for EOF. 1530 if (Current == End) { 1531 IsDone = true; 1532 return true; 1533 } 1534 1535 if (!consumeLineBreakIfPresent()) { 1536 IsDone = true; 1537 return true; 1538 } 1539 ++LineBreaks; 1540 } 1541 return true; 1542 } 1543 1544 bool Scanner::scanBlockScalarIndent(unsigned BlockIndent, 1545 unsigned BlockExitIndent, bool &IsDone) { 1546 // Skip the indentation. 1547 while (Column < BlockIndent) { 1548 auto I = skip_s_space(Current); 1549 if (I == Current) 1550 break; 1551 Current = I; 1552 ++Column; 1553 } 1554 1555 if (skip_nb_char(Current) == Current) 1556 return true; 1557 1558 if (Column <= BlockExitIndent) { // End of the block literal. 1559 IsDone = true; 1560 return true; 1561 } 1562 1563 if (Column < BlockIndent) { 1564 if (Current != End && *Current == '#') { // Trailing comment. 1565 IsDone = true; 1566 return true; 1567 } 1568 setError("A text line is less indented than the block scalar", Current); 1569 return false; 1570 } 1571 return true; // A normal text line. 1572 } 1573 1574 bool Scanner::scanBlockScalar(bool IsLiteral) { 1575 // Eat '|' or '>' 1576 assert(*Current == '|' || *Current == '>'); 1577 skip(1); 1578 1579 char ChompingIndicator; 1580 unsigned BlockIndent; 1581 bool IsDone = false; 1582 if (!scanBlockScalarHeader(ChompingIndicator, BlockIndent, IsDone)) 1583 return false; 1584 if (IsDone) 1585 return true; 1586 1587 auto Start = Current; 1588 unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent; 1589 unsigned LineBreaks = 0; 1590 if (BlockIndent == 0) { 1591 if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks, 1592 IsDone)) 1593 return false; 1594 } 1595 1596 // Scan the block's scalars body. 1597 SmallString<256> Str; 1598 while (!IsDone) { 1599 if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone)) 1600 return false; 1601 if (IsDone) 1602 break; 1603 1604 // Parse the current line. 1605 auto LineStart = Current; 1606 advanceWhile(&Scanner::skip_nb_char); 1607 if (LineStart != Current) { 1608 Str.append(LineBreaks, '\n'); 1609 Str.append(StringRef(LineStart, Current - LineStart)); 1610 LineBreaks = 0; 1611 } 1612 1613 // Check for EOF. 1614 if (Current == End) 1615 break; 1616 1617 if (!consumeLineBreakIfPresent()) 1618 break; 1619 ++LineBreaks; 1620 } 1621 1622 if (Current == End && !LineBreaks) 1623 // Ensure that there is at least one line break before the end of file. 1624 LineBreaks = 1; 1625 Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n'); 1626 1627 // New lines may start a simple key. 1628 if (!FlowLevel) 1629 IsSimpleKeyAllowed = true; 1630 1631 Token T; 1632 T.Kind = Token::TK_BlockScalar; 1633 T.Range = StringRef(Start, Current - Start); 1634 T.Value = Str.str().str(); 1635 TokenQueue.push_back(T); 1636 return true; 1637 } 1638 1639 bool Scanner::scanTag() { 1640 StringRef::iterator Start = Current; 1641 unsigned ColStart = Column; 1642 skip(1); // Eat !. 1643 if (Current == End || isBlankOrBreak(Current)); // An empty tag. 1644 else if (*Current == '<') { 1645 skip(1); 1646 scan_ns_uri_char(); 1647 if (!consume('>')) 1648 return false; 1649 } else { 1650 // FIXME: Actually parse the c-ns-shorthand-tag rule. 1651 Current = skip_while(&Scanner::skip_ns_char, Current); 1652 } 1653 1654 Token T; 1655 T.Kind = Token::TK_Tag; 1656 T.Range = StringRef(Start, Current - Start); 1657 TokenQueue.push_back(T); 1658 1659 // Tags can be simple keys. 1660 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1661 1662 IsSimpleKeyAllowed = false; 1663 1664 return true; 1665 } 1666 1667 bool Scanner::fetchMoreTokens() { 1668 if (IsStartOfStream) 1669 return scanStreamStart(); 1670 1671 scanToNextToken(); 1672 1673 if (Current == End) 1674 return scanStreamEnd(); 1675 1676 removeStaleSimpleKeyCandidates(); 1677 1678 unrollIndent(Column); 1679 1680 if (Column == 0 && *Current == '%') 1681 return scanDirective(); 1682 1683 if (Column == 0 && Current + 4 <= End 1684 && *Current == '-' 1685 && *(Current + 1) == '-' 1686 && *(Current + 2) == '-' 1687 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1688 return scanDocumentIndicator(true); 1689 1690 if (Column == 0 && Current + 4 <= End 1691 && *Current == '.' 1692 && *(Current + 1) == '.' 1693 && *(Current + 2) == '.' 1694 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1695 return scanDocumentIndicator(false); 1696 1697 if (*Current == '[') 1698 return scanFlowCollectionStart(true); 1699 1700 if (*Current == '{') 1701 return scanFlowCollectionStart(false); 1702 1703 if (*Current == ']') 1704 return scanFlowCollectionEnd(true); 1705 1706 if (*Current == '}') 1707 return scanFlowCollectionEnd(false); 1708 1709 if (*Current == ',') 1710 return scanFlowEntry(); 1711 1712 if (*Current == '-' && isBlankOrBreak(Current + 1)) 1713 return scanBlockEntry(); 1714 1715 if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1))) 1716 return scanKey(); 1717 1718 if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1))) 1719 return scanValue(); 1720 1721 if (*Current == '*') 1722 return scanAliasOrAnchor(true); 1723 1724 if (*Current == '&') 1725 return scanAliasOrAnchor(false); 1726 1727 if (*Current == '!') 1728 return scanTag(); 1729 1730 if (*Current == '|' && !FlowLevel) 1731 return scanBlockScalar(true); 1732 1733 if (*Current == '>' && !FlowLevel) 1734 return scanBlockScalar(false); 1735 1736 if (*Current == '\'') 1737 return scanFlowScalar(false); 1738 1739 if (*Current == '"') 1740 return scanFlowScalar(true); 1741 1742 // Get a plain scalar. 1743 StringRef FirstChar(Current, 1); 1744 if (!(isBlankOrBreak(Current) 1745 || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos) 1746 || (*Current == '-' && !isBlankOrBreak(Current + 1)) 1747 || (!FlowLevel && (*Current == '?' || *Current == ':') 1748 && isBlankOrBreak(Current + 1)) 1749 || (!FlowLevel && *Current == ':' 1750 && Current + 2 < End 1751 && *(Current + 1) == ':' 1752 && !isBlankOrBreak(Current + 2))) 1753 return scanPlainScalar(); 1754 1755 setError("Unrecognized character while tokenizing."); 1756 return false; 1757 } 1758 1759 Stream::Stream(StringRef Input, SourceMgr &SM, bool ShowColors, 1760 std::error_code *EC) 1761 : scanner(new Scanner(Input, SM, ShowColors, EC)), CurrentDoc() {} 1762 1763 Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors, 1764 std::error_code *EC) 1765 : scanner(new Scanner(InputBuffer, SM, ShowColors, EC)), CurrentDoc() {} 1766 1767 Stream::~Stream() = default; 1768 1769 bool Stream::failed() { return scanner->failed(); } 1770 1771 void Stream::printError(Node *N, const Twine &Msg) { 1772 scanner->printError( N->getSourceRange().Start 1773 , SourceMgr::DK_Error 1774 , Msg 1775 , N->getSourceRange()); 1776 } 1777 1778 document_iterator Stream::begin() { 1779 if (CurrentDoc) 1780 report_fatal_error("Can only iterate over the stream once"); 1781 1782 // Skip Stream-Start. 1783 scanner->getNext(); 1784 1785 CurrentDoc.reset(new Document(*this)); 1786 return document_iterator(CurrentDoc); 1787 } 1788 1789 document_iterator Stream::end() { 1790 return document_iterator(); 1791 } 1792 1793 void Stream::skip() { 1794 for (document_iterator i = begin(), e = end(); i != e; ++i) 1795 i->skip(); 1796 } 1797 1798 Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A, 1799 StringRef T) 1800 : Doc(D), TypeID(Type), Anchor(A), Tag(T) { 1801 SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); 1802 SourceRange = SMRange(Start, Start); 1803 } 1804 1805 std::string Node::getVerbatimTag() const { 1806 StringRef Raw = getRawTag(); 1807 if (!Raw.empty() && Raw != "!") { 1808 std::string Ret; 1809 if (Raw.find_last_of('!') == 0) { 1810 Ret = Doc->getTagMap().find("!")->second; 1811 Ret += Raw.substr(1); 1812 return Ret; 1813 } else if (Raw.startswith("!!")) { 1814 Ret = Doc->getTagMap().find("!!")->second; 1815 Ret += Raw.substr(2); 1816 return Ret; 1817 } else { 1818 StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1); 1819 std::map<StringRef, StringRef>::const_iterator It = 1820 Doc->getTagMap().find(TagHandle); 1821 if (It != Doc->getTagMap().end()) 1822 Ret = It->second; 1823 else { 1824 Token T; 1825 T.Kind = Token::TK_Tag; 1826 T.Range = TagHandle; 1827 setError(Twine("Unknown tag handle ") + TagHandle, T); 1828 } 1829 Ret += Raw.substr(Raw.find_last_of('!') + 1); 1830 return Ret; 1831 } 1832 } 1833 1834 switch (getType()) { 1835 case NK_Null: 1836 return "tag:yaml.org,2002:null"; 1837 case NK_Scalar: 1838 case NK_BlockScalar: 1839 // TODO: Tag resolution. 1840 return "tag:yaml.org,2002:str"; 1841 case NK_Mapping: 1842 return "tag:yaml.org,2002:map"; 1843 case NK_Sequence: 1844 return "tag:yaml.org,2002:seq"; 1845 } 1846 1847 return ""; 1848 } 1849 1850 Token &Node::peekNext() { 1851 return Doc->peekNext(); 1852 } 1853 1854 Token Node::getNext() { 1855 return Doc->getNext(); 1856 } 1857 1858 Node *Node::parseBlockNode() { 1859 return Doc->parseBlockNode(); 1860 } 1861 1862 BumpPtrAllocator &Node::getAllocator() { 1863 return Doc->NodeAllocator; 1864 } 1865 1866 void Node::setError(const Twine &Msg, Token &Tok) const { 1867 Doc->setError(Msg, Tok); 1868 } 1869 1870 bool Node::failed() const { 1871 return Doc->failed(); 1872 } 1873 1874 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { 1875 // TODO: Handle newlines properly. We need to remove leading whitespace. 1876 if (Value[0] == '"') { // Double quoted. 1877 // Pull off the leading and trailing "s. 1878 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1879 // Search for characters that would require unescaping the value. 1880 StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); 1881 if (i != StringRef::npos) 1882 return unescapeDoubleQuoted(UnquotedValue, i, Storage); 1883 return UnquotedValue; 1884 } else if (Value[0] == '\'') { // Single quoted. 1885 // Pull off the leading and trailing 's. 1886 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1887 StringRef::size_type i = UnquotedValue.find('\''); 1888 if (i != StringRef::npos) { 1889 // We're going to need Storage. 1890 Storage.clear(); 1891 Storage.reserve(UnquotedValue.size()); 1892 for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { 1893 StringRef Valid(UnquotedValue.begin(), i); 1894 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1895 Storage.push_back('\''); 1896 UnquotedValue = UnquotedValue.substr(i + 2); 1897 } 1898 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 1899 return StringRef(Storage.begin(), Storage.size()); 1900 } 1901 return UnquotedValue; 1902 } 1903 // Plain or block. 1904 return Value.rtrim(' '); 1905 } 1906 1907 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue 1908 , StringRef::size_type i 1909 , SmallVectorImpl<char> &Storage) 1910 const { 1911 // Use Storage to build proper value. 1912 Storage.clear(); 1913 Storage.reserve(UnquotedValue.size()); 1914 for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { 1915 // Insert all previous chars into Storage. 1916 StringRef Valid(UnquotedValue.begin(), i); 1917 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1918 // Chop off inserted chars. 1919 UnquotedValue = UnquotedValue.substr(i); 1920 1921 assert(!UnquotedValue.empty() && "Can't be empty!"); 1922 1923 // Parse escape or line break. 1924 switch (UnquotedValue[0]) { 1925 case '\r': 1926 case '\n': 1927 Storage.push_back('\n'); 1928 if ( UnquotedValue.size() > 1 1929 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1930 UnquotedValue = UnquotedValue.substr(1); 1931 UnquotedValue = UnquotedValue.substr(1); 1932 break; 1933 default: 1934 if (UnquotedValue.size() == 1) 1935 // TODO: Report error. 1936 break; 1937 UnquotedValue = UnquotedValue.substr(1); 1938 switch (UnquotedValue[0]) { 1939 default: { 1940 Token T; 1941 T.Range = StringRef(UnquotedValue.begin(), 1); 1942 setError("Unrecognized escape code!", T); 1943 return ""; 1944 } 1945 case '\r': 1946 case '\n': 1947 // Remove the new line. 1948 if ( UnquotedValue.size() > 1 1949 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1950 UnquotedValue = UnquotedValue.substr(1); 1951 // If this was just a single byte newline, it will get skipped 1952 // below. 1953 break; 1954 case '0': 1955 Storage.push_back(0x00); 1956 break; 1957 case 'a': 1958 Storage.push_back(0x07); 1959 break; 1960 case 'b': 1961 Storage.push_back(0x08); 1962 break; 1963 case 't': 1964 case 0x09: 1965 Storage.push_back(0x09); 1966 break; 1967 case 'n': 1968 Storage.push_back(0x0A); 1969 break; 1970 case 'v': 1971 Storage.push_back(0x0B); 1972 break; 1973 case 'f': 1974 Storage.push_back(0x0C); 1975 break; 1976 case 'r': 1977 Storage.push_back(0x0D); 1978 break; 1979 case 'e': 1980 Storage.push_back(0x1B); 1981 break; 1982 case ' ': 1983 Storage.push_back(0x20); 1984 break; 1985 case '"': 1986 Storage.push_back(0x22); 1987 break; 1988 case '/': 1989 Storage.push_back(0x2F); 1990 break; 1991 case '\\': 1992 Storage.push_back(0x5C); 1993 break; 1994 case 'N': 1995 encodeUTF8(0x85, Storage); 1996 break; 1997 case '_': 1998 encodeUTF8(0xA0, Storage); 1999 break; 2000 case 'L': 2001 encodeUTF8(0x2028, Storage); 2002 break; 2003 case 'P': 2004 encodeUTF8(0x2029, Storage); 2005 break; 2006 case 'x': { 2007 if (UnquotedValue.size() < 3) 2008 // TODO: Report error. 2009 break; 2010 unsigned int UnicodeScalarValue; 2011 if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) 2012 // TODO: Report error. 2013 UnicodeScalarValue = 0xFFFD; 2014 encodeUTF8(UnicodeScalarValue, Storage); 2015 UnquotedValue = UnquotedValue.substr(2); 2016 break; 2017 } 2018 case 'u': { 2019 if (UnquotedValue.size() < 5) 2020 // TODO: Report error. 2021 break; 2022 unsigned int UnicodeScalarValue; 2023 if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) 2024 // TODO: Report error. 2025 UnicodeScalarValue = 0xFFFD; 2026 encodeUTF8(UnicodeScalarValue, Storage); 2027 UnquotedValue = UnquotedValue.substr(4); 2028 break; 2029 } 2030 case 'U': { 2031 if (UnquotedValue.size() < 9) 2032 // TODO: Report error. 2033 break; 2034 unsigned int UnicodeScalarValue; 2035 if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) 2036 // TODO: Report error. 2037 UnicodeScalarValue = 0xFFFD; 2038 encodeUTF8(UnicodeScalarValue, Storage); 2039 UnquotedValue = UnquotedValue.substr(8); 2040 break; 2041 } 2042 } 2043 UnquotedValue = UnquotedValue.substr(1); 2044 } 2045 } 2046 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 2047 return StringRef(Storage.begin(), Storage.size()); 2048 } 2049 2050 Node *KeyValueNode::getKey() { 2051 if (Key) 2052 return Key; 2053 // Handle implicit null keys. 2054 { 2055 Token &t = peekNext(); 2056 if ( t.Kind == Token::TK_BlockEnd 2057 || t.Kind == Token::TK_Value 2058 || t.Kind == Token::TK_Error) { 2059 return Key = new (getAllocator()) NullNode(Doc); 2060 } 2061 if (t.Kind == Token::TK_Key) 2062 getNext(); // skip TK_Key. 2063 } 2064 2065 // Handle explicit null keys. 2066 Token &t = peekNext(); 2067 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { 2068 return Key = new (getAllocator()) NullNode(Doc); 2069 } 2070 2071 // We've got a normal key. 2072 return Key = parseBlockNode(); 2073 } 2074 2075 Node *KeyValueNode::getValue() { 2076 if (Value) 2077 return Value; 2078 getKey()->skip(); 2079 if (failed()) 2080 return Value = new (getAllocator()) NullNode(Doc); 2081 2082 // Handle implicit null values. 2083 { 2084 Token &t = peekNext(); 2085 if ( t.Kind == Token::TK_BlockEnd 2086 || t.Kind == Token::TK_FlowMappingEnd 2087 || t.Kind == Token::TK_Key 2088 || t.Kind == Token::TK_FlowEntry 2089 || t.Kind == Token::TK_Error) { 2090 return Value = new (getAllocator()) NullNode(Doc); 2091 } 2092 2093 if (t.Kind != Token::TK_Value) { 2094 setError("Unexpected token in Key Value.", t); 2095 return Value = new (getAllocator()) NullNode(Doc); 2096 } 2097 getNext(); // skip TK_Value. 2098 } 2099 2100 // Handle explicit null values. 2101 Token &t = peekNext(); 2102 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { 2103 return Value = new (getAllocator()) NullNode(Doc); 2104 } 2105 2106 // We got a normal value. 2107 return Value = parseBlockNode(); 2108 } 2109 2110 void MappingNode::increment() { 2111 if (failed()) { 2112 IsAtEnd = true; 2113 CurrentEntry = nullptr; 2114 return; 2115 } 2116 if (CurrentEntry) { 2117 CurrentEntry->skip(); 2118 if (Type == MT_Inline) { 2119 IsAtEnd = true; 2120 CurrentEntry = nullptr; 2121 return; 2122 } 2123 } 2124 Token T = peekNext(); 2125 if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { 2126 // KeyValueNode eats the TK_Key. That way it can detect null keys. 2127 CurrentEntry = new (getAllocator()) KeyValueNode(Doc); 2128 } else if (Type == MT_Block) { 2129 switch (T.Kind) { 2130 case Token::TK_BlockEnd: 2131 getNext(); 2132 IsAtEnd = true; 2133 CurrentEntry = nullptr; 2134 break; 2135 default: 2136 setError("Unexpected token. Expected Key or Block End", T); 2137 LLVM_FALLTHROUGH; 2138 case Token::TK_Error: 2139 IsAtEnd = true; 2140 CurrentEntry = nullptr; 2141 } 2142 } else { 2143 switch (T.Kind) { 2144 case Token::TK_FlowEntry: 2145 // Eat the flow entry and recurse. 2146 getNext(); 2147 return increment(); 2148 case Token::TK_FlowMappingEnd: 2149 getNext(); 2150 LLVM_FALLTHROUGH; 2151 case Token::TK_Error: 2152 // Set this to end iterator. 2153 IsAtEnd = true; 2154 CurrentEntry = nullptr; 2155 break; 2156 default: 2157 setError( "Unexpected token. Expected Key, Flow Entry, or Flow " 2158 "Mapping End." 2159 , T); 2160 IsAtEnd = true; 2161 CurrentEntry = nullptr; 2162 } 2163 } 2164 } 2165 2166 void SequenceNode::increment() { 2167 if (failed()) { 2168 IsAtEnd = true; 2169 CurrentEntry = nullptr; 2170 return; 2171 } 2172 if (CurrentEntry) 2173 CurrentEntry->skip(); 2174 Token T = peekNext(); 2175 if (SeqType == ST_Block) { 2176 switch (T.Kind) { 2177 case Token::TK_BlockEntry: 2178 getNext(); 2179 CurrentEntry = parseBlockNode(); 2180 if (!CurrentEntry) { // An error occurred. 2181 IsAtEnd = true; 2182 CurrentEntry = nullptr; 2183 } 2184 break; 2185 case Token::TK_BlockEnd: 2186 getNext(); 2187 IsAtEnd = true; 2188 CurrentEntry = nullptr; 2189 break; 2190 default: 2191 setError( "Unexpected token. Expected Block Entry or Block End." 2192 , T); 2193 LLVM_FALLTHROUGH; 2194 case Token::TK_Error: 2195 IsAtEnd = true; 2196 CurrentEntry = nullptr; 2197 } 2198 } else if (SeqType == ST_Indentless) { 2199 switch (T.Kind) { 2200 case Token::TK_BlockEntry: 2201 getNext(); 2202 CurrentEntry = parseBlockNode(); 2203 if (!CurrentEntry) { // An error occurred. 2204 IsAtEnd = true; 2205 CurrentEntry = nullptr; 2206 } 2207 break; 2208 default: 2209 case Token::TK_Error: 2210 IsAtEnd = true; 2211 CurrentEntry = nullptr; 2212 } 2213 } else if (SeqType == ST_Flow) { 2214 switch (T.Kind) { 2215 case Token::TK_FlowEntry: 2216 // Eat the flow entry and recurse. 2217 getNext(); 2218 WasPreviousTokenFlowEntry = true; 2219 return increment(); 2220 case Token::TK_FlowSequenceEnd: 2221 getNext(); 2222 LLVM_FALLTHROUGH; 2223 case Token::TK_Error: 2224 // Set this to end iterator. 2225 IsAtEnd = true; 2226 CurrentEntry = nullptr; 2227 break; 2228 case Token::TK_StreamEnd: 2229 case Token::TK_DocumentEnd: 2230 case Token::TK_DocumentStart: 2231 setError("Could not find closing ]!", T); 2232 // Set this to end iterator. 2233 IsAtEnd = true; 2234 CurrentEntry = nullptr; 2235 break; 2236 default: 2237 if (!WasPreviousTokenFlowEntry) { 2238 setError("Expected , between entries!", T); 2239 IsAtEnd = true; 2240 CurrentEntry = nullptr; 2241 break; 2242 } 2243 // Otherwise it must be a flow entry. 2244 CurrentEntry = parseBlockNode(); 2245 if (!CurrentEntry) { 2246 IsAtEnd = true; 2247 } 2248 WasPreviousTokenFlowEntry = false; 2249 break; 2250 } 2251 } 2252 } 2253 2254 Document::Document(Stream &S) : stream(S), Root(nullptr) { 2255 // Tag maps starts with two default mappings. 2256 TagMap["!"] = "!"; 2257 TagMap["!!"] = "tag:yaml.org,2002:"; 2258 2259 if (parseDirectives()) 2260 expectToken(Token::TK_DocumentStart); 2261 Token &T = peekNext(); 2262 if (T.Kind == Token::TK_DocumentStart) 2263 getNext(); 2264 } 2265 2266 bool Document::skip() { 2267 if (stream.scanner->failed()) 2268 return false; 2269 if (!Root) 2270 getRoot(); 2271 Root->skip(); 2272 Token &T = peekNext(); 2273 if (T.Kind == Token::TK_StreamEnd) 2274 return false; 2275 if (T.Kind == Token::TK_DocumentEnd) { 2276 getNext(); 2277 return skip(); 2278 } 2279 return true; 2280 } 2281 2282 Token &Document::peekNext() { 2283 return stream.scanner->peekNext(); 2284 } 2285 2286 Token Document::getNext() { 2287 return stream.scanner->getNext(); 2288 } 2289 2290 void Document::setError(const Twine &Message, Token &Location) const { 2291 stream.scanner->setError(Message, Location.Range.begin()); 2292 } 2293 2294 bool Document::failed() const { 2295 return stream.scanner->failed(); 2296 } 2297 2298 Node *Document::parseBlockNode() { 2299 Token T = peekNext(); 2300 // Handle properties. 2301 Token AnchorInfo; 2302 Token TagInfo; 2303 parse_property: 2304 switch (T.Kind) { 2305 case Token::TK_Alias: 2306 getNext(); 2307 return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); 2308 case Token::TK_Anchor: 2309 if (AnchorInfo.Kind == Token::TK_Anchor) { 2310 setError("Already encountered an anchor for this node!", T); 2311 return nullptr; 2312 } 2313 AnchorInfo = getNext(); // Consume TK_Anchor. 2314 T = peekNext(); 2315 goto parse_property; 2316 case Token::TK_Tag: 2317 if (TagInfo.Kind == Token::TK_Tag) { 2318 setError("Already encountered a tag for this node!", T); 2319 return nullptr; 2320 } 2321 TagInfo = getNext(); // Consume TK_Tag. 2322 T = peekNext(); 2323 goto parse_property; 2324 default: 2325 break; 2326 } 2327 2328 switch (T.Kind) { 2329 case Token::TK_BlockEntry: 2330 // We got an unindented BlockEntry sequence. This is not terminated with 2331 // a BlockEnd. 2332 // Don't eat the TK_BlockEntry, SequenceNode needs it. 2333 return new (NodeAllocator) SequenceNode( stream.CurrentDoc 2334 , AnchorInfo.Range.substr(1) 2335 , TagInfo.Range 2336 , SequenceNode::ST_Indentless); 2337 case Token::TK_BlockSequenceStart: 2338 getNext(); 2339 return new (NodeAllocator) 2340 SequenceNode( stream.CurrentDoc 2341 , AnchorInfo.Range.substr(1) 2342 , TagInfo.Range 2343 , SequenceNode::ST_Block); 2344 case Token::TK_BlockMappingStart: 2345 getNext(); 2346 return new (NodeAllocator) 2347 MappingNode( stream.CurrentDoc 2348 , AnchorInfo.Range.substr(1) 2349 , TagInfo.Range 2350 , MappingNode::MT_Block); 2351 case Token::TK_FlowSequenceStart: 2352 getNext(); 2353 return new (NodeAllocator) 2354 SequenceNode( stream.CurrentDoc 2355 , AnchorInfo.Range.substr(1) 2356 , TagInfo.Range 2357 , SequenceNode::ST_Flow); 2358 case Token::TK_FlowMappingStart: 2359 getNext(); 2360 return new (NodeAllocator) 2361 MappingNode( stream.CurrentDoc 2362 , AnchorInfo.Range.substr(1) 2363 , TagInfo.Range 2364 , MappingNode::MT_Flow); 2365 case Token::TK_Scalar: 2366 getNext(); 2367 return new (NodeAllocator) 2368 ScalarNode( stream.CurrentDoc 2369 , AnchorInfo.Range.substr(1) 2370 , TagInfo.Range 2371 , T.Range); 2372 case Token::TK_BlockScalar: { 2373 getNext(); 2374 StringRef NullTerminatedStr(T.Value.c_str(), T.Value.length() + 1); 2375 StringRef StrCopy = NullTerminatedStr.copy(NodeAllocator).drop_back(); 2376 return new (NodeAllocator) 2377 BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1), 2378 TagInfo.Range, StrCopy, T.Range); 2379 } 2380 case Token::TK_Key: 2381 // Don't eat the TK_Key, KeyValueNode expects it. 2382 return new (NodeAllocator) 2383 MappingNode( stream.CurrentDoc 2384 , AnchorInfo.Range.substr(1) 2385 , TagInfo.Range 2386 , MappingNode::MT_Inline); 2387 case Token::TK_DocumentStart: 2388 case Token::TK_DocumentEnd: 2389 case Token::TK_StreamEnd: 2390 default: 2391 // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not 2392 // !!null null. 2393 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2394 case Token::TK_Error: 2395 return nullptr; 2396 } 2397 llvm_unreachable("Control flow shouldn't reach here."); 2398 return nullptr; 2399 } 2400 2401 bool Document::parseDirectives() { 2402 bool isDirective = false; 2403 while (true) { 2404 Token T = peekNext(); 2405 if (T.Kind == Token::TK_TagDirective) { 2406 parseTAGDirective(); 2407 isDirective = true; 2408 } else if (T.Kind == Token::TK_VersionDirective) { 2409 parseYAMLDirective(); 2410 isDirective = true; 2411 } else 2412 break; 2413 } 2414 return isDirective; 2415 } 2416 2417 void Document::parseYAMLDirective() { 2418 getNext(); // Eat %YAML <version> 2419 } 2420 2421 void Document::parseTAGDirective() { 2422 Token Tag = getNext(); // %TAG <handle> <prefix> 2423 StringRef T = Tag.Range; 2424 // Strip %TAG 2425 T = T.substr(T.find_first_of(" \t")).ltrim(" \t"); 2426 std::size_t HandleEnd = T.find_first_of(" \t"); 2427 StringRef TagHandle = T.substr(0, HandleEnd); 2428 StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t"); 2429 TagMap[TagHandle] = TagPrefix; 2430 } 2431 2432 bool Document::expectToken(int TK) { 2433 Token T = getNext(); 2434 if (T.Kind != TK) { 2435 setError("Unexpected token", T); 2436 return false; 2437 } 2438 return true; 2439 } 2440