1 //===--- YAMLParser.cpp - Simple YAML parser ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements a YAML parser. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Support/YAMLParser.h" 15 #include "llvm/ADT/STLExtras.h" 16 #include "llvm/ADT/SmallString.h" 17 #include "llvm/ADT/SmallVector.h" 18 #include "llvm/ADT/StringExtras.h" 19 #include "llvm/ADT/Twine.h" 20 #include "llvm/ADT/AllocatorList.h" 21 #include "llvm/Support/ErrorHandling.h" 22 #include "llvm/Support/MemoryBuffer.h" 23 #include "llvm/Support/SourceMgr.h" 24 #include "llvm/Support/raw_ostream.h" 25 26 using namespace llvm; 27 using namespace yaml; 28 29 enum UnicodeEncodingForm { 30 UEF_UTF32_LE, ///< UTF-32 Little Endian 31 UEF_UTF32_BE, ///< UTF-32 Big Endian 32 UEF_UTF16_LE, ///< UTF-16 Little Endian 33 UEF_UTF16_BE, ///< UTF-16 Big Endian 34 UEF_UTF8, ///< UTF-8 or ascii. 35 UEF_Unknown ///< Not a valid Unicode encoding. 36 }; 37 38 /// EncodingInfo - Holds the encoding type and length of the byte order mark if 39 /// it exists. Length is in {0, 2, 3, 4}. 40 typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo; 41 42 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode 43 /// encoding form of \a Input. 44 /// 45 /// @param Input A string of length 0 or more. 46 /// @returns An EncodingInfo indicating the Unicode encoding form of the input 47 /// and how long the byte order mark is if one exists. 48 static EncodingInfo getUnicodeEncoding(StringRef Input) { 49 if (Input.size() == 0) 50 return std::make_pair(UEF_Unknown, 0); 51 52 switch (uint8_t(Input[0])) { 53 case 0x00: 54 if (Input.size() >= 4) { 55 if ( Input[1] == 0 56 && uint8_t(Input[2]) == 0xFE 57 && uint8_t(Input[3]) == 0xFF) 58 return std::make_pair(UEF_UTF32_BE, 4); 59 if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) 60 return std::make_pair(UEF_UTF32_BE, 0); 61 } 62 63 if (Input.size() >= 2 && Input[1] != 0) 64 return std::make_pair(UEF_UTF16_BE, 0); 65 return std::make_pair(UEF_Unknown, 0); 66 case 0xFF: 67 if ( Input.size() >= 4 68 && uint8_t(Input[1]) == 0xFE 69 && Input[2] == 0 70 && Input[3] == 0) 71 return std::make_pair(UEF_UTF32_LE, 4); 72 73 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) 74 return std::make_pair(UEF_UTF16_LE, 2); 75 return std::make_pair(UEF_Unknown, 0); 76 case 0xFE: 77 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) 78 return std::make_pair(UEF_UTF16_BE, 2); 79 return std::make_pair(UEF_Unknown, 0); 80 case 0xEF: 81 if ( Input.size() >= 3 82 && uint8_t(Input[1]) == 0xBB 83 && uint8_t(Input[2]) == 0xBF) 84 return std::make_pair(UEF_UTF8, 3); 85 return std::make_pair(UEF_Unknown, 0); 86 } 87 88 // It could still be utf-32 or utf-16. 89 if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) 90 return std::make_pair(UEF_UTF32_LE, 0); 91 92 if (Input.size() >= 2 && Input[1] == 0) 93 return std::make_pair(UEF_UTF16_LE, 0); 94 95 return std::make_pair(UEF_UTF8, 0); 96 } 97 98 namespace llvm { 99 namespace yaml { 100 /// Pin the vtables to this file. 101 void Node::anchor() {} 102 void NullNode::anchor() {} 103 void ScalarNode::anchor() {} 104 void BlockScalarNode::anchor() {} 105 void KeyValueNode::anchor() {} 106 void MappingNode::anchor() {} 107 void SequenceNode::anchor() {} 108 void AliasNode::anchor() {} 109 110 /// Token - A single YAML token. 111 struct Token { 112 enum TokenKind { 113 TK_Error, // Uninitialized token. 114 TK_StreamStart, 115 TK_StreamEnd, 116 TK_VersionDirective, 117 TK_TagDirective, 118 TK_DocumentStart, 119 TK_DocumentEnd, 120 TK_BlockEntry, 121 TK_BlockEnd, 122 TK_BlockSequenceStart, 123 TK_BlockMappingStart, 124 TK_FlowEntry, 125 TK_FlowSequenceStart, 126 TK_FlowSequenceEnd, 127 TK_FlowMappingStart, 128 TK_FlowMappingEnd, 129 TK_Key, 130 TK_Value, 131 TK_Scalar, 132 TK_BlockScalar, 133 TK_Alias, 134 TK_Anchor, 135 TK_Tag 136 } Kind; 137 138 /// A string of length 0 or more whose begin() points to the logical location 139 /// of the token in the input. 140 StringRef Range; 141 142 /// The value of a block scalar node. 143 std::string Value; 144 145 Token() : Kind(TK_Error) {} 146 }; 147 } 148 } 149 150 typedef llvm::BumpPtrList<Token> TokenQueueT; 151 152 namespace { 153 /// @brief This struct is used to track simple keys. 154 /// 155 /// Simple keys are handled by creating an entry in SimpleKeys for each Token 156 /// which could legally be the start of a simple key. When peekNext is called, 157 /// if the Token To be returned is referenced by a SimpleKey, we continue 158 /// tokenizing until that potential simple key has either been found to not be 159 /// a simple key (we moved on to the next line or went further than 1024 chars). 160 /// Or when we run into a Value, and then insert a Key token (and possibly 161 /// others) before the SimpleKey's Tok. 162 struct SimpleKey { 163 TokenQueueT::iterator Tok; 164 unsigned Column; 165 unsigned Line; 166 unsigned FlowLevel; 167 bool IsRequired; 168 169 bool operator ==(const SimpleKey &Other) { 170 return Tok == Other.Tok; 171 } 172 }; 173 } 174 175 /// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit 176 /// subsequence and the subsequence's length in code units (uint8_t). 177 /// A length of 0 represents an error. 178 typedef std::pair<uint32_t, unsigned> UTF8Decoded; 179 180 static UTF8Decoded decodeUTF8(StringRef Range) { 181 StringRef::iterator Position= Range.begin(); 182 StringRef::iterator End = Range.end(); 183 // 1 byte: [0x00, 0x7f] 184 // Bit pattern: 0xxxxxxx 185 if ((*Position & 0x80) == 0) { 186 return std::make_pair(*Position, 1); 187 } 188 // 2 bytes: [0x80, 0x7ff] 189 // Bit pattern: 110xxxxx 10xxxxxx 190 if (Position + 1 != End && 191 ((*Position & 0xE0) == 0xC0) && 192 ((*(Position + 1) & 0xC0) == 0x80)) { 193 uint32_t codepoint = ((*Position & 0x1F) << 6) | 194 (*(Position + 1) & 0x3F); 195 if (codepoint >= 0x80) 196 return std::make_pair(codepoint, 2); 197 } 198 // 3 bytes: [0x8000, 0xffff] 199 // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx 200 if (Position + 2 != End && 201 ((*Position & 0xF0) == 0xE0) && 202 ((*(Position + 1) & 0xC0) == 0x80) && 203 ((*(Position + 2) & 0xC0) == 0x80)) { 204 uint32_t codepoint = ((*Position & 0x0F) << 12) | 205 ((*(Position + 1) & 0x3F) << 6) | 206 (*(Position + 2) & 0x3F); 207 // Codepoints between 0xD800 and 0xDFFF are invalid, as 208 // they are high / low surrogate halves used by UTF-16. 209 if (codepoint >= 0x800 && 210 (codepoint < 0xD800 || codepoint > 0xDFFF)) 211 return std::make_pair(codepoint, 3); 212 } 213 // 4 bytes: [0x10000, 0x10FFFF] 214 // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 215 if (Position + 3 != End && 216 ((*Position & 0xF8) == 0xF0) && 217 ((*(Position + 1) & 0xC0) == 0x80) && 218 ((*(Position + 2) & 0xC0) == 0x80) && 219 ((*(Position + 3) & 0xC0) == 0x80)) { 220 uint32_t codepoint = ((*Position & 0x07) << 18) | 221 ((*(Position + 1) & 0x3F) << 12) | 222 ((*(Position + 2) & 0x3F) << 6) | 223 (*(Position + 3) & 0x3F); 224 if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) 225 return std::make_pair(codepoint, 4); 226 } 227 return std::make_pair(0, 0); 228 } 229 230 namespace llvm { 231 namespace yaml { 232 /// @brief Scans YAML tokens from a MemoryBuffer. 233 class Scanner { 234 public: 235 Scanner(StringRef Input, SourceMgr &SM, bool ShowColors = true, 236 std::error_code *EC = nullptr); 237 Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors = true, 238 std::error_code *EC = nullptr); 239 240 /// @brief Parse the next token and return it without popping it. 241 Token &peekNext(); 242 243 /// @brief Parse the next token and pop it from the queue. 244 Token getNext(); 245 246 void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, 247 ArrayRef<SMRange> Ranges = None) { 248 SM.PrintMessage(Loc, Kind, Message, Ranges, /* FixIts= */ None, ShowColors); 249 } 250 251 void setError(const Twine &Message, StringRef::iterator Position) { 252 if (Current >= End) 253 Current = End - 1; 254 255 // propagate the error if possible 256 if (EC) 257 *EC = make_error_code(std::errc::invalid_argument); 258 259 // Don't print out more errors after the first one we encounter. The rest 260 // are just the result of the first, and have no meaning. 261 if (!Failed) 262 printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message); 263 Failed = true; 264 } 265 266 void setError(const Twine &Message) { 267 setError(Message, Current); 268 } 269 270 /// @brief Returns true if an error occurred while parsing. 271 bool failed() { 272 return Failed; 273 } 274 275 private: 276 void init(MemoryBufferRef Buffer); 277 278 StringRef currentInput() { 279 return StringRef(Current, End - Current); 280 } 281 282 /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting 283 /// at \a Position. 284 /// 285 /// If the UTF-8 code units starting at Position do not form a well-formed 286 /// code unit subsequence, then the Unicode scalar value is 0, and the length 287 /// is 0. 288 UTF8Decoded decodeUTF8(StringRef::iterator Position) { 289 return ::decodeUTF8(StringRef(Position, End - Position)); 290 } 291 292 // The following functions are based on the gramar rules in the YAML spec. The 293 // style of the function names it meant to closely match how they are written 294 // in the spec. The number within the [] is the number of the grammar rule in 295 // the spec. 296 // 297 // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. 298 // 299 // c- 300 // A production starting and ending with a special character. 301 // b- 302 // A production matching a single line break. 303 // nb- 304 // A production starting and ending with a non-break character. 305 // s- 306 // A production starting and ending with a white space character. 307 // ns- 308 // A production starting and ending with a non-space character. 309 // l- 310 // A production matching complete line(s). 311 312 /// @brief Skip a single nb-char[27] starting at Position. 313 /// 314 /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] 315 /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] 316 /// 317 /// @returns The code unit after the nb-char, or Position if it's not an 318 /// nb-char. 319 StringRef::iterator skip_nb_char(StringRef::iterator Position); 320 321 /// @brief Skip a single b-break[28] starting at Position. 322 /// 323 /// A b-break is 0xD 0xA | 0xD | 0xA 324 /// 325 /// @returns The code unit after the b-break, or Position if it's not a 326 /// b-break. 327 StringRef::iterator skip_b_break(StringRef::iterator Position); 328 329 /// Skip a single s-space[31] starting at Position. 330 /// 331 /// An s-space is 0x20 332 /// 333 /// @returns The code unit after the s-space, or Position if it's not a 334 /// s-space. 335 StringRef::iterator skip_s_space(StringRef::iterator Position); 336 337 /// @brief Skip a single s-white[33] starting at Position. 338 /// 339 /// A s-white is 0x20 | 0x9 340 /// 341 /// @returns The code unit after the s-white, or Position if it's not a 342 /// s-white. 343 StringRef::iterator skip_s_white(StringRef::iterator Position); 344 345 /// @brief Skip a single ns-char[34] starting at Position. 346 /// 347 /// A ns-char is nb-char - s-white 348 /// 349 /// @returns The code unit after the ns-char, or Position if it's not a 350 /// ns-char. 351 StringRef::iterator skip_ns_char(StringRef::iterator Position); 352 353 typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator); 354 /// @brief Skip minimal well-formed code unit subsequences until Func 355 /// returns its input. 356 /// 357 /// @returns The code unit after the last minimal well-formed code unit 358 /// subsequence that Func accepted. 359 StringRef::iterator skip_while( SkipWhileFunc Func 360 , StringRef::iterator Position); 361 362 /// Skip minimal well-formed code unit subsequences until Func returns its 363 /// input. 364 void advanceWhile(SkipWhileFunc Func); 365 366 /// @brief Scan ns-uri-char[39]s starting at Cur. 367 /// 368 /// This updates Cur and Column while scanning. 369 void scan_ns_uri_char(); 370 371 /// @brief Consume a minimal well-formed code unit subsequence starting at 372 /// \a Cur. Return false if it is not the same Unicode scalar value as 373 /// \a Expected. This updates \a Column. 374 bool consume(uint32_t Expected); 375 376 /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. 377 void skip(uint32_t Distance); 378 379 /// @brief Return true if the minimal well-formed code unit subsequence at 380 /// Pos is whitespace or a new line 381 bool isBlankOrBreak(StringRef::iterator Position); 382 383 /// Consume a single b-break[28] if it's present at the current position. 384 /// 385 /// Return false if the code unit at the current position isn't a line break. 386 bool consumeLineBreakIfPresent(); 387 388 /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey. 389 void saveSimpleKeyCandidate( TokenQueueT::iterator Tok 390 , unsigned AtColumn 391 , bool IsRequired); 392 393 /// @brief Remove simple keys that can no longer be valid simple keys. 394 /// 395 /// Invalid simple keys are not on the current line or are further than 1024 396 /// columns back. 397 void removeStaleSimpleKeyCandidates(); 398 399 /// @brief Remove all simple keys on FlowLevel \a Level. 400 void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); 401 402 /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd 403 /// tokens if needed. 404 bool unrollIndent(int ToColumn); 405 406 /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint 407 /// if needed. 408 bool rollIndent( int ToColumn 409 , Token::TokenKind Kind 410 , TokenQueueT::iterator InsertPoint); 411 412 /// @brief Skip a single-line comment when the comment starts at the current 413 /// position of the scanner. 414 void skipComment(); 415 416 /// @brief Skip whitespace and comments until the start of the next token. 417 void scanToNextToken(); 418 419 /// @brief Must be the first token generated. 420 bool scanStreamStart(); 421 422 /// @brief Generate tokens needed to close out the stream. 423 bool scanStreamEnd(); 424 425 /// @brief Scan a %BLAH directive. 426 bool scanDirective(); 427 428 /// @brief Scan a ... or ---. 429 bool scanDocumentIndicator(bool IsStart); 430 431 /// @brief Scan a [ or { and generate the proper flow collection start token. 432 bool scanFlowCollectionStart(bool IsSequence); 433 434 /// @brief Scan a ] or } and generate the proper flow collection end token. 435 bool scanFlowCollectionEnd(bool IsSequence); 436 437 /// @brief Scan the , that separates entries in a flow collection. 438 bool scanFlowEntry(); 439 440 /// @brief Scan the - that starts block sequence entries. 441 bool scanBlockEntry(); 442 443 /// @brief Scan an explicit ? indicating a key. 444 bool scanKey(); 445 446 /// @brief Scan an explicit : indicating a value. 447 bool scanValue(); 448 449 /// @brief Scan a quoted scalar. 450 bool scanFlowScalar(bool IsDoubleQuoted); 451 452 /// @brief Scan an unquoted scalar. 453 bool scanPlainScalar(); 454 455 /// @brief Scan an Alias or Anchor starting with * or &. 456 bool scanAliasOrAnchor(bool IsAlias); 457 458 /// @brief Scan a block scalar starting with | or >. 459 bool scanBlockScalar(bool IsLiteral); 460 461 /// Scan a chomping indicator in a block scalar header. 462 char scanBlockChompingIndicator(); 463 464 /// Scan an indentation indicator in a block scalar header. 465 unsigned scanBlockIndentationIndicator(); 466 467 /// Scan a block scalar header. 468 /// 469 /// Return false if an error occurred. 470 bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator, 471 bool &IsDone); 472 473 /// Look for the indentation level of a block scalar. 474 /// 475 /// Return false if an error occurred. 476 bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent, 477 unsigned &LineBreaks, bool &IsDone); 478 479 /// Scan the indentation of a text line in a block scalar. 480 /// 481 /// Return false if an error occurred. 482 bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent, 483 bool &IsDone); 484 485 /// @brief Scan a tag of the form !stuff. 486 bool scanTag(); 487 488 /// @brief Dispatch to the next scanning function based on \a *Cur. 489 bool fetchMoreTokens(); 490 491 /// @brief The SourceMgr used for diagnostics and buffer management. 492 SourceMgr &SM; 493 494 /// @brief The original input. 495 MemoryBufferRef InputBuffer; 496 497 /// @brief The current position of the scanner. 498 StringRef::iterator Current; 499 500 /// @brief The end of the input (one past the last character). 501 StringRef::iterator End; 502 503 /// @brief Current YAML indentation level in spaces. 504 int Indent; 505 506 /// @brief Current column number in Unicode code points. 507 unsigned Column; 508 509 /// @brief Current line number. 510 unsigned Line; 511 512 /// @brief How deep we are in flow style containers. 0 Means at block level. 513 unsigned FlowLevel; 514 515 /// @brief Are we at the start of the stream? 516 bool IsStartOfStream; 517 518 /// @brief Can the next token be the start of a simple key? 519 bool IsSimpleKeyAllowed; 520 521 /// @brief True if an error has occurred. 522 bool Failed; 523 524 /// @brief Should colors be used when printing out the diagnostic messages? 525 bool ShowColors; 526 527 /// @brief Queue of tokens. This is required to queue up tokens while looking 528 /// for the end of a simple key. And for cases where a single character 529 /// can produce multiple tokens (e.g. BlockEnd). 530 TokenQueueT TokenQueue; 531 532 /// @brief Indentation levels. 533 SmallVector<int, 4> Indents; 534 535 /// @brief Potential simple keys. 536 SmallVector<SimpleKey, 4> SimpleKeys; 537 538 std::error_code *EC; 539 }; 540 541 } // end namespace yaml 542 } // end namespace llvm 543 544 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. 545 static void encodeUTF8( uint32_t UnicodeScalarValue 546 , SmallVectorImpl<char> &Result) { 547 if (UnicodeScalarValue <= 0x7F) { 548 Result.push_back(UnicodeScalarValue & 0x7F); 549 } else if (UnicodeScalarValue <= 0x7FF) { 550 uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); 551 uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); 552 Result.push_back(FirstByte); 553 Result.push_back(SecondByte); 554 } else if (UnicodeScalarValue <= 0xFFFF) { 555 uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); 556 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 557 uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); 558 Result.push_back(FirstByte); 559 Result.push_back(SecondByte); 560 Result.push_back(ThirdByte); 561 } else if (UnicodeScalarValue <= 0x10FFFF) { 562 uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); 563 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); 564 uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 565 uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); 566 Result.push_back(FirstByte); 567 Result.push_back(SecondByte); 568 Result.push_back(ThirdByte); 569 Result.push_back(FourthByte); 570 } 571 } 572 573 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { 574 SourceMgr SM; 575 Scanner scanner(Input, SM); 576 while (true) { 577 Token T = scanner.getNext(); 578 switch (T.Kind) { 579 case Token::TK_StreamStart: 580 OS << "Stream-Start: "; 581 break; 582 case Token::TK_StreamEnd: 583 OS << "Stream-End: "; 584 break; 585 case Token::TK_VersionDirective: 586 OS << "Version-Directive: "; 587 break; 588 case Token::TK_TagDirective: 589 OS << "Tag-Directive: "; 590 break; 591 case Token::TK_DocumentStart: 592 OS << "Document-Start: "; 593 break; 594 case Token::TK_DocumentEnd: 595 OS << "Document-End: "; 596 break; 597 case Token::TK_BlockEntry: 598 OS << "Block-Entry: "; 599 break; 600 case Token::TK_BlockEnd: 601 OS << "Block-End: "; 602 break; 603 case Token::TK_BlockSequenceStart: 604 OS << "Block-Sequence-Start: "; 605 break; 606 case Token::TK_BlockMappingStart: 607 OS << "Block-Mapping-Start: "; 608 break; 609 case Token::TK_FlowEntry: 610 OS << "Flow-Entry: "; 611 break; 612 case Token::TK_FlowSequenceStart: 613 OS << "Flow-Sequence-Start: "; 614 break; 615 case Token::TK_FlowSequenceEnd: 616 OS << "Flow-Sequence-End: "; 617 break; 618 case Token::TK_FlowMappingStart: 619 OS << "Flow-Mapping-Start: "; 620 break; 621 case Token::TK_FlowMappingEnd: 622 OS << "Flow-Mapping-End: "; 623 break; 624 case Token::TK_Key: 625 OS << "Key: "; 626 break; 627 case Token::TK_Value: 628 OS << "Value: "; 629 break; 630 case Token::TK_Scalar: 631 OS << "Scalar: "; 632 break; 633 case Token::TK_BlockScalar: 634 OS << "Block Scalar: "; 635 break; 636 case Token::TK_Alias: 637 OS << "Alias: "; 638 break; 639 case Token::TK_Anchor: 640 OS << "Anchor: "; 641 break; 642 case Token::TK_Tag: 643 OS << "Tag: "; 644 break; 645 case Token::TK_Error: 646 break; 647 } 648 OS << T.Range << "\n"; 649 if (T.Kind == Token::TK_StreamEnd) 650 break; 651 else if (T.Kind == Token::TK_Error) 652 return false; 653 } 654 return true; 655 } 656 657 bool yaml::scanTokens(StringRef Input) { 658 llvm::SourceMgr SM; 659 llvm::yaml::Scanner scanner(Input, SM); 660 for (;;) { 661 llvm::yaml::Token T = scanner.getNext(); 662 if (T.Kind == Token::TK_StreamEnd) 663 break; 664 else if (T.Kind == Token::TK_Error) 665 return false; 666 } 667 return true; 668 } 669 670 std::string yaml::escape(StringRef Input) { 671 std::string EscapedInput; 672 for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { 673 if (*i == '\\') 674 EscapedInput += "\\\\"; 675 else if (*i == '"') 676 EscapedInput += "\\\""; 677 else if (*i == 0) 678 EscapedInput += "\\0"; 679 else if (*i == 0x07) 680 EscapedInput += "\\a"; 681 else if (*i == 0x08) 682 EscapedInput += "\\b"; 683 else if (*i == 0x09) 684 EscapedInput += "\\t"; 685 else if (*i == 0x0A) 686 EscapedInput += "\\n"; 687 else if (*i == 0x0B) 688 EscapedInput += "\\v"; 689 else if (*i == 0x0C) 690 EscapedInput += "\\f"; 691 else if (*i == 0x0D) 692 EscapedInput += "\\r"; 693 else if (*i == 0x1B) 694 EscapedInput += "\\e"; 695 else if ((unsigned char)*i < 0x20) { // Control characters not handled above. 696 std::string HexStr = utohexstr(*i); 697 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 698 } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. 699 UTF8Decoded UnicodeScalarValue 700 = decodeUTF8(StringRef(i, Input.end() - i)); 701 if (UnicodeScalarValue.second == 0) { 702 // Found invalid char. 703 SmallString<4> Val; 704 encodeUTF8(0xFFFD, Val); 705 EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end()); 706 // FIXME: Error reporting. 707 return EscapedInput; 708 } 709 if (UnicodeScalarValue.first == 0x85) 710 EscapedInput += "\\N"; 711 else if (UnicodeScalarValue.first == 0xA0) 712 EscapedInput += "\\_"; 713 else if (UnicodeScalarValue.first == 0x2028) 714 EscapedInput += "\\L"; 715 else if (UnicodeScalarValue.first == 0x2029) 716 EscapedInput += "\\P"; 717 else { 718 std::string HexStr = utohexstr(UnicodeScalarValue.first); 719 if (HexStr.size() <= 2) 720 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 721 else if (HexStr.size() <= 4) 722 EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; 723 else if (HexStr.size() <= 8) 724 EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; 725 } 726 i += UnicodeScalarValue.second - 1; 727 } else 728 EscapedInput.push_back(*i); 729 } 730 return EscapedInput; 731 } 732 733 Scanner::Scanner(StringRef Input, SourceMgr &sm, bool ShowColors, 734 std::error_code *EC) 735 : SM(sm), ShowColors(ShowColors), EC(EC) { 736 init(MemoryBufferRef(Input, "YAML")); 737 } 738 739 Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors, 740 std::error_code *EC) 741 : SM(SM_), ShowColors(ShowColors), EC(EC) { 742 init(Buffer); 743 } 744 745 void Scanner::init(MemoryBufferRef Buffer) { 746 InputBuffer = Buffer; 747 Current = InputBuffer.getBufferStart(); 748 End = InputBuffer.getBufferEnd(); 749 Indent = -1; 750 Column = 0; 751 Line = 0; 752 FlowLevel = 0; 753 IsStartOfStream = true; 754 IsSimpleKeyAllowed = true; 755 Failed = false; 756 std::unique_ptr<MemoryBuffer> InputBufferOwner = 757 MemoryBuffer::getMemBuffer(Buffer); 758 SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc()); 759 } 760 761 Token &Scanner::peekNext() { 762 // If the current token is a possible simple key, keep parsing until we 763 // can confirm. 764 bool NeedMore = false; 765 while (true) { 766 if (TokenQueue.empty() || NeedMore) { 767 if (!fetchMoreTokens()) { 768 TokenQueue.clear(); 769 TokenQueue.push_back(Token()); 770 return TokenQueue.front(); 771 } 772 } 773 assert(!TokenQueue.empty() && 774 "fetchMoreTokens lied about getting tokens!"); 775 776 removeStaleSimpleKeyCandidates(); 777 SimpleKey SK; 778 SK.Tok = TokenQueue.begin(); 779 if (!is_contained(SimpleKeys, SK)) 780 break; 781 else 782 NeedMore = true; 783 } 784 return TokenQueue.front(); 785 } 786 787 Token Scanner::getNext() { 788 Token Ret = peekNext(); 789 // TokenQueue can be empty if there was an error getting the next token. 790 if (!TokenQueue.empty()) 791 TokenQueue.pop_front(); 792 793 // There cannot be any referenced Token's if the TokenQueue is empty. So do a 794 // quick deallocation of them all. 795 if (TokenQueue.empty()) 796 TokenQueue.resetAlloc(); 797 798 return Ret; 799 } 800 801 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { 802 if (Position == End) 803 return Position; 804 // Check 7 bit c-printable - b-char. 805 if ( *Position == 0x09 806 || (*Position >= 0x20 && *Position <= 0x7E)) 807 return Position + 1; 808 809 // Check for valid UTF-8. 810 if (uint8_t(*Position) & 0x80) { 811 UTF8Decoded u8d = decodeUTF8(Position); 812 if ( u8d.second != 0 813 && u8d.first != 0xFEFF 814 && ( u8d.first == 0x85 815 || ( u8d.first >= 0xA0 816 && u8d.first <= 0xD7FF) 817 || ( u8d.first >= 0xE000 818 && u8d.first <= 0xFFFD) 819 || ( u8d.first >= 0x10000 820 && u8d.first <= 0x10FFFF))) 821 return Position + u8d.second; 822 } 823 return Position; 824 } 825 826 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { 827 if (Position == End) 828 return Position; 829 if (*Position == 0x0D) { 830 if (Position + 1 != End && *(Position + 1) == 0x0A) 831 return Position + 2; 832 return Position + 1; 833 } 834 835 if (*Position == 0x0A) 836 return Position + 1; 837 return Position; 838 } 839 840 StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) { 841 if (Position == End) 842 return Position; 843 if (*Position == ' ') 844 return Position + 1; 845 return Position; 846 } 847 848 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { 849 if (Position == End) 850 return Position; 851 if (*Position == ' ' || *Position == '\t') 852 return Position + 1; 853 return Position; 854 } 855 856 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { 857 if (Position == End) 858 return Position; 859 if (*Position == ' ' || *Position == '\t') 860 return Position; 861 return skip_nb_char(Position); 862 } 863 864 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func 865 , StringRef::iterator Position) { 866 while (true) { 867 StringRef::iterator i = (this->*Func)(Position); 868 if (i == Position) 869 break; 870 Position = i; 871 } 872 return Position; 873 } 874 875 void Scanner::advanceWhile(SkipWhileFunc Func) { 876 auto Final = skip_while(Func, Current); 877 Column += Final - Current; 878 Current = Final; 879 } 880 881 static bool is_ns_hex_digit(const char C) { 882 return (C >= '0' && C <= '9') 883 || (C >= 'a' && C <= 'z') 884 || (C >= 'A' && C <= 'Z'); 885 } 886 887 static bool is_ns_word_char(const char C) { 888 return C == '-' 889 || (C >= 'a' && C <= 'z') 890 || (C >= 'A' && C <= 'Z'); 891 } 892 893 void Scanner::scan_ns_uri_char() { 894 while (true) { 895 if (Current == End) 896 break; 897 if (( *Current == '%' 898 && Current + 2 < End 899 && is_ns_hex_digit(*(Current + 1)) 900 && is_ns_hex_digit(*(Current + 2))) 901 || is_ns_word_char(*Current) 902 || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") 903 != StringRef::npos) { 904 ++Current; 905 ++Column; 906 } else 907 break; 908 } 909 } 910 911 bool Scanner::consume(uint32_t Expected) { 912 if (Expected >= 0x80) 913 report_fatal_error("Not dealing with this yet"); 914 if (Current == End) 915 return false; 916 if (uint8_t(*Current) >= 0x80) 917 report_fatal_error("Not dealing with this yet"); 918 if (uint8_t(*Current) == Expected) { 919 ++Current; 920 ++Column; 921 return true; 922 } 923 return false; 924 } 925 926 void Scanner::skip(uint32_t Distance) { 927 Current += Distance; 928 Column += Distance; 929 assert(Current <= End && "Skipped past the end"); 930 } 931 932 bool Scanner::isBlankOrBreak(StringRef::iterator Position) { 933 if (Position == End) 934 return false; 935 return *Position == ' ' || *Position == '\t' || *Position == '\r' || 936 *Position == '\n'; 937 } 938 939 bool Scanner::consumeLineBreakIfPresent() { 940 auto Next = skip_b_break(Current); 941 if (Next == Current) 942 return false; 943 Column = 0; 944 ++Line; 945 Current = Next; 946 return true; 947 } 948 949 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok 950 , unsigned AtColumn 951 , bool IsRequired) { 952 if (IsSimpleKeyAllowed) { 953 SimpleKey SK; 954 SK.Tok = Tok; 955 SK.Line = Line; 956 SK.Column = AtColumn; 957 SK.IsRequired = IsRequired; 958 SK.FlowLevel = FlowLevel; 959 SimpleKeys.push_back(SK); 960 } 961 } 962 963 void Scanner::removeStaleSimpleKeyCandidates() { 964 for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); 965 i != SimpleKeys.end();) { 966 if (i->Line != Line || i->Column + 1024 < Column) { 967 if (i->IsRequired) 968 setError( "Could not find expected : for simple key" 969 , i->Tok->Range.begin()); 970 i = SimpleKeys.erase(i); 971 } else 972 ++i; 973 } 974 } 975 976 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { 977 if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) 978 SimpleKeys.pop_back(); 979 } 980 981 bool Scanner::unrollIndent(int ToColumn) { 982 Token T; 983 // Indentation is ignored in flow. 984 if (FlowLevel != 0) 985 return true; 986 987 while (Indent > ToColumn) { 988 T.Kind = Token::TK_BlockEnd; 989 T.Range = StringRef(Current, 1); 990 TokenQueue.push_back(T); 991 Indent = Indents.pop_back_val(); 992 } 993 994 return true; 995 } 996 997 bool Scanner::rollIndent( int ToColumn 998 , Token::TokenKind Kind 999 , TokenQueueT::iterator InsertPoint) { 1000 if (FlowLevel) 1001 return true; 1002 if (Indent < ToColumn) { 1003 Indents.push_back(Indent); 1004 Indent = ToColumn; 1005 1006 Token T; 1007 T.Kind = Kind; 1008 T.Range = StringRef(Current, 0); 1009 TokenQueue.insert(InsertPoint, T); 1010 } 1011 return true; 1012 } 1013 1014 void Scanner::skipComment() { 1015 if (*Current != '#') 1016 return; 1017 while (true) { 1018 // This may skip more than one byte, thus Column is only incremented 1019 // for code points. 1020 StringRef::iterator I = skip_nb_char(Current); 1021 if (I == Current) 1022 break; 1023 Current = I; 1024 ++Column; 1025 } 1026 } 1027 1028 void Scanner::scanToNextToken() { 1029 while (true) { 1030 while (*Current == ' ' || *Current == '\t') { 1031 skip(1); 1032 } 1033 1034 skipComment(); 1035 1036 // Skip EOL. 1037 StringRef::iterator i = skip_b_break(Current); 1038 if (i == Current) 1039 break; 1040 Current = i; 1041 ++Line; 1042 Column = 0; 1043 // New lines may start a simple key. 1044 if (!FlowLevel) 1045 IsSimpleKeyAllowed = true; 1046 } 1047 } 1048 1049 bool Scanner::scanStreamStart() { 1050 IsStartOfStream = false; 1051 1052 EncodingInfo EI = getUnicodeEncoding(currentInput()); 1053 1054 Token T; 1055 T.Kind = Token::TK_StreamStart; 1056 T.Range = StringRef(Current, EI.second); 1057 TokenQueue.push_back(T); 1058 Current += EI.second; 1059 return true; 1060 } 1061 1062 bool Scanner::scanStreamEnd() { 1063 // Force an ending new line if one isn't present. 1064 if (Column != 0) { 1065 Column = 0; 1066 ++Line; 1067 } 1068 1069 unrollIndent(-1); 1070 SimpleKeys.clear(); 1071 IsSimpleKeyAllowed = false; 1072 1073 Token T; 1074 T.Kind = Token::TK_StreamEnd; 1075 T.Range = StringRef(Current, 0); 1076 TokenQueue.push_back(T); 1077 return true; 1078 } 1079 1080 bool Scanner::scanDirective() { 1081 // Reset the indentation level. 1082 unrollIndent(-1); 1083 SimpleKeys.clear(); 1084 IsSimpleKeyAllowed = false; 1085 1086 StringRef::iterator Start = Current; 1087 consume('%'); 1088 StringRef::iterator NameStart = Current; 1089 Current = skip_while(&Scanner::skip_ns_char, Current); 1090 StringRef Name(NameStart, Current - NameStart); 1091 Current = skip_while(&Scanner::skip_s_white, Current); 1092 1093 Token T; 1094 if (Name == "YAML") { 1095 Current = skip_while(&Scanner::skip_ns_char, Current); 1096 T.Kind = Token::TK_VersionDirective; 1097 T.Range = StringRef(Start, Current - Start); 1098 TokenQueue.push_back(T); 1099 return true; 1100 } else if(Name == "TAG") { 1101 Current = skip_while(&Scanner::skip_ns_char, Current); 1102 Current = skip_while(&Scanner::skip_s_white, Current); 1103 Current = skip_while(&Scanner::skip_ns_char, Current); 1104 T.Kind = Token::TK_TagDirective; 1105 T.Range = StringRef(Start, Current - Start); 1106 TokenQueue.push_back(T); 1107 return true; 1108 } 1109 return false; 1110 } 1111 1112 bool Scanner::scanDocumentIndicator(bool IsStart) { 1113 unrollIndent(-1); 1114 SimpleKeys.clear(); 1115 IsSimpleKeyAllowed = false; 1116 1117 Token T; 1118 T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; 1119 T.Range = StringRef(Current, 3); 1120 skip(3); 1121 TokenQueue.push_back(T); 1122 return true; 1123 } 1124 1125 bool Scanner::scanFlowCollectionStart(bool IsSequence) { 1126 Token T; 1127 T.Kind = IsSequence ? Token::TK_FlowSequenceStart 1128 : Token::TK_FlowMappingStart; 1129 T.Range = StringRef(Current, 1); 1130 skip(1); 1131 TokenQueue.push_back(T); 1132 1133 // [ and { may begin a simple key. 1134 saveSimpleKeyCandidate(--TokenQueue.end(), Column - 1, false); 1135 1136 // And may also be followed by a simple key. 1137 IsSimpleKeyAllowed = true; 1138 ++FlowLevel; 1139 return true; 1140 } 1141 1142 bool Scanner::scanFlowCollectionEnd(bool IsSequence) { 1143 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1144 IsSimpleKeyAllowed = false; 1145 Token T; 1146 T.Kind = IsSequence ? Token::TK_FlowSequenceEnd 1147 : Token::TK_FlowMappingEnd; 1148 T.Range = StringRef(Current, 1); 1149 skip(1); 1150 TokenQueue.push_back(T); 1151 if (FlowLevel) 1152 --FlowLevel; 1153 return true; 1154 } 1155 1156 bool Scanner::scanFlowEntry() { 1157 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1158 IsSimpleKeyAllowed = true; 1159 Token T; 1160 T.Kind = Token::TK_FlowEntry; 1161 T.Range = StringRef(Current, 1); 1162 skip(1); 1163 TokenQueue.push_back(T); 1164 return true; 1165 } 1166 1167 bool Scanner::scanBlockEntry() { 1168 rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); 1169 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1170 IsSimpleKeyAllowed = true; 1171 Token T; 1172 T.Kind = Token::TK_BlockEntry; 1173 T.Range = StringRef(Current, 1); 1174 skip(1); 1175 TokenQueue.push_back(T); 1176 return true; 1177 } 1178 1179 bool Scanner::scanKey() { 1180 if (!FlowLevel) 1181 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1182 1183 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1184 IsSimpleKeyAllowed = !FlowLevel; 1185 1186 Token T; 1187 T.Kind = Token::TK_Key; 1188 T.Range = StringRef(Current, 1); 1189 skip(1); 1190 TokenQueue.push_back(T); 1191 return true; 1192 } 1193 1194 bool Scanner::scanValue() { 1195 // If the previous token could have been a simple key, insert the key token 1196 // into the token queue. 1197 if (!SimpleKeys.empty()) { 1198 SimpleKey SK = SimpleKeys.pop_back_val(); 1199 Token T; 1200 T.Kind = Token::TK_Key; 1201 T.Range = SK.Tok->Range; 1202 TokenQueueT::iterator i, e; 1203 for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { 1204 if (i == SK.Tok) 1205 break; 1206 } 1207 assert(i != e && "SimpleKey not in token queue!"); 1208 i = TokenQueue.insert(i, T); 1209 1210 // We may also need to add a Block-Mapping-Start token. 1211 rollIndent(SK.Column, Token::TK_BlockMappingStart, i); 1212 1213 IsSimpleKeyAllowed = false; 1214 } else { 1215 if (!FlowLevel) 1216 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1217 IsSimpleKeyAllowed = !FlowLevel; 1218 } 1219 1220 Token T; 1221 T.Kind = Token::TK_Value; 1222 T.Range = StringRef(Current, 1); 1223 skip(1); 1224 TokenQueue.push_back(T); 1225 return true; 1226 } 1227 1228 // Forbidding inlining improves performance by roughly 20%. 1229 // FIXME: Remove once llvm optimizes this to the faster version without hints. 1230 LLVM_ATTRIBUTE_NOINLINE static bool 1231 wasEscaped(StringRef::iterator First, StringRef::iterator Position); 1232 1233 // Returns whether a character at 'Position' was escaped with a leading '\'. 1234 // 'First' specifies the position of the first character in the string. 1235 static bool wasEscaped(StringRef::iterator First, 1236 StringRef::iterator Position) { 1237 assert(Position - 1 >= First); 1238 StringRef::iterator I = Position - 1; 1239 // We calculate the number of consecutive '\'s before the current position 1240 // by iterating backwards through our string. 1241 while (I >= First && *I == '\\') --I; 1242 // (Position - 1 - I) now contains the number of '\'s before the current 1243 // position. If it is odd, the character at 'Position' was escaped. 1244 return (Position - 1 - I) % 2 == 1; 1245 } 1246 1247 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { 1248 StringRef::iterator Start = Current; 1249 unsigned ColStart = Column; 1250 if (IsDoubleQuoted) { 1251 do { 1252 ++Current; 1253 while (Current != End && *Current != '"') 1254 ++Current; 1255 // Repeat until the previous character was not a '\' or was an escaped 1256 // backslash. 1257 } while ( Current != End 1258 && *(Current - 1) == '\\' 1259 && wasEscaped(Start + 1, Current)); 1260 } else { 1261 skip(1); 1262 while (true) { 1263 // Skip a ' followed by another '. 1264 if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { 1265 skip(2); 1266 continue; 1267 } else if (*Current == '\'') 1268 break; 1269 StringRef::iterator i = skip_nb_char(Current); 1270 if (i == Current) { 1271 i = skip_b_break(Current); 1272 if (i == Current) 1273 break; 1274 Current = i; 1275 Column = 0; 1276 ++Line; 1277 } else { 1278 if (i == End) 1279 break; 1280 Current = i; 1281 ++Column; 1282 } 1283 } 1284 } 1285 1286 if (Current == End) { 1287 setError("Expected quote at end of scalar", Current); 1288 return false; 1289 } 1290 1291 skip(1); // Skip ending quote. 1292 Token T; 1293 T.Kind = Token::TK_Scalar; 1294 T.Range = StringRef(Start, Current - Start); 1295 TokenQueue.push_back(T); 1296 1297 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1298 1299 IsSimpleKeyAllowed = false; 1300 1301 return true; 1302 } 1303 1304 bool Scanner::scanPlainScalar() { 1305 StringRef::iterator Start = Current; 1306 unsigned ColStart = Column; 1307 unsigned LeadingBlanks = 0; 1308 assert(Indent >= -1 && "Indent must be >= -1 !"); 1309 unsigned indent = static_cast<unsigned>(Indent + 1); 1310 while (true) { 1311 if (*Current == '#') 1312 break; 1313 1314 while (!isBlankOrBreak(Current)) { 1315 if ( FlowLevel && *Current == ':' 1316 && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) { 1317 setError("Found unexpected ':' while scanning a plain scalar", Current); 1318 return false; 1319 } 1320 1321 // Check for the end of the plain scalar. 1322 if ( (*Current == ':' && isBlankOrBreak(Current + 1)) 1323 || ( FlowLevel 1324 && (StringRef(Current, 1).find_first_of(",:?[]{}") 1325 != StringRef::npos))) 1326 break; 1327 1328 StringRef::iterator i = skip_nb_char(Current); 1329 if (i == Current) 1330 break; 1331 Current = i; 1332 ++Column; 1333 } 1334 1335 // Are we at the end? 1336 if (!isBlankOrBreak(Current)) 1337 break; 1338 1339 // Eat blanks. 1340 StringRef::iterator Tmp = Current; 1341 while (isBlankOrBreak(Tmp)) { 1342 StringRef::iterator i = skip_s_white(Tmp); 1343 if (i != Tmp) { 1344 if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { 1345 setError("Found invalid tab character in indentation", Tmp); 1346 return false; 1347 } 1348 Tmp = i; 1349 ++Column; 1350 } else { 1351 i = skip_b_break(Tmp); 1352 if (!LeadingBlanks) 1353 LeadingBlanks = 1; 1354 Tmp = i; 1355 Column = 0; 1356 ++Line; 1357 } 1358 } 1359 1360 if (!FlowLevel && Column < indent) 1361 break; 1362 1363 Current = Tmp; 1364 } 1365 if (Start == Current) { 1366 setError("Got empty plain scalar", Start); 1367 return false; 1368 } 1369 Token T; 1370 T.Kind = Token::TK_Scalar; 1371 T.Range = StringRef(Start, Current - Start); 1372 TokenQueue.push_back(T); 1373 1374 // Plain scalars can be simple keys. 1375 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1376 1377 IsSimpleKeyAllowed = false; 1378 1379 return true; 1380 } 1381 1382 bool Scanner::scanAliasOrAnchor(bool IsAlias) { 1383 StringRef::iterator Start = Current; 1384 unsigned ColStart = Column; 1385 skip(1); 1386 while(true) { 1387 if ( *Current == '[' || *Current == ']' 1388 || *Current == '{' || *Current == '}' 1389 || *Current == ',' 1390 || *Current == ':') 1391 break; 1392 StringRef::iterator i = skip_ns_char(Current); 1393 if (i == Current) 1394 break; 1395 Current = i; 1396 ++Column; 1397 } 1398 1399 if (Start == Current) { 1400 setError("Got empty alias or anchor", Start); 1401 return false; 1402 } 1403 1404 Token T; 1405 T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; 1406 T.Range = StringRef(Start, Current - Start); 1407 TokenQueue.push_back(T); 1408 1409 // Alias and anchors can be simple keys. 1410 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1411 1412 IsSimpleKeyAllowed = false; 1413 1414 return true; 1415 } 1416 1417 char Scanner::scanBlockChompingIndicator() { 1418 char Indicator = ' '; 1419 if (Current != End && (*Current == '+' || *Current == '-')) { 1420 Indicator = *Current; 1421 skip(1); 1422 } 1423 return Indicator; 1424 } 1425 1426 /// Get the number of line breaks after chomping. 1427 /// 1428 /// Return the number of trailing line breaks to emit, depending on 1429 /// \p ChompingIndicator. 1430 static unsigned getChompedLineBreaks(char ChompingIndicator, 1431 unsigned LineBreaks, StringRef Str) { 1432 if (ChompingIndicator == '-') // Strip all line breaks. 1433 return 0; 1434 if (ChompingIndicator == '+') // Keep all line breaks. 1435 return LineBreaks; 1436 // Clip trailing lines. 1437 return Str.empty() ? 0 : 1; 1438 } 1439 1440 unsigned Scanner::scanBlockIndentationIndicator() { 1441 unsigned Indent = 0; 1442 if (Current != End && (*Current >= '1' && *Current <= '9')) { 1443 Indent = unsigned(*Current - '0'); 1444 skip(1); 1445 } 1446 return Indent; 1447 } 1448 1449 bool Scanner::scanBlockScalarHeader(char &ChompingIndicator, 1450 unsigned &IndentIndicator, bool &IsDone) { 1451 auto Start = Current; 1452 1453 ChompingIndicator = scanBlockChompingIndicator(); 1454 IndentIndicator = scanBlockIndentationIndicator(); 1455 // Check for the chomping indicator once again. 1456 if (ChompingIndicator == ' ') 1457 ChompingIndicator = scanBlockChompingIndicator(); 1458 Current = skip_while(&Scanner::skip_s_white, Current); 1459 skipComment(); 1460 1461 if (Current == End) { // EOF, we have an empty scalar. 1462 Token T; 1463 T.Kind = Token::TK_BlockScalar; 1464 T.Range = StringRef(Start, Current - Start); 1465 TokenQueue.push_back(T); 1466 IsDone = true; 1467 return true; 1468 } 1469 1470 if (!consumeLineBreakIfPresent()) { 1471 setError("Expected a line break after block scalar header", Current); 1472 return false; 1473 } 1474 return true; 1475 } 1476 1477 bool Scanner::findBlockScalarIndent(unsigned &BlockIndent, 1478 unsigned BlockExitIndent, 1479 unsigned &LineBreaks, bool &IsDone) { 1480 unsigned MaxAllSpaceLineCharacters = 0; 1481 StringRef::iterator LongestAllSpaceLine; 1482 1483 while (true) { 1484 advanceWhile(&Scanner::skip_s_space); 1485 if (skip_nb_char(Current) != Current) { 1486 // This line isn't empty, so try and find the indentation. 1487 if (Column <= BlockExitIndent) { // End of the block literal. 1488 IsDone = true; 1489 return true; 1490 } 1491 // We found the block's indentation. 1492 BlockIndent = Column; 1493 if (MaxAllSpaceLineCharacters > BlockIndent) { 1494 setError( 1495 "Leading all-spaces line must be smaller than the block indent", 1496 LongestAllSpaceLine); 1497 return false; 1498 } 1499 return true; 1500 } 1501 if (skip_b_break(Current) != Current && 1502 Column > MaxAllSpaceLineCharacters) { 1503 // Record the longest all-space line in case it's longer than the 1504 // discovered block indent. 1505 MaxAllSpaceLineCharacters = Column; 1506 LongestAllSpaceLine = Current; 1507 } 1508 1509 // Check for EOF. 1510 if (Current == End) { 1511 IsDone = true; 1512 return true; 1513 } 1514 1515 if (!consumeLineBreakIfPresent()) { 1516 IsDone = true; 1517 return true; 1518 } 1519 ++LineBreaks; 1520 } 1521 return true; 1522 } 1523 1524 bool Scanner::scanBlockScalarIndent(unsigned BlockIndent, 1525 unsigned BlockExitIndent, bool &IsDone) { 1526 // Skip the indentation. 1527 while (Column < BlockIndent) { 1528 auto I = skip_s_space(Current); 1529 if (I == Current) 1530 break; 1531 Current = I; 1532 ++Column; 1533 } 1534 1535 if (skip_nb_char(Current) == Current) 1536 return true; 1537 1538 if (Column <= BlockExitIndent) { // End of the block literal. 1539 IsDone = true; 1540 return true; 1541 } 1542 1543 if (Column < BlockIndent) { 1544 if (Current != End && *Current == '#') { // Trailing comment. 1545 IsDone = true; 1546 return true; 1547 } 1548 setError("A text line is less indented than the block scalar", Current); 1549 return false; 1550 } 1551 return true; // A normal text line. 1552 } 1553 1554 bool Scanner::scanBlockScalar(bool IsLiteral) { 1555 // Eat '|' or '>' 1556 assert(*Current == '|' || *Current == '>'); 1557 skip(1); 1558 1559 char ChompingIndicator; 1560 unsigned BlockIndent; 1561 bool IsDone = false; 1562 if (!scanBlockScalarHeader(ChompingIndicator, BlockIndent, IsDone)) 1563 return false; 1564 if (IsDone) 1565 return true; 1566 1567 auto Start = Current; 1568 unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent; 1569 unsigned LineBreaks = 0; 1570 if (BlockIndent == 0) { 1571 if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks, 1572 IsDone)) 1573 return false; 1574 } 1575 1576 // Scan the block's scalars body. 1577 SmallString<256> Str; 1578 while (!IsDone) { 1579 if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone)) 1580 return false; 1581 if (IsDone) 1582 break; 1583 1584 // Parse the current line. 1585 auto LineStart = Current; 1586 advanceWhile(&Scanner::skip_nb_char); 1587 if (LineStart != Current) { 1588 Str.append(LineBreaks, '\n'); 1589 Str.append(StringRef(LineStart, Current - LineStart)); 1590 LineBreaks = 0; 1591 } 1592 1593 // Check for EOF. 1594 if (Current == End) 1595 break; 1596 1597 if (!consumeLineBreakIfPresent()) 1598 break; 1599 ++LineBreaks; 1600 } 1601 1602 if (Current == End && !LineBreaks) 1603 // Ensure that there is at least one line break before the end of file. 1604 LineBreaks = 1; 1605 Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n'); 1606 1607 // New lines may start a simple key. 1608 if (!FlowLevel) 1609 IsSimpleKeyAllowed = true; 1610 1611 Token T; 1612 T.Kind = Token::TK_BlockScalar; 1613 T.Range = StringRef(Start, Current - Start); 1614 T.Value = Str.str().str(); 1615 TokenQueue.push_back(T); 1616 return true; 1617 } 1618 1619 bool Scanner::scanTag() { 1620 StringRef::iterator Start = Current; 1621 unsigned ColStart = Column; 1622 skip(1); // Eat !. 1623 if (Current == End || isBlankOrBreak(Current)); // An empty tag. 1624 else if (*Current == '<') { 1625 skip(1); 1626 scan_ns_uri_char(); 1627 if (!consume('>')) 1628 return false; 1629 } else { 1630 // FIXME: Actually parse the c-ns-shorthand-tag rule. 1631 Current = skip_while(&Scanner::skip_ns_char, Current); 1632 } 1633 1634 Token T; 1635 T.Kind = Token::TK_Tag; 1636 T.Range = StringRef(Start, Current - Start); 1637 TokenQueue.push_back(T); 1638 1639 // Tags can be simple keys. 1640 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1641 1642 IsSimpleKeyAllowed = false; 1643 1644 return true; 1645 } 1646 1647 bool Scanner::fetchMoreTokens() { 1648 if (IsStartOfStream) 1649 return scanStreamStart(); 1650 1651 scanToNextToken(); 1652 1653 if (Current == End) 1654 return scanStreamEnd(); 1655 1656 removeStaleSimpleKeyCandidates(); 1657 1658 unrollIndent(Column); 1659 1660 if (Column == 0 && *Current == '%') 1661 return scanDirective(); 1662 1663 if (Column == 0 && Current + 4 <= End 1664 && *Current == '-' 1665 && *(Current + 1) == '-' 1666 && *(Current + 2) == '-' 1667 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1668 return scanDocumentIndicator(true); 1669 1670 if (Column == 0 && Current + 4 <= End 1671 && *Current == '.' 1672 && *(Current + 1) == '.' 1673 && *(Current + 2) == '.' 1674 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1675 return scanDocumentIndicator(false); 1676 1677 if (*Current == '[') 1678 return scanFlowCollectionStart(true); 1679 1680 if (*Current == '{') 1681 return scanFlowCollectionStart(false); 1682 1683 if (*Current == ']') 1684 return scanFlowCollectionEnd(true); 1685 1686 if (*Current == '}') 1687 return scanFlowCollectionEnd(false); 1688 1689 if (*Current == ',') 1690 return scanFlowEntry(); 1691 1692 if (*Current == '-' && isBlankOrBreak(Current + 1)) 1693 return scanBlockEntry(); 1694 1695 if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1))) 1696 return scanKey(); 1697 1698 if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1))) 1699 return scanValue(); 1700 1701 if (*Current == '*') 1702 return scanAliasOrAnchor(true); 1703 1704 if (*Current == '&') 1705 return scanAliasOrAnchor(false); 1706 1707 if (*Current == '!') 1708 return scanTag(); 1709 1710 if (*Current == '|' && !FlowLevel) 1711 return scanBlockScalar(true); 1712 1713 if (*Current == '>' && !FlowLevel) 1714 return scanBlockScalar(false); 1715 1716 if (*Current == '\'') 1717 return scanFlowScalar(false); 1718 1719 if (*Current == '"') 1720 return scanFlowScalar(true); 1721 1722 // Get a plain scalar. 1723 StringRef FirstChar(Current, 1); 1724 if (!(isBlankOrBreak(Current) 1725 || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos) 1726 || (*Current == '-' && !isBlankOrBreak(Current + 1)) 1727 || (!FlowLevel && (*Current == '?' || *Current == ':') 1728 && isBlankOrBreak(Current + 1)) 1729 || (!FlowLevel && *Current == ':' 1730 && Current + 2 < End 1731 && *(Current + 1) == ':' 1732 && !isBlankOrBreak(Current + 2))) 1733 return scanPlainScalar(); 1734 1735 setError("Unrecognized character while tokenizing."); 1736 return false; 1737 } 1738 1739 Stream::Stream(StringRef Input, SourceMgr &SM, bool ShowColors, 1740 std::error_code *EC) 1741 : scanner(new Scanner(Input, SM, ShowColors, EC)), CurrentDoc() {} 1742 1743 Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors, 1744 std::error_code *EC) 1745 : scanner(new Scanner(InputBuffer, SM, ShowColors, EC)), CurrentDoc() {} 1746 1747 Stream::~Stream() {} 1748 1749 bool Stream::failed() { return scanner->failed(); } 1750 1751 void Stream::printError(Node *N, const Twine &Msg) { 1752 scanner->printError( N->getSourceRange().Start 1753 , SourceMgr::DK_Error 1754 , Msg 1755 , N->getSourceRange()); 1756 } 1757 1758 document_iterator Stream::begin() { 1759 if (CurrentDoc) 1760 report_fatal_error("Can only iterate over the stream once"); 1761 1762 // Skip Stream-Start. 1763 scanner->getNext(); 1764 1765 CurrentDoc.reset(new Document(*this)); 1766 return document_iterator(CurrentDoc); 1767 } 1768 1769 document_iterator Stream::end() { 1770 return document_iterator(); 1771 } 1772 1773 void Stream::skip() { 1774 for (document_iterator i = begin(), e = end(); i != e; ++i) 1775 i->skip(); 1776 } 1777 1778 Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A, 1779 StringRef T) 1780 : Doc(D), TypeID(Type), Anchor(A), Tag(T) { 1781 SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); 1782 SourceRange = SMRange(Start, Start); 1783 } 1784 1785 std::string Node::getVerbatimTag() const { 1786 StringRef Raw = getRawTag(); 1787 if (!Raw.empty() && Raw != "!") { 1788 std::string Ret; 1789 if (Raw.find_last_of('!') == 0) { 1790 Ret = Doc->getTagMap().find("!")->second; 1791 Ret += Raw.substr(1); 1792 return Ret; 1793 } else if (Raw.startswith("!!")) { 1794 Ret = Doc->getTagMap().find("!!")->second; 1795 Ret += Raw.substr(2); 1796 return Ret; 1797 } else { 1798 StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1); 1799 std::map<StringRef, StringRef>::const_iterator It = 1800 Doc->getTagMap().find(TagHandle); 1801 if (It != Doc->getTagMap().end()) 1802 Ret = It->second; 1803 else { 1804 Token T; 1805 T.Kind = Token::TK_Tag; 1806 T.Range = TagHandle; 1807 setError(Twine("Unknown tag handle ") + TagHandle, T); 1808 } 1809 Ret += Raw.substr(Raw.find_last_of('!') + 1); 1810 return Ret; 1811 } 1812 } 1813 1814 switch (getType()) { 1815 case NK_Null: 1816 return "tag:yaml.org,2002:null"; 1817 case NK_Scalar: 1818 case NK_BlockScalar: 1819 // TODO: Tag resolution. 1820 return "tag:yaml.org,2002:str"; 1821 case NK_Mapping: 1822 return "tag:yaml.org,2002:map"; 1823 case NK_Sequence: 1824 return "tag:yaml.org,2002:seq"; 1825 } 1826 1827 return ""; 1828 } 1829 1830 Token &Node::peekNext() { 1831 return Doc->peekNext(); 1832 } 1833 1834 Token Node::getNext() { 1835 return Doc->getNext(); 1836 } 1837 1838 Node *Node::parseBlockNode() { 1839 return Doc->parseBlockNode(); 1840 } 1841 1842 BumpPtrAllocator &Node::getAllocator() { 1843 return Doc->NodeAllocator; 1844 } 1845 1846 void Node::setError(const Twine &Msg, Token &Tok) const { 1847 Doc->setError(Msg, Tok); 1848 } 1849 1850 bool Node::failed() const { 1851 return Doc->failed(); 1852 } 1853 1854 1855 1856 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { 1857 // TODO: Handle newlines properly. We need to remove leading whitespace. 1858 if (Value[0] == '"') { // Double quoted. 1859 // Pull off the leading and trailing "s. 1860 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1861 // Search for characters that would require unescaping the value. 1862 StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); 1863 if (i != StringRef::npos) 1864 return unescapeDoubleQuoted(UnquotedValue, i, Storage); 1865 return UnquotedValue; 1866 } else if (Value[0] == '\'') { // Single quoted. 1867 // Pull off the leading and trailing 's. 1868 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1869 StringRef::size_type i = UnquotedValue.find('\''); 1870 if (i != StringRef::npos) { 1871 // We're going to need Storage. 1872 Storage.clear(); 1873 Storage.reserve(UnquotedValue.size()); 1874 for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { 1875 StringRef Valid(UnquotedValue.begin(), i); 1876 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1877 Storage.push_back('\''); 1878 UnquotedValue = UnquotedValue.substr(i + 2); 1879 } 1880 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 1881 return StringRef(Storage.begin(), Storage.size()); 1882 } 1883 return UnquotedValue; 1884 } 1885 // Plain or block. 1886 return Value.rtrim(' '); 1887 } 1888 1889 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue 1890 , StringRef::size_type i 1891 , SmallVectorImpl<char> &Storage) 1892 const { 1893 // Use Storage to build proper value. 1894 Storage.clear(); 1895 Storage.reserve(UnquotedValue.size()); 1896 for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { 1897 // Insert all previous chars into Storage. 1898 StringRef Valid(UnquotedValue.begin(), i); 1899 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1900 // Chop off inserted chars. 1901 UnquotedValue = UnquotedValue.substr(i); 1902 1903 assert(!UnquotedValue.empty() && "Can't be empty!"); 1904 1905 // Parse escape or line break. 1906 switch (UnquotedValue[0]) { 1907 case '\r': 1908 case '\n': 1909 Storage.push_back('\n'); 1910 if ( UnquotedValue.size() > 1 1911 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1912 UnquotedValue = UnquotedValue.substr(1); 1913 UnquotedValue = UnquotedValue.substr(1); 1914 break; 1915 default: 1916 if (UnquotedValue.size() == 1) 1917 // TODO: Report error. 1918 break; 1919 UnquotedValue = UnquotedValue.substr(1); 1920 switch (UnquotedValue[0]) { 1921 default: { 1922 Token T; 1923 T.Range = StringRef(UnquotedValue.begin(), 1); 1924 setError("Unrecognized escape code!", T); 1925 return ""; 1926 } 1927 case '\r': 1928 case '\n': 1929 // Remove the new line. 1930 if ( UnquotedValue.size() > 1 1931 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1932 UnquotedValue = UnquotedValue.substr(1); 1933 // If this was just a single byte newline, it will get skipped 1934 // below. 1935 break; 1936 case '0': 1937 Storage.push_back(0x00); 1938 break; 1939 case 'a': 1940 Storage.push_back(0x07); 1941 break; 1942 case 'b': 1943 Storage.push_back(0x08); 1944 break; 1945 case 't': 1946 case 0x09: 1947 Storage.push_back(0x09); 1948 break; 1949 case 'n': 1950 Storage.push_back(0x0A); 1951 break; 1952 case 'v': 1953 Storage.push_back(0x0B); 1954 break; 1955 case 'f': 1956 Storage.push_back(0x0C); 1957 break; 1958 case 'r': 1959 Storage.push_back(0x0D); 1960 break; 1961 case 'e': 1962 Storage.push_back(0x1B); 1963 break; 1964 case ' ': 1965 Storage.push_back(0x20); 1966 break; 1967 case '"': 1968 Storage.push_back(0x22); 1969 break; 1970 case '/': 1971 Storage.push_back(0x2F); 1972 break; 1973 case '\\': 1974 Storage.push_back(0x5C); 1975 break; 1976 case 'N': 1977 encodeUTF8(0x85, Storage); 1978 break; 1979 case '_': 1980 encodeUTF8(0xA0, Storage); 1981 break; 1982 case 'L': 1983 encodeUTF8(0x2028, Storage); 1984 break; 1985 case 'P': 1986 encodeUTF8(0x2029, Storage); 1987 break; 1988 case 'x': { 1989 if (UnquotedValue.size() < 3) 1990 // TODO: Report error. 1991 break; 1992 unsigned int UnicodeScalarValue; 1993 if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) 1994 // TODO: Report error. 1995 UnicodeScalarValue = 0xFFFD; 1996 encodeUTF8(UnicodeScalarValue, Storage); 1997 UnquotedValue = UnquotedValue.substr(2); 1998 break; 1999 } 2000 case 'u': { 2001 if (UnquotedValue.size() < 5) 2002 // TODO: Report error. 2003 break; 2004 unsigned int UnicodeScalarValue; 2005 if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) 2006 // TODO: Report error. 2007 UnicodeScalarValue = 0xFFFD; 2008 encodeUTF8(UnicodeScalarValue, Storage); 2009 UnquotedValue = UnquotedValue.substr(4); 2010 break; 2011 } 2012 case 'U': { 2013 if (UnquotedValue.size() < 9) 2014 // TODO: Report error. 2015 break; 2016 unsigned int UnicodeScalarValue; 2017 if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) 2018 // TODO: Report error. 2019 UnicodeScalarValue = 0xFFFD; 2020 encodeUTF8(UnicodeScalarValue, Storage); 2021 UnquotedValue = UnquotedValue.substr(8); 2022 break; 2023 } 2024 } 2025 UnquotedValue = UnquotedValue.substr(1); 2026 } 2027 } 2028 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 2029 return StringRef(Storage.begin(), Storage.size()); 2030 } 2031 2032 Node *KeyValueNode::getKey() { 2033 if (Key) 2034 return Key; 2035 // Handle implicit null keys. 2036 { 2037 Token &t = peekNext(); 2038 if ( t.Kind == Token::TK_BlockEnd 2039 || t.Kind == Token::TK_Value 2040 || t.Kind == Token::TK_Error) { 2041 return Key = new (getAllocator()) NullNode(Doc); 2042 } 2043 if (t.Kind == Token::TK_Key) 2044 getNext(); // skip TK_Key. 2045 } 2046 2047 // Handle explicit null keys. 2048 Token &t = peekNext(); 2049 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { 2050 return Key = new (getAllocator()) NullNode(Doc); 2051 } 2052 2053 // We've got a normal key. 2054 return Key = parseBlockNode(); 2055 } 2056 2057 Node *KeyValueNode::getValue() { 2058 if (Value) 2059 return Value; 2060 getKey()->skip(); 2061 if (failed()) 2062 return Value = new (getAllocator()) NullNode(Doc); 2063 2064 // Handle implicit null values. 2065 { 2066 Token &t = peekNext(); 2067 if ( t.Kind == Token::TK_BlockEnd 2068 || t.Kind == Token::TK_FlowMappingEnd 2069 || t.Kind == Token::TK_Key 2070 || t.Kind == Token::TK_FlowEntry 2071 || t.Kind == Token::TK_Error) { 2072 return Value = new (getAllocator()) NullNode(Doc); 2073 } 2074 2075 if (t.Kind != Token::TK_Value) { 2076 setError("Unexpected token in Key Value.", t); 2077 return Value = new (getAllocator()) NullNode(Doc); 2078 } 2079 getNext(); // skip TK_Value. 2080 } 2081 2082 // Handle explicit null values. 2083 Token &t = peekNext(); 2084 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { 2085 return Value = new (getAllocator()) NullNode(Doc); 2086 } 2087 2088 // We got a normal value. 2089 return Value = parseBlockNode(); 2090 } 2091 2092 void MappingNode::increment() { 2093 if (failed()) { 2094 IsAtEnd = true; 2095 CurrentEntry = nullptr; 2096 return; 2097 } 2098 if (CurrentEntry) { 2099 CurrentEntry->skip(); 2100 if (Type == MT_Inline) { 2101 IsAtEnd = true; 2102 CurrentEntry = nullptr; 2103 return; 2104 } 2105 } 2106 Token T = peekNext(); 2107 if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { 2108 // KeyValueNode eats the TK_Key. That way it can detect null keys. 2109 CurrentEntry = new (getAllocator()) KeyValueNode(Doc); 2110 } else if (Type == MT_Block) { 2111 switch (T.Kind) { 2112 case Token::TK_BlockEnd: 2113 getNext(); 2114 IsAtEnd = true; 2115 CurrentEntry = nullptr; 2116 break; 2117 default: 2118 setError("Unexpected token. Expected Key or Block End", T); 2119 case Token::TK_Error: 2120 IsAtEnd = true; 2121 CurrentEntry = nullptr; 2122 } 2123 } else { 2124 switch (T.Kind) { 2125 case Token::TK_FlowEntry: 2126 // Eat the flow entry and recurse. 2127 getNext(); 2128 return increment(); 2129 case Token::TK_FlowMappingEnd: 2130 getNext(); 2131 case Token::TK_Error: 2132 // Set this to end iterator. 2133 IsAtEnd = true; 2134 CurrentEntry = nullptr; 2135 break; 2136 default: 2137 setError( "Unexpected token. Expected Key, Flow Entry, or Flow " 2138 "Mapping End." 2139 , T); 2140 IsAtEnd = true; 2141 CurrentEntry = nullptr; 2142 } 2143 } 2144 } 2145 2146 void SequenceNode::increment() { 2147 if (failed()) { 2148 IsAtEnd = true; 2149 CurrentEntry = nullptr; 2150 return; 2151 } 2152 if (CurrentEntry) 2153 CurrentEntry->skip(); 2154 Token T = peekNext(); 2155 if (SeqType == ST_Block) { 2156 switch (T.Kind) { 2157 case Token::TK_BlockEntry: 2158 getNext(); 2159 CurrentEntry = parseBlockNode(); 2160 if (!CurrentEntry) { // An error occurred. 2161 IsAtEnd = true; 2162 CurrentEntry = nullptr; 2163 } 2164 break; 2165 case Token::TK_BlockEnd: 2166 getNext(); 2167 IsAtEnd = true; 2168 CurrentEntry = nullptr; 2169 break; 2170 default: 2171 setError( "Unexpected token. Expected Block Entry or Block End." 2172 , T); 2173 case Token::TK_Error: 2174 IsAtEnd = true; 2175 CurrentEntry = nullptr; 2176 } 2177 } else if (SeqType == ST_Indentless) { 2178 switch (T.Kind) { 2179 case Token::TK_BlockEntry: 2180 getNext(); 2181 CurrentEntry = parseBlockNode(); 2182 if (!CurrentEntry) { // An error occurred. 2183 IsAtEnd = true; 2184 CurrentEntry = nullptr; 2185 } 2186 break; 2187 default: 2188 case Token::TK_Error: 2189 IsAtEnd = true; 2190 CurrentEntry = nullptr; 2191 } 2192 } else if (SeqType == ST_Flow) { 2193 switch (T.Kind) { 2194 case Token::TK_FlowEntry: 2195 // Eat the flow entry and recurse. 2196 getNext(); 2197 WasPreviousTokenFlowEntry = true; 2198 return increment(); 2199 case Token::TK_FlowSequenceEnd: 2200 getNext(); 2201 case Token::TK_Error: 2202 // Set this to end iterator. 2203 IsAtEnd = true; 2204 CurrentEntry = nullptr; 2205 break; 2206 case Token::TK_StreamEnd: 2207 case Token::TK_DocumentEnd: 2208 case Token::TK_DocumentStart: 2209 setError("Could not find closing ]!", T); 2210 // Set this to end iterator. 2211 IsAtEnd = true; 2212 CurrentEntry = nullptr; 2213 break; 2214 default: 2215 if (!WasPreviousTokenFlowEntry) { 2216 setError("Expected , between entries!", T); 2217 IsAtEnd = true; 2218 CurrentEntry = nullptr; 2219 break; 2220 } 2221 // Otherwise it must be a flow entry. 2222 CurrentEntry = parseBlockNode(); 2223 if (!CurrentEntry) { 2224 IsAtEnd = true; 2225 } 2226 WasPreviousTokenFlowEntry = false; 2227 break; 2228 } 2229 } 2230 } 2231 2232 Document::Document(Stream &S) : stream(S), Root(nullptr) { 2233 // Tag maps starts with two default mappings. 2234 TagMap["!"] = "!"; 2235 TagMap["!!"] = "tag:yaml.org,2002:"; 2236 2237 if (parseDirectives()) 2238 expectToken(Token::TK_DocumentStart); 2239 Token &T = peekNext(); 2240 if (T.Kind == Token::TK_DocumentStart) 2241 getNext(); 2242 } 2243 2244 bool Document::skip() { 2245 if (stream.scanner->failed()) 2246 return false; 2247 if (!Root) 2248 getRoot(); 2249 Root->skip(); 2250 Token &T = peekNext(); 2251 if (T.Kind == Token::TK_StreamEnd) 2252 return false; 2253 if (T.Kind == Token::TK_DocumentEnd) { 2254 getNext(); 2255 return skip(); 2256 } 2257 return true; 2258 } 2259 2260 Token &Document::peekNext() { 2261 return stream.scanner->peekNext(); 2262 } 2263 2264 Token Document::getNext() { 2265 return stream.scanner->getNext(); 2266 } 2267 2268 void Document::setError(const Twine &Message, Token &Location) const { 2269 stream.scanner->setError(Message, Location.Range.begin()); 2270 } 2271 2272 bool Document::failed() const { 2273 return stream.scanner->failed(); 2274 } 2275 2276 Node *Document::parseBlockNode() { 2277 Token T = peekNext(); 2278 // Handle properties. 2279 Token AnchorInfo; 2280 Token TagInfo; 2281 parse_property: 2282 switch (T.Kind) { 2283 case Token::TK_Alias: 2284 getNext(); 2285 return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); 2286 case Token::TK_Anchor: 2287 if (AnchorInfo.Kind == Token::TK_Anchor) { 2288 setError("Already encountered an anchor for this node!", T); 2289 return nullptr; 2290 } 2291 AnchorInfo = getNext(); // Consume TK_Anchor. 2292 T = peekNext(); 2293 goto parse_property; 2294 case Token::TK_Tag: 2295 if (TagInfo.Kind == Token::TK_Tag) { 2296 setError("Already encountered a tag for this node!", T); 2297 return nullptr; 2298 } 2299 TagInfo = getNext(); // Consume TK_Tag. 2300 T = peekNext(); 2301 goto parse_property; 2302 default: 2303 break; 2304 } 2305 2306 switch (T.Kind) { 2307 case Token::TK_BlockEntry: 2308 // We got an unindented BlockEntry sequence. This is not terminated with 2309 // a BlockEnd. 2310 // Don't eat the TK_BlockEntry, SequenceNode needs it. 2311 return new (NodeAllocator) SequenceNode( stream.CurrentDoc 2312 , AnchorInfo.Range.substr(1) 2313 , TagInfo.Range 2314 , SequenceNode::ST_Indentless); 2315 case Token::TK_BlockSequenceStart: 2316 getNext(); 2317 return new (NodeAllocator) 2318 SequenceNode( stream.CurrentDoc 2319 , AnchorInfo.Range.substr(1) 2320 , TagInfo.Range 2321 , SequenceNode::ST_Block); 2322 case Token::TK_BlockMappingStart: 2323 getNext(); 2324 return new (NodeAllocator) 2325 MappingNode( stream.CurrentDoc 2326 , AnchorInfo.Range.substr(1) 2327 , TagInfo.Range 2328 , MappingNode::MT_Block); 2329 case Token::TK_FlowSequenceStart: 2330 getNext(); 2331 return new (NodeAllocator) 2332 SequenceNode( stream.CurrentDoc 2333 , AnchorInfo.Range.substr(1) 2334 , TagInfo.Range 2335 , SequenceNode::ST_Flow); 2336 case Token::TK_FlowMappingStart: 2337 getNext(); 2338 return new (NodeAllocator) 2339 MappingNode( stream.CurrentDoc 2340 , AnchorInfo.Range.substr(1) 2341 , TagInfo.Range 2342 , MappingNode::MT_Flow); 2343 case Token::TK_Scalar: 2344 getNext(); 2345 return new (NodeAllocator) 2346 ScalarNode( stream.CurrentDoc 2347 , AnchorInfo.Range.substr(1) 2348 , TagInfo.Range 2349 , T.Range); 2350 case Token::TK_BlockScalar: { 2351 getNext(); 2352 StringRef NullTerminatedStr(T.Value.c_str(), T.Value.length() + 1); 2353 StringRef StrCopy = NullTerminatedStr.copy(NodeAllocator).drop_back(); 2354 return new (NodeAllocator) 2355 BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1), 2356 TagInfo.Range, StrCopy, T.Range); 2357 } 2358 case Token::TK_Key: 2359 // Don't eat the TK_Key, KeyValueNode expects it. 2360 return new (NodeAllocator) 2361 MappingNode( stream.CurrentDoc 2362 , AnchorInfo.Range.substr(1) 2363 , TagInfo.Range 2364 , MappingNode::MT_Inline); 2365 case Token::TK_DocumentStart: 2366 case Token::TK_DocumentEnd: 2367 case Token::TK_StreamEnd: 2368 default: 2369 // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not 2370 // !!null null. 2371 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2372 case Token::TK_Error: 2373 return nullptr; 2374 } 2375 llvm_unreachable("Control flow shouldn't reach here."); 2376 return nullptr; 2377 } 2378 2379 bool Document::parseDirectives() { 2380 bool isDirective = false; 2381 while (true) { 2382 Token T = peekNext(); 2383 if (T.Kind == Token::TK_TagDirective) { 2384 parseTAGDirective(); 2385 isDirective = true; 2386 } else if (T.Kind == Token::TK_VersionDirective) { 2387 parseYAMLDirective(); 2388 isDirective = true; 2389 } else 2390 break; 2391 } 2392 return isDirective; 2393 } 2394 2395 void Document::parseYAMLDirective() { 2396 getNext(); // Eat %YAML <version> 2397 } 2398 2399 void Document::parseTAGDirective() { 2400 Token Tag = getNext(); // %TAG <handle> <prefix> 2401 StringRef T = Tag.Range; 2402 // Strip %TAG 2403 T = T.substr(T.find_first_of(" \t")).ltrim(" \t"); 2404 std::size_t HandleEnd = T.find_first_of(" \t"); 2405 StringRef TagHandle = T.substr(0, HandleEnd); 2406 StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t"); 2407 TagMap[TagHandle] = TagPrefix; 2408 } 2409 2410 bool Document::expectToken(int TK) { 2411 Token T = getNext(); 2412 if (T.Kind != TK) { 2413 setError("Unexpected token", T); 2414 return false; 2415 } 2416 return true; 2417 } 2418