1 //===--- YAMLParser.cpp - Simple YAML parser ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements a YAML parser. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Support/YAMLParser.h" 15 #include "llvm/ADT/STLExtras.h" 16 #include "llvm/ADT/SmallString.h" 17 #include "llvm/ADT/SmallVector.h" 18 #include "llvm/ADT/StringExtras.h" 19 #include "llvm/ADT/Twine.h" 20 #include "llvm/ADT/ilist.h" 21 #include "llvm/ADT/ilist_node.h" 22 #include "llvm/Support/ErrorHandling.h" 23 #include "llvm/Support/MemoryBuffer.h" 24 #include "llvm/Support/SourceMgr.h" 25 #include "llvm/Support/raw_ostream.h" 26 27 using namespace llvm; 28 using namespace yaml; 29 30 enum UnicodeEncodingForm { 31 UEF_UTF32_LE, ///< UTF-32 Little Endian 32 UEF_UTF32_BE, ///< UTF-32 Big Endian 33 UEF_UTF16_LE, ///< UTF-16 Little Endian 34 UEF_UTF16_BE, ///< UTF-16 Big Endian 35 UEF_UTF8, ///< UTF-8 or ascii. 36 UEF_Unknown ///< Not a valid Unicode encoding. 37 }; 38 39 /// EncodingInfo - Holds the encoding type and length of the byte order mark if 40 /// it exists. Length is in {0, 2, 3, 4}. 41 typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo; 42 43 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode 44 /// encoding form of \a Input. 45 /// 46 /// @param Input A string of length 0 or more. 47 /// @returns An EncodingInfo indicating the Unicode encoding form of the input 48 /// and how long the byte order mark is if one exists. 49 static EncodingInfo getUnicodeEncoding(StringRef Input) { 50 if (Input.size() == 0) 51 return std::make_pair(UEF_Unknown, 0); 52 53 switch (uint8_t(Input[0])) { 54 case 0x00: 55 if (Input.size() >= 4) { 56 if ( Input[1] == 0 57 && uint8_t(Input[2]) == 0xFE 58 && uint8_t(Input[3]) == 0xFF) 59 return std::make_pair(UEF_UTF32_BE, 4); 60 if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) 61 return std::make_pair(UEF_UTF32_BE, 0); 62 } 63 64 if (Input.size() >= 2 && Input[1] != 0) 65 return std::make_pair(UEF_UTF16_BE, 0); 66 return std::make_pair(UEF_Unknown, 0); 67 case 0xFF: 68 if ( Input.size() >= 4 69 && uint8_t(Input[1]) == 0xFE 70 && Input[2] == 0 71 && Input[3] == 0) 72 return std::make_pair(UEF_UTF32_LE, 4); 73 74 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) 75 return std::make_pair(UEF_UTF16_LE, 2); 76 return std::make_pair(UEF_Unknown, 0); 77 case 0xFE: 78 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) 79 return std::make_pair(UEF_UTF16_BE, 2); 80 return std::make_pair(UEF_Unknown, 0); 81 case 0xEF: 82 if ( Input.size() >= 3 83 && uint8_t(Input[1]) == 0xBB 84 && uint8_t(Input[2]) == 0xBF) 85 return std::make_pair(UEF_UTF8, 3); 86 return std::make_pair(UEF_Unknown, 0); 87 } 88 89 // It could still be utf-32 or utf-16. 90 if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) 91 return std::make_pair(UEF_UTF32_LE, 0); 92 93 if (Input.size() >= 2 && Input[1] == 0) 94 return std::make_pair(UEF_UTF16_LE, 0); 95 96 return std::make_pair(UEF_UTF8, 0); 97 } 98 99 namespace llvm { 100 namespace yaml { 101 /// Pin the vtables to this file. 102 void Node::anchor() {} 103 void NullNode::anchor() {} 104 void ScalarNode::anchor() {} 105 void BlockScalarNode::anchor() {} 106 void KeyValueNode::anchor() {} 107 void MappingNode::anchor() {} 108 void SequenceNode::anchor() {} 109 void AliasNode::anchor() {} 110 111 /// Token - A single YAML token. 112 struct Token : ilist_node<Token> { 113 enum TokenKind { 114 TK_Error, // Uninitialized token. 115 TK_StreamStart, 116 TK_StreamEnd, 117 TK_VersionDirective, 118 TK_TagDirective, 119 TK_DocumentStart, 120 TK_DocumentEnd, 121 TK_BlockEntry, 122 TK_BlockEnd, 123 TK_BlockSequenceStart, 124 TK_BlockMappingStart, 125 TK_FlowEntry, 126 TK_FlowSequenceStart, 127 TK_FlowSequenceEnd, 128 TK_FlowMappingStart, 129 TK_FlowMappingEnd, 130 TK_Key, 131 TK_Value, 132 TK_Scalar, 133 TK_BlockScalar, 134 TK_Alias, 135 TK_Anchor, 136 TK_Tag 137 } Kind; 138 139 /// A string of length 0 or more whose begin() points to the logical location 140 /// of the token in the input. 141 StringRef Range; 142 143 /// The value of a block scalar node. 144 std::string Value; 145 146 Token() : Kind(TK_Error) {} 147 }; 148 } 149 } 150 151 namespace llvm { 152 template <> struct ilist_alloc_traits<Token> { 153 Token *createNode(const Token &V) { 154 return new (Alloc.Allocate<Token>()) Token(V); 155 } 156 static void deleteNode(Token *V) { V->~Token(); } 157 158 BumpPtrAllocator Alloc; 159 }; 160 } // end namespace llvm 161 162 typedef ilist<Token> TokenQueueT; 163 164 namespace { 165 /// @brief This struct is used to track simple keys. 166 /// 167 /// Simple keys are handled by creating an entry in SimpleKeys for each Token 168 /// which could legally be the start of a simple key. When peekNext is called, 169 /// if the Token To be returned is referenced by a SimpleKey, we continue 170 /// tokenizing until that potential simple key has either been found to not be 171 /// a simple key (we moved on to the next line or went further than 1024 chars). 172 /// Or when we run into a Value, and then insert a Key token (and possibly 173 /// others) before the SimpleKey's Tok. 174 struct SimpleKey { 175 TokenQueueT::iterator Tok; 176 unsigned Column; 177 unsigned Line; 178 unsigned FlowLevel; 179 bool IsRequired; 180 181 bool operator ==(const SimpleKey &Other) { 182 return Tok == Other.Tok; 183 } 184 }; 185 } 186 187 /// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit 188 /// subsequence and the subsequence's length in code units (uint8_t). 189 /// A length of 0 represents an error. 190 typedef std::pair<uint32_t, unsigned> UTF8Decoded; 191 192 static UTF8Decoded decodeUTF8(StringRef Range) { 193 StringRef::iterator Position= Range.begin(); 194 StringRef::iterator End = Range.end(); 195 // 1 byte: [0x00, 0x7f] 196 // Bit pattern: 0xxxxxxx 197 if ((*Position & 0x80) == 0) { 198 return std::make_pair(*Position, 1); 199 } 200 // 2 bytes: [0x80, 0x7ff] 201 // Bit pattern: 110xxxxx 10xxxxxx 202 if (Position + 1 != End && 203 ((*Position & 0xE0) == 0xC0) && 204 ((*(Position + 1) & 0xC0) == 0x80)) { 205 uint32_t codepoint = ((*Position & 0x1F) << 6) | 206 (*(Position + 1) & 0x3F); 207 if (codepoint >= 0x80) 208 return std::make_pair(codepoint, 2); 209 } 210 // 3 bytes: [0x8000, 0xffff] 211 // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx 212 if (Position + 2 != End && 213 ((*Position & 0xF0) == 0xE0) && 214 ((*(Position + 1) & 0xC0) == 0x80) && 215 ((*(Position + 2) & 0xC0) == 0x80)) { 216 uint32_t codepoint = ((*Position & 0x0F) << 12) | 217 ((*(Position + 1) & 0x3F) << 6) | 218 (*(Position + 2) & 0x3F); 219 // Codepoints between 0xD800 and 0xDFFF are invalid, as 220 // they are high / low surrogate halves used by UTF-16. 221 if (codepoint >= 0x800 && 222 (codepoint < 0xD800 || codepoint > 0xDFFF)) 223 return std::make_pair(codepoint, 3); 224 } 225 // 4 bytes: [0x10000, 0x10FFFF] 226 // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 227 if (Position + 3 != End && 228 ((*Position & 0xF8) == 0xF0) && 229 ((*(Position + 1) & 0xC0) == 0x80) && 230 ((*(Position + 2) & 0xC0) == 0x80) && 231 ((*(Position + 3) & 0xC0) == 0x80)) { 232 uint32_t codepoint = ((*Position & 0x07) << 18) | 233 ((*(Position + 1) & 0x3F) << 12) | 234 ((*(Position + 2) & 0x3F) << 6) | 235 (*(Position + 3) & 0x3F); 236 if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) 237 return std::make_pair(codepoint, 4); 238 } 239 return std::make_pair(0, 0); 240 } 241 242 namespace llvm { 243 namespace yaml { 244 /// @brief Scans YAML tokens from a MemoryBuffer. 245 class Scanner { 246 public: 247 Scanner(StringRef Input, SourceMgr &SM, bool ShowColors = true); 248 Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors = true); 249 250 /// @brief Parse the next token and return it without popping it. 251 Token &peekNext(); 252 253 /// @brief Parse the next token and pop it from the queue. 254 Token getNext(); 255 256 void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, 257 ArrayRef<SMRange> Ranges = None) { 258 SM.PrintMessage(Loc, Kind, Message, Ranges, /* FixIts= */ None, ShowColors); 259 } 260 261 void setError(const Twine &Message, StringRef::iterator Position) { 262 if (Current >= End) 263 Current = End - 1; 264 265 // Don't print out more errors after the first one we encounter. The rest 266 // are just the result of the first, and have no meaning. 267 if (!Failed) 268 printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message); 269 Failed = true; 270 } 271 272 void setError(const Twine &Message) { 273 setError(Message, Current); 274 } 275 276 /// @brief Returns true if an error occurred while parsing. 277 bool failed() { 278 return Failed; 279 } 280 281 private: 282 void init(MemoryBufferRef Buffer); 283 284 StringRef currentInput() { 285 return StringRef(Current, End - Current); 286 } 287 288 /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting 289 /// at \a Position. 290 /// 291 /// If the UTF-8 code units starting at Position do not form a well-formed 292 /// code unit subsequence, then the Unicode scalar value is 0, and the length 293 /// is 0. 294 UTF8Decoded decodeUTF8(StringRef::iterator Position) { 295 return ::decodeUTF8(StringRef(Position, End - Position)); 296 } 297 298 // The following functions are based on the gramar rules in the YAML spec. The 299 // style of the function names it meant to closely match how they are written 300 // in the spec. The number within the [] is the number of the grammar rule in 301 // the spec. 302 // 303 // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. 304 // 305 // c- 306 // A production starting and ending with a special character. 307 // b- 308 // A production matching a single line break. 309 // nb- 310 // A production starting and ending with a non-break character. 311 // s- 312 // A production starting and ending with a white space character. 313 // ns- 314 // A production starting and ending with a non-space character. 315 // l- 316 // A production matching complete line(s). 317 318 /// @brief Skip a single nb-char[27] starting at Position. 319 /// 320 /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] 321 /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] 322 /// 323 /// @returns The code unit after the nb-char, or Position if it's not an 324 /// nb-char. 325 StringRef::iterator skip_nb_char(StringRef::iterator Position); 326 327 /// @brief Skip a single b-break[28] starting at Position. 328 /// 329 /// A b-break is 0xD 0xA | 0xD | 0xA 330 /// 331 /// @returns The code unit after the b-break, or Position if it's not a 332 /// b-break. 333 StringRef::iterator skip_b_break(StringRef::iterator Position); 334 335 /// Skip a single s-space[31] starting at Position. 336 /// 337 /// An s-space is 0x20 338 /// 339 /// @returns The code unit after the s-space, or Position if it's not a 340 /// s-space. 341 StringRef::iterator skip_s_space(StringRef::iterator Position); 342 343 /// @brief Skip a single s-white[33] starting at Position. 344 /// 345 /// A s-white is 0x20 | 0x9 346 /// 347 /// @returns The code unit after the s-white, or Position if it's not a 348 /// s-white. 349 StringRef::iterator skip_s_white(StringRef::iterator Position); 350 351 /// @brief Skip a single ns-char[34] starting at Position. 352 /// 353 /// A ns-char is nb-char - s-white 354 /// 355 /// @returns The code unit after the ns-char, or Position if it's not a 356 /// ns-char. 357 StringRef::iterator skip_ns_char(StringRef::iterator Position); 358 359 typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator); 360 /// @brief Skip minimal well-formed code unit subsequences until Func 361 /// returns its input. 362 /// 363 /// @returns The code unit after the last minimal well-formed code unit 364 /// subsequence that Func accepted. 365 StringRef::iterator skip_while( SkipWhileFunc Func 366 , StringRef::iterator Position); 367 368 /// Skip minimal well-formed code unit subsequences until Func returns its 369 /// input. 370 void advanceWhile(SkipWhileFunc Func); 371 372 /// @brief Scan ns-uri-char[39]s starting at Cur. 373 /// 374 /// This updates Cur and Column while scanning. 375 /// 376 /// @returns A StringRef starting at Cur which covers the longest contiguous 377 /// sequence of ns-uri-char. 378 StringRef scan_ns_uri_char(); 379 380 /// @brief Consume a minimal well-formed code unit subsequence starting at 381 /// \a Cur. Return false if it is not the same Unicode scalar value as 382 /// \a Expected. This updates \a Column. 383 bool consume(uint32_t Expected); 384 385 /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. 386 void skip(uint32_t Distance); 387 388 /// @brief Return true if the minimal well-formed code unit subsequence at 389 /// Pos is whitespace or a new line 390 bool isBlankOrBreak(StringRef::iterator Position); 391 392 /// Consume a single b-break[28] if it's present at the current position. 393 /// 394 /// Return false if the code unit at the current position isn't a line break. 395 bool consumeLineBreakIfPresent(); 396 397 /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey. 398 void saveSimpleKeyCandidate( TokenQueueT::iterator Tok 399 , unsigned AtColumn 400 , bool IsRequired); 401 402 /// @brief Remove simple keys that can no longer be valid simple keys. 403 /// 404 /// Invalid simple keys are not on the current line or are further than 1024 405 /// columns back. 406 void removeStaleSimpleKeyCandidates(); 407 408 /// @brief Remove all simple keys on FlowLevel \a Level. 409 void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); 410 411 /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd 412 /// tokens if needed. 413 bool unrollIndent(int ToColumn); 414 415 /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint 416 /// if needed. 417 bool rollIndent( int ToColumn 418 , Token::TokenKind Kind 419 , TokenQueueT::iterator InsertPoint); 420 421 /// @brief Skip a single-line comment when the comment starts at the current 422 /// position of the scanner. 423 void skipComment(); 424 425 /// @brief Skip whitespace and comments until the start of the next token. 426 void scanToNextToken(); 427 428 /// @brief Must be the first token generated. 429 bool scanStreamStart(); 430 431 /// @brief Generate tokens needed to close out the stream. 432 bool scanStreamEnd(); 433 434 /// @brief Scan a %BLAH directive. 435 bool scanDirective(); 436 437 /// @brief Scan a ... or ---. 438 bool scanDocumentIndicator(bool IsStart); 439 440 /// @brief Scan a [ or { and generate the proper flow collection start token. 441 bool scanFlowCollectionStart(bool IsSequence); 442 443 /// @brief Scan a ] or } and generate the proper flow collection end token. 444 bool scanFlowCollectionEnd(bool IsSequence); 445 446 /// @brief Scan the , that separates entries in a flow collection. 447 bool scanFlowEntry(); 448 449 /// @brief Scan the - that starts block sequence entries. 450 bool scanBlockEntry(); 451 452 /// @brief Scan an explicit ? indicating a key. 453 bool scanKey(); 454 455 /// @brief Scan an explicit : indicating a value. 456 bool scanValue(); 457 458 /// @brief Scan a quoted scalar. 459 bool scanFlowScalar(bool IsDoubleQuoted); 460 461 /// @brief Scan an unquoted scalar. 462 bool scanPlainScalar(); 463 464 /// @brief Scan an Alias or Anchor starting with * or &. 465 bool scanAliasOrAnchor(bool IsAlias); 466 467 /// @brief Scan a block scalar starting with | or >. 468 bool scanBlockScalar(bool IsLiteral); 469 470 /// Scan a chomping indicator in a block scalar header. 471 char scanBlockChompingIndicator(); 472 473 /// Scan an indentation indicator in a block scalar header. 474 unsigned scanBlockIndentationIndicator(); 475 476 /// Scan a block scalar header. 477 /// 478 /// Return false if an error occurred. 479 bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator, 480 bool &IsDone); 481 482 /// Look for the indentation level of a block scalar. 483 /// 484 /// Return false if an error occurred. 485 bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent, 486 unsigned &LineBreaks, bool &IsDone); 487 488 /// Scan the indentation of a text line in a block scalar. 489 /// 490 /// Return false if an error occurred. 491 bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent, 492 bool &IsDone); 493 494 /// @brief Scan a tag of the form !stuff. 495 bool scanTag(); 496 497 /// @brief Dispatch to the next scanning function based on \a *Cur. 498 bool fetchMoreTokens(); 499 500 /// @brief The SourceMgr used for diagnostics and buffer management. 501 SourceMgr &SM; 502 503 /// @brief The original input. 504 MemoryBufferRef InputBuffer; 505 506 /// @brief The current position of the scanner. 507 StringRef::iterator Current; 508 509 /// @brief The end of the input (one past the last character). 510 StringRef::iterator End; 511 512 /// @brief Current YAML indentation level in spaces. 513 int Indent; 514 515 /// @brief Current column number in Unicode code points. 516 unsigned Column; 517 518 /// @brief Current line number. 519 unsigned Line; 520 521 /// @brief How deep we are in flow style containers. 0 Means at block level. 522 unsigned FlowLevel; 523 524 /// @brief Are we at the start of the stream? 525 bool IsStartOfStream; 526 527 /// @brief Can the next token be the start of a simple key? 528 bool IsSimpleKeyAllowed; 529 530 /// @brief True if an error has occurred. 531 bool Failed; 532 533 /// @brief Should colors be used when printing out the diagnostic messages? 534 bool ShowColors; 535 536 /// @brief Queue of tokens. This is required to queue up tokens while looking 537 /// for the end of a simple key. And for cases where a single character 538 /// can produce multiple tokens (e.g. BlockEnd). 539 TokenQueueT TokenQueue; 540 541 /// @brief Indentation levels. 542 SmallVector<int, 4> Indents; 543 544 /// @brief Potential simple keys. 545 SmallVector<SimpleKey, 4> SimpleKeys; 546 }; 547 548 } // end namespace yaml 549 } // end namespace llvm 550 551 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. 552 static void encodeUTF8( uint32_t UnicodeScalarValue 553 , SmallVectorImpl<char> &Result) { 554 if (UnicodeScalarValue <= 0x7F) { 555 Result.push_back(UnicodeScalarValue & 0x7F); 556 } else if (UnicodeScalarValue <= 0x7FF) { 557 uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); 558 uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); 559 Result.push_back(FirstByte); 560 Result.push_back(SecondByte); 561 } else if (UnicodeScalarValue <= 0xFFFF) { 562 uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); 563 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 564 uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); 565 Result.push_back(FirstByte); 566 Result.push_back(SecondByte); 567 Result.push_back(ThirdByte); 568 } else if (UnicodeScalarValue <= 0x10FFFF) { 569 uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); 570 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); 571 uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 572 uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); 573 Result.push_back(FirstByte); 574 Result.push_back(SecondByte); 575 Result.push_back(ThirdByte); 576 Result.push_back(FourthByte); 577 } 578 } 579 580 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { 581 SourceMgr SM; 582 Scanner scanner(Input, SM); 583 while (true) { 584 Token T = scanner.getNext(); 585 switch (T.Kind) { 586 case Token::TK_StreamStart: 587 OS << "Stream-Start: "; 588 break; 589 case Token::TK_StreamEnd: 590 OS << "Stream-End: "; 591 break; 592 case Token::TK_VersionDirective: 593 OS << "Version-Directive: "; 594 break; 595 case Token::TK_TagDirective: 596 OS << "Tag-Directive: "; 597 break; 598 case Token::TK_DocumentStart: 599 OS << "Document-Start: "; 600 break; 601 case Token::TK_DocumentEnd: 602 OS << "Document-End: "; 603 break; 604 case Token::TK_BlockEntry: 605 OS << "Block-Entry: "; 606 break; 607 case Token::TK_BlockEnd: 608 OS << "Block-End: "; 609 break; 610 case Token::TK_BlockSequenceStart: 611 OS << "Block-Sequence-Start: "; 612 break; 613 case Token::TK_BlockMappingStart: 614 OS << "Block-Mapping-Start: "; 615 break; 616 case Token::TK_FlowEntry: 617 OS << "Flow-Entry: "; 618 break; 619 case Token::TK_FlowSequenceStart: 620 OS << "Flow-Sequence-Start: "; 621 break; 622 case Token::TK_FlowSequenceEnd: 623 OS << "Flow-Sequence-End: "; 624 break; 625 case Token::TK_FlowMappingStart: 626 OS << "Flow-Mapping-Start: "; 627 break; 628 case Token::TK_FlowMappingEnd: 629 OS << "Flow-Mapping-End: "; 630 break; 631 case Token::TK_Key: 632 OS << "Key: "; 633 break; 634 case Token::TK_Value: 635 OS << "Value: "; 636 break; 637 case Token::TK_Scalar: 638 OS << "Scalar: "; 639 break; 640 case Token::TK_BlockScalar: 641 OS << "Block Scalar: "; 642 break; 643 case Token::TK_Alias: 644 OS << "Alias: "; 645 break; 646 case Token::TK_Anchor: 647 OS << "Anchor: "; 648 break; 649 case Token::TK_Tag: 650 OS << "Tag: "; 651 break; 652 case Token::TK_Error: 653 break; 654 } 655 OS << T.Range << "\n"; 656 if (T.Kind == Token::TK_StreamEnd) 657 break; 658 else if (T.Kind == Token::TK_Error) 659 return false; 660 } 661 return true; 662 } 663 664 bool yaml::scanTokens(StringRef Input) { 665 llvm::SourceMgr SM; 666 llvm::yaml::Scanner scanner(Input, SM); 667 for (;;) { 668 llvm::yaml::Token T = scanner.getNext(); 669 if (T.Kind == Token::TK_StreamEnd) 670 break; 671 else if (T.Kind == Token::TK_Error) 672 return false; 673 } 674 return true; 675 } 676 677 std::string yaml::escape(StringRef Input) { 678 std::string EscapedInput; 679 for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { 680 if (*i == '\\') 681 EscapedInput += "\\\\"; 682 else if (*i == '"') 683 EscapedInput += "\\\""; 684 else if (*i == 0) 685 EscapedInput += "\\0"; 686 else if (*i == 0x07) 687 EscapedInput += "\\a"; 688 else if (*i == 0x08) 689 EscapedInput += "\\b"; 690 else if (*i == 0x09) 691 EscapedInput += "\\t"; 692 else if (*i == 0x0A) 693 EscapedInput += "\\n"; 694 else if (*i == 0x0B) 695 EscapedInput += "\\v"; 696 else if (*i == 0x0C) 697 EscapedInput += "\\f"; 698 else if (*i == 0x0D) 699 EscapedInput += "\\r"; 700 else if (*i == 0x1B) 701 EscapedInput += "\\e"; 702 else if ((unsigned char)*i < 0x20) { // Control characters not handled above. 703 std::string HexStr = utohexstr(*i); 704 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 705 } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. 706 UTF8Decoded UnicodeScalarValue 707 = decodeUTF8(StringRef(i, Input.end() - i)); 708 if (UnicodeScalarValue.second == 0) { 709 // Found invalid char. 710 SmallString<4> Val; 711 encodeUTF8(0xFFFD, Val); 712 EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end()); 713 // FIXME: Error reporting. 714 return EscapedInput; 715 } 716 if (UnicodeScalarValue.first == 0x85) 717 EscapedInput += "\\N"; 718 else if (UnicodeScalarValue.first == 0xA0) 719 EscapedInput += "\\_"; 720 else if (UnicodeScalarValue.first == 0x2028) 721 EscapedInput += "\\L"; 722 else if (UnicodeScalarValue.first == 0x2029) 723 EscapedInput += "\\P"; 724 else { 725 std::string HexStr = utohexstr(UnicodeScalarValue.first); 726 if (HexStr.size() <= 2) 727 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 728 else if (HexStr.size() <= 4) 729 EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; 730 else if (HexStr.size() <= 8) 731 EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; 732 } 733 i += UnicodeScalarValue.second - 1; 734 } else 735 EscapedInput.push_back(*i); 736 } 737 return EscapedInput; 738 } 739 740 Scanner::Scanner(StringRef Input, SourceMgr &sm, bool ShowColors) 741 : SM(sm), ShowColors(ShowColors) { 742 init(MemoryBufferRef(Input, "YAML")); 743 } 744 745 Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors) 746 : SM(SM_), ShowColors(ShowColors) { 747 init(Buffer); 748 } 749 750 void Scanner::init(MemoryBufferRef Buffer) { 751 InputBuffer = Buffer; 752 Current = InputBuffer.getBufferStart(); 753 End = InputBuffer.getBufferEnd(); 754 Indent = -1; 755 Column = 0; 756 Line = 0; 757 FlowLevel = 0; 758 IsStartOfStream = true; 759 IsSimpleKeyAllowed = true; 760 Failed = false; 761 std::unique_ptr<MemoryBuffer> InputBufferOwner = 762 MemoryBuffer::getMemBuffer(Buffer); 763 SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc()); 764 } 765 766 Token &Scanner::peekNext() { 767 // If the current token is a possible simple key, keep parsing until we 768 // can confirm. 769 bool NeedMore = false; 770 while (true) { 771 if (TokenQueue.empty() || NeedMore) { 772 if (!fetchMoreTokens()) { 773 TokenQueue.clear(); 774 TokenQueue.push_back(Token()); 775 return TokenQueue.front(); 776 } 777 } 778 assert(!TokenQueue.empty() && 779 "fetchMoreTokens lied about getting tokens!"); 780 781 removeStaleSimpleKeyCandidates(); 782 SimpleKey SK; 783 SK.Tok = TokenQueue.begin(); 784 if (!is_contained(SimpleKeys, SK)) 785 break; 786 else 787 NeedMore = true; 788 } 789 return TokenQueue.front(); 790 } 791 792 Token Scanner::getNext() { 793 Token Ret = peekNext(); 794 // TokenQueue can be empty if there was an error getting the next token. 795 if (!TokenQueue.empty()) 796 TokenQueue.pop_front(); 797 798 // There cannot be any referenced Token's if the TokenQueue is empty. So do a 799 // quick deallocation of them all. 800 if (TokenQueue.empty()) { 801 TokenQueue.Alloc.Reset(); 802 } 803 804 return Ret; 805 } 806 807 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { 808 if (Position == End) 809 return Position; 810 // Check 7 bit c-printable - b-char. 811 if ( *Position == 0x09 812 || (*Position >= 0x20 && *Position <= 0x7E)) 813 return Position + 1; 814 815 // Check for valid UTF-8. 816 if (uint8_t(*Position) & 0x80) { 817 UTF8Decoded u8d = decodeUTF8(Position); 818 if ( u8d.second != 0 819 && u8d.first != 0xFEFF 820 && ( u8d.first == 0x85 821 || ( u8d.first >= 0xA0 822 && u8d.first <= 0xD7FF) 823 || ( u8d.first >= 0xE000 824 && u8d.first <= 0xFFFD) 825 || ( u8d.first >= 0x10000 826 && u8d.first <= 0x10FFFF))) 827 return Position + u8d.second; 828 } 829 return Position; 830 } 831 832 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { 833 if (Position == End) 834 return Position; 835 if (*Position == 0x0D) { 836 if (Position + 1 != End && *(Position + 1) == 0x0A) 837 return Position + 2; 838 return Position + 1; 839 } 840 841 if (*Position == 0x0A) 842 return Position + 1; 843 return Position; 844 } 845 846 StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) { 847 if (Position == End) 848 return Position; 849 if (*Position == ' ') 850 return Position + 1; 851 return Position; 852 } 853 854 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { 855 if (Position == End) 856 return Position; 857 if (*Position == ' ' || *Position == '\t') 858 return Position + 1; 859 return Position; 860 } 861 862 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { 863 if (Position == End) 864 return Position; 865 if (*Position == ' ' || *Position == '\t') 866 return Position; 867 return skip_nb_char(Position); 868 } 869 870 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func 871 , StringRef::iterator Position) { 872 while (true) { 873 StringRef::iterator i = (this->*Func)(Position); 874 if (i == Position) 875 break; 876 Position = i; 877 } 878 return Position; 879 } 880 881 void Scanner::advanceWhile(SkipWhileFunc Func) { 882 auto Final = skip_while(Func, Current); 883 Column += Final - Current; 884 Current = Final; 885 } 886 887 static bool is_ns_hex_digit(const char C) { 888 return (C >= '0' && C <= '9') 889 || (C >= 'a' && C <= 'z') 890 || (C >= 'A' && C <= 'Z'); 891 } 892 893 static bool is_ns_word_char(const char C) { 894 return C == '-' 895 || (C >= 'a' && C <= 'z') 896 || (C >= 'A' && C <= 'Z'); 897 } 898 899 StringRef Scanner::scan_ns_uri_char() { 900 StringRef::iterator Start = Current; 901 while (true) { 902 if (Current == End) 903 break; 904 if (( *Current == '%' 905 && Current + 2 < End 906 && is_ns_hex_digit(*(Current + 1)) 907 && is_ns_hex_digit(*(Current + 2))) 908 || is_ns_word_char(*Current) 909 || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") 910 != StringRef::npos) { 911 ++Current; 912 ++Column; 913 } else 914 break; 915 } 916 return StringRef(Start, Current - Start); 917 } 918 919 bool Scanner::consume(uint32_t Expected) { 920 if (Expected >= 0x80) 921 report_fatal_error("Not dealing with this yet"); 922 if (Current == End) 923 return false; 924 if (uint8_t(*Current) >= 0x80) 925 report_fatal_error("Not dealing with this yet"); 926 if (uint8_t(*Current) == Expected) { 927 ++Current; 928 ++Column; 929 return true; 930 } 931 return false; 932 } 933 934 void Scanner::skip(uint32_t Distance) { 935 Current += Distance; 936 Column += Distance; 937 assert(Current <= End && "Skipped past the end"); 938 } 939 940 bool Scanner::isBlankOrBreak(StringRef::iterator Position) { 941 if (Position == End) 942 return false; 943 return *Position == ' ' || *Position == '\t' || *Position == '\r' || 944 *Position == '\n'; 945 } 946 947 bool Scanner::consumeLineBreakIfPresent() { 948 auto Next = skip_b_break(Current); 949 if (Next == Current) 950 return false; 951 Column = 0; 952 ++Line; 953 Current = Next; 954 return true; 955 } 956 957 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok 958 , unsigned AtColumn 959 , bool IsRequired) { 960 if (IsSimpleKeyAllowed) { 961 SimpleKey SK; 962 SK.Tok = Tok; 963 SK.Line = Line; 964 SK.Column = AtColumn; 965 SK.IsRequired = IsRequired; 966 SK.FlowLevel = FlowLevel; 967 SimpleKeys.push_back(SK); 968 } 969 } 970 971 void Scanner::removeStaleSimpleKeyCandidates() { 972 for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); 973 i != SimpleKeys.end();) { 974 if (i->Line != Line || i->Column + 1024 < Column) { 975 if (i->IsRequired) 976 setError( "Could not find expected : for simple key" 977 , i->Tok->Range.begin()); 978 i = SimpleKeys.erase(i); 979 } else 980 ++i; 981 } 982 } 983 984 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { 985 if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) 986 SimpleKeys.pop_back(); 987 } 988 989 bool Scanner::unrollIndent(int ToColumn) { 990 Token T; 991 // Indentation is ignored in flow. 992 if (FlowLevel != 0) 993 return true; 994 995 while (Indent > ToColumn) { 996 T.Kind = Token::TK_BlockEnd; 997 T.Range = StringRef(Current, 1); 998 TokenQueue.push_back(T); 999 Indent = Indents.pop_back_val(); 1000 } 1001 1002 return true; 1003 } 1004 1005 bool Scanner::rollIndent( int ToColumn 1006 , Token::TokenKind Kind 1007 , TokenQueueT::iterator InsertPoint) { 1008 if (FlowLevel) 1009 return true; 1010 if (Indent < ToColumn) { 1011 Indents.push_back(Indent); 1012 Indent = ToColumn; 1013 1014 Token T; 1015 T.Kind = Kind; 1016 T.Range = StringRef(Current, 0); 1017 TokenQueue.insert(InsertPoint, T); 1018 } 1019 return true; 1020 } 1021 1022 void Scanner::skipComment() { 1023 if (*Current != '#') 1024 return; 1025 while (true) { 1026 // This may skip more than one byte, thus Column is only incremented 1027 // for code points. 1028 StringRef::iterator I = skip_nb_char(Current); 1029 if (I == Current) 1030 break; 1031 Current = I; 1032 ++Column; 1033 } 1034 } 1035 1036 void Scanner::scanToNextToken() { 1037 while (true) { 1038 while (*Current == ' ' || *Current == '\t') { 1039 skip(1); 1040 } 1041 1042 skipComment(); 1043 1044 // Skip EOL. 1045 StringRef::iterator i = skip_b_break(Current); 1046 if (i == Current) 1047 break; 1048 Current = i; 1049 ++Line; 1050 Column = 0; 1051 // New lines may start a simple key. 1052 if (!FlowLevel) 1053 IsSimpleKeyAllowed = true; 1054 } 1055 } 1056 1057 bool Scanner::scanStreamStart() { 1058 IsStartOfStream = false; 1059 1060 EncodingInfo EI = getUnicodeEncoding(currentInput()); 1061 1062 Token T; 1063 T.Kind = Token::TK_StreamStart; 1064 T.Range = StringRef(Current, EI.second); 1065 TokenQueue.push_back(T); 1066 Current += EI.second; 1067 return true; 1068 } 1069 1070 bool Scanner::scanStreamEnd() { 1071 // Force an ending new line if one isn't present. 1072 if (Column != 0) { 1073 Column = 0; 1074 ++Line; 1075 } 1076 1077 unrollIndent(-1); 1078 SimpleKeys.clear(); 1079 IsSimpleKeyAllowed = false; 1080 1081 Token T; 1082 T.Kind = Token::TK_StreamEnd; 1083 T.Range = StringRef(Current, 0); 1084 TokenQueue.push_back(T); 1085 return true; 1086 } 1087 1088 bool Scanner::scanDirective() { 1089 // Reset the indentation level. 1090 unrollIndent(-1); 1091 SimpleKeys.clear(); 1092 IsSimpleKeyAllowed = false; 1093 1094 StringRef::iterator Start = Current; 1095 consume('%'); 1096 StringRef::iterator NameStart = Current; 1097 Current = skip_while(&Scanner::skip_ns_char, Current); 1098 StringRef Name(NameStart, Current - NameStart); 1099 Current = skip_while(&Scanner::skip_s_white, Current); 1100 1101 Token T; 1102 if (Name == "YAML") { 1103 Current = skip_while(&Scanner::skip_ns_char, Current); 1104 T.Kind = Token::TK_VersionDirective; 1105 T.Range = StringRef(Start, Current - Start); 1106 TokenQueue.push_back(T); 1107 return true; 1108 } else if(Name == "TAG") { 1109 Current = skip_while(&Scanner::skip_ns_char, Current); 1110 Current = skip_while(&Scanner::skip_s_white, Current); 1111 Current = skip_while(&Scanner::skip_ns_char, Current); 1112 T.Kind = Token::TK_TagDirective; 1113 T.Range = StringRef(Start, Current - Start); 1114 TokenQueue.push_back(T); 1115 return true; 1116 } 1117 return false; 1118 } 1119 1120 bool Scanner::scanDocumentIndicator(bool IsStart) { 1121 unrollIndent(-1); 1122 SimpleKeys.clear(); 1123 IsSimpleKeyAllowed = false; 1124 1125 Token T; 1126 T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; 1127 T.Range = StringRef(Current, 3); 1128 skip(3); 1129 TokenQueue.push_back(T); 1130 return true; 1131 } 1132 1133 bool Scanner::scanFlowCollectionStart(bool IsSequence) { 1134 Token T; 1135 T.Kind = IsSequence ? Token::TK_FlowSequenceStart 1136 : Token::TK_FlowMappingStart; 1137 T.Range = StringRef(Current, 1); 1138 skip(1); 1139 TokenQueue.push_back(T); 1140 1141 // [ and { may begin a simple key. 1142 saveSimpleKeyCandidate(--TokenQueue.end(), Column - 1, false); 1143 1144 // And may also be followed by a simple key. 1145 IsSimpleKeyAllowed = true; 1146 ++FlowLevel; 1147 return true; 1148 } 1149 1150 bool Scanner::scanFlowCollectionEnd(bool IsSequence) { 1151 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1152 IsSimpleKeyAllowed = false; 1153 Token T; 1154 T.Kind = IsSequence ? Token::TK_FlowSequenceEnd 1155 : Token::TK_FlowMappingEnd; 1156 T.Range = StringRef(Current, 1); 1157 skip(1); 1158 TokenQueue.push_back(T); 1159 if (FlowLevel) 1160 --FlowLevel; 1161 return true; 1162 } 1163 1164 bool Scanner::scanFlowEntry() { 1165 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1166 IsSimpleKeyAllowed = true; 1167 Token T; 1168 T.Kind = Token::TK_FlowEntry; 1169 T.Range = StringRef(Current, 1); 1170 skip(1); 1171 TokenQueue.push_back(T); 1172 return true; 1173 } 1174 1175 bool Scanner::scanBlockEntry() { 1176 rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); 1177 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1178 IsSimpleKeyAllowed = true; 1179 Token T; 1180 T.Kind = Token::TK_BlockEntry; 1181 T.Range = StringRef(Current, 1); 1182 skip(1); 1183 TokenQueue.push_back(T); 1184 return true; 1185 } 1186 1187 bool Scanner::scanKey() { 1188 if (!FlowLevel) 1189 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1190 1191 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1192 IsSimpleKeyAllowed = !FlowLevel; 1193 1194 Token T; 1195 T.Kind = Token::TK_Key; 1196 T.Range = StringRef(Current, 1); 1197 skip(1); 1198 TokenQueue.push_back(T); 1199 return true; 1200 } 1201 1202 bool Scanner::scanValue() { 1203 // If the previous token could have been a simple key, insert the key token 1204 // into the token queue. 1205 if (!SimpleKeys.empty()) { 1206 SimpleKey SK = SimpleKeys.pop_back_val(); 1207 Token T; 1208 T.Kind = Token::TK_Key; 1209 T.Range = SK.Tok->Range; 1210 TokenQueueT::iterator i, e; 1211 for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { 1212 if (i == SK.Tok) 1213 break; 1214 } 1215 assert(i != e && "SimpleKey not in token queue!"); 1216 i = TokenQueue.insert(i, T); 1217 1218 // We may also need to add a Block-Mapping-Start token. 1219 rollIndent(SK.Column, Token::TK_BlockMappingStart, i); 1220 1221 IsSimpleKeyAllowed = false; 1222 } else { 1223 if (!FlowLevel) 1224 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1225 IsSimpleKeyAllowed = !FlowLevel; 1226 } 1227 1228 Token T; 1229 T.Kind = Token::TK_Value; 1230 T.Range = StringRef(Current, 1); 1231 skip(1); 1232 TokenQueue.push_back(T); 1233 return true; 1234 } 1235 1236 // Forbidding inlining improves performance by roughly 20%. 1237 // FIXME: Remove once llvm optimizes this to the faster version without hints. 1238 LLVM_ATTRIBUTE_NOINLINE static bool 1239 wasEscaped(StringRef::iterator First, StringRef::iterator Position); 1240 1241 // Returns whether a character at 'Position' was escaped with a leading '\'. 1242 // 'First' specifies the position of the first character in the string. 1243 static bool wasEscaped(StringRef::iterator First, 1244 StringRef::iterator Position) { 1245 assert(Position - 1 >= First); 1246 StringRef::iterator I = Position - 1; 1247 // We calculate the number of consecutive '\'s before the current position 1248 // by iterating backwards through our string. 1249 while (I >= First && *I == '\\') --I; 1250 // (Position - 1 - I) now contains the number of '\'s before the current 1251 // position. If it is odd, the character at 'Position' was escaped. 1252 return (Position - 1 - I) % 2 == 1; 1253 } 1254 1255 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { 1256 StringRef::iterator Start = Current; 1257 unsigned ColStart = Column; 1258 if (IsDoubleQuoted) { 1259 do { 1260 ++Current; 1261 while (Current != End && *Current != '"') 1262 ++Current; 1263 // Repeat until the previous character was not a '\' or was an escaped 1264 // backslash. 1265 } while ( Current != End 1266 && *(Current - 1) == '\\' 1267 && wasEscaped(Start + 1, Current)); 1268 } else { 1269 skip(1); 1270 while (true) { 1271 // Skip a ' followed by another '. 1272 if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { 1273 skip(2); 1274 continue; 1275 } else if (*Current == '\'') 1276 break; 1277 StringRef::iterator i = skip_nb_char(Current); 1278 if (i == Current) { 1279 i = skip_b_break(Current); 1280 if (i == Current) 1281 break; 1282 Current = i; 1283 Column = 0; 1284 ++Line; 1285 } else { 1286 if (i == End) 1287 break; 1288 Current = i; 1289 ++Column; 1290 } 1291 } 1292 } 1293 1294 if (Current == End) { 1295 setError("Expected quote at end of scalar", Current); 1296 return false; 1297 } 1298 1299 skip(1); // Skip ending quote. 1300 Token T; 1301 T.Kind = Token::TK_Scalar; 1302 T.Range = StringRef(Start, Current - Start); 1303 TokenQueue.push_back(T); 1304 1305 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1306 1307 IsSimpleKeyAllowed = false; 1308 1309 return true; 1310 } 1311 1312 bool Scanner::scanPlainScalar() { 1313 StringRef::iterator Start = Current; 1314 unsigned ColStart = Column; 1315 unsigned LeadingBlanks = 0; 1316 assert(Indent >= -1 && "Indent must be >= -1 !"); 1317 unsigned indent = static_cast<unsigned>(Indent + 1); 1318 while (true) { 1319 if (*Current == '#') 1320 break; 1321 1322 while (!isBlankOrBreak(Current)) { 1323 if ( FlowLevel && *Current == ':' 1324 && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) { 1325 setError("Found unexpected ':' while scanning a plain scalar", Current); 1326 return false; 1327 } 1328 1329 // Check for the end of the plain scalar. 1330 if ( (*Current == ':' && isBlankOrBreak(Current + 1)) 1331 || ( FlowLevel 1332 && (StringRef(Current, 1).find_first_of(",:?[]{}") 1333 != StringRef::npos))) 1334 break; 1335 1336 StringRef::iterator i = skip_nb_char(Current); 1337 if (i == Current) 1338 break; 1339 Current = i; 1340 ++Column; 1341 } 1342 1343 // Are we at the end? 1344 if (!isBlankOrBreak(Current)) 1345 break; 1346 1347 // Eat blanks. 1348 StringRef::iterator Tmp = Current; 1349 while (isBlankOrBreak(Tmp)) { 1350 StringRef::iterator i = skip_s_white(Tmp); 1351 if (i != Tmp) { 1352 if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { 1353 setError("Found invalid tab character in indentation", Tmp); 1354 return false; 1355 } 1356 Tmp = i; 1357 ++Column; 1358 } else { 1359 i = skip_b_break(Tmp); 1360 if (!LeadingBlanks) 1361 LeadingBlanks = 1; 1362 Tmp = i; 1363 Column = 0; 1364 ++Line; 1365 } 1366 } 1367 1368 if (!FlowLevel && Column < indent) 1369 break; 1370 1371 Current = Tmp; 1372 } 1373 if (Start == Current) { 1374 setError("Got empty plain scalar", Start); 1375 return false; 1376 } 1377 Token T; 1378 T.Kind = Token::TK_Scalar; 1379 T.Range = StringRef(Start, Current - Start); 1380 TokenQueue.push_back(T); 1381 1382 // Plain scalars can be simple keys. 1383 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1384 1385 IsSimpleKeyAllowed = false; 1386 1387 return true; 1388 } 1389 1390 bool Scanner::scanAliasOrAnchor(bool IsAlias) { 1391 StringRef::iterator Start = Current; 1392 unsigned ColStart = Column; 1393 skip(1); 1394 while(true) { 1395 if ( *Current == '[' || *Current == ']' 1396 || *Current == '{' || *Current == '}' 1397 || *Current == ',' 1398 || *Current == ':') 1399 break; 1400 StringRef::iterator i = skip_ns_char(Current); 1401 if (i == Current) 1402 break; 1403 Current = i; 1404 ++Column; 1405 } 1406 1407 if (Start == Current) { 1408 setError("Got empty alias or anchor", Start); 1409 return false; 1410 } 1411 1412 Token T; 1413 T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; 1414 T.Range = StringRef(Start, Current - Start); 1415 TokenQueue.push_back(T); 1416 1417 // Alias and anchors can be simple keys. 1418 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1419 1420 IsSimpleKeyAllowed = false; 1421 1422 return true; 1423 } 1424 1425 char Scanner::scanBlockChompingIndicator() { 1426 char Indicator = ' '; 1427 if (Current != End && (*Current == '+' || *Current == '-')) { 1428 Indicator = *Current; 1429 skip(1); 1430 } 1431 return Indicator; 1432 } 1433 1434 /// Get the number of line breaks after chomping. 1435 /// 1436 /// Return the number of trailing line breaks to emit, depending on 1437 /// \p ChompingIndicator. 1438 static unsigned getChompedLineBreaks(char ChompingIndicator, 1439 unsigned LineBreaks, StringRef Str) { 1440 if (ChompingIndicator == '-') // Strip all line breaks. 1441 return 0; 1442 if (ChompingIndicator == '+') // Keep all line breaks. 1443 return LineBreaks; 1444 // Clip trailing lines. 1445 return Str.empty() ? 0 : 1; 1446 } 1447 1448 unsigned Scanner::scanBlockIndentationIndicator() { 1449 unsigned Indent = 0; 1450 if (Current != End && (*Current >= '1' && *Current <= '9')) { 1451 Indent = unsigned(*Current - '0'); 1452 skip(1); 1453 } 1454 return Indent; 1455 } 1456 1457 bool Scanner::scanBlockScalarHeader(char &ChompingIndicator, 1458 unsigned &IndentIndicator, bool &IsDone) { 1459 auto Start = Current; 1460 1461 ChompingIndicator = scanBlockChompingIndicator(); 1462 IndentIndicator = scanBlockIndentationIndicator(); 1463 // Check for the chomping indicator once again. 1464 if (ChompingIndicator == ' ') 1465 ChompingIndicator = scanBlockChompingIndicator(); 1466 Current = skip_while(&Scanner::skip_s_white, Current); 1467 skipComment(); 1468 1469 if (Current == End) { // EOF, we have an empty scalar. 1470 Token T; 1471 T.Kind = Token::TK_BlockScalar; 1472 T.Range = StringRef(Start, Current - Start); 1473 TokenQueue.push_back(T); 1474 IsDone = true; 1475 return true; 1476 } 1477 1478 if (!consumeLineBreakIfPresent()) { 1479 setError("Expected a line break after block scalar header", Current); 1480 return false; 1481 } 1482 return true; 1483 } 1484 1485 bool Scanner::findBlockScalarIndent(unsigned &BlockIndent, 1486 unsigned BlockExitIndent, 1487 unsigned &LineBreaks, bool &IsDone) { 1488 unsigned MaxAllSpaceLineCharacters = 0; 1489 StringRef::iterator LongestAllSpaceLine; 1490 1491 while (true) { 1492 advanceWhile(&Scanner::skip_s_space); 1493 if (skip_nb_char(Current) != Current) { 1494 // This line isn't empty, so try and find the indentation. 1495 if (Column <= BlockExitIndent) { // End of the block literal. 1496 IsDone = true; 1497 return true; 1498 } 1499 // We found the block's indentation. 1500 BlockIndent = Column; 1501 if (MaxAllSpaceLineCharacters > BlockIndent) { 1502 setError( 1503 "Leading all-spaces line must be smaller than the block indent", 1504 LongestAllSpaceLine); 1505 return false; 1506 } 1507 return true; 1508 } 1509 if (skip_b_break(Current) != Current && 1510 Column > MaxAllSpaceLineCharacters) { 1511 // Record the longest all-space line in case it's longer than the 1512 // discovered block indent. 1513 MaxAllSpaceLineCharacters = Column; 1514 LongestAllSpaceLine = Current; 1515 } 1516 1517 // Check for EOF. 1518 if (Current == End) { 1519 IsDone = true; 1520 return true; 1521 } 1522 1523 if (!consumeLineBreakIfPresent()) { 1524 IsDone = true; 1525 return true; 1526 } 1527 ++LineBreaks; 1528 } 1529 return true; 1530 } 1531 1532 bool Scanner::scanBlockScalarIndent(unsigned BlockIndent, 1533 unsigned BlockExitIndent, bool &IsDone) { 1534 // Skip the indentation. 1535 while (Column < BlockIndent) { 1536 auto I = skip_s_space(Current); 1537 if (I == Current) 1538 break; 1539 Current = I; 1540 ++Column; 1541 } 1542 1543 if (skip_nb_char(Current) == Current) 1544 return true; 1545 1546 if (Column <= BlockExitIndent) { // End of the block literal. 1547 IsDone = true; 1548 return true; 1549 } 1550 1551 if (Column < BlockIndent) { 1552 if (Current != End && *Current == '#') { // Trailing comment. 1553 IsDone = true; 1554 return true; 1555 } 1556 setError("A text line is less indented than the block scalar", Current); 1557 return false; 1558 } 1559 return true; // A normal text line. 1560 } 1561 1562 bool Scanner::scanBlockScalar(bool IsLiteral) { 1563 // Eat '|' or '>' 1564 assert(*Current == '|' || *Current == '>'); 1565 skip(1); 1566 1567 char ChompingIndicator; 1568 unsigned BlockIndent; 1569 bool IsDone = false; 1570 if (!scanBlockScalarHeader(ChompingIndicator, BlockIndent, IsDone)) 1571 return false; 1572 if (IsDone) 1573 return true; 1574 1575 auto Start = Current; 1576 unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent; 1577 unsigned LineBreaks = 0; 1578 if (BlockIndent == 0) { 1579 if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks, 1580 IsDone)) 1581 return false; 1582 } 1583 1584 // Scan the block's scalars body. 1585 SmallString<256> Str; 1586 while (!IsDone) { 1587 if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone)) 1588 return false; 1589 if (IsDone) 1590 break; 1591 1592 // Parse the current line. 1593 auto LineStart = Current; 1594 advanceWhile(&Scanner::skip_nb_char); 1595 if (LineStart != Current) { 1596 Str.append(LineBreaks, '\n'); 1597 Str.append(StringRef(LineStart, Current - LineStart)); 1598 LineBreaks = 0; 1599 } 1600 1601 // Check for EOF. 1602 if (Current == End) 1603 break; 1604 1605 if (!consumeLineBreakIfPresent()) 1606 break; 1607 ++LineBreaks; 1608 } 1609 1610 if (Current == End && !LineBreaks) 1611 // Ensure that there is at least one line break before the end of file. 1612 LineBreaks = 1; 1613 Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n'); 1614 1615 // New lines may start a simple key. 1616 if (!FlowLevel) 1617 IsSimpleKeyAllowed = true; 1618 1619 Token T; 1620 T.Kind = Token::TK_BlockScalar; 1621 T.Range = StringRef(Start, Current - Start); 1622 T.Value = Str.str().str(); 1623 TokenQueue.push_back(T); 1624 return true; 1625 } 1626 1627 bool Scanner::scanTag() { 1628 StringRef::iterator Start = Current; 1629 unsigned ColStart = Column; 1630 skip(1); // Eat !. 1631 if (Current == End || isBlankOrBreak(Current)); // An empty tag. 1632 else if (*Current == '<') { 1633 skip(1); 1634 scan_ns_uri_char(); 1635 if (!consume('>')) 1636 return false; 1637 } else { 1638 // FIXME: Actually parse the c-ns-shorthand-tag rule. 1639 Current = skip_while(&Scanner::skip_ns_char, Current); 1640 } 1641 1642 Token T; 1643 T.Kind = Token::TK_Tag; 1644 T.Range = StringRef(Start, Current - Start); 1645 TokenQueue.push_back(T); 1646 1647 // Tags can be simple keys. 1648 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1649 1650 IsSimpleKeyAllowed = false; 1651 1652 return true; 1653 } 1654 1655 bool Scanner::fetchMoreTokens() { 1656 if (IsStartOfStream) 1657 return scanStreamStart(); 1658 1659 scanToNextToken(); 1660 1661 if (Current == End) 1662 return scanStreamEnd(); 1663 1664 removeStaleSimpleKeyCandidates(); 1665 1666 unrollIndent(Column); 1667 1668 if (Column == 0 && *Current == '%') 1669 return scanDirective(); 1670 1671 if (Column == 0 && Current + 4 <= End 1672 && *Current == '-' 1673 && *(Current + 1) == '-' 1674 && *(Current + 2) == '-' 1675 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1676 return scanDocumentIndicator(true); 1677 1678 if (Column == 0 && Current + 4 <= End 1679 && *Current == '.' 1680 && *(Current + 1) == '.' 1681 && *(Current + 2) == '.' 1682 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1683 return scanDocumentIndicator(false); 1684 1685 if (*Current == '[') 1686 return scanFlowCollectionStart(true); 1687 1688 if (*Current == '{') 1689 return scanFlowCollectionStart(false); 1690 1691 if (*Current == ']') 1692 return scanFlowCollectionEnd(true); 1693 1694 if (*Current == '}') 1695 return scanFlowCollectionEnd(false); 1696 1697 if (*Current == ',') 1698 return scanFlowEntry(); 1699 1700 if (*Current == '-' && isBlankOrBreak(Current + 1)) 1701 return scanBlockEntry(); 1702 1703 if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1))) 1704 return scanKey(); 1705 1706 if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1))) 1707 return scanValue(); 1708 1709 if (*Current == '*') 1710 return scanAliasOrAnchor(true); 1711 1712 if (*Current == '&') 1713 return scanAliasOrAnchor(false); 1714 1715 if (*Current == '!') 1716 return scanTag(); 1717 1718 if (*Current == '|' && !FlowLevel) 1719 return scanBlockScalar(true); 1720 1721 if (*Current == '>' && !FlowLevel) 1722 return scanBlockScalar(false); 1723 1724 if (*Current == '\'') 1725 return scanFlowScalar(false); 1726 1727 if (*Current == '"') 1728 return scanFlowScalar(true); 1729 1730 // Get a plain scalar. 1731 StringRef FirstChar(Current, 1); 1732 if (!(isBlankOrBreak(Current) 1733 || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos) 1734 || (*Current == '-' && !isBlankOrBreak(Current + 1)) 1735 || (!FlowLevel && (*Current == '?' || *Current == ':') 1736 && isBlankOrBreak(Current + 1)) 1737 || (!FlowLevel && *Current == ':' 1738 && Current + 2 < End 1739 && *(Current + 1) == ':' 1740 && !isBlankOrBreak(Current + 2))) 1741 return scanPlainScalar(); 1742 1743 setError("Unrecognized character while tokenizing."); 1744 return false; 1745 } 1746 1747 Stream::Stream(StringRef Input, SourceMgr &SM, bool ShowColors) 1748 : scanner(new Scanner(Input, SM, ShowColors)), CurrentDoc() {} 1749 1750 Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors) 1751 : scanner(new Scanner(InputBuffer, SM, ShowColors)), CurrentDoc() {} 1752 1753 Stream::~Stream() {} 1754 1755 bool Stream::failed() { return scanner->failed(); } 1756 1757 void Stream::printError(Node *N, const Twine &Msg) { 1758 scanner->printError( N->getSourceRange().Start 1759 , SourceMgr::DK_Error 1760 , Msg 1761 , N->getSourceRange()); 1762 } 1763 1764 document_iterator Stream::begin() { 1765 if (CurrentDoc) 1766 report_fatal_error("Can only iterate over the stream once"); 1767 1768 // Skip Stream-Start. 1769 scanner->getNext(); 1770 1771 CurrentDoc.reset(new Document(*this)); 1772 return document_iterator(CurrentDoc); 1773 } 1774 1775 document_iterator Stream::end() { 1776 return document_iterator(); 1777 } 1778 1779 void Stream::skip() { 1780 for (document_iterator i = begin(), e = end(); i != e; ++i) 1781 i->skip(); 1782 } 1783 1784 Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A, 1785 StringRef T) 1786 : Doc(D), TypeID(Type), Anchor(A), Tag(T) { 1787 SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); 1788 SourceRange = SMRange(Start, Start); 1789 } 1790 1791 std::string Node::getVerbatimTag() const { 1792 StringRef Raw = getRawTag(); 1793 if (!Raw.empty() && Raw != "!") { 1794 std::string Ret; 1795 if (Raw.find_last_of('!') == 0) { 1796 Ret = Doc->getTagMap().find("!")->second; 1797 Ret += Raw.substr(1); 1798 return Ret; 1799 } else if (Raw.startswith("!!")) { 1800 Ret = Doc->getTagMap().find("!!")->second; 1801 Ret += Raw.substr(2); 1802 return Ret; 1803 } else { 1804 StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1); 1805 std::map<StringRef, StringRef>::const_iterator It = 1806 Doc->getTagMap().find(TagHandle); 1807 if (It != Doc->getTagMap().end()) 1808 Ret = It->second; 1809 else { 1810 Token T; 1811 T.Kind = Token::TK_Tag; 1812 T.Range = TagHandle; 1813 setError(Twine("Unknown tag handle ") + TagHandle, T); 1814 } 1815 Ret += Raw.substr(Raw.find_last_of('!') + 1); 1816 return Ret; 1817 } 1818 } 1819 1820 switch (getType()) { 1821 case NK_Null: 1822 return "tag:yaml.org,2002:null"; 1823 case NK_Scalar: 1824 case NK_BlockScalar: 1825 // TODO: Tag resolution. 1826 return "tag:yaml.org,2002:str"; 1827 case NK_Mapping: 1828 return "tag:yaml.org,2002:map"; 1829 case NK_Sequence: 1830 return "tag:yaml.org,2002:seq"; 1831 } 1832 1833 return ""; 1834 } 1835 1836 Token &Node::peekNext() { 1837 return Doc->peekNext(); 1838 } 1839 1840 Token Node::getNext() { 1841 return Doc->getNext(); 1842 } 1843 1844 Node *Node::parseBlockNode() { 1845 return Doc->parseBlockNode(); 1846 } 1847 1848 BumpPtrAllocator &Node::getAllocator() { 1849 return Doc->NodeAllocator; 1850 } 1851 1852 void Node::setError(const Twine &Msg, Token &Tok) const { 1853 Doc->setError(Msg, Tok); 1854 } 1855 1856 bool Node::failed() const { 1857 return Doc->failed(); 1858 } 1859 1860 1861 1862 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { 1863 // TODO: Handle newlines properly. We need to remove leading whitespace. 1864 if (Value[0] == '"') { // Double quoted. 1865 // Pull off the leading and trailing "s. 1866 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1867 // Search for characters that would require unescaping the value. 1868 StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); 1869 if (i != StringRef::npos) 1870 return unescapeDoubleQuoted(UnquotedValue, i, Storage); 1871 return UnquotedValue; 1872 } else if (Value[0] == '\'') { // Single quoted. 1873 // Pull off the leading and trailing 's. 1874 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1875 StringRef::size_type i = UnquotedValue.find('\''); 1876 if (i != StringRef::npos) { 1877 // We're going to need Storage. 1878 Storage.clear(); 1879 Storage.reserve(UnquotedValue.size()); 1880 for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { 1881 StringRef Valid(UnquotedValue.begin(), i); 1882 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1883 Storage.push_back('\''); 1884 UnquotedValue = UnquotedValue.substr(i + 2); 1885 } 1886 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 1887 return StringRef(Storage.begin(), Storage.size()); 1888 } 1889 return UnquotedValue; 1890 } 1891 // Plain or block. 1892 return Value.rtrim(' '); 1893 } 1894 1895 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue 1896 , StringRef::size_type i 1897 , SmallVectorImpl<char> &Storage) 1898 const { 1899 // Use Storage to build proper value. 1900 Storage.clear(); 1901 Storage.reserve(UnquotedValue.size()); 1902 for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { 1903 // Insert all previous chars into Storage. 1904 StringRef Valid(UnquotedValue.begin(), i); 1905 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1906 // Chop off inserted chars. 1907 UnquotedValue = UnquotedValue.substr(i); 1908 1909 assert(!UnquotedValue.empty() && "Can't be empty!"); 1910 1911 // Parse escape or line break. 1912 switch (UnquotedValue[0]) { 1913 case '\r': 1914 case '\n': 1915 Storage.push_back('\n'); 1916 if ( UnquotedValue.size() > 1 1917 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1918 UnquotedValue = UnquotedValue.substr(1); 1919 UnquotedValue = UnquotedValue.substr(1); 1920 break; 1921 default: 1922 if (UnquotedValue.size() == 1) 1923 // TODO: Report error. 1924 break; 1925 UnquotedValue = UnquotedValue.substr(1); 1926 switch (UnquotedValue[0]) { 1927 default: { 1928 Token T; 1929 T.Range = StringRef(UnquotedValue.begin(), 1); 1930 setError("Unrecognized escape code!", T); 1931 return ""; 1932 } 1933 case '\r': 1934 case '\n': 1935 // Remove the new line. 1936 if ( UnquotedValue.size() > 1 1937 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1938 UnquotedValue = UnquotedValue.substr(1); 1939 // If this was just a single byte newline, it will get skipped 1940 // below. 1941 break; 1942 case '0': 1943 Storage.push_back(0x00); 1944 break; 1945 case 'a': 1946 Storage.push_back(0x07); 1947 break; 1948 case 'b': 1949 Storage.push_back(0x08); 1950 break; 1951 case 't': 1952 case 0x09: 1953 Storage.push_back(0x09); 1954 break; 1955 case 'n': 1956 Storage.push_back(0x0A); 1957 break; 1958 case 'v': 1959 Storage.push_back(0x0B); 1960 break; 1961 case 'f': 1962 Storage.push_back(0x0C); 1963 break; 1964 case 'r': 1965 Storage.push_back(0x0D); 1966 break; 1967 case 'e': 1968 Storage.push_back(0x1B); 1969 break; 1970 case ' ': 1971 Storage.push_back(0x20); 1972 break; 1973 case '"': 1974 Storage.push_back(0x22); 1975 break; 1976 case '/': 1977 Storage.push_back(0x2F); 1978 break; 1979 case '\\': 1980 Storage.push_back(0x5C); 1981 break; 1982 case 'N': 1983 encodeUTF8(0x85, Storage); 1984 break; 1985 case '_': 1986 encodeUTF8(0xA0, Storage); 1987 break; 1988 case 'L': 1989 encodeUTF8(0x2028, Storage); 1990 break; 1991 case 'P': 1992 encodeUTF8(0x2029, Storage); 1993 break; 1994 case 'x': { 1995 if (UnquotedValue.size() < 3) 1996 // TODO: Report error. 1997 break; 1998 unsigned int UnicodeScalarValue; 1999 if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) 2000 // TODO: Report error. 2001 UnicodeScalarValue = 0xFFFD; 2002 encodeUTF8(UnicodeScalarValue, Storage); 2003 UnquotedValue = UnquotedValue.substr(2); 2004 break; 2005 } 2006 case 'u': { 2007 if (UnquotedValue.size() < 5) 2008 // TODO: Report error. 2009 break; 2010 unsigned int UnicodeScalarValue; 2011 if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) 2012 // TODO: Report error. 2013 UnicodeScalarValue = 0xFFFD; 2014 encodeUTF8(UnicodeScalarValue, Storage); 2015 UnquotedValue = UnquotedValue.substr(4); 2016 break; 2017 } 2018 case 'U': { 2019 if (UnquotedValue.size() < 9) 2020 // TODO: Report error. 2021 break; 2022 unsigned int UnicodeScalarValue; 2023 if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) 2024 // TODO: Report error. 2025 UnicodeScalarValue = 0xFFFD; 2026 encodeUTF8(UnicodeScalarValue, Storage); 2027 UnquotedValue = UnquotedValue.substr(8); 2028 break; 2029 } 2030 } 2031 UnquotedValue = UnquotedValue.substr(1); 2032 } 2033 } 2034 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 2035 return StringRef(Storage.begin(), Storage.size()); 2036 } 2037 2038 Node *KeyValueNode::getKey() { 2039 if (Key) 2040 return Key; 2041 // Handle implicit null keys. 2042 { 2043 Token &t = peekNext(); 2044 if ( t.Kind == Token::TK_BlockEnd 2045 || t.Kind == Token::TK_Value 2046 || t.Kind == Token::TK_Error) { 2047 return Key = new (getAllocator()) NullNode(Doc); 2048 } 2049 if (t.Kind == Token::TK_Key) 2050 getNext(); // skip TK_Key. 2051 } 2052 2053 // Handle explicit null keys. 2054 Token &t = peekNext(); 2055 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { 2056 return Key = new (getAllocator()) NullNode(Doc); 2057 } 2058 2059 // We've got a normal key. 2060 return Key = parseBlockNode(); 2061 } 2062 2063 Node *KeyValueNode::getValue() { 2064 if (Value) 2065 return Value; 2066 getKey()->skip(); 2067 if (failed()) 2068 return Value = new (getAllocator()) NullNode(Doc); 2069 2070 // Handle implicit null values. 2071 { 2072 Token &t = peekNext(); 2073 if ( t.Kind == Token::TK_BlockEnd 2074 || t.Kind == Token::TK_FlowMappingEnd 2075 || t.Kind == Token::TK_Key 2076 || t.Kind == Token::TK_FlowEntry 2077 || t.Kind == Token::TK_Error) { 2078 return Value = new (getAllocator()) NullNode(Doc); 2079 } 2080 2081 if (t.Kind != Token::TK_Value) { 2082 setError("Unexpected token in Key Value.", t); 2083 return Value = new (getAllocator()) NullNode(Doc); 2084 } 2085 getNext(); // skip TK_Value. 2086 } 2087 2088 // Handle explicit null values. 2089 Token &t = peekNext(); 2090 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { 2091 return Value = new (getAllocator()) NullNode(Doc); 2092 } 2093 2094 // We got a normal value. 2095 return Value = parseBlockNode(); 2096 } 2097 2098 void MappingNode::increment() { 2099 if (failed()) { 2100 IsAtEnd = true; 2101 CurrentEntry = nullptr; 2102 return; 2103 } 2104 if (CurrentEntry) { 2105 CurrentEntry->skip(); 2106 if (Type == MT_Inline) { 2107 IsAtEnd = true; 2108 CurrentEntry = nullptr; 2109 return; 2110 } 2111 } 2112 Token T = peekNext(); 2113 if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { 2114 // KeyValueNode eats the TK_Key. That way it can detect null keys. 2115 CurrentEntry = new (getAllocator()) KeyValueNode(Doc); 2116 } else if (Type == MT_Block) { 2117 switch (T.Kind) { 2118 case Token::TK_BlockEnd: 2119 getNext(); 2120 IsAtEnd = true; 2121 CurrentEntry = nullptr; 2122 break; 2123 default: 2124 setError("Unexpected token. Expected Key or Block End", T); 2125 case Token::TK_Error: 2126 IsAtEnd = true; 2127 CurrentEntry = nullptr; 2128 } 2129 } else { 2130 switch (T.Kind) { 2131 case Token::TK_FlowEntry: 2132 // Eat the flow entry and recurse. 2133 getNext(); 2134 return increment(); 2135 case Token::TK_FlowMappingEnd: 2136 getNext(); 2137 case Token::TK_Error: 2138 // Set this to end iterator. 2139 IsAtEnd = true; 2140 CurrentEntry = nullptr; 2141 break; 2142 default: 2143 setError( "Unexpected token. Expected Key, Flow Entry, or Flow " 2144 "Mapping End." 2145 , T); 2146 IsAtEnd = true; 2147 CurrentEntry = nullptr; 2148 } 2149 } 2150 } 2151 2152 void SequenceNode::increment() { 2153 if (failed()) { 2154 IsAtEnd = true; 2155 CurrentEntry = nullptr; 2156 return; 2157 } 2158 if (CurrentEntry) 2159 CurrentEntry->skip(); 2160 Token T = peekNext(); 2161 if (SeqType == ST_Block) { 2162 switch (T.Kind) { 2163 case Token::TK_BlockEntry: 2164 getNext(); 2165 CurrentEntry = parseBlockNode(); 2166 if (!CurrentEntry) { // An error occurred. 2167 IsAtEnd = true; 2168 CurrentEntry = nullptr; 2169 } 2170 break; 2171 case Token::TK_BlockEnd: 2172 getNext(); 2173 IsAtEnd = true; 2174 CurrentEntry = nullptr; 2175 break; 2176 default: 2177 setError( "Unexpected token. Expected Block Entry or Block End." 2178 , T); 2179 case Token::TK_Error: 2180 IsAtEnd = true; 2181 CurrentEntry = nullptr; 2182 } 2183 } else if (SeqType == ST_Indentless) { 2184 switch (T.Kind) { 2185 case Token::TK_BlockEntry: 2186 getNext(); 2187 CurrentEntry = parseBlockNode(); 2188 if (!CurrentEntry) { // An error occurred. 2189 IsAtEnd = true; 2190 CurrentEntry = nullptr; 2191 } 2192 break; 2193 default: 2194 case Token::TK_Error: 2195 IsAtEnd = true; 2196 CurrentEntry = nullptr; 2197 } 2198 } else if (SeqType == ST_Flow) { 2199 switch (T.Kind) { 2200 case Token::TK_FlowEntry: 2201 // Eat the flow entry and recurse. 2202 getNext(); 2203 WasPreviousTokenFlowEntry = true; 2204 return increment(); 2205 case Token::TK_FlowSequenceEnd: 2206 getNext(); 2207 case Token::TK_Error: 2208 // Set this to end iterator. 2209 IsAtEnd = true; 2210 CurrentEntry = nullptr; 2211 break; 2212 case Token::TK_StreamEnd: 2213 case Token::TK_DocumentEnd: 2214 case Token::TK_DocumentStart: 2215 setError("Could not find closing ]!", T); 2216 // Set this to end iterator. 2217 IsAtEnd = true; 2218 CurrentEntry = nullptr; 2219 break; 2220 default: 2221 if (!WasPreviousTokenFlowEntry) { 2222 setError("Expected , between entries!", T); 2223 IsAtEnd = true; 2224 CurrentEntry = nullptr; 2225 break; 2226 } 2227 // Otherwise it must be a flow entry. 2228 CurrentEntry = parseBlockNode(); 2229 if (!CurrentEntry) { 2230 IsAtEnd = true; 2231 } 2232 WasPreviousTokenFlowEntry = false; 2233 break; 2234 } 2235 } 2236 } 2237 2238 Document::Document(Stream &S) : stream(S), Root(nullptr) { 2239 // Tag maps starts with two default mappings. 2240 TagMap["!"] = "!"; 2241 TagMap["!!"] = "tag:yaml.org,2002:"; 2242 2243 if (parseDirectives()) 2244 expectToken(Token::TK_DocumentStart); 2245 Token &T = peekNext(); 2246 if (T.Kind == Token::TK_DocumentStart) 2247 getNext(); 2248 } 2249 2250 bool Document::skip() { 2251 if (stream.scanner->failed()) 2252 return false; 2253 if (!Root) 2254 getRoot(); 2255 Root->skip(); 2256 Token &T = peekNext(); 2257 if (T.Kind == Token::TK_StreamEnd) 2258 return false; 2259 if (T.Kind == Token::TK_DocumentEnd) { 2260 getNext(); 2261 return skip(); 2262 } 2263 return true; 2264 } 2265 2266 Token &Document::peekNext() { 2267 return stream.scanner->peekNext(); 2268 } 2269 2270 Token Document::getNext() { 2271 return stream.scanner->getNext(); 2272 } 2273 2274 void Document::setError(const Twine &Message, Token &Location) const { 2275 stream.scanner->setError(Message, Location.Range.begin()); 2276 } 2277 2278 bool Document::failed() const { 2279 return stream.scanner->failed(); 2280 } 2281 2282 Node *Document::parseBlockNode() { 2283 Token T = peekNext(); 2284 // Handle properties. 2285 Token AnchorInfo; 2286 Token TagInfo; 2287 parse_property: 2288 switch (T.Kind) { 2289 case Token::TK_Alias: 2290 getNext(); 2291 return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); 2292 case Token::TK_Anchor: 2293 if (AnchorInfo.Kind == Token::TK_Anchor) { 2294 setError("Already encountered an anchor for this node!", T); 2295 return nullptr; 2296 } 2297 AnchorInfo = getNext(); // Consume TK_Anchor. 2298 T = peekNext(); 2299 goto parse_property; 2300 case Token::TK_Tag: 2301 if (TagInfo.Kind == Token::TK_Tag) { 2302 setError("Already encountered a tag for this node!", T); 2303 return nullptr; 2304 } 2305 TagInfo = getNext(); // Consume TK_Tag. 2306 T = peekNext(); 2307 goto parse_property; 2308 default: 2309 break; 2310 } 2311 2312 switch (T.Kind) { 2313 case Token::TK_BlockEntry: 2314 // We got an unindented BlockEntry sequence. This is not terminated with 2315 // a BlockEnd. 2316 // Don't eat the TK_BlockEntry, SequenceNode needs it. 2317 return new (NodeAllocator) SequenceNode( stream.CurrentDoc 2318 , AnchorInfo.Range.substr(1) 2319 , TagInfo.Range 2320 , SequenceNode::ST_Indentless); 2321 case Token::TK_BlockSequenceStart: 2322 getNext(); 2323 return new (NodeAllocator) 2324 SequenceNode( stream.CurrentDoc 2325 , AnchorInfo.Range.substr(1) 2326 , TagInfo.Range 2327 , SequenceNode::ST_Block); 2328 case Token::TK_BlockMappingStart: 2329 getNext(); 2330 return new (NodeAllocator) 2331 MappingNode( stream.CurrentDoc 2332 , AnchorInfo.Range.substr(1) 2333 , TagInfo.Range 2334 , MappingNode::MT_Block); 2335 case Token::TK_FlowSequenceStart: 2336 getNext(); 2337 return new (NodeAllocator) 2338 SequenceNode( stream.CurrentDoc 2339 , AnchorInfo.Range.substr(1) 2340 , TagInfo.Range 2341 , SequenceNode::ST_Flow); 2342 case Token::TK_FlowMappingStart: 2343 getNext(); 2344 return new (NodeAllocator) 2345 MappingNode( stream.CurrentDoc 2346 , AnchorInfo.Range.substr(1) 2347 , TagInfo.Range 2348 , MappingNode::MT_Flow); 2349 case Token::TK_Scalar: 2350 getNext(); 2351 return new (NodeAllocator) 2352 ScalarNode( stream.CurrentDoc 2353 , AnchorInfo.Range.substr(1) 2354 , TagInfo.Range 2355 , T.Range); 2356 case Token::TK_BlockScalar: { 2357 getNext(); 2358 StringRef NullTerminatedStr(T.Value.c_str(), T.Value.length() + 1); 2359 StringRef StrCopy = NullTerminatedStr.copy(NodeAllocator).drop_back(); 2360 return new (NodeAllocator) 2361 BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1), 2362 TagInfo.Range, StrCopy, T.Range); 2363 } 2364 case Token::TK_Key: 2365 // Don't eat the TK_Key, KeyValueNode expects it. 2366 return new (NodeAllocator) 2367 MappingNode( stream.CurrentDoc 2368 , AnchorInfo.Range.substr(1) 2369 , TagInfo.Range 2370 , MappingNode::MT_Inline); 2371 case Token::TK_DocumentStart: 2372 case Token::TK_DocumentEnd: 2373 case Token::TK_StreamEnd: 2374 default: 2375 // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not 2376 // !!null null. 2377 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2378 case Token::TK_Error: 2379 return nullptr; 2380 } 2381 llvm_unreachable("Control flow shouldn't reach here."); 2382 return nullptr; 2383 } 2384 2385 bool Document::parseDirectives() { 2386 bool isDirective = false; 2387 while (true) { 2388 Token T = peekNext(); 2389 if (T.Kind == Token::TK_TagDirective) { 2390 parseTAGDirective(); 2391 isDirective = true; 2392 } else if (T.Kind == Token::TK_VersionDirective) { 2393 parseYAMLDirective(); 2394 isDirective = true; 2395 } else 2396 break; 2397 } 2398 return isDirective; 2399 } 2400 2401 void Document::parseYAMLDirective() { 2402 getNext(); // Eat %YAML <version> 2403 } 2404 2405 void Document::parseTAGDirective() { 2406 Token Tag = getNext(); // %TAG <handle> <prefix> 2407 StringRef T = Tag.Range; 2408 // Strip %TAG 2409 T = T.substr(T.find_first_of(" \t")).ltrim(" \t"); 2410 std::size_t HandleEnd = T.find_first_of(" \t"); 2411 StringRef TagHandle = T.substr(0, HandleEnd); 2412 StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t"); 2413 TagMap[TagHandle] = TagPrefix; 2414 } 2415 2416 bool Document::expectToken(int TK) { 2417 Token T = getNext(); 2418 if (T.Kind != TK) { 2419 setError("Unexpected token", T); 2420 return false; 2421 } 2422 return true; 2423 } 2424