1 //===--- YAMLParser.cpp - Simple YAML parser ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements a YAML parser. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Support/YAMLParser.h" 15 #include "llvm/ADT/STLExtras.h" 16 #include "llvm/ADT/SmallString.h" 17 #include "llvm/ADT/SmallVector.h" 18 #include "llvm/ADT/StringExtras.h" 19 #include "llvm/ADT/Twine.h" 20 #include "llvm/ADT/ilist.h" 21 #include "llvm/ADT/ilist_node.h" 22 #include "llvm/Support/ErrorHandling.h" 23 #include "llvm/Support/MemoryBuffer.h" 24 #include "llvm/Support/SourceMgr.h" 25 #include "llvm/Support/raw_ostream.h" 26 27 using namespace llvm; 28 using namespace yaml; 29 30 enum UnicodeEncodingForm { 31 UEF_UTF32_LE, ///< UTF-32 Little Endian 32 UEF_UTF32_BE, ///< UTF-32 Big Endian 33 UEF_UTF16_LE, ///< UTF-16 Little Endian 34 UEF_UTF16_BE, ///< UTF-16 Big Endian 35 UEF_UTF8, ///< UTF-8 or ascii. 36 UEF_Unknown ///< Not a valid Unicode encoding. 37 }; 38 39 /// EncodingInfo - Holds the encoding type and length of the byte order mark if 40 /// it exists. Length is in {0, 2, 3, 4}. 41 typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo; 42 43 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode 44 /// encoding form of \a Input. 45 /// 46 /// @param Input A string of length 0 or more. 47 /// @returns An EncodingInfo indicating the Unicode encoding form of the input 48 /// and how long the byte order mark is if one exists. 49 static EncodingInfo getUnicodeEncoding(StringRef Input) { 50 if (Input.size() == 0) 51 return std::make_pair(UEF_Unknown, 0); 52 53 switch (uint8_t(Input[0])) { 54 case 0x00: 55 if (Input.size() >= 4) { 56 if ( Input[1] == 0 57 && uint8_t(Input[2]) == 0xFE 58 && uint8_t(Input[3]) == 0xFF) 59 return std::make_pair(UEF_UTF32_BE, 4); 60 if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) 61 return std::make_pair(UEF_UTF32_BE, 0); 62 } 63 64 if (Input.size() >= 2 && Input[1] != 0) 65 return std::make_pair(UEF_UTF16_BE, 0); 66 return std::make_pair(UEF_Unknown, 0); 67 case 0xFF: 68 if ( Input.size() >= 4 69 && uint8_t(Input[1]) == 0xFE 70 && Input[2] == 0 71 && Input[3] == 0) 72 return std::make_pair(UEF_UTF32_LE, 4); 73 74 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) 75 return std::make_pair(UEF_UTF16_LE, 2); 76 return std::make_pair(UEF_Unknown, 0); 77 case 0xFE: 78 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) 79 return std::make_pair(UEF_UTF16_BE, 2); 80 return std::make_pair(UEF_Unknown, 0); 81 case 0xEF: 82 if ( Input.size() >= 3 83 && uint8_t(Input[1]) == 0xBB 84 && uint8_t(Input[2]) == 0xBF) 85 return std::make_pair(UEF_UTF8, 3); 86 return std::make_pair(UEF_Unknown, 0); 87 } 88 89 // It could still be utf-32 or utf-16. 90 if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) 91 return std::make_pair(UEF_UTF32_LE, 0); 92 93 if (Input.size() >= 2 && Input[1] == 0) 94 return std::make_pair(UEF_UTF16_LE, 0); 95 96 return std::make_pair(UEF_UTF8, 0); 97 } 98 99 namespace llvm { 100 namespace yaml { 101 /// Pin the vtables to this file. 102 void Node::anchor() {} 103 void NullNode::anchor() {} 104 void ScalarNode::anchor() {} 105 void BlockScalarNode::anchor() {} 106 void KeyValueNode::anchor() {} 107 void MappingNode::anchor() {} 108 void SequenceNode::anchor() {} 109 void AliasNode::anchor() {} 110 111 /// Token - A single YAML token. 112 struct Token : ilist_node<Token> { 113 enum TokenKind { 114 TK_Error, // Uninitialized token. 115 TK_StreamStart, 116 TK_StreamEnd, 117 TK_VersionDirective, 118 TK_TagDirective, 119 TK_DocumentStart, 120 TK_DocumentEnd, 121 TK_BlockEntry, 122 TK_BlockEnd, 123 TK_BlockSequenceStart, 124 TK_BlockMappingStart, 125 TK_FlowEntry, 126 TK_FlowSequenceStart, 127 TK_FlowSequenceEnd, 128 TK_FlowMappingStart, 129 TK_FlowMappingEnd, 130 TK_Key, 131 TK_Value, 132 TK_Scalar, 133 TK_BlockScalar, 134 TK_Alias, 135 TK_Anchor, 136 TK_Tag 137 } Kind; 138 139 /// A string of length 0 or more whose begin() points to the logical location 140 /// of the token in the input. 141 StringRef Range; 142 143 /// The value of a block scalar node. 144 std::string Value; 145 146 Token() : Kind(TK_Error) {} 147 }; 148 } 149 } 150 151 namespace llvm { 152 template <> 153 struct ilist_sentinel_traits<Token> 154 : public ilist_full_embedded_sentinel_traits<Token> {}; 155 156 template<> 157 struct ilist_node_traits<Token> { 158 Token *createNode(const Token &V) { 159 return new (Alloc.Allocate<Token>()) Token(V); 160 } 161 static void deleteNode(Token *V) { V->~Token(); } 162 163 void addNodeToList(Token *) {} 164 void removeNodeFromList(Token *) {} 165 void transferNodesFromList(ilist_node_traits & /*SrcTraits*/, 166 ilist_iterator<Token> /*first*/, 167 ilist_iterator<Token> /*last*/) {} 168 169 BumpPtrAllocator Alloc; 170 }; 171 } 172 173 typedef ilist<Token> TokenQueueT; 174 175 namespace { 176 /// @brief This struct is used to track simple keys. 177 /// 178 /// Simple keys are handled by creating an entry in SimpleKeys for each Token 179 /// which could legally be the start of a simple key. When peekNext is called, 180 /// if the Token To be returned is referenced by a SimpleKey, we continue 181 /// tokenizing until that potential simple key has either been found to not be 182 /// a simple key (we moved on to the next line or went further than 1024 chars). 183 /// Or when we run into a Value, and then insert a Key token (and possibly 184 /// others) before the SimpleKey's Tok. 185 struct SimpleKey { 186 TokenQueueT::iterator Tok; 187 unsigned Column; 188 unsigned Line; 189 unsigned FlowLevel; 190 bool IsRequired; 191 192 bool operator ==(const SimpleKey &Other) { 193 return Tok == Other.Tok; 194 } 195 }; 196 } 197 198 /// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit 199 /// subsequence and the subsequence's length in code units (uint8_t). 200 /// A length of 0 represents an error. 201 typedef std::pair<uint32_t, unsigned> UTF8Decoded; 202 203 static UTF8Decoded decodeUTF8(StringRef Range) { 204 StringRef::iterator Position= Range.begin(); 205 StringRef::iterator End = Range.end(); 206 // 1 byte: [0x00, 0x7f] 207 // Bit pattern: 0xxxxxxx 208 if ((*Position & 0x80) == 0) { 209 return std::make_pair(*Position, 1); 210 } 211 // 2 bytes: [0x80, 0x7ff] 212 // Bit pattern: 110xxxxx 10xxxxxx 213 if (Position + 1 != End && 214 ((*Position & 0xE0) == 0xC0) && 215 ((*(Position + 1) & 0xC0) == 0x80)) { 216 uint32_t codepoint = ((*Position & 0x1F) << 6) | 217 (*(Position + 1) & 0x3F); 218 if (codepoint >= 0x80) 219 return std::make_pair(codepoint, 2); 220 } 221 // 3 bytes: [0x8000, 0xffff] 222 // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx 223 if (Position + 2 != End && 224 ((*Position & 0xF0) == 0xE0) && 225 ((*(Position + 1) & 0xC0) == 0x80) && 226 ((*(Position + 2) & 0xC0) == 0x80)) { 227 uint32_t codepoint = ((*Position & 0x0F) << 12) | 228 ((*(Position + 1) & 0x3F) << 6) | 229 (*(Position + 2) & 0x3F); 230 // Codepoints between 0xD800 and 0xDFFF are invalid, as 231 // they are high / low surrogate halves used by UTF-16. 232 if (codepoint >= 0x800 && 233 (codepoint < 0xD800 || codepoint > 0xDFFF)) 234 return std::make_pair(codepoint, 3); 235 } 236 // 4 bytes: [0x10000, 0x10FFFF] 237 // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 238 if (Position + 3 != End && 239 ((*Position & 0xF8) == 0xF0) && 240 ((*(Position + 1) & 0xC0) == 0x80) && 241 ((*(Position + 2) & 0xC0) == 0x80) && 242 ((*(Position + 3) & 0xC0) == 0x80)) { 243 uint32_t codepoint = ((*Position & 0x07) << 18) | 244 ((*(Position + 1) & 0x3F) << 12) | 245 ((*(Position + 2) & 0x3F) << 6) | 246 (*(Position + 3) & 0x3F); 247 if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) 248 return std::make_pair(codepoint, 4); 249 } 250 return std::make_pair(0, 0); 251 } 252 253 namespace llvm { 254 namespace yaml { 255 /// @brief Scans YAML tokens from a MemoryBuffer. 256 class Scanner { 257 public: 258 Scanner(StringRef Input, SourceMgr &SM, bool ShowColors = true); 259 Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors = true); 260 261 /// @brief Parse the next token and return it without popping it. 262 Token &peekNext(); 263 264 /// @brief Parse the next token and pop it from the queue. 265 Token getNext(); 266 267 void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, 268 ArrayRef<SMRange> Ranges = None) { 269 SM.PrintMessage(Loc, Kind, Message, Ranges, /* FixIts= */ None, ShowColors); 270 } 271 272 void setError(const Twine &Message, StringRef::iterator Position) { 273 if (Current >= End) 274 Current = End - 1; 275 276 // Don't print out more errors after the first one we encounter. The rest 277 // are just the result of the first, and have no meaning. 278 if (!Failed) 279 printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message); 280 Failed = true; 281 } 282 283 void setError(const Twine &Message) { 284 setError(Message, Current); 285 } 286 287 /// @brief Returns true if an error occurred while parsing. 288 bool failed() { 289 return Failed; 290 } 291 292 private: 293 void init(MemoryBufferRef Buffer); 294 295 StringRef currentInput() { 296 return StringRef(Current, End - Current); 297 } 298 299 /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting 300 /// at \a Position. 301 /// 302 /// If the UTF-8 code units starting at Position do not form a well-formed 303 /// code unit subsequence, then the Unicode scalar value is 0, and the length 304 /// is 0. 305 UTF8Decoded decodeUTF8(StringRef::iterator Position) { 306 return ::decodeUTF8(StringRef(Position, End - Position)); 307 } 308 309 // The following functions are based on the gramar rules in the YAML spec. The 310 // style of the function names it meant to closely match how they are written 311 // in the spec. The number within the [] is the number of the grammar rule in 312 // the spec. 313 // 314 // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. 315 // 316 // c- 317 // A production starting and ending with a special character. 318 // b- 319 // A production matching a single line break. 320 // nb- 321 // A production starting and ending with a non-break character. 322 // s- 323 // A production starting and ending with a white space character. 324 // ns- 325 // A production starting and ending with a non-space character. 326 // l- 327 // A production matching complete line(s). 328 329 /// @brief Skip a single nb-char[27] starting at Position. 330 /// 331 /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] 332 /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] 333 /// 334 /// @returns The code unit after the nb-char, or Position if it's not an 335 /// nb-char. 336 StringRef::iterator skip_nb_char(StringRef::iterator Position); 337 338 /// @brief Skip a single b-break[28] starting at Position. 339 /// 340 /// A b-break is 0xD 0xA | 0xD | 0xA 341 /// 342 /// @returns The code unit after the b-break, or Position if it's not a 343 /// b-break. 344 StringRef::iterator skip_b_break(StringRef::iterator Position); 345 346 /// Skip a single s-space[31] starting at Position. 347 /// 348 /// An s-space is 0x20 349 /// 350 /// @returns The code unit after the s-space, or Position if it's not a 351 /// s-space. 352 StringRef::iterator skip_s_space(StringRef::iterator Position); 353 354 /// @brief Skip a single s-white[33] starting at Position. 355 /// 356 /// A s-white is 0x20 | 0x9 357 /// 358 /// @returns The code unit after the s-white, or Position if it's not a 359 /// s-white. 360 StringRef::iterator skip_s_white(StringRef::iterator Position); 361 362 /// @brief Skip a single ns-char[34] starting at Position. 363 /// 364 /// A ns-char is nb-char - s-white 365 /// 366 /// @returns The code unit after the ns-char, or Position if it's not a 367 /// ns-char. 368 StringRef::iterator skip_ns_char(StringRef::iterator Position); 369 370 typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator); 371 /// @brief Skip minimal well-formed code unit subsequences until Func 372 /// returns its input. 373 /// 374 /// @returns The code unit after the last minimal well-formed code unit 375 /// subsequence that Func accepted. 376 StringRef::iterator skip_while( SkipWhileFunc Func 377 , StringRef::iterator Position); 378 379 /// Skip minimal well-formed code unit subsequences until Func returns its 380 /// input. 381 void advanceWhile(SkipWhileFunc Func); 382 383 /// @brief Scan ns-uri-char[39]s starting at Cur. 384 /// 385 /// This updates Cur and Column while scanning. 386 /// 387 /// @returns A StringRef starting at Cur which covers the longest contiguous 388 /// sequence of ns-uri-char. 389 StringRef scan_ns_uri_char(); 390 391 /// @brief Consume a minimal well-formed code unit subsequence starting at 392 /// \a Cur. Return false if it is not the same Unicode scalar value as 393 /// \a Expected. This updates \a Column. 394 bool consume(uint32_t Expected); 395 396 /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. 397 void skip(uint32_t Distance); 398 399 /// @brief Return true if the minimal well-formed code unit subsequence at 400 /// Pos is whitespace or a new line 401 bool isBlankOrBreak(StringRef::iterator Position); 402 403 /// Consume a single b-break[28] if it's present at the current position. 404 /// 405 /// Return false if the code unit at the current position isn't a line break. 406 bool consumeLineBreakIfPresent(); 407 408 /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey. 409 void saveSimpleKeyCandidate( TokenQueueT::iterator Tok 410 , unsigned AtColumn 411 , bool IsRequired); 412 413 /// @brief Remove simple keys that can no longer be valid simple keys. 414 /// 415 /// Invalid simple keys are not on the current line or are further than 1024 416 /// columns back. 417 void removeStaleSimpleKeyCandidates(); 418 419 /// @brief Remove all simple keys on FlowLevel \a Level. 420 void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); 421 422 /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd 423 /// tokens if needed. 424 bool unrollIndent(int ToColumn); 425 426 /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint 427 /// if needed. 428 bool rollIndent( int ToColumn 429 , Token::TokenKind Kind 430 , TokenQueueT::iterator InsertPoint); 431 432 /// @brief Skip a single-line comment when the comment starts at the current 433 /// position of the scanner. 434 void skipComment(); 435 436 /// @brief Skip whitespace and comments until the start of the next token. 437 void scanToNextToken(); 438 439 /// @brief Must be the first token generated. 440 bool scanStreamStart(); 441 442 /// @brief Generate tokens needed to close out the stream. 443 bool scanStreamEnd(); 444 445 /// @brief Scan a %BLAH directive. 446 bool scanDirective(); 447 448 /// @brief Scan a ... or ---. 449 bool scanDocumentIndicator(bool IsStart); 450 451 /// @brief Scan a [ or { and generate the proper flow collection start token. 452 bool scanFlowCollectionStart(bool IsSequence); 453 454 /// @brief Scan a ] or } and generate the proper flow collection end token. 455 bool scanFlowCollectionEnd(bool IsSequence); 456 457 /// @brief Scan the , that separates entries in a flow collection. 458 bool scanFlowEntry(); 459 460 /// @brief Scan the - that starts block sequence entries. 461 bool scanBlockEntry(); 462 463 /// @brief Scan an explicit ? indicating a key. 464 bool scanKey(); 465 466 /// @brief Scan an explicit : indicating a value. 467 bool scanValue(); 468 469 /// @brief Scan a quoted scalar. 470 bool scanFlowScalar(bool IsDoubleQuoted); 471 472 /// @brief Scan an unquoted scalar. 473 bool scanPlainScalar(); 474 475 /// @brief Scan an Alias or Anchor starting with * or &. 476 bool scanAliasOrAnchor(bool IsAlias); 477 478 /// @brief Scan a block scalar starting with | or >. 479 bool scanBlockScalar(bool IsLiteral); 480 481 /// Scan a chomping indicator in a block scalar header. 482 char scanBlockChompingIndicator(); 483 484 /// Scan an indentation indicator in a block scalar header. 485 unsigned scanBlockIndentationIndicator(); 486 487 /// Scan a block scalar header. 488 /// 489 /// Return false if an error occurred. 490 bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator, 491 bool &IsDone); 492 493 /// Look for the indentation level of a block scalar. 494 /// 495 /// Return false if an error occurred. 496 bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent, 497 unsigned &LineBreaks, bool &IsDone); 498 499 /// Scan the indentation of a text line in a block scalar. 500 /// 501 /// Return false if an error occurred. 502 bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent, 503 bool &IsDone); 504 505 /// @brief Scan a tag of the form !stuff. 506 bool scanTag(); 507 508 /// @brief Dispatch to the next scanning function based on \a *Cur. 509 bool fetchMoreTokens(); 510 511 /// @brief The SourceMgr used for diagnostics and buffer management. 512 SourceMgr &SM; 513 514 /// @brief The original input. 515 MemoryBufferRef InputBuffer; 516 517 /// @brief The current position of the scanner. 518 StringRef::iterator Current; 519 520 /// @brief The end of the input (one past the last character). 521 StringRef::iterator End; 522 523 /// @brief Current YAML indentation level in spaces. 524 int Indent; 525 526 /// @brief Current column number in Unicode code points. 527 unsigned Column; 528 529 /// @brief Current line number. 530 unsigned Line; 531 532 /// @brief How deep we are in flow style containers. 0 Means at block level. 533 unsigned FlowLevel; 534 535 /// @brief Are we at the start of the stream? 536 bool IsStartOfStream; 537 538 /// @brief Can the next token be the start of a simple key? 539 bool IsSimpleKeyAllowed; 540 541 /// @brief True if an error has occurred. 542 bool Failed; 543 544 /// @brief Should colors be used when printing out the diagnostic messages? 545 bool ShowColors; 546 547 /// @brief Queue of tokens. This is required to queue up tokens while looking 548 /// for the end of a simple key. And for cases where a single character 549 /// can produce multiple tokens (e.g. BlockEnd). 550 TokenQueueT TokenQueue; 551 552 /// @brief Indentation levels. 553 SmallVector<int, 4> Indents; 554 555 /// @brief Potential simple keys. 556 SmallVector<SimpleKey, 4> SimpleKeys; 557 }; 558 559 } // end namespace yaml 560 } // end namespace llvm 561 562 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. 563 static void encodeUTF8( uint32_t UnicodeScalarValue 564 , SmallVectorImpl<char> &Result) { 565 if (UnicodeScalarValue <= 0x7F) { 566 Result.push_back(UnicodeScalarValue & 0x7F); 567 } else if (UnicodeScalarValue <= 0x7FF) { 568 uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); 569 uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); 570 Result.push_back(FirstByte); 571 Result.push_back(SecondByte); 572 } else if (UnicodeScalarValue <= 0xFFFF) { 573 uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); 574 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 575 uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); 576 Result.push_back(FirstByte); 577 Result.push_back(SecondByte); 578 Result.push_back(ThirdByte); 579 } else if (UnicodeScalarValue <= 0x10FFFF) { 580 uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); 581 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); 582 uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 583 uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); 584 Result.push_back(FirstByte); 585 Result.push_back(SecondByte); 586 Result.push_back(ThirdByte); 587 Result.push_back(FourthByte); 588 } 589 } 590 591 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { 592 SourceMgr SM; 593 Scanner scanner(Input, SM); 594 while (true) { 595 Token T = scanner.getNext(); 596 switch (T.Kind) { 597 case Token::TK_StreamStart: 598 OS << "Stream-Start: "; 599 break; 600 case Token::TK_StreamEnd: 601 OS << "Stream-End: "; 602 break; 603 case Token::TK_VersionDirective: 604 OS << "Version-Directive: "; 605 break; 606 case Token::TK_TagDirective: 607 OS << "Tag-Directive: "; 608 break; 609 case Token::TK_DocumentStart: 610 OS << "Document-Start: "; 611 break; 612 case Token::TK_DocumentEnd: 613 OS << "Document-End: "; 614 break; 615 case Token::TK_BlockEntry: 616 OS << "Block-Entry: "; 617 break; 618 case Token::TK_BlockEnd: 619 OS << "Block-End: "; 620 break; 621 case Token::TK_BlockSequenceStart: 622 OS << "Block-Sequence-Start: "; 623 break; 624 case Token::TK_BlockMappingStart: 625 OS << "Block-Mapping-Start: "; 626 break; 627 case Token::TK_FlowEntry: 628 OS << "Flow-Entry: "; 629 break; 630 case Token::TK_FlowSequenceStart: 631 OS << "Flow-Sequence-Start: "; 632 break; 633 case Token::TK_FlowSequenceEnd: 634 OS << "Flow-Sequence-End: "; 635 break; 636 case Token::TK_FlowMappingStart: 637 OS << "Flow-Mapping-Start: "; 638 break; 639 case Token::TK_FlowMappingEnd: 640 OS << "Flow-Mapping-End: "; 641 break; 642 case Token::TK_Key: 643 OS << "Key: "; 644 break; 645 case Token::TK_Value: 646 OS << "Value: "; 647 break; 648 case Token::TK_Scalar: 649 OS << "Scalar: "; 650 break; 651 case Token::TK_BlockScalar: 652 OS << "Block Scalar: "; 653 break; 654 case Token::TK_Alias: 655 OS << "Alias: "; 656 break; 657 case Token::TK_Anchor: 658 OS << "Anchor: "; 659 break; 660 case Token::TK_Tag: 661 OS << "Tag: "; 662 break; 663 case Token::TK_Error: 664 break; 665 } 666 OS << T.Range << "\n"; 667 if (T.Kind == Token::TK_StreamEnd) 668 break; 669 else if (T.Kind == Token::TK_Error) 670 return false; 671 } 672 return true; 673 } 674 675 bool yaml::scanTokens(StringRef Input) { 676 llvm::SourceMgr SM; 677 llvm::yaml::Scanner scanner(Input, SM); 678 for (;;) { 679 llvm::yaml::Token T = scanner.getNext(); 680 if (T.Kind == Token::TK_StreamEnd) 681 break; 682 else if (T.Kind == Token::TK_Error) 683 return false; 684 } 685 return true; 686 } 687 688 std::string yaml::escape(StringRef Input) { 689 std::string EscapedInput; 690 for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { 691 if (*i == '\\') 692 EscapedInput += "\\\\"; 693 else if (*i == '"') 694 EscapedInput += "\\\""; 695 else if (*i == 0) 696 EscapedInput += "\\0"; 697 else if (*i == 0x07) 698 EscapedInput += "\\a"; 699 else if (*i == 0x08) 700 EscapedInput += "\\b"; 701 else if (*i == 0x09) 702 EscapedInput += "\\t"; 703 else if (*i == 0x0A) 704 EscapedInput += "\\n"; 705 else if (*i == 0x0B) 706 EscapedInput += "\\v"; 707 else if (*i == 0x0C) 708 EscapedInput += "\\f"; 709 else if (*i == 0x0D) 710 EscapedInput += "\\r"; 711 else if (*i == 0x1B) 712 EscapedInput += "\\e"; 713 else if ((unsigned char)*i < 0x20) { // Control characters not handled above. 714 std::string HexStr = utohexstr(*i); 715 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 716 } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. 717 UTF8Decoded UnicodeScalarValue 718 = decodeUTF8(StringRef(i, Input.end() - i)); 719 if (UnicodeScalarValue.second == 0) { 720 // Found invalid char. 721 SmallString<4> Val; 722 encodeUTF8(0xFFFD, Val); 723 EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end()); 724 // FIXME: Error reporting. 725 return EscapedInput; 726 } 727 if (UnicodeScalarValue.first == 0x85) 728 EscapedInput += "\\N"; 729 else if (UnicodeScalarValue.first == 0xA0) 730 EscapedInput += "\\_"; 731 else if (UnicodeScalarValue.first == 0x2028) 732 EscapedInput += "\\L"; 733 else if (UnicodeScalarValue.first == 0x2029) 734 EscapedInput += "\\P"; 735 else { 736 std::string HexStr = utohexstr(UnicodeScalarValue.first); 737 if (HexStr.size() <= 2) 738 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 739 else if (HexStr.size() <= 4) 740 EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; 741 else if (HexStr.size() <= 8) 742 EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; 743 } 744 i += UnicodeScalarValue.second - 1; 745 } else 746 EscapedInput.push_back(*i); 747 } 748 return EscapedInput; 749 } 750 751 Scanner::Scanner(StringRef Input, SourceMgr &sm, bool ShowColors) 752 : SM(sm), ShowColors(ShowColors) { 753 init(MemoryBufferRef(Input, "YAML")); 754 } 755 756 Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors) 757 : SM(SM_), ShowColors(ShowColors) { 758 init(Buffer); 759 } 760 761 void Scanner::init(MemoryBufferRef Buffer) { 762 InputBuffer = Buffer; 763 Current = InputBuffer.getBufferStart(); 764 End = InputBuffer.getBufferEnd(); 765 Indent = -1; 766 Column = 0; 767 Line = 0; 768 FlowLevel = 0; 769 IsStartOfStream = true; 770 IsSimpleKeyAllowed = true; 771 Failed = false; 772 std::unique_ptr<MemoryBuffer> InputBufferOwner = 773 MemoryBuffer::getMemBuffer(Buffer); 774 SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc()); 775 } 776 777 Token &Scanner::peekNext() { 778 // If the current token is a possible simple key, keep parsing until we 779 // can confirm. 780 bool NeedMore = false; 781 while (true) { 782 if (TokenQueue.empty() || NeedMore) { 783 if (!fetchMoreTokens()) { 784 TokenQueue.clear(); 785 TokenQueue.push_back(Token()); 786 return TokenQueue.front(); 787 } 788 } 789 assert(!TokenQueue.empty() && 790 "fetchMoreTokens lied about getting tokens!"); 791 792 removeStaleSimpleKeyCandidates(); 793 SimpleKey SK; 794 SK.Tok = TokenQueue.begin(); 795 if (!is_contained(SimpleKeys, SK)) 796 break; 797 else 798 NeedMore = true; 799 } 800 return TokenQueue.front(); 801 } 802 803 Token Scanner::getNext() { 804 Token Ret = peekNext(); 805 // TokenQueue can be empty if there was an error getting the next token. 806 if (!TokenQueue.empty()) 807 TokenQueue.pop_front(); 808 809 // There cannot be any referenced Token's if the TokenQueue is empty. So do a 810 // quick deallocation of them all. 811 if (TokenQueue.empty()) { 812 TokenQueue.Alloc.Reset(); 813 } 814 815 return Ret; 816 } 817 818 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { 819 if (Position == End) 820 return Position; 821 // Check 7 bit c-printable - b-char. 822 if ( *Position == 0x09 823 || (*Position >= 0x20 && *Position <= 0x7E)) 824 return Position + 1; 825 826 // Check for valid UTF-8. 827 if (uint8_t(*Position) & 0x80) { 828 UTF8Decoded u8d = decodeUTF8(Position); 829 if ( u8d.second != 0 830 && u8d.first != 0xFEFF 831 && ( u8d.first == 0x85 832 || ( u8d.first >= 0xA0 833 && u8d.first <= 0xD7FF) 834 || ( u8d.first >= 0xE000 835 && u8d.first <= 0xFFFD) 836 || ( u8d.first >= 0x10000 837 && u8d.first <= 0x10FFFF))) 838 return Position + u8d.second; 839 } 840 return Position; 841 } 842 843 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { 844 if (Position == End) 845 return Position; 846 if (*Position == 0x0D) { 847 if (Position + 1 != End && *(Position + 1) == 0x0A) 848 return Position + 2; 849 return Position + 1; 850 } 851 852 if (*Position == 0x0A) 853 return Position + 1; 854 return Position; 855 } 856 857 StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) { 858 if (Position == End) 859 return Position; 860 if (*Position == ' ') 861 return Position + 1; 862 return Position; 863 } 864 865 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { 866 if (Position == End) 867 return Position; 868 if (*Position == ' ' || *Position == '\t') 869 return Position + 1; 870 return Position; 871 } 872 873 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { 874 if (Position == End) 875 return Position; 876 if (*Position == ' ' || *Position == '\t') 877 return Position; 878 return skip_nb_char(Position); 879 } 880 881 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func 882 , StringRef::iterator Position) { 883 while (true) { 884 StringRef::iterator i = (this->*Func)(Position); 885 if (i == Position) 886 break; 887 Position = i; 888 } 889 return Position; 890 } 891 892 void Scanner::advanceWhile(SkipWhileFunc Func) { 893 auto Final = skip_while(Func, Current); 894 Column += Final - Current; 895 Current = Final; 896 } 897 898 static bool is_ns_hex_digit(const char C) { 899 return (C >= '0' && C <= '9') 900 || (C >= 'a' && C <= 'z') 901 || (C >= 'A' && C <= 'Z'); 902 } 903 904 static bool is_ns_word_char(const char C) { 905 return C == '-' 906 || (C >= 'a' && C <= 'z') 907 || (C >= 'A' && C <= 'Z'); 908 } 909 910 StringRef Scanner::scan_ns_uri_char() { 911 StringRef::iterator Start = Current; 912 while (true) { 913 if (Current == End) 914 break; 915 if (( *Current == '%' 916 && Current + 2 < End 917 && is_ns_hex_digit(*(Current + 1)) 918 && is_ns_hex_digit(*(Current + 2))) 919 || is_ns_word_char(*Current) 920 || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") 921 != StringRef::npos) { 922 ++Current; 923 ++Column; 924 } else 925 break; 926 } 927 return StringRef(Start, Current - Start); 928 } 929 930 bool Scanner::consume(uint32_t Expected) { 931 if (Expected >= 0x80) 932 report_fatal_error("Not dealing with this yet"); 933 if (Current == End) 934 return false; 935 if (uint8_t(*Current) >= 0x80) 936 report_fatal_error("Not dealing with this yet"); 937 if (uint8_t(*Current) == Expected) { 938 ++Current; 939 ++Column; 940 return true; 941 } 942 return false; 943 } 944 945 void Scanner::skip(uint32_t Distance) { 946 Current += Distance; 947 Column += Distance; 948 assert(Current <= End && "Skipped past the end"); 949 } 950 951 bool Scanner::isBlankOrBreak(StringRef::iterator Position) { 952 if (Position == End) 953 return false; 954 return *Position == ' ' || *Position == '\t' || *Position == '\r' || 955 *Position == '\n'; 956 } 957 958 bool Scanner::consumeLineBreakIfPresent() { 959 auto Next = skip_b_break(Current); 960 if (Next == Current) 961 return false; 962 Column = 0; 963 ++Line; 964 Current = Next; 965 return true; 966 } 967 968 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok 969 , unsigned AtColumn 970 , bool IsRequired) { 971 if (IsSimpleKeyAllowed) { 972 SimpleKey SK; 973 SK.Tok = Tok; 974 SK.Line = Line; 975 SK.Column = AtColumn; 976 SK.IsRequired = IsRequired; 977 SK.FlowLevel = FlowLevel; 978 SimpleKeys.push_back(SK); 979 } 980 } 981 982 void Scanner::removeStaleSimpleKeyCandidates() { 983 for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); 984 i != SimpleKeys.end();) { 985 if (i->Line != Line || i->Column + 1024 < Column) { 986 if (i->IsRequired) 987 setError( "Could not find expected : for simple key" 988 , i->Tok->Range.begin()); 989 i = SimpleKeys.erase(i); 990 } else 991 ++i; 992 } 993 } 994 995 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { 996 if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) 997 SimpleKeys.pop_back(); 998 } 999 1000 bool Scanner::unrollIndent(int ToColumn) { 1001 Token T; 1002 // Indentation is ignored in flow. 1003 if (FlowLevel != 0) 1004 return true; 1005 1006 while (Indent > ToColumn) { 1007 T.Kind = Token::TK_BlockEnd; 1008 T.Range = StringRef(Current, 1); 1009 TokenQueue.push_back(T); 1010 Indent = Indents.pop_back_val(); 1011 } 1012 1013 return true; 1014 } 1015 1016 bool Scanner::rollIndent( int ToColumn 1017 , Token::TokenKind Kind 1018 , TokenQueueT::iterator InsertPoint) { 1019 if (FlowLevel) 1020 return true; 1021 if (Indent < ToColumn) { 1022 Indents.push_back(Indent); 1023 Indent = ToColumn; 1024 1025 Token T; 1026 T.Kind = Kind; 1027 T.Range = StringRef(Current, 0); 1028 TokenQueue.insert(InsertPoint, T); 1029 } 1030 return true; 1031 } 1032 1033 void Scanner::skipComment() { 1034 if (*Current != '#') 1035 return; 1036 while (true) { 1037 // This may skip more than one byte, thus Column is only incremented 1038 // for code points. 1039 StringRef::iterator I = skip_nb_char(Current); 1040 if (I == Current) 1041 break; 1042 Current = I; 1043 ++Column; 1044 } 1045 } 1046 1047 void Scanner::scanToNextToken() { 1048 while (true) { 1049 while (*Current == ' ' || *Current == '\t') { 1050 skip(1); 1051 } 1052 1053 skipComment(); 1054 1055 // Skip EOL. 1056 StringRef::iterator i = skip_b_break(Current); 1057 if (i == Current) 1058 break; 1059 Current = i; 1060 ++Line; 1061 Column = 0; 1062 // New lines may start a simple key. 1063 if (!FlowLevel) 1064 IsSimpleKeyAllowed = true; 1065 } 1066 } 1067 1068 bool Scanner::scanStreamStart() { 1069 IsStartOfStream = false; 1070 1071 EncodingInfo EI = getUnicodeEncoding(currentInput()); 1072 1073 Token T; 1074 T.Kind = Token::TK_StreamStart; 1075 T.Range = StringRef(Current, EI.second); 1076 TokenQueue.push_back(T); 1077 Current += EI.second; 1078 return true; 1079 } 1080 1081 bool Scanner::scanStreamEnd() { 1082 // Force an ending new line if one isn't present. 1083 if (Column != 0) { 1084 Column = 0; 1085 ++Line; 1086 } 1087 1088 unrollIndent(-1); 1089 SimpleKeys.clear(); 1090 IsSimpleKeyAllowed = false; 1091 1092 Token T; 1093 T.Kind = Token::TK_StreamEnd; 1094 T.Range = StringRef(Current, 0); 1095 TokenQueue.push_back(T); 1096 return true; 1097 } 1098 1099 bool Scanner::scanDirective() { 1100 // Reset the indentation level. 1101 unrollIndent(-1); 1102 SimpleKeys.clear(); 1103 IsSimpleKeyAllowed = false; 1104 1105 StringRef::iterator Start = Current; 1106 consume('%'); 1107 StringRef::iterator NameStart = Current; 1108 Current = skip_while(&Scanner::skip_ns_char, Current); 1109 StringRef Name(NameStart, Current - NameStart); 1110 Current = skip_while(&Scanner::skip_s_white, Current); 1111 1112 Token T; 1113 if (Name == "YAML") { 1114 Current = skip_while(&Scanner::skip_ns_char, Current); 1115 T.Kind = Token::TK_VersionDirective; 1116 T.Range = StringRef(Start, Current - Start); 1117 TokenQueue.push_back(T); 1118 return true; 1119 } else if(Name == "TAG") { 1120 Current = skip_while(&Scanner::skip_ns_char, Current); 1121 Current = skip_while(&Scanner::skip_s_white, Current); 1122 Current = skip_while(&Scanner::skip_ns_char, Current); 1123 T.Kind = Token::TK_TagDirective; 1124 T.Range = StringRef(Start, Current - Start); 1125 TokenQueue.push_back(T); 1126 return true; 1127 } 1128 return false; 1129 } 1130 1131 bool Scanner::scanDocumentIndicator(bool IsStart) { 1132 unrollIndent(-1); 1133 SimpleKeys.clear(); 1134 IsSimpleKeyAllowed = false; 1135 1136 Token T; 1137 T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; 1138 T.Range = StringRef(Current, 3); 1139 skip(3); 1140 TokenQueue.push_back(T); 1141 return true; 1142 } 1143 1144 bool Scanner::scanFlowCollectionStart(bool IsSequence) { 1145 Token T; 1146 T.Kind = IsSequence ? Token::TK_FlowSequenceStart 1147 : Token::TK_FlowMappingStart; 1148 T.Range = StringRef(Current, 1); 1149 skip(1); 1150 TokenQueue.push_back(T); 1151 1152 // [ and { may begin a simple key. 1153 saveSimpleKeyCandidate(--TokenQueue.end(), Column - 1, false); 1154 1155 // And may also be followed by a simple key. 1156 IsSimpleKeyAllowed = true; 1157 ++FlowLevel; 1158 return true; 1159 } 1160 1161 bool Scanner::scanFlowCollectionEnd(bool IsSequence) { 1162 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1163 IsSimpleKeyAllowed = false; 1164 Token T; 1165 T.Kind = IsSequence ? Token::TK_FlowSequenceEnd 1166 : Token::TK_FlowMappingEnd; 1167 T.Range = StringRef(Current, 1); 1168 skip(1); 1169 TokenQueue.push_back(T); 1170 if (FlowLevel) 1171 --FlowLevel; 1172 return true; 1173 } 1174 1175 bool Scanner::scanFlowEntry() { 1176 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1177 IsSimpleKeyAllowed = true; 1178 Token T; 1179 T.Kind = Token::TK_FlowEntry; 1180 T.Range = StringRef(Current, 1); 1181 skip(1); 1182 TokenQueue.push_back(T); 1183 return true; 1184 } 1185 1186 bool Scanner::scanBlockEntry() { 1187 rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); 1188 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1189 IsSimpleKeyAllowed = true; 1190 Token T; 1191 T.Kind = Token::TK_BlockEntry; 1192 T.Range = StringRef(Current, 1); 1193 skip(1); 1194 TokenQueue.push_back(T); 1195 return true; 1196 } 1197 1198 bool Scanner::scanKey() { 1199 if (!FlowLevel) 1200 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1201 1202 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1203 IsSimpleKeyAllowed = !FlowLevel; 1204 1205 Token T; 1206 T.Kind = Token::TK_Key; 1207 T.Range = StringRef(Current, 1); 1208 skip(1); 1209 TokenQueue.push_back(T); 1210 return true; 1211 } 1212 1213 bool Scanner::scanValue() { 1214 // If the previous token could have been a simple key, insert the key token 1215 // into the token queue. 1216 if (!SimpleKeys.empty()) { 1217 SimpleKey SK = SimpleKeys.pop_back_val(); 1218 Token T; 1219 T.Kind = Token::TK_Key; 1220 T.Range = SK.Tok->Range; 1221 TokenQueueT::iterator i, e; 1222 for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { 1223 if (i == SK.Tok) 1224 break; 1225 } 1226 assert(i != e && "SimpleKey not in token queue!"); 1227 i = TokenQueue.insert(i, T); 1228 1229 // We may also need to add a Block-Mapping-Start token. 1230 rollIndent(SK.Column, Token::TK_BlockMappingStart, i); 1231 1232 IsSimpleKeyAllowed = false; 1233 } else { 1234 if (!FlowLevel) 1235 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1236 IsSimpleKeyAllowed = !FlowLevel; 1237 } 1238 1239 Token T; 1240 T.Kind = Token::TK_Value; 1241 T.Range = StringRef(Current, 1); 1242 skip(1); 1243 TokenQueue.push_back(T); 1244 return true; 1245 } 1246 1247 // Forbidding inlining improves performance by roughly 20%. 1248 // FIXME: Remove once llvm optimizes this to the faster version without hints. 1249 LLVM_ATTRIBUTE_NOINLINE static bool 1250 wasEscaped(StringRef::iterator First, StringRef::iterator Position); 1251 1252 // Returns whether a character at 'Position' was escaped with a leading '\'. 1253 // 'First' specifies the position of the first character in the string. 1254 static bool wasEscaped(StringRef::iterator First, 1255 StringRef::iterator Position) { 1256 assert(Position - 1 >= First); 1257 StringRef::iterator I = Position - 1; 1258 // We calculate the number of consecutive '\'s before the current position 1259 // by iterating backwards through our string. 1260 while (I >= First && *I == '\\') --I; 1261 // (Position - 1 - I) now contains the number of '\'s before the current 1262 // position. If it is odd, the character at 'Position' was escaped. 1263 return (Position - 1 - I) % 2 == 1; 1264 } 1265 1266 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { 1267 StringRef::iterator Start = Current; 1268 unsigned ColStart = Column; 1269 if (IsDoubleQuoted) { 1270 do { 1271 ++Current; 1272 while (Current != End && *Current != '"') 1273 ++Current; 1274 // Repeat until the previous character was not a '\' or was an escaped 1275 // backslash. 1276 } while ( Current != End 1277 && *(Current - 1) == '\\' 1278 && wasEscaped(Start + 1, Current)); 1279 } else { 1280 skip(1); 1281 while (true) { 1282 // Skip a ' followed by another '. 1283 if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { 1284 skip(2); 1285 continue; 1286 } else if (*Current == '\'') 1287 break; 1288 StringRef::iterator i = skip_nb_char(Current); 1289 if (i == Current) { 1290 i = skip_b_break(Current); 1291 if (i == Current) 1292 break; 1293 Current = i; 1294 Column = 0; 1295 ++Line; 1296 } else { 1297 if (i == End) 1298 break; 1299 Current = i; 1300 ++Column; 1301 } 1302 } 1303 } 1304 1305 if (Current == End) { 1306 setError("Expected quote at end of scalar", Current); 1307 return false; 1308 } 1309 1310 skip(1); // Skip ending quote. 1311 Token T; 1312 T.Kind = Token::TK_Scalar; 1313 T.Range = StringRef(Start, Current - Start); 1314 TokenQueue.push_back(T); 1315 1316 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1317 1318 IsSimpleKeyAllowed = false; 1319 1320 return true; 1321 } 1322 1323 bool Scanner::scanPlainScalar() { 1324 StringRef::iterator Start = Current; 1325 unsigned ColStart = Column; 1326 unsigned LeadingBlanks = 0; 1327 assert(Indent >= -1 && "Indent must be >= -1 !"); 1328 unsigned indent = static_cast<unsigned>(Indent + 1); 1329 while (true) { 1330 if (*Current == '#') 1331 break; 1332 1333 while (!isBlankOrBreak(Current)) { 1334 if ( FlowLevel && *Current == ':' 1335 && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) { 1336 setError("Found unexpected ':' while scanning a plain scalar", Current); 1337 return false; 1338 } 1339 1340 // Check for the end of the plain scalar. 1341 if ( (*Current == ':' && isBlankOrBreak(Current + 1)) 1342 || ( FlowLevel 1343 && (StringRef(Current, 1).find_first_of(",:?[]{}") 1344 != StringRef::npos))) 1345 break; 1346 1347 StringRef::iterator i = skip_nb_char(Current); 1348 if (i == Current) 1349 break; 1350 Current = i; 1351 ++Column; 1352 } 1353 1354 // Are we at the end? 1355 if (!isBlankOrBreak(Current)) 1356 break; 1357 1358 // Eat blanks. 1359 StringRef::iterator Tmp = Current; 1360 while (isBlankOrBreak(Tmp)) { 1361 StringRef::iterator i = skip_s_white(Tmp); 1362 if (i != Tmp) { 1363 if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { 1364 setError("Found invalid tab character in indentation", Tmp); 1365 return false; 1366 } 1367 Tmp = i; 1368 ++Column; 1369 } else { 1370 i = skip_b_break(Tmp); 1371 if (!LeadingBlanks) 1372 LeadingBlanks = 1; 1373 Tmp = i; 1374 Column = 0; 1375 ++Line; 1376 } 1377 } 1378 1379 if (!FlowLevel && Column < indent) 1380 break; 1381 1382 Current = Tmp; 1383 } 1384 if (Start == Current) { 1385 setError("Got empty plain scalar", Start); 1386 return false; 1387 } 1388 Token T; 1389 T.Kind = Token::TK_Scalar; 1390 T.Range = StringRef(Start, Current - Start); 1391 TokenQueue.push_back(T); 1392 1393 // Plain scalars can be simple keys. 1394 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1395 1396 IsSimpleKeyAllowed = false; 1397 1398 return true; 1399 } 1400 1401 bool Scanner::scanAliasOrAnchor(bool IsAlias) { 1402 StringRef::iterator Start = Current; 1403 unsigned ColStart = Column; 1404 skip(1); 1405 while(true) { 1406 if ( *Current == '[' || *Current == ']' 1407 || *Current == '{' || *Current == '}' 1408 || *Current == ',' 1409 || *Current == ':') 1410 break; 1411 StringRef::iterator i = skip_ns_char(Current); 1412 if (i == Current) 1413 break; 1414 Current = i; 1415 ++Column; 1416 } 1417 1418 if (Start == Current) { 1419 setError("Got empty alias or anchor", Start); 1420 return false; 1421 } 1422 1423 Token T; 1424 T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; 1425 T.Range = StringRef(Start, Current - Start); 1426 TokenQueue.push_back(T); 1427 1428 // Alias and anchors can be simple keys. 1429 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1430 1431 IsSimpleKeyAllowed = false; 1432 1433 return true; 1434 } 1435 1436 char Scanner::scanBlockChompingIndicator() { 1437 char Indicator = ' '; 1438 if (Current != End && (*Current == '+' || *Current == '-')) { 1439 Indicator = *Current; 1440 skip(1); 1441 } 1442 return Indicator; 1443 } 1444 1445 /// Get the number of line breaks after chomping. 1446 /// 1447 /// Return the number of trailing line breaks to emit, depending on 1448 /// \p ChompingIndicator. 1449 static unsigned getChompedLineBreaks(char ChompingIndicator, 1450 unsigned LineBreaks, StringRef Str) { 1451 if (ChompingIndicator == '-') // Strip all line breaks. 1452 return 0; 1453 if (ChompingIndicator == '+') // Keep all line breaks. 1454 return LineBreaks; 1455 // Clip trailing lines. 1456 return Str.empty() ? 0 : 1; 1457 } 1458 1459 unsigned Scanner::scanBlockIndentationIndicator() { 1460 unsigned Indent = 0; 1461 if (Current != End && (*Current >= '1' && *Current <= '9')) { 1462 Indent = unsigned(*Current - '0'); 1463 skip(1); 1464 } 1465 return Indent; 1466 } 1467 1468 bool Scanner::scanBlockScalarHeader(char &ChompingIndicator, 1469 unsigned &IndentIndicator, bool &IsDone) { 1470 auto Start = Current; 1471 1472 ChompingIndicator = scanBlockChompingIndicator(); 1473 IndentIndicator = scanBlockIndentationIndicator(); 1474 // Check for the chomping indicator once again. 1475 if (ChompingIndicator == ' ') 1476 ChompingIndicator = scanBlockChompingIndicator(); 1477 Current = skip_while(&Scanner::skip_s_white, Current); 1478 skipComment(); 1479 1480 if (Current == End) { // EOF, we have an empty scalar. 1481 Token T; 1482 T.Kind = Token::TK_BlockScalar; 1483 T.Range = StringRef(Start, Current - Start); 1484 TokenQueue.push_back(T); 1485 IsDone = true; 1486 return true; 1487 } 1488 1489 if (!consumeLineBreakIfPresent()) { 1490 setError("Expected a line break after block scalar header", Current); 1491 return false; 1492 } 1493 return true; 1494 } 1495 1496 bool Scanner::findBlockScalarIndent(unsigned &BlockIndent, 1497 unsigned BlockExitIndent, 1498 unsigned &LineBreaks, bool &IsDone) { 1499 unsigned MaxAllSpaceLineCharacters = 0; 1500 StringRef::iterator LongestAllSpaceLine; 1501 1502 while (true) { 1503 advanceWhile(&Scanner::skip_s_space); 1504 if (skip_nb_char(Current) != Current) { 1505 // This line isn't empty, so try and find the indentation. 1506 if (Column <= BlockExitIndent) { // End of the block literal. 1507 IsDone = true; 1508 return true; 1509 } 1510 // We found the block's indentation. 1511 BlockIndent = Column; 1512 if (MaxAllSpaceLineCharacters > BlockIndent) { 1513 setError( 1514 "Leading all-spaces line must be smaller than the block indent", 1515 LongestAllSpaceLine); 1516 return false; 1517 } 1518 return true; 1519 } 1520 if (skip_b_break(Current) != Current && 1521 Column > MaxAllSpaceLineCharacters) { 1522 // Record the longest all-space line in case it's longer than the 1523 // discovered block indent. 1524 MaxAllSpaceLineCharacters = Column; 1525 LongestAllSpaceLine = Current; 1526 } 1527 1528 // Check for EOF. 1529 if (Current == End) { 1530 IsDone = true; 1531 return true; 1532 } 1533 1534 if (!consumeLineBreakIfPresent()) { 1535 IsDone = true; 1536 return true; 1537 } 1538 ++LineBreaks; 1539 } 1540 return true; 1541 } 1542 1543 bool Scanner::scanBlockScalarIndent(unsigned BlockIndent, 1544 unsigned BlockExitIndent, bool &IsDone) { 1545 // Skip the indentation. 1546 while (Column < BlockIndent) { 1547 auto I = skip_s_space(Current); 1548 if (I == Current) 1549 break; 1550 Current = I; 1551 ++Column; 1552 } 1553 1554 if (skip_nb_char(Current) == Current) 1555 return true; 1556 1557 if (Column <= BlockExitIndent) { // End of the block literal. 1558 IsDone = true; 1559 return true; 1560 } 1561 1562 if (Column < BlockIndent) { 1563 if (Current != End && *Current == '#') { // Trailing comment. 1564 IsDone = true; 1565 return true; 1566 } 1567 setError("A text line is less indented than the block scalar", Current); 1568 return false; 1569 } 1570 return true; // A normal text line. 1571 } 1572 1573 bool Scanner::scanBlockScalar(bool IsLiteral) { 1574 // Eat '|' or '>' 1575 assert(*Current == '|' || *Current == '>'); 1576 skip(1); 1577 1578 char ChompingIndicator; 1579 unsigned BlockIndent; 1580 bool IsDone = false; 1581 if (!scanBlockScalarHeader(ChompingIndicator, BlockIndent, IsDone)) 1582 return false; 1583 if (IsDone) 1584 return true; 1585 1586 auto Start = Current; 1587 unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent; 1588 unsigned LineBreaks = 0; 1589 if (BlockIndent == 0) { 1590 if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks, 1591 IsDone)) 1592 return false; 1593 } 1594 1595 // Scan the block's scalars body. 1596 SmallString<256> Str; 1597 while (!IsDone) { 1598 if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone)) 1599 return false; 1600 if (IsDone) 1601 break; 1602 1603 // Parse the current line. 1604 auto LineStart = Current; 1605 advanceWhile(&Scanner::skip_nb_char); 1606 if (LineStart != Current) { 1607 Str.append(LineBreaks, '\n'); 1608 Str.append(StringRef(LineStart, Current - LineStart)); 1609 LineBreaks = 0; 1610 } 1611 1612 // Check for EOF. 1613 if (Current == End) 1614 break; 1615 1616 if (!consumeLineBreakIfPresent()) 1617 break; 1618 ++LineBreaks; 1619 } 1620 1621 if (Current == End && !LineBreaks) 1622 // Ensure that there is at least one line break before the end of file. 1623 LineBreaks = 1; 1624 Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n'); 1625 1626 // New lines may start a simple key. 1627 if (!FlowLevel) 1628 IsSimpleKeyAllowed = true; 1629 1630 Token T; 1631 T.Kind = Token::TK_BlockScalar; 1632 T.Range = StringRef(Start, Current - Start); 1633 T.Value = Str.str().str(); 1634 TokenQueue.push_back(T); 1635 return true; 1636 } 1637 1638 bool Scanner::scanTag() { 1639 StringRef::iterator Start = Current; 1640 unsigned ColStart = Column; 1641 skip(1); // Eat !. 1642 if (Current == End || isBlankOrBreak(Current)); // An empty tag. 1643 else if (*Current == '<') { 1644 skip(1); 1645 scan_ns_uri_char(); 1646 if (!consume('>')) 1647 return false; 1648 } else { 1649 // FIXME: Actually parse the c-ns-shorthand-tag rule. 1650 Current = skip_while(&Scanner::skip_ns_char, Current); 1651 } 1652 1653 Token T; 1654 T.Kind = Token::TK_Tag; 1655 T.Range = StringRef(Start, Current - Start); 1656 TokenQueue.push_back(T); 1657 1658 // Tags can be simple keys. 1659 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1660 1661 IsSimpleKeyAllowed = false; 1662 1663 return true; 1664 } 1665 1666 bool Scanner::fetchMoreTokens() { 1667 if (IsStartOfStream) 1668 return scanStreamStart(); 1669 1670 scanToNextToken(); 1671 1672 if (Current == End) 1673 return scanStreamEnd(); 1674 1675 removeStaleSimpleKeyCandidates(); 1676 1677 unrollIndent(Column); 1678 1679 if (Column == 0 && *Current == '%') 1680 return scanDirective(); 1681 1682 if (Column == 0 && Current + 4 <= End 1683 && *Current == '-' 1684 && *(Current + 1) == '-' 1685 && *(Current + 2) == '-' 1686 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1687 return scanDocumentIndicator(true); 1688 1689 if (Column == 0 && Current + 4 <= End 1690 && *Current == '.' 1691 && *(Current + 1) == '.' 1692 && *(Current + 2) == '.' 1693 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1694 return scanDocumentIndicator(false); 1695 1696 if (*Current == '[') 1697 return scanFlowCollectionStart(true); 1698 1699 if (*Current == '{') 1700 return scanFlowCollectionStart(false); 1701 1702 if (*Current == ']') 1703 return scanFlowCollectionEnd(true); 1704 1705 if (*Current == '}') 1706 return scanFlowCollectionEnd(false); 1707 1708 if (*Current == ',') 1709 return scanFlowEntry(); 1710 1711 if (*Current == '-' && isBlankOrBreak(Current + 1)) 1712 return scanBlockEntry(); 1713 1714 if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1))) 1715 return scanKey(); 1716 1717 if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1))) 1718 return scanValue(); 1719 1720 if (*Current == '*') 1721 return scanAliasOrAnchor(true); 1722 1723 if (*Current == '&') 1724 return scanAliasOrAnchor(false); 1725 1726 if (*Current == '!') 1727 return scanTag(); 1728 1729 if (*Current == '|' && !FlowLevel) 1730 return scanBlockScalar(true); 1731 1732 if (*Current == '>' && !FlowLevel) 1733 return scanBlockScalar(false); 1734 1735 if (*Current == '\'') 1736 return scanFlowScalar(false); 1737 1738 if (*Current == '"') 1739 return scanFlowScalar(true); 1740 1741 // Get a plain scalar. 1742 StringRef FirstChar(Current, 1); 1743 if (!(isBlankOrBreak(Current) 1744 || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos) 1745 || (*Current == '-' && !isBlankOrBreak(Current + 1)) 1746 || (!FlowLevel && (*Current == '?' || *Current == ':') 1747 && isBlankOrBreak(Current + 1)) 1748 || (!FlowLevel && *Current == ':' 1749 && Current + 2 < End 1750 && *(Current + 1) == ':' 1751 && !isBlankOrBreak(Current + 2))) 1752 return scanPlainScalar(); 1753 1754 setError("Unrecognized character while tokenizing."); 1755 return false; 1756 } 1757 1758 Stream::Stream(StringRef Input, SourceMgr &SM, bool ShowColors) 1759 : scanner(new Scanner(Input, SM, ShowColors)), CurrentDoc() {} 1760 1761 Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors) 1762 : scanner(new Scanner(InputBuffer, SM, ShowColors)), CurrentDoc() {} 1763 1764 Stream::~Stream() {} 1765 1766 bool Stream::failed() { return scanner->failed(); } 1767 1768 void Stream::printError(Node *N, const Twine &Msg) { 1769 scanner->printError( N->getSourceRange().Start 1770 , SourceMgr::DK_Error 1771 , Msg 1772 , N->getSourceRange()); 1773 } 1774 1775 document_iterator Stream::begin() { 1776 if (CurrentDoc) 1777 report_fatal_error("Can only iterate over the stream once"); 1778 1779 // Skip Stream-Start. 1780 scanner->getNext(); 1781 1782 CurrentDoc.reset(new Document(*this)); 1783 return document_iterator(CurrentDoc); 1784 } 1785 1786 document_iterator Stream::end() { 1787 return document_iterator(); 1788 } 1789 1790 void Stream::skip() { 1791 for (document_iterator i = begin(), e = end(); i != e; ++i) 1792 i->skip(); 1793 } 1794 1795 Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A, 1796 StringRef T) 1797 : Doc(D), TypeID(Type), Anchor(A), Tag(T) { 1798 SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); 1799 SourceRange = SMRange(Start, Start); 1800 } 1801 1802 std::string Node::getVerbatimTag() const { 1803 StringRef Raw = getRawTag(); 1804 if (!Raw.empty() && Raw != "!") { 1805 std::string Ret; 1806 if (Raw.find_last_of('!') == 0) { 1807 Ret = Doc->getTagMap().find("!")->second; 1808 Ret += Raw.substr(1); 1809 return Ret; 1810 } else if (Raw.startswith("!!")) { 1811 Ret = Doc->getTagMap().find("!!")->second; 1812 Ret += Raw.substr(2); 1813 return Ret; 1814 } else { 1815 StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1); 1816 std::map<StringRef, StringRef>::const_iterator It = 1817 Doc->getTagMap().find(TagHandle); 1818 if (It != Doc->getTagMap().end()) 1819 Ret = It->second; 1820 else { 1821 Token T; 1822 T.Kind = Token::TK_Tag; 1823 T.Range = TagHandle; 1824 setError(Twine("Unknown tag handle ") + TagHandle, T); 1825 } 1826 Ret += Raw.substr(Raw.find_last_of('!') + 1); 1827 return Ret; 1828 } 1829 } 1830 1831 switch (getType()) { 1832 case NK_Null: 1833 return "tag:yaml.org,2002:null"; 1834 case NK_Scalar: 1835 case NK_BlockScalar: 1836 // TODO: Tag resolution. 1837 return "tag:yaml.org,2002:str"; 1838 case NK_Mapping: 1839 return "tag:yaml.org,2002:map"; 1840 case NK_Sequence: 1841 return "tag:yaml.org,2002:seq"; 1842 } 1843 1844 return ""; 1845 } 1846 1847 Token &Node::peekNext() { 1848 return Doc->peekNext(); 1849 } 1850 1851 Token Node::getNext() { 1852 return Doc->getNext(); 1853 } 1854 1855 Node *Node::parseBlockNode() { 1856 return Doc->parseBlockNode(); 1857 } 1858 1859 BumpPtrAllocator &Node::getAllocator() { 1860 return Doc->NodeAllocator; 1861 } 1862 1863 void Node::setError(const Twine &Msg, Token &Tok) const { 1864 Doc->setError(Msg, Tok); 1865 } 1866 1867 bool Node::failed() const { 1868 return Doc->failed(); 1869 } 1870 1871 1872 1873 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { 1874 // TODO: Handle newlines properly. We need to remove leading whitespace. 1875 if (Value[0] == '"') { // Double quoted. 1876 // Pull off the leading and trailing "s. 1877 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1878 // Search for characters that would require unescaping the value. 1879 StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); 1880 if (i != StringRef::npos) 1881 return unescapeDoubleQuoted(UnquotedValue, i, Storage); 1882 return UnquotedValue; 1883 } else if (Value[0] == '\'') { // Single quoted. 1884 // Pull off the leading and trailing 's. 1885 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1886 StringRef::size_type i = UnquotedValue.find('\''); 1887 if (i != StringRef::npos) { 1888 // We're going to need Storage. 1889 Storage.clear(); 1890 Storage.reserve(UnquotedValue.size()); 1891 for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { 1892 StringRef Valid(UnquotedValue.begin(), i); 1893 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1894 Storage.push_back('\''); 1895 UnquotedValue = UnquotedValue.substr(i + 2); 1896 } 1897 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 1898 return StringRef(Storage.begin(), Storage.size()); 1899 } 1900 return UnquotedValue; 1901 } 1902 // Plain or block. 1903 return Value.rtrim(' '); 1904 } 1905 1906 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue 1907 , StringRef::size_type i 1908 , SmallVectorImpl<char> &Storage) 1909 const { 1910 // Use Storage to build proper value. 1911 Storage.clear(); 1912 Storage.reserve(UnquotedValue.size()); 1913 for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { 1914 // Insert all previous chars into Storage. 1915 StringRef Valid(UnquotedValue.begin(), i); 1916 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1917 // Chop off inserted chars. 1918 UnquotedValue = UnquotedValue.substr(i); 1919 1920 assert(!UnquotedValue.empty() && "Can't be empty!"); 1921 1922 // Parse escape or line break. 1923 switch (UnquotedValue[0]) { 1924 case '\r': 1925 case '\n': 1926 Storage.push_back('\n'); 1927 if ( UnquotedValue.size() > 1 1928 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1929 UnquotedValue = UnquotedValue.substr(1); 1930 UnquotedValue = UnquotedValue.substr(1); 1931 break; 1932 default: 1933 if (UnquotedValue.size() == 1) 1934 // TODO: Report error. 1935 break; 1936 UnquotedValue = UnquotedValue.substr(1); 1937 switch (UnquotedValue[0]) { 1938 default: { 1939 Token T; 1940 T.Range = StringRef(UnquotedValue.begin(), 1); 1941 setError("Unrecognized escape code!", T); 1942 return ""; 1943 } 1944 case '\r': 1945 case '\n': 1946 // Remove the new line. 1947 if ( UnquotedValue.size() > 1 1948 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1949 UnquotedValue = UnquotedValue.substr(1); 1950 // If this was just a single byte newline, it will get skipped 1951 // below. 1952 break; 1953 case '0': 1954 Storage.push_back(0x00); 1955 break; 1956 case 'a': 1957 Storage.push_back(0x07); 1958 break; 1959 case 'b': 1960 Storage.push_back(0x08); 1961 break; 1962 case 't': 1963 case 0x09: 1964 Storage.push_back(0x09); 1965 break; 1966 case 'n': 1967 Storage.push_back(0x0A); 1968 break; 1969 case 'v': 1970 Storage.push_back(0x0B); 1971 break; 1972 case 'f': 1973 Storage.push_back(0x0C); 1974 break; 1975 case 'r': 1976 Storage.push_back(0x0D); 1977 break; 1978 case 'e': 1979 Storage.push_back(0x1B); 1980 break; 1981 case ' ': 1982 Storage.push_back(0x20); 1983 break; 1984 case '"': 1985 Storage.push_back(0x22); 1986 break; 1987 case '/': 1988 Storage.push_back(0x2F); 1989 break; 1990 case '\\': 1991 Storage.push_back(0x5C); 1992 break; 1993 case 'N': 1994 encodeUTF8(0x85, Storage); 1995 break; 1996 case '_': 1997 encodeUTF8(0xA0, Storage); 1998 break; 1999 case 'L': 2000 encodeUTF8(0x2028, Storage); 2001 break; 2002 case 'P': 2003 encodeUTF8(0x2029, Storage); 2004 break; 2005 case 'x': { 2006 if (UnquotedValue.size() < 3) 2007 // TODO: Report error. 2008 break; 2009 unsigned int UnicodeScalarValue; 2010 if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) 2011 // TODO: Report error. 2012 UnicodeScalarValue = 0xFFFD; 2013 encodeUTF8(UnicodeScalarValue, Storage); 2014 UnquotedValue = UnquotedValue.substr(2); 2015 break; 2016 } 2017 case 'u': { 2018 if (UnquotedValue.size() < 5) 2019 // TODO: Report error. 2020 break; 2021 unsigned int UnicodeScalarValue; 2022 if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) 2023 // TODO: Report error. 2024 UnicodeScalarValue = 0xFFFD; 2025 encodeUTF8(UnicodeScalarValue, Storage); 2026 UnquotedValue = UnquotedValue.substr(4); 2027 break; 2028 } 2029 case 'U': { 2030 if (UnquotedValue.size() < 9) 2031 // TODO: Report error. 2032 break; 2033 unsigned int UnicodeScalarValue; 2034 if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) 2035 // TODO: Report error. 2036 UnicodeScalarValue = 0xFFFD; 2037 encodeUTF8(UnicodeScalarValue, Storage); 2038 UnquotedValue = UnquotedValue.substr(8); 2039 break; 2040 } 2041 } 2042 UnquotedValue = UnquotedValue.substr(1); 2043 } 2044 } 2045 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 2046 return StringRef(Storage.begin(), Storage.size()); 2047 } 2048 2049 Node *KeyValueNode::getKey() { 2050 if (Key) 2051 return Key; 2052 // Handle implicit null keys. 2053 { 2054 Token &t = peekNext(); 2055 if ( t.Kind == Token::TK_BlockEnd 2056 || t.Kind == Token::TK_Value 2057 || t.Kind == Token::TK_Error) { 2058 return Key = new (getAllocator()) NullNode(Doc); 2059 } 2060 if (t.Kind == Token::TK_Key) 2061 getNext(); // skip TK_Key. 2062 } 2063 2064 // Handle explicit null keys. 2065 Token &t = peekNext(); 2066 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { 2067 return Key = new (getAllocator()) NullNode(Doc); 2068 } 2069 2070 // We've got a normal key. 2071 return Key = parseBlockNode(); 2072 } 2073 2074 Node *KeyValueNode::getValue() { 2075 if (Value) 2076 return Value; 2077 getKey()->skip(); 2078 if (failed()) 2079 return Value = new (getAllocator()) NullNode(Doc); 2080 2081 // Handle implicit null values. 2082 { 2083 Token &t = peekNext(); 2084 if ( t.Kind == Token::TK_BlockEnd 2085 || t.Kind == Token::TK_FlowMappingEnd 2086 || t.Kind == Token::TK_Key 2087 || t.Kind == Token::TK_FlowEntry 2088 || t.Kind == Token::TK_Error) { 2089 return Value = new (getAllocator()) NullNode(Doc); 2090 } 2091 2092 if (t.Kind != Token::TK_Value) { 2093 setError("Unexpected token in Key Value.", t); 2094 return Value = new (getAllocator()) NullNode(Doc); 2095 } 2096 getNext(); // skip TK_Value. 2097 } 2098 2099 // Handle explicit null values. 2100 Token &t = peekNext(); 2101 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { 2102 return Value = new (getAllocator()) NullNode(Doc); 2103 } 2104 2105 // We got a normal value. 2106 return Value = parseBlockNode(); 2107 } 2108 2109 void MappingNode::increment() { 2110 if (failed()) { 2111 IsAtEnd = true; 2112 CurrentEntry = nullptr; 2113 return; 2114 } 2115 if (CurrentEntry) { 2116 CurrentEntry->skip(); 2117 if (Type == MT_Inline) { 2118 IsAtEnd = true; 2119 CurrentEntry = nullptr; 2120 return; 2121 } 2122 } 2123 Token T = peekNext(); 2124 if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { 2125 // KeyValueNode eats the TK_Key. That way it can detect null keys. 2126 CurrentEntry = new (getAllocator()) KeyValueNode(Doc); 2127 } else if (Type == MT_Block) { 2128 switch (T.Kind) { 2129 case Token::TK_BlockEnd: 2130 getNext(); 2131 IsAtEnd = true; 2132 CurrentEntry = nullptr; 2133 break; 2134 default: 2135 setError("Unexpected token. Expected Key or Block End", T); 2136 case Token::TK_Error: 2137 IsAtEnd = true; 2138 CurrentEntry = nullptr; 2139 } 2140 } else { 2141 switch (T.Kind) { 2142 case Token::TK_FlowEntry: 2143 // Eat the flow entry and recurse. 2144 getNext(); 2145 return increment(); 2146 case Token::TK_FlowMappingEnd: 2147 getNext(); 2148 case Token::TK_Error: 2149 // Set this to end iterator. 2150 IsAtEnd = true; 2151 CurrentEntry = nullptr; 2152 break; 2153 default: 2154 setError( "Unexpected token. Expected Key, Flow Entry, or Flow " 2155 "Mapping End." 2156 , T); 2157 IsAtEnd = true; 2158 CurrentEntry = nullptr; 2159 } 2160 } 2161 } 2162 2163 void SequenceNode::increment() { 2164 if (failed()) { 2165 IsAtEnd = true; 2166 CurrentEntry = nullptr; 2167 return; 2168 } 2169 if (CurrentEntry) 2170 CurrentEntry->skip(); 2171 Token T = peekNext(); 2172 if (SeqType == ST_Block) { 2173 switch (T.Kind) { 2174 case Token::TK_BlockEntry: 2175 getNext(); 2176 CurrentEntry = parseBlockNode(); 2177 if (!CurrentEntry) { // An error occurred. 2178 IsAtEnd = true; 2179 CurrentEntry = nullptr; 2180 } 2181 break; 2182 case Token::TK_BlockEnd: 2183 getNext(); 2184 IsAtEnd = true; 2185 CurrentEntry = nullptr; 2186 break; 2187 default: 2188 setError( "Unexpected token. Expected Block Entry or Block End." 2189 , T); 2190 case Token::TK_Error: 2191 IsAtEnd = true; 2192 CurrentEntry = nullptr; 2193 } 2194 } else if (SeqType == ST_Indentless) { 2195 switch (T.Kind) { 2196 case Token::TK_BlockEntry: 2197 getNext(); 2198 CurrentEntry = parseBlockNode(); 2199 if (!CurrentEntry) { // An error occurred. 2200 IsAtEnd = true; 2201 CurrentEntry = nullptr; 2202 } 2203 break; 2204 default: 2205 case Token::TK_Error: 2206 IsAtEnd = true; 2207 CurrentEntry = nullptr; 2208 } 2209 } else if (SeqType == ST_Flow) { 2210 switch (T.Kind) { 2211 case Token::TK_FlowEntry: 2212 // Eat the flow entry and recurse. 2213 getNext(); 2214 WasPreviousTokenFlowEntry = true; 2215 return increment(); 2216 case Token::TK_FlowSequenceEnd: 2217 getNext(); 2218 case Token::TK_Error: 2219 // Set this to end iterator. 2220 IsAtEnd = true; 2221 CurrentEntry = nullptr; 2222 break; 2223 case Token::TK_StreamEnd: 2224 case Token::TK_DocumentEnd: 2225 case Token::TK_DocumentStart: 2226 setError("Could not find closing ]!", T); 2227 // Set this to end iterator. 2228 IsAtEnd = true; 2229 CurrentEntry = nullptr; 2230 break; 2231 default: 2232 if (!WasPreviousTokenFlowEntry) { 2233 setError("Expected , between entries!", T); 2234 IsAtEnd = true; 2235 CurrentEntry = nullptr; 2236 break; 2237 } 2238 // Otherwise it must be a flow entry. 2239 CurrentEntry = parseBlockNode(); 2240 if (!CurrentEntry) { 2241 IsAtEnd = true; 2242 } 2243 WasPreviousTokenFlowEntry = false; 2244 break; 2245 } 2246 } 2247 } 2248 2249 Document::Document(Stream &S) : stream(S), Root(nullptr) { 2250 // Tag maps starts with two default mappings. 2251 TagMap["!"] = "!"; 2252 TagMap["!!"] = "tag:yaml.org,2002:"; 2253 2254 if (parseDirectives()) 2255 expectToken(Token::TK_DocumentStart); 2256 Token &T = peekNext(); 2257 if (T.Kind == Token::TK_DocumentStart) 2258 getNext(); 2259 } 2260 2261 bool Document::skip() { 2262 if (stream.scanner->failed()) 2263 return false; 2264 if (!Root) 2265 getRoot(); 2266 Root->skip(); 2267 Token &T = peekNext(); 2268 if (T.Kind == Token::TK_StreamEnd) 2269 return false; 2270 if (T.Kind == Token::TK_DocumentEnd) { 2271 getNext(); 2272 return skip(); 2273 } 2274 return true; 2275 } 2276 2277 Token &Document::peekNext() { 2278 return stream.scanner->peekNext(); 2279 } 2280 2281 Token Document::getNext() { 2282 return stream.scanner->getNext(); 2283 } 2284 2285 void Document::setError(const Twine &Message, Token &Location) const { 2286 stream.scanner->setError(Message, Location.Range.begin()); 2287 } 2288 2289 bool Document::failed() const { 2290 return stream.scanner->failed(); 2291 } 2292 2293 Node *Document::parseBlockNode() { 2294 Token T = peekNext(); 2295 // Handle properties. 2296 Token AnchorInfo; 2297 Token TagInfo; 2298 parse_property: 2299 switch (T.Kind) { 2300 case Token::TK_Alias: 2301 getNext(); 2302 return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); 2303 case Token::TK_Anchor: 2304 if (AnchorInfo.Kind == Token::TK_Anchor) { 2305 setError("Already encountered an anchor for this node!", T); 2306 return nullptr; 2307 } 2308 AnchorInfo = getNext(); // Consume TK_Anchor. 2309 T = peekNext(); 2310 goto parse_property; 2311 case Token::TK_Tag: 2312 if (TagInfo.Kind == Token::TK_Tag) { 2313 setError("Already encountered a tag for this node!", T); 2314 return nullptr; 2315 } 2316 TagInfo = getNext(); // Consume TK_Tag. 2317 T = peekNext(); 2318 goto parse_property; 2319 default: 2320 break; 2321 } 2322 2323 switch (T.Kind) { 2324 case Token::TK_BlockEntry: 2325 // We got an unindented BlockEntry sequence. This is not terminated with 2326 // a BlockEnd. 2327 // Don't eat the TK_BlockEntry, SequenceNode needs it. 2328 return new (NodeAllocator) SequenceNode( stream.CurrentDoc 2329 , AnchorInfo.Range.substr(1) 2330 , TagInfo.Range 2331 , SequenceNode::ST_Indentless); 2332 case Token::TK_BlockSequenceStart: 2333 getNext(); 2334 return new (NodeAllocator) 2335 SequenceNode( stream.CurrentDoc 2336 , AnchorInfo.Range.substr(1) 2337 , TagInfo.Range 2338 , SequenceNode::ST_Block); 2339 case Token::TK_BlockMappingStart: 2340 getNext(); 2341 return new (NodeAllocator) 2342 MappingNode( stream.CurrentDoc 2343 , AnchorInfo.Range.substr(1) 2344 , TagInfo.Range 2345 , MappingNode::MT_Block); 2346 case Token::TK_FlowSequenceStart: 2347 getNext(); 2348 return new (NodeAllocator) 2349 SequenceNode( stream.CurrentDoc 2350 , AnchorInfo.Range.substr(1) 2351 , TagInfo.Range 2352 , SequenceNode::ST_Flow); 2353 case Token::TK_FlowMappingStart: 2354 getNext(); 2355 return new (NodeAllocator) 2356 MappingNode( stream.CurrentDoc 2357 , AnchorInfo.Range.substr(1) 2358 , TagInfo.Range 2359 , MappingNode::MT_Flow); 2360 case Token::TK_Scalar: 2361 getNext(); 2362 return new (NodeAllocator) 2363 ScalarNode( stream.CurrentDoc 2364 , AnchorInfo.Range.substr(1) 2365 , TagInfo.Range 2366 , T.Range); 2367 case Token::TK_BlockScalar: { 2368 getNext(); 2369 StringRef NullTerminatedStr(T.Value.c_str(), T.Value.length() + 1); 2370 StringRef StrCopy = NullTerminatedStr.copy(NodeAllocator).drop_back(); 2371 return new (NodeAllocator) 2372 BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1), 2373 TagInfo.Range, StrCopy, T.Range); 2374 } 2375 case Token::TK_Key: 2376 // Don't eat the TK_Key, KeyValueNode expects it. 2377 return new (NodeAllocator) 2378 MappingNode( stream.CurrentDoc 2379 , AnchorInfo.Range.substr(1) 2380 , TagInfo.Range 2381 , MappingNode::MT_Inline); 2382 case Token::TK_DocumentStart: 2383 case Token::TK_DocumentEnd: 2384 case Token::TK_StreamEnd: 2385 default: 2386 // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not 2387 // !!null null. 2388 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2389 case Token::TK_Error: 2390 return nullptr; 2391 } 2392 llvm_unreachable("Control flow shouldn't reach here."); 2393 return nullptr; 2394 } 2395 2396 bool Document::parseDirectives() { 2397 bool isDirective = false; 2398 while (true) { 2399 Token T = peekNext(); 2400 if (T.Kind == Token::TK_TagDirective) { 2401 parseTAGDirective(); 2402 isDirective = true; 2403 } else if (T.Kind == Token::TK_VersionDirective) { 2404 parseYAMLDirective(); 2405 isDirective = true; 2406 } else 2407 break; 2408 } 2409 return isDirective; 2410 } 2411 2412 void Document::parseYAMLDirective() { 2413 getNext(); // Eat %YAML <version> 2414 } 2415 2416 void Document::parseTAGDirective() { 2417 Token Tag = getNext(); // %TAG <handle> <prefix> 2418 StringRef T = Tag.Range; 2419 // Strip %TAG 2420 T = T.substr(T.find_first_of(" \t")).ltrim(" \t"); 2421 std::size_t HandleEnd = T.find_first_of(" \t"); 2422 StringRef TagHandle = T.substr(0, HandleEnd); 2423 StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t"); 2424 TagMap[TagHandle] = TagPrefix; 2425 } 2426 2427 bool Document::expectToken(int TK) { 2428 Token T = getNext(); 2429 if (T.Kind != TK) { 2430 setError("Unexpected token", T); 2431 return false; 2432 } 2433 return true; 2434 } 2435