1 //===--- YAMLParser.cpp - Simple YAML parser ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements a YAML parser. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Support/YAMLParser.h" 15 #include "llvm/ADT/STLExtras.h" 16 #include "llvm/ADT/SmallString.h" 17 #include "llvm/ADT/SmallVector.h" 18 #include "llvm/ADT/StringExtras.h" 19 #include "llvm/ADT/Twine.h" 20 #include "llvm/ADT/ilist.h" 21 #include "llvm/ADT/ilist_node.h" 22 #include "llvm/Support/ErrorHandling.h" 23 #include "llvm/Support/MemoryBuffer.h" 24 #include "llvm/Support/SourceMgr.h" 25 #include "llvm/Support/raw_ostream.h" 26 27 using namespace llvm; 28 using namespace yaml; 29 30 enum UnicodeEncodingForm { 31 UEF_UTF32_LE, ///< UTF-32 Little Endian 32 UEF_UTF32_BE, ///< UTF-32 Big Endian 33 UEF_UTF16_LE, ///< UTF-16 Little Endian 34 UEF_UTF16_BE, ///< UTF-16 Big Endian 35 UEF_UTF8, ///< UTF-8 or ascii. 36 UEF_Unknown ///< Not a valid Unicode encoding. 37 }; 38 39 /// EncodingInfo - Holds the encoding type and length of the byte order mark if 40 /// it exists. Length is in {0, 2, 3, 4}. 41 typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo; 42 43 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode 44 /// encoding form of \a Input. 45 /// 46 /// @param Input A string of length 0 or more. 47 /// @returns An EncodingInfo indicating the Unicode encoding form of the input 48 /// and how long the byte order mark is if one exists. 49 static EncodingInfo getUnicodeEncoding(StringRef Input) { 50 if (Input.size() == 0) 51 return std::make_pair(UEF_Unknown, 0); 52 53 switch (uint8_t(Input[0])) { 54 case 0x00: 55 if (Input.size() >= 4) { 56 if ( Input[1] == 0 57 && uint8_t(Input[2]) == 0xFE 58 && uint8_t(Input[3]) == 0xFF) 59 return std::make_pair(UEF_UTF32_BE, 4); 60 if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) 61 return std::make_pair(UEF_UTF32_BE, 0); 62 } 63 64 if (Input.size() >= 2 && Input[1] != 0) 65 return std::make_pair(UEF_UTF16_BE, 0); 66 return std::make_pair(UEF_Unknown, 0); 67 case 0xFF: 68 if ( Input.size() >= 4 69 && uint8_t(Input[1]) == 0xFE 70 && Input[2] == 0 71 && Input[3] == 0) 72 return std::make_pair(UEF_UTF32_LE, 4); 73 74 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) 75 return std::make_pair(UEF_UTF16_LE, 2); 76 return std::make_pair(UEF_Unknown, 0); 77 case 0xFE: 78 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) 79 return std::make_pair(UEF_UTF16_BE, 2); 80 return std::make_pair(UEF_Unknown, 0); 81 case 0xEF: 82 if ( Input.size() >= 3 83 && uint8_t(Input[1]) == 0xBB 84 && uint8_t(Input[2]) == 0xBF) 85 return std::make_pair(UEF_UTF8, 3); 86 return std::make_pair(UEF_Unknown, 0); 87 } 88 89 // It could still be utf-32 or utf-16. 90 if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) 91 return std::make_pair(UEF_UTF32_LE, 0); 92 93 if (Input.size() >= 2 && Input[1] == 0) 94 return std::make_pair(UEF_UTF16_LE, 0); 95 96 return std::make_pair(UEF_UTF8, 0); 97 } 98 99 namespace llvm { 100 namespace yaml { 101 /// Pin the vtables to this file. 102 void Node::anchor() {} 103 void NullNode::anchor() {} 104 void ScalarNode::anchor() {} 105 void BlockScalarNode::anchor() {} 106 void KeyValueNode::anchor() {} 107 void MappingNode::anchor() {} 108 void SequenceNode::anchor() {} 109 void AliasNode::anchor() {} 110 111 /// Token - A single YAML token. 112 struct Token : ilist_node<Token> { 113 enum TokenKind { 114 TK_Error, // Uninitialized token. 115 TK_StreamStart, 116 TK_StreamEnd, 117 TK_VersionDirective, 118 TK_TagDirective, 119 TK_DocumentStart, 120 TK_DocumentEnd, 121 TK_BlockEntry, 122 TK_BlockEnd, 123 TK_BlockSequenceStart, 124 TK_BlockMappingStart, 125 TK_FlowEntry, 126 TK_FlowSequenceStart, 127 TK_FlowSequenceEnd, 128 TK_FlowMappingStart, 129 TK_FlowMappingEnd, 130 TK_Key, 131 TK_Value, 132 TK_Scalar, 133 TK_BlockScalar, 134 TK_Alias, 135 TK_Anchor, 136 TK_Tag 137 } Kind; 138 139 /// A string of length 0 or more whose begin() points to the logical location 140 /// of the token in the input. 141 StringRef Range; 142 143 /// The value of a block scalar node. 144 std::string Value; 145 146 Token() : Kind(TK_Error) {} 147 }; 148 } 149 } 150 151 namespace llvm { 152 template<> 153 struct ilist_node_traits<Token> { 154 Token *createNode(const Token &V) { 155 return new (Alloc.Allocate<Token>()) Token(V); 156 } 157 static void deleteNode(Token *V) { V->~Token(); } 158 159 void addNodeToList(Token *) {} 160 void removeNodeFromList(Token *) {} 161 void transferNodesFromList(ilist_node_traits & /*SrcTraits*/, 162 ilist_iterator<Token> /*first*/, 163 ilist_iterator<Token> /*last*/) {} 164 165 BumpPtrAllocator Alloc; 166 }; 167 } 168 169 typedef ilist<Token> TokenQueueT; 170 171 namespace { 172 /// @brief This struct is used to track simple keys. 173 /// 174 /// Simple keys are handled by creating an entry in SimpleKeys for each Token 175 /// which could legally be the start of a simple key. When peekNext is called, 176 /// if the Token To be returned is referenced by a SimpleKey, we continue 177 /// tokenizing until that potential simple key has either been found to not be 178 /// a simple key (we moved on to the next line or went further than 1024 chars). 179 /// Or when we run into a Value, and then insert a Key token (and possibly 180 /// others) before the SimpleKey's Tok. 181 struct SimpleKey { 182 TokenQueueT::iterator Tok; 183 unsigned Column; 184 unsigned Line; 185 unsigned FlowLevel; 186 bool IsRequired; 187 188 bool operator ==(const SimpleKey &Other) { 189 return Tok == Other.Tok; 190 } 191 }; 192 } 193 194 /// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit 195 /// subsequence and the subsequence's length in code units (uint8_t). 196 /// A length of 0 represents an error. 197 typedef std::pair<uint32_t, unsigned> UTF8Decoded; 198 199 static UTF8Decoded decodeUTF8(StringRef Range) { 200 StringRef::iterator Position= Range.begin(); 201 StringRef::iterator End = Range.end(); 202 // 1 byte: [0x00, 0x7f] 203 // Bit pattern: 0xxxxxxx 204 if ((*Position & 0x80) == 0) { 205 return std::make_pair(*Position, 1); 206 } 207 // 2 bytes: [0x80, 0x7ff] 208 // Bit pattern: 110xxxxx 10xxxxxx 209 if (Position + 1 != End && 210 ((*Position & 0xE0) == 0xC0) && 211 ((*(Position + 1) & 0xC0) == 0x80)) { 212 uint32_t codepoint = ((*Position & 0x1F) << 6) | 213 (*(Position + 1) & 0x3F); 214 if (codepoint >= 0x80) 215 return std::make_pair(codepoint, 2); 216 } 217 // 3 bytes: [0x8000, 0xffff] 218 // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx 219 if (Position + 2 != End && 220 ((*Position & 0xF0) == 0xE0) && 221 ((*(Position + 1) & 0xC0) == 0x80) && 222 ((*(Position + 2) & 0xC0) == 0x80)) { 223 uint32_t codepoint = ((*Position & 0x0F) << 12) | 224 ((*(Position + 1) & 0x3F) << 6) | 225 (*(Position + 2) & 0x3F); 226 // Codepoints between 0xD800 and 0xDFFF are invalid, as 227 // they are high / low surrogate halves used by UTF-16. 228 if (codepoint >= 0x800 && 229 (codepoint < 0xD800 || codepoint > 0xDFFF)) 230 return std::make_pair(codepoint, 3); 231 } 232 // 4 bytes: [0x10000, 0x10FFFF] 233 // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 234 if (Position + 3 != End && 235 ((*Position & 0xF8) == 0xF0) && 236 ((*(Position + 1) & 0xC0) == 0x80) && 237 ((*(Position + 2) & 0xC0) == 0x80) && 238 ((*(Position + 3) & 0xC0) == 0x80)) { 239 uint32_t codepoint = ((*Position & 0x07) << 18) | 240 ((*(Position + 1) & 0x3F) << 12) | 241 ((*(Position + 2) & 0x3F) << 6) | 242 (*(Position + 3) & 0x3F); 243 if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) 244 return std::make_pair(codepoint, 4); 245 } 246 return std::make_pair(0, 0); 247 } 248 249 namespace llvm { 250 namespace yaml { 251 /// @brief Scans YAML tokens from a MemoryBuffer. 252 class Scanner { 253 public: 254 Scanner(StringRef Input, SourceMgr &SM, bool ShowColors = true); 255 Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors = true); 256 257 /// @brief Parse the next token and return it without popping it. 258 Token &peekNext(); 259 260 /// @brief Parse the next token and pop it from the queue. 261 Token getNext(); 262 263 void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, 264 ArrayRef<SMRange> Ranges = None) { 265 SM.PrintMessage(Loc, Kind, Message, Ranges, /* FixIts= */ None, ShowColors); 266 } 267 268 void setError(const Twine &Message, StringRef::iterator Position) { 269 if (Current >= End) 270 Current = End - 1; 271 272 // Don't print out more errors after the first one we encounter. The rest 273 // are just the result of the first, and have no meaning. 274 if (!Failed) 275 printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message); 276 Failed = true; 277 } 278 279 void setError(const Twine &Message) { 280 setError(Message, Current); 281 } 282 283 /// @brief Returns true if an error occurred while parsing. 284 bool failed() { 285 return Failed; 286 } 287 288 private: 289 void init(MemoryBufferRef Buffer); 290 291 StringRef currentInput() { 292 return StringRef(Current, End - Current); 293 } 294 295 /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting 296 /// at \a Position. 297 /// 298 /// If the UTF-8 code units starting at Position do not form a well-formed 299 /// code unit subsequence, then the Unicode scalar value is 0, and the length 300 /// is 0. 301 UTF8Decoded decodeUTF8(StringRef::iterator Position) { 302 return ::decodeUTF8(StringRef(Position, End - Position)); 303 } 304 305 // The following functions are based on the gramar rules in the YAML spec. The 306 // style of the function names it meant to closely match how they are written 307 // in the spec. The number within the [] is the number of the grammar rule in 308 // the spec. 309 // 310 // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. 311 // 312 // c- 313 // A production starting and ending with a special character. 314 // b- 315 // A production matching a single line break. 316 // nb- 317 // A production starting and ending with a non-break character. 318 // s- 319 // A production starting and ending with a white space character. 320 // ns- 321 // A production starting and ending with a non-space character. 322 // l- 323 // A production matching complete line(s). 324 325 /// @brief Skip a single nb-char[27] starting at Position. 326 /// 327 /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] 328 /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] 329 /// 330 /// @returns The code unit after the nb-char, or Position if it's not an 331 /// nb-char. 332 StringRef::iterator skip_nb_char(StringRef::iterator Position); 333 334 /// @brief Skip a single b-break[28] starting at Position. 335 /// 336 /// A b-break is 0xD 0xA | 0xD | 0xA 337 /// 338 /// @returns The code unit after the b-break, or Position if it's not a 339 /// b-break. 340 StringRef::iterator skip_b_break(StringRef::iterator Position); 341 342 /// Skip a single s-space[31] starting at Position. 343 /// 344 /// An s-space is 0x20 345 /// 346 /// @returns The code unit after the s-space, or Position if it's not a 347 /// s-space. 348 StringRef::iterator skip_s_space(StringRef::iterator Position); 349 350 /// @brief Skip a single s-white[33] starting at Position. 351 /// 352 /// A s-white is 0x20 | 0x9 353 /// 354 /// @returns The code unit after the s-white, or Position if it's not a 355 /// s-white. 356 StringRef::iterator skip_s_white(StringRef::iterator Position); 357 358 /// @brief Skip a single ns-char[34] starting at Position. 359 /// 360 /// A ns-char is nb-char - s-white 361 /// 362 /// @returns The code unit after the ns-char, or Position if it's not a 363 /// ns-char. 364 StringRef::iterator skip_ns_char(StringRef::iterator Position); 365 366 typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator); 367 /// @brief Skip minimal well-formed code unit subsequences until Func 368 /// returns its input. 369 /// 370 /// @returns The code unit after the last minimal well-formed code unit 371 /// subsequence that Func accepted. 372 StringRef::iterator skip_while( SkipWhileFunc Func 373 , StringRef::iterator Position); 374 375 /// Skip minimal well-formed code unit subsequences until Func returns its 376 /// input. 377 void advanceWhile(SkipWhileFunc Func); 378 379 /// @brief Scan ns-uri-char[39]s starting at Cur. 380 /// 381 /// This updates Cur and Column while scanning. 382 /// 383 /// @returns A StringRef starting at Cur which covers the longest contiguous 384 /// sequence of ns-uri-char. 385 StringRef scan_ns_uri_char(); 386 387 /// @brief Consume a minimal well-formed code unit subsequence starting at 388 /// \a Cur. Return false if it is not the same Unicode scalar value as 389 /// \a Expected. This updates \a Column. 390 bool consume(uint32_t Expected); 391 392 /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. 393 void skip(uint32_t Distance); 394 395 /// @brief Return true if the minimal well-formed code unit subsequence at 396 /// Pos is whitespace or a new line 397 bool isBlankOrBreak(StringRef::iterator Position); 398 399 /// Consume a single b-break[28] if it's present at the current position. 400 /// 401 /// Return false if the code unit at the current position isn't a line break. 402 bool consumeLineBreakIfPresent(); 403 404 /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey. 405 void saveSimpleKeyCandidate( TokenQueueT::iterator Tok 406 , unsigned AtColumn 407 , bool IsRequired); 408 409 /// @brief Remove simple keys that can no longer be valid simple keys. 410 /// 411 /// Invalid simple keys are not on the current line or are further than 1024 412 /// columns back. 413 void removeStaleSimpleKeyCandidates(); 414 415 /// @brief Remove all simple keys on FlowLevel \a Level. 416 void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); 417 418 /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd 419 /// tokens if needed. 420 bool unrollIndent(int ToColumn); 421 422 /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint 423 /// if needed. 424 bool rollIndent( int ToColumn 425 , Token::TokenKind Kind 426 , TokenQueueT::iterator InsertPoint); 427 428 /// @brief Skip a single-line comment when the comment starts at the current 429 /// position of the scanner. 430 void skipComment(); 431 432 /// @brief Skip whitespace and comments until the start of the next token. 433 void scanToNextToken(); 434 435 /// @brief Must be the first token generated. 436 bool scanStreamStart(); 437 438 /// @brief Generate tokens needed to close out the stream. 439 bool scanStreamEnd(); 440 441 /// @brief Scan a %BLAH directive. 442 bool scanDirective(); 443 444 /// @brief Scan a ... or ---. 445 bool scanDocumentIndicator(bool IsStart); 446 447 /// @brief Scan a [ or { and generate the proper flow collection start token. 448 bool scanFlowCollectionStart(bool IsSequence); 449 450 /// @brief Scan a ] or } and generate the proper flow collection end token. 451 bool scanFlowCollectionEnd(bool IsSequence); 452 453 /// @brief Scan the , that separates entries in a flow collection. 454 bool scanFlowEntry(); 455 456 /// @brief Scan the - that starts block sequence entries. 457 bool scanBlockEntry(); 458 459 /// @brief Scan an explicit ? indicating a key. 460 bool scanKey(); 461 462 /// @brief Scan an explicit : indicating a value. 463 bool scanValue(); 464 465 /// @brief Scan a quoted scalar. 466 bool scanFlowScalar(bool IsDoubleQuoted); 467 468 /// @brief Scan an unquoted scalar. 469 bool scanPlainScalar(); 470 471 /// @brief Scan an Alias or Anchor starting with * or &. 472 bool scanAliasOrAnchor(bool IsAlias); 473 474 /// @brief Scan a block scalar starting with | or >. 475 bool scanBlockScalar(bool IsLiteral); 476 477 /// Scan a chomping indicator in a block scalar header. 478 char scanBlockChompingIndicator(); 479 480 /// Scan an indentation indicator in a block scalar header. 481 unsigned scanBlockIndentationIndicator(); 482 483 /// Scan a block scalar header. 484 /// 485 /// Return false if an error occurred. 486 bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator, 487 bool &IsDone); 488 489 /// Look for the indentation level of a block scalar. 490 /// 491 /// Return false if an error occurred. 492 bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent, 493 unsigned &LineBreaks, bool &IsDone); 494 495 /// Scan the indentation of a text line in a block scalar. 496 /// 497 /// Return false if an error occurred. 498 bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent, 499 bool &IsDone); 500 501 /// @brief Scan a tag of the form !stuff. 502 bool scanTag(); 503 504 /// @brief Dispatch to the next scanning function based on \a *Cur. 505 bool fetchMoreTokens(); 506 507 /// @brief The SourceMgr used for diagnostics and buffer management. 508 SourceMgr &SM; 509 510 /// @brief The original input. 511 MemoryBufferRef InputBuffer; 512 513 /// @brief The current position of the scanner. 514 StringRef::iterator Current; 515 516 /// @brief The end of the input (one past the last character). 517 StringRef::iterator End; 518 519 /// @brief Current YAML indentation level in spaces. 520 int Indent; 521 522 /// @brief Current column number in Unicode code points. 523 unsigned Column; 524 525 /// @brief Current line number. 526 unsigned Line; 527 528 /// @brief How deep we are in flow style containers. 0 Means at block level. 529 unsigned FlowLevel; 530 531 /// @brief Are we at the start of the stream? 532 bool IsStartOfStream; 533 534 /// @brief Can the next token be the start of a simple key? 535 bool IsSimpleKeyAllowed; 536 537 /// @brief True if an error has occurred. 538 bool Failed; 539 540 /// @brief Should colors be used when printing out the diagnostic messages? 541 bool ShowColors; 542 543 /// @brief Queue of tokens. This is required to queue up tokens while looking 544 /// for the end of a simple key. And for cases where a single character 545 /// can produce multiple tokens (e.g. BlockEnd). 546 TokenQueueT TokenQueue; 547 548 /// @brief Indentation levels. 549 SmallVector<int, 4> Indents; 550 551 /// @brief Potential simple keys. 552 SmallVector<SimpleKey, 4> SimpleKeys; 553 }; 554 555 } // end namespace yaml 556 } // end namespace llvm 557 558 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. 559 static void encodeUTF8( uint32_t UnicodeScalarValue 560 , SmallVectorImpl<char> &Result) { 561 if (UnicodeScalarValue <= 0x7F) { 562 Result.push_back(UnicodeScalarValue & 0x7F); 563 } else if (UnicodeScalarValue <= 0x7FF) { 564 uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); 565 uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); 566 Result.push_back(FirstByte); 567 Result.push_back(SecondByte); 568 } else if (UnicodeScalarValue <= 0xFFFF) { 569 uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); 570 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 571 uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); 572 Result.push_back(FirstByte); 573 Result.push_back(SecondByte); 574 Result.push_back(ThirdByte); 575 } else if (UnicodeScalarValue <= 0x10FFFF) { 576 uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); 577 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); 578 uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 579 uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); 580 Result.push_back(FirstByte); 581 Result.push_back(SecondByte); 582 Result.push_back(ThirdByte); 583 Result.push_back(FourthByte); 584 } 585 } 586 587 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { 588 SourceMgr SM; 589 Scanner scanner(Input, SM); 590 while (true) { 591 Token T = scanner.getNext(); 592 switch (T.Kind) { 593 case Token::TK_StreamStart: 594 OS << "Stream-Start: "; 595 break; 596 case Token::TK_StreamEnd: 597 OS << "Stream-End: "; 598 break; 599 case Token::TK_VersionDirective: 600 OS << "Version-Directive: "; 601 break; 602 case Token::TK_TagDirective: 603 OS << "Tag-Directive: "; 604 break; 605 case Token::TK_DocumentStart: 606 OS << "Document-Start: "; 607 break; 608 case Token::TK_DocumentEnd: 609 OS << "Document-End: "; 610 break; 611 case Token::TK_BlockEntry: 612 OS << "Block-Entry: "; 613 break; 614 case Token::TK_BlockEnd: 615 OS << "Block-End: "; 616 break; 617 case Token::TK_BlockSequenceStart: 618 OS << "Block-Sequence-Start: "; 619 break; 620 case Token::TK_BlockMappingStart: 621 OS << "Block-Mapping-Start: "; 622 break; 623 case Token::TK_FlowEntry: 624 OS << "Flow-Entry: "; 625 break; 626 case Token::TK_FlowSequenceStart: 627 OS << "Flow-Sequence-Start: "; 628 break; 629 case Token::TK_FlowSequenceEnd: 630 OS << "Flow-Sequence-End: "; 631 break; 632 case Token::TK_FlowMappingStart: 633 OS << "Flow-Mapping-Start: "; 634 break; 635 case Token::TK_FlowMappingEnd: 636 OS << "Flow-Mapping-End: "; 637 break; 638 case Token::TK_Key: 639 OS << "Key: "; 640 break; 641 case Token::TK_Value: 642 OS << "Value: "; 643 break; 644 case Token::TK_Scalar: 645 OS << "Scalar: "; 646 break; 647 case Token::TK_BlockScalar: 648 OS << "Block Scalar: "; 649 break; 650 case Token::TK_Alias: 651 OS << "Alias: "; 652 break; 653 case Token::TK_Anchor: 654 OS << "Anchor: "; 655 break; 656 case Token::TK_Tag: 657 OS << "Tag: "; 658 break; 659 case Token::TK_Error: 660 break; 661 } 662 OS << T.Range << "\n"; 663 if (T.Kind == Token::TK_StreamEnd) 664 break; 665 else if (T.Kind == Token::TK_Error) 666 return false; 667 } 668 return true; 669 } 670 671 bool yaml::scanTokens(StringRef Input) { 672 llvm::SourceMgr SM; 673 llvm::yaml::Scanner scanner(Input, SM); 674 for (;;) { 675 llvm::yaml::Token T = scanner.getNext(); 676 if (T.Kind == Token::TK_StreamEnd) 677 break; 678 else if (T.Kind == Token::TK_Error) 679 return false; 680 } 681 return true; 682 } 683 684 std::string yaml::escape(StringRef Input) { 685 std::string EscapedInput; 686 for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { 687 if (*i == '\\') 688 EscapedInput += "\\\\"; 689 else if (*i == '"') 690 EscapedInput += "\\\""; 691 else if (*i == 0) 692 EscapedInput += "\\0"; 693 else if (*i == 0x07) 694 EscapedInput += "\\a"; 695 else if (*i == 0x08) 696 EscapedInput += "\\b"; 697 else if (*i == 0x09) 698 EscapedInput += "\\t"; 699 else if (*i == 0x0A) 700 EscapedInput += "\\n"; 701 else if (*i == 0x0B) 702 EscapedInput += "\\v"; 703 else if (*i == 0x0C) 704 EscapedInput += "\\f"; 705 else if (*i == 0x0D) 706 EscapedInput += "\\r"; 707 else if (*i == 0x1B) 708 EscapedInput += "\\e"; 709 else if ((unsigned char)*i < 0x20) { // Control characters not handled above. 710 std::string HexStr = utohexstr(*i); 711 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 712 } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. 713 UTF8Decoded UnicodeScalarValue 714 = decodeUTF8(StringRef(i, Input.end() - i)); 715 if (UnicodeScalarValue.second == 0) { 716 // Found invalid char. 717 SmallString<4> Val; 718 encodeUTF8(0xFFFD, Val); 719 EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end()); 720 // FIXME: Error reporting. 721 return EscapedInput; 722 } 723 if (UnicodeScalarValue.first == 0x85) 724 EscapedInput += "\\N"; 725 else if (UnicodeScalarValue.first == 0xA0) 726 EscapedInput += "\\_"; 727 else if (UnicodeScalarValue.first == 0x2028) 728 EscapedInput += "\\L"; 729 else if (UnicodeScalarValue.first == 0x2029) 730 EscapedInput += "\\P"; 731 else { 732 std::string HexStr = utohexstr(UnicodeScalarValue.first); 733 if (HexStr.size() <= 2) 734 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 735 else if (HexStr.size() <= 4) 736 EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; 737 else if (HexStr.size() <= 8) 738 EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; 739 } 740 i += UnicodeScalarValue.second - 1; 741 } else 742 EscapedInput.push_back(*i); 743 } 744 return EscapedInput; 745 } 746 747 Scanner::Scanner(StringRef Input, SourceMgr &sm, bool ShowColors) 748 : SM(sm), ShowColors(ShowColors) { 749 init(MemoryBufferRef(Input, "YAML")); 750 } 751 752 Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors) 753 : SM(SM_), ShowColors(ShowColors) { 754 init(Buffer); 755 } 756 757 void Scanner::init(MemoryBufferRef Buffer) { 758 InputBuffer = Buffer; 759 Current = InputBuffer.getBufferStart(); 760 End = InputBuffer.getBufferEnd(); 761 Indent = -1; 762 Column = 0; 763 Line = 0; 764 FlowLevel = 0; 765 IsStartOfStream = true; 766 IsSimpleKeyAllowed = true; 767 Failed = false; 768 std::unique_ptr<MemoryBuffer> InputBufferOwner = 769 MemoryBuffer::getMemBuffer(Buffer); 770 SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc()); 771 } 772 773 Token &Scanner::peekNext() { 774 // If the current token is a possible simple key, keep parsing until we 775 // can confirm. 776 bool NeedMore = false; 777 while (true) { 778 if (TokenQueue.empty() || NeedMore) { 779 if (!fetchMoreTokens()) { 780 TokenQueue.clear(); 781 TokenQueue.push_back(Token()); 782 return TokenQueue.front(); 783 } 784 } 785 assert(!TokenQueue.empty() && 786 "fetchMoreTokens lied about getting tokens!"); 787 788 removeStaleSimpleKeyCandidates(); 789 SimpleKey SK; 790 SK.Tok = TokenQueue.begin(); 791 if (!is_contained(SimpleKeys, SK)) 792 break; 793 else 794 NeedMore = true; 795 } 796 return TokenQueue.front(); 797 } 798 799 Token Scanner::getNext() { 800 Token Ret = peekNext(); 801 // TokenQueue can be empty if there was an error getting the next token. 802 if (!TokenQueue.empty()) 803 TokenQueue.pop_front(); 804 805 // There cannot be any referenced Token's if the TokenQueue is empty. So do a 806 // quick deallocation of them all. 807 if (TokenQueue.empty()) { 808 TokenQueue.Alloc.Reset(); 809 } 810 811 return Ret; 812 } 813 814 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { 815 if (Position == End) 816 return Position; 817 // Check 7 bit c-printable - b-char. 818 if ( *Position == 0x09 819 || (*Position >= 0x20 && *Position <= 0x7E)) 820 return Position + 1; 821 822 // Check for valid UTF-8. 823 if (uint8_t(*Position) & 0x80) { 824 UTF8Decoded u8d = decodeUTF8(Position); 825 if ( u8d.second != 0 826 && u8d.first != 0xFEFF 827 && ( u8d.first == 0x85 828 || ( u8d.first >= 0xA0 829 && u8d.first <= 0xD7FF) 830 || ( u8d.first >= 0xE000 831 && u8d.first <= 0xFFFD) 832 || ( u8d.first >= 0x10000 833 && u8d.first <= 0x10FFFF))) 834 return Position + u8d.second; 835 } 836 return Position; 837 } 838 839 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { 840 if (Position == End) 841 return Position; 842 if (*Position == 0x0D) { 843 if (Position + 1 != End && *(Position + 1) == 0x0A) 844 return Position + 2; 845 return Position + 1; 846 } 847 848 if (*Position == 0x0A) 849 return Position + 1; 850 return Position; 851 } 852 853 StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) { 854 if (Position == End) 855 return Position; 856 if (*Position == ' ') 857 return Position + 1; 858 return Position; 859 } 860 861 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { 862 if (Position == End) 863 return Position; 864 if (*Position == ' ' || *Position == '\t') 865 return Position + 1; 866 return Position; 867 } 868 869 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { 870 if (Position == End) 871 return Position; 872 if (*Position == ' ' || *Position == '\t') 873 return Position; 874 return skip_nb_char(Position); 875 } 876 877 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func 878 , StringRef::iterator Position) { 879 while (true) { 880 StringRef::iterator i = (this->*Func)(Position); 881 if (i == Position) 882 break; 883 Position = i; 884 } 885 return Position; 886 } 887 888 void Scanner::advanceWhile(SkipWhileFunc Func) { 889 auto Final = skip_while(Func, Current); 890 Column += Final - Current; 891 Current = Final; 892 } 893 894 static bool is_ns_hex_digit(const char C) { 895 return (C >= '0' && C <= '9') 896 || (C >= 'a' && C <= 'z') 897 || (C >= 'A' && C <= 'Z'); 898 } 899 900 static bool is_ns_word_char(const char C) { 901 return C == '-' 902 || (C >= 'a' && C <= 'z') 903 || (C >= 'A' && C <= 'Z'); 904 } 905 906 StringRef Scanner::scan_ns_uri_char() { 907 StringRef::iterator Start = Current; 908 while (true) { 909 if (Current == End) 910 break; 911 if (( *Current == '%' 912 && Current + 2 < End 913 && is_ns_hex_digit(*(Current + 1)) 914 && is_ns_hex_digit(*(Current + 2))) 915 || is_ns_word_char(*Current) 916 || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") 917 != StringRef::npos) { 918 ++Current; 919 ++Column; 920 } else 921 break; 922 } 923 return StringRef(Start, Current - Start); 924 } 925 926 bool Scanner::consume(uint32_t Expected) { 927 if (Expected >= 0x80) 928 report_fatal_error("Not dealing with this yet"); 929 if (Current == End) 930 return false; 931 if (uint8_t(*Current) >= 0x80) 932 report_fatal_error("Not dealing with this yet"); 933 if (uint8_t(*Current) == Expected) { 934 ++Current; 935 ++Column; 936 return true; 937 } 938 return false; 939 } 940 941 void Scanner::skip(uint32_t Distance) { 942 Current += Distance; 943 Column += Distance; 944 assert(Current <= End && "Skipped past the end"); 945 } 946 947 bool Scanner::isBlankOrBreak(StringRef::iterator Position) { 948 if (Position == End) 949 return false; 950 return *Position == ' ' || *Position == '\t' || *Position == '\r' || 951 *Position == '\n'; 952 } 953 954 bool Scanner::consumeLineBreakIfPresent() { 955 auto Next = skip_b_break(Current); 956 if (Next == Current) 957 return false; 958 Column = 0; 959 ++Line; 960 Current = Next; 961 return true; 962 } 963 964 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok 965 , unsigned AtColumn 966 , bool IsRequired) { 967 if (IsSimpleKeyAllowed) { 968 SimpleKey SK; 969 SK.Tok = Tok; 970 SK.Line = Line; 971 SK.Column = AtColumn; 972 SK.IsRequired = IsRequired; 973 SK.FlowLevel = FlowLevel; 974 SimpleKeys.push_back(SK); 975 } 976 } 977 978 void Scanner::removeStaleSimpleKeyCandidates() { 979 for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); 980 i != SimpleKeys.end();) { 981 if (i->Line != Line || i->Column + 1024 < Column) { 982 if (i->IsRequired) 983 setError( "Could not find expected : for simple key" 984 , i->Tok->Range.begin()); 985 i = SimpleKeys.erase(i); 986 } else 987 ++i; 988 } 989 } 990 991 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { 992 if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) 993 SimpleKeys.pop_back(); 994 } 995 996 bool Scanner::unrollIndent(int ToColumn) { 997 Token T; 998 // Indentation is ignored in flow. 999 if (FlowLevel != 0) 1000 return true; 1001 1002 while (Indent > ToColumn) { 1003 T.Kind = Token::TK_BlockEnd; 1004 T.Range = StringRef(Current, 1); 1005 TokenQueue.push_back(T); 1006 Indent = Indents.pop_back_val(); 1007 } 1008 1009 return true; 1010 } 1011 1012 bool Scanner::rollIndent( int ToColumn 1013 , Token::TokenKind Kind 1014 , TokenQueueT::iterator InsertPoint) { 1015 if (FlowLevel) 1016 return true; 1017 if (Indent < ToColumn) { 1018 Indents.push_back(Indent); 1019 Indent = ToColumn; 1020 1021 Token T; 1022 T.Kind = Kind; 1023 T.Range = StringRef(Current, 0); 1024 TokenQueue.insert(InsertPoint, T); 1025 } 1026 return true; 1027 } 1028 1029 void Scanner::skipComment() { 1030 if (*Current != '#') 1031 return; 1032 while (true) { 1033 // This may skip more than one byte, thus Column is only incremented 1034 // for code points. 1035 StringRef::iterator I = skip_nb_char(Current); 1036 if (I == Current) 1037 break; 1038 Current = I; 1039 ++Column; 1040 } 1041 } 1042 1043 void Scanner::scanToNextToken() { 1044 while (true) { 1045 while (*Current == ' ' || *Current == '\t') { 1046 skip(1); 1047 } 1048 1049 skipComment(); 1050 1051 // Skip EOL. 1052 StringRef::iterator i = skip_b_break(Current); 1053 if (i == Current) 1054 break; 1055 Current = i; 1056 ++Line; 1057 Column = 0; 1058 // New lines may start a simple key. 1059 if (!FlowLevel) 1060 IsSimpleKeyAllowed = true; 1061 } 1062 } 1063 1064 bool Scanner::scanStreamStart() { 1065 IsStartOfStream = false; 1066 1067 EncodingInfo EI = getUnicodeEncoding(currentInput()); 1068 1069 Token T; 1070 T.Kind = Token::TK_StreamStart; 1071 T.Range = StringRef(Current, EI.second); 1072 TokenQueue.push_back(T); 1073 Current += EI.second; 1074 return true; 1075 } 1076 1077 bool Scanner::scanStreamEnd() { 1078 // Force an ending new line if one isn't present. 1079 if (Column != 0) { 1080 Column = 0; 1081 ++Line; 1082 } 1083 1084 unrollIndent(-1); 1085 SimpleKeys.clear(); 1086 IsSimpleKeyAllowed = false; 1087 1088 Token T; 1089 T.Kind = Token::TK_StreamEnd; 1090 T.Range = StringRef(Current, 0); 1091 TokenQueue.push_back(T); 1092 return true; 1093 } 1094 1095 bool Scanner::scanDirective() { 1096 // Reset the indentation level. 1097 unrollIndent(-1); 1098 SimpleKeys.clear(); 1099 IsSimpleKeyAllowed = false; 1100 1101 StringRef::iterator Start = Current; 1102 consume('%'); 1103 StringRef::iterator NameStart = Current; 1104 Current = skip_while(&Scanner::skip_ns_char, Current); 1105 StringRef Name(NameStart, Current - NameStart); 1106 Current = skip_while(&Scanner::skip_s_white, Current); 1107 1108 Token T; 1109 if (Name == "YAML") { 1110 Current = skip_while(&Scanner::skip_ns_char, Current); 1111 T.Kind = Token::TK_VersionDirective; 1112 T.Range = StringRef(Start, Current - Start); 1113 TokenQueue.push_back(T); 1114 return true; 1115 } else if(Name == "TAG") { 1116 Current = skip_while(&Scanner::skip_ns_char, Current); 1117 Current = skip_while(&Scanner::skip_s_white, Current); 1118 Current = skip_while(&Scanner::skip_ns_char, Current); 1119 T.Kind = Token::TK_TagDirective; 1120 T.Range = StringRef(Start, Current - Start); 1121 TokenQueue.push_back(T); 1122 return true; 1123 } 1124 return false; 1125 } 1126 1127 bool Scanner::scanDocumentIndicator(bool IsStart) { 1128 unrollIndent(-1); 1129 SimpleKeys.clear(); 1130 IsSimpleKeyAllowed = false; 1131 1132 Token T; 1133 T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; 1134 T.Range = StringRef(Current, 3); 1135 skip(3); 1136 TokenQueue.push_back(T); 1137 return true; 1138 } 1139 1140 bool Scanner::scanFlowCollectionStart(bool IsSequence) { 1141 Token T; 1142 T.Kind = IsSequence ? Token::TK_FlowSequenceStart 1143 : Token::TK_FlowMappingStart; 1144 T.Range = StringRef(Current, 1); 1145 skip(1); 1146 TokenQueue.push_back(T); 1147 1148 // [ and { may begin a simple key. 1149 saveSimpleKeyCandidate(--TokenQueue.end(), Column - 1, false); 1150 1151 // And may also be followed by a simple key. 1152 IsSimpleKeyAllowed = true; 1153 ++FlowLevel; 1154 return true; 1155 } 1156 1157 bool Scanner::scanFlowCollectionEnd(bool IsSequence) { 1158 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1159 IsSimpleKeyAllowed = false; 1160 Token T; 1161 T.Kind = IsSequence ? Token::TK_FlowSequenceEnd 1162 : Token::TK_FlowMappingEnd; 1163 T.Range = StringRef(Current, 1); 1164 skip(1); 1165 TokenQueue.push_back(T); 1166 if (FlowLevel) 1167 --FlowLevel; 1168 return true; 1169 } 1170 1171 bool Scanner::scanFlowEntry() { 1172 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1173 IsSimpleKeyAllowed = true; 1174 Token T; 1175 T.Kind = Token::TK_FlowEntry; 1176 T.Range = StringRef(Current, 1); 1177 skip(1); 1178 TokenQueue.push_back(T); 1179 return true; 1180 } 1181 1182 bool Scanner::scanBlockEntry() { 1183 rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); 1184 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1185 IsSimpleKeyAllowed = true; 1186 Token T; 1187 T.Kind = Token::TK_BlockEntry; 1188 T.Range = StringRef(Current, 1); 1189 skip(1); 1190 TokenQueue.push_back(T); 1191 return true; 1192 } 1193 1194 bool Scanner::scanKey() { 1195 if (!FlowLevel) 1196 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1197 1198 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1199 IsSimpleKeyAllowed = !FlowLevel; 1200 1201 Token T; 1202 T.Kind = Token::TK_Key; 1203 T.Range = StringRef(Current, 1); 1204 skip(1); 1205 TokenQueue.push_back(T); 1206 return true; 1207 } 1208 1209 bool Scanner::scanValue() { 1210 // If the previous token could have been a simple key, insert the key token 1211 // into the token queue. 1212 if (!SimpleKeys.empty()) { 1213 SimpleKey SK = SimpleKeys.pop_back_val(); 1214 Token T; 1215 T.Kind = Token::TK_Key; 1216 T.Range = SK.Tok->Range; 1217 TokenQueueT::iterator i, e; 1218 for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { 1219 if (i == SK.Tok) 1220 break; 1221 } 1222 assert(i != e && "SimpleKey not in token queue!"); 1223 i = TokenQueue.insert(i, T); 1224 1225 // We may also need to add a Block-Mapping-Start token. 1226 rollIndent(SK.Column, Token::TK_BlockMappingStart, i); 1227 1228 IsSimpleKeyAllowed = false; 1229 } else { 1230 if (!FlowLevel) 1231 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1232 IsSimpleKeyAllowed = !FlowLevel; 1233 } 1234 1235 Token T; 1236 T.Kind = Token::TK_Value; 1237 T.Range = StringRef(Current, 1); 1238 skip(1); 1239 TokenQueue.push_back(T); 1240 return true; 1241 } 1242 1243 // Forbidding inlining improves performance by roughly 20%. 1244 // FIXME: Remove once llvm optimizes this to the faster version without hints. 1245 LLVM_ATTRIBUTE_NOINLINE static bool 1246 wasEscaped(StringRef::iterator First, StringRef::iterator Position); 1247 1248 // Returns whether a character at 'Position' was escaped with a leading '\'. 1249 // 'First' specifies the position of the first character in the string. 1250 static bool wasEscaped(StringRef::iterator First, 1251 StringRef::iterator Position) { 1252 assert(Position - 1 >= First); 1253 StringRef::iterator I = Position - 1; 1254 // We calculate the number of consecutive '\'s before the current position 1255 // by iterating backwards through our string. 1256 while (I >= First && *I == '\\') --I; 1257 // (Position - 1 - I) now contains the number of '\'s before the current 1258 // position. If it is odd, the character at 'Position' was escaped. 1259 return (Position - 1 - I) % 2 == 1; 1260 } 1261 1262 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { 1263 StringRef::iterator Start = Current; 1264 unsigned ColStart = Column; 1265 if (IsDoubleQuoted) { 1266 do { 1267 ++Current; 1268 while (Current != End && *Current != '"') 1269 ++Current; 1270 // Repeat until the previous character was not a '\' or was an escaped 1271 // backslash. 1272 } while ( Current != End 1273 && *(Current - 1) == '\\' 1274 && wasEscaped(Start + 1, Current)); 1275 } else { 1276 skip(1); 1277 while (true) { 1278 // Skip a ' followed by another '. 1279 if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { 1280 skip(2); 1281 continue; 1282 } else if (*Current == '\'') 1283 break; 1284 StringRef::iterator i = skip_nb_char(Current); 1285 if (i == Current) { 1286 i = skip_b_break(Current); 1287 if (i == Current) 1288 break; 1289 Current = i; 1290 Column = 0; 1291 ++Line; 1292 } else { 1293 if (i == End) 1294 break; 1295 Current = i; 1296 ++Column; 1297 } 1298 } 1299 } 1300 1301 if (Current == End) { 1302 setError("Expected quote at end of scalar", Current); 1303 return false; 1304 } 1305 1306 skip(1); // Skip ending quote. 1307 Token T; 1308 T.Kind = Token::TK_Scalar; 1309 T.Range = StringRef(Start, Current - Start); 1310 TokenQueue.push_back(T); 1311 1312 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1313 1314 IsSimpleKeyAllowed = false; 1315 1316 return true; 1317 } 1318 1319 bool Scanner::scanPlainScalar() { 1320 StringRef::iterator Start = Current; 1321 unsigned ColStart = Column; 1322 unsigned LeadingBlanks = 0; 1323 assert(Indent >= -1 && "Indent must be >= -1 !"); 1324 unsigned indent = static_cast<unsigned>(Indent + 1); 1325 while (true) { 1326 if (*Current == '#') 1327 break; 1328 1329 while (!isBlankOrBreak(Current)) { 1330 if ( FlowLevel && *Current == ':' 1331 && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) { 1332 setError("Found unexpected ':' while scanning a plain scalar", Current); 1333 return false; 1334 } 1335 1336 // Check for the end of the plain scalar. 1337 if ( (*Current == ':' && isBlankOrBreak(Current + 1)) 1338 || ( FlowLevel 1339 && (StringRef(Current, 1).find_first_of(",:?[]{}") 1340 != StringRef::npos))) 1341 break; 1342 1343 StringRef::iterator i = skip_nb_char(Current); 1344 if (i == Current) 1345 break; 1346 Current = i; 1347 ++Column; 1348 } 1349 1350 // Are we at the end? 1351 if (!isBlankOrBreak(Current)) 1352 break; 1353 1354 // Eat blanks. 1355 StringRef::iterator Tmp = Current; 1356 while (isBlankOrBreak(Tmp)) { 1357 StringRef::iterator i = skip_s_white(Tmp); 1358 if (i != Tmp) { 1359 if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { 1360 setError("Found invalid tab character in indentation", Tmp); 1361 return false; 1362 } 1363 Tmp = i; 1364 ++Column; 1365 } else { 1366 i = skip_b_break(Tmp); 1367 if (!LeadingBlanks) 1368 LeadingBlanks = 1; 1369 Tmp = i; 1370 Column = 0; 1371 ++Line; 1372 } 1373 } 1374 1375 if (!FlowLevel && Column < indent) 1376 break; 1377 1378 Current = Tmp; 1379 } 1380 if (Start == Current) { 1381 setError("Got empty plain scalar", Start); 1382 return false; 1383 } 1384 Token T; 1385 T.Kind = Token::TK_Scalar; 1386 T.Range = StringRef(Start, Current - Start); 1387 TokenQueue.push_back(T); 1388 1389 // Plain scalars can be simple keys. 1390 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1391 1392 IsSimpleKeyAllowed = false; 1393 1394 return true; 1395 } 1396 1397 bool Scanner::scanAliasOrAnchor(bool IsAlias) { 1398 StringRef::iterator Start = Current; 1399 unsigned ColStart = Column; 1400 skip(1); 1401 while(true) { 1402 if ( *Current == '[' || *Current == ']' 1403 || *Current == '{' || *Current == '}' 1404 || *Current == ',' 1405 || *Current == ':') 1406 break; 1407 StringRef::iterator i = skip_ns_char(Current); 1408 if (i == Current) 1409 break; 1410 Current = i; 1411 ++Column; 1412 } 1413 1414 if (Start == Current) { 1415 setError("Got empty alias or anchor", Start); 1416 return false; 1417 } 1418 1419 Token T; 1420 T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; 1421 T.Range = StringRef(Start, Current - Start); 1422 TokenQueue.push_back(T); 1423 1424 // Alias and anchors can be simple keys. 1425 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1426 1427 IsSimpleKeyAllowed = false; 1428 1429 return true; 1430 } 1431 1432 char Scanner::scanBlockChompingIndicator() { 1433 char Indicator = ' '; 1434 if (Current != End && (*Current == '+' || *Current == '-')) { 1435 Indicator = *Current; 1436 skip(1); 1437 } 1438 return Indicator; 1439 } 1440 1441 /// Get the number of line breaks after chomping. 1442 /// 1443 /// Return the number of trailing line breaks to emit, depending on 1444 /// \p ChompingIndicator. 1445 static unsigned getChompedLineBreaks(char ChompingIndicator, 1446 unsigned LineBreaks, StringRef Str) { 1447 if (ChompingIndicator == '-') // Strip all line breaks. 1448 return 0; 1449 if (ChompingIndicator == '+') // Keep all line breaks. 1450 return LineBreaks; 1451 // Clip trailing lines. 1452 return Str.empty() ? 0 : 1; 1453 } 1454 1455 unsigned Scanner::scanBlockIndentationIndicator() { 1456 unsigned Indent = 0; 1457 if (Current != End && (*Current >= '1' && *Current <= '9')) { 1458 Indent = unsigned(*Current - '0'); 1459 skip(1); 1460 } 1461 return Indent; 1462 } 1463 1464 bool Scanner::scanBlockScalarHeader(char &ChompingIndicator, 1465 unsigned &IndentIndicator, bool &IsDone) { 1466 auto Start = Current; 1467 1468 ChompingIndicator = scanBlockChompingIndicator(); 1469 IndentIndicator = scanBlockIndentationIndicator(); 1470 // Check for the chomping indicator once again. 1471 if (ChompingIndicator == ' ') 1472 ChompingIndicator = scanBlockChompingIndicator(); 1473 Current = skip_while(&Scanner::skip_s_white, Current); 1474 skipComment(); 1475 1476 if (Current == End) { // EOF, we have an empty scalar. 1477 Token T; 1478 T.Kind = Token::TK_BlockScalar; 1479 T.Range = StringRef(Start, Current - Start); 1480 TokenQueue.push_back(T); 1481 IsDone = true; 1482 return true; 1483 } 1484 1485 if (!consumeLineBreakIfPresent()) { 1486 setError("Expected a line break after block scalar header", Current); 1487 return false; 1488 } 1489 return true; 1490 } 1491 1492 bool Scanner::findBlockScalarIndent(unsigned &BlockIndent, 1493 unsigned BlockExitIndent, 1494 unsigned &LineBreaks, bool &IsDone) { 1495 unsigned MaxAllSpaceLineCharacters = 0; 1496 StringRef::iterator LongestAllSpaceLine; 1497 1498 while (true) { 1499 advanceWhile(&Scanner::skip_s_space); 1500 if (skip_nb_char(Current) != Current) { 1501 // This line isn't empty, so try and find the indentation. 1502 if (Column <= BlockExitIndent) { // End of the block literal. 1503 IsDone = true; 1504 return true; 1505 } 1506 // We found the block's indentation. 1507 BlockIndent = Column; 1508 if (MaxAllSpaceLineCharacters > BlockIndent) { 1509 setError( 1510 "Leading all-spaces line must be smaller than the block indent", 1511 LongestAllSpaceLine); 1512 return false; 1513 } 1514 return true; 1515 } 1516 if (skip_b_break(Current) != Current && 1517 Column > MaxAllSpaceLineCharacters) { 1518 // Record the longest all-space line in case it's longer than the 1519 // discovered block indent. 1520 MaxAllSpaceLineCharacters = Column; 1521 LongestAllSpaceLine = Current; 1522 } 1523 1524 // Check for EOF. 1525 if (Current == End) { 1526 IsDone = true; 1527 return true; 1528 } 1529 1530 if (!consumeLineBreakIfPresent()) { 1531 IsDone = true; 1532 return true; 1533 } 1534 ++LineBreaks; 1535 } 1536 return true; 1537 } 1538 1539 bool Scanner::scanBlockScalarIndent(unsigned BlockIndent, 1540 unsigned BlockExitIndent, bool &IsDone) { 1541 // Skip the indentation. 1542 while (Column < BlockIndent) { 1543 auto I = skip_s_space(Current); 1544 if (I == Current) 1545 break; 1546 Current = I; 1547 ++Column; 1548 } 1549 1550 if (skip_nb_char(Current) == Current) 1551 return true; 1552 1553 if (Column <= BlockExitIndent) { // End of the block literal. 1554 IsDone = true; 1555 return true; 1556 } 1557 1558 if (Column < BlockIndent) { 1559 if (Current != End && *Current == '#') { // Trailing comment. 1560 IsDone = true; 1561 return true; 1562 } 1563 setError("A text line is less indented than the block scalar", Current); 1564 return false; 1565 } 1566 return true; // A normal text line. 1567 } 1568 1569 bool Scanner::scanBlockScalar(bool IsLiteral) { 1570 // Eat '|' or '>' 1571 assert(*Current == '|' || *Current == '>'); 1572 skip(1); 1573 1574 char ChompingIndicator; 1575 unsigned BlockIndent; 1576 bool IsDone = false; 1577 if (!scanBlockScalarHeader(ChompingIndicator, BlockIndent, IsDone)) 1578 return false; 1579 if (IsDone) 1580 return true; 1581 1582 auto Start = Current; 1583 unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent; 1584 unsigned LineBreaks = 0; 1585 if (BlockIndent == 0) { 1586 if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks, 1587 IsDone)) 1588 return false; 1589 } 1590 1591 // Scan the block's scalars body. 1592 SmallString<256> Str; 1593 while (!IsDone) { 1594 if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone)) 1595 return false; 1596 if (IsDone) 1597 break; 1598 1599 // Parse the current line. 1600 auto LineStart = Current; 1601 advanceWhile(&Scanner::skip_nb_char); 1602 if (LineStart != Current) { 1603 Str.append(LineBreaks, '\n'); 1604 Str.append(StringRef(LineStart, Current - LineStart)); 1605 LineBreaks = 0; 1606 } 1607 1608 // Check for EOF. 1609 if (Current == End) 1610 break; 1611 1612 if (!consumeLineBreakIfPresent()) 1613 break; 1614 ++LineBreaks; 1615 } 1616 1617 if (Current == End && !LineBreaks) 1618 // Ensure that there is at least one line break before the end of file. 1619 LineBreaks = 1; 1620 Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n'); 1621 1622 // New lines may start a simple key. 1623 if (!FlowLevel) 1624 IsSimpleKeyAllowed = true; 1625 1626 Token T; 1627 T.Kind = Token::TK_BlockScalar; 1628 T.Range = StringRef(Start, Current - Start); 1629 T.Value = Str.str().str(); 1630 TokenQueue.push_back(T); 1631 return true; 1632 } 1633 1634 bool Scanner::scanTag() { 1635 StringRef::iterator Start = Current; 1636 unsigned ColStart = Column; 1637 skip(1); // Eat !. 1638 if (Current == End || isBlankOrBreak(Current)); // An empty tag. 1639 else if (*Current == '<') { 1640 skip(1); 1641 scan_ns_uri_char(); 1642 if (!consume('>')) 1643 return false; 1644 } else { 1645 // FIXME: Actually parse the c-ns-shorthand-tag rule. 1646 Current = skip_while(&Scanner::skip_ns_char, Current); 1647 } 1648 1649 Token T; 1650 T.Kind = Token::TK_Tag; 1651 T.Range = StringRef(Start, Current - Start); 1652 TokenQueue.push_back(T); 1653 1654 // Tags can be simple keys. 1655 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1656 1657 IsSimpleKeyAllowed = false; 1658 1659 return true; 1660 } 1661 1662 bool Scanner::fetchMoreTokens() { 1663 if (IsStartOfStream) 1664 return scanStreamStart(); 1665 1666 scanToNextToken(); 1667 1668 if (Current == End) 1669 return scanStreamEnd(); 1670 1671 removeStaleSimpleKeyCandidates(); 1672 1673 unrollIndent(Column); 1674 1675 if (Column == 0 && *Current == '%') 1676 return scanDirective(); 1677 1678 if (Column == 0 && Current + 4 <= End 1679 && *Current == '-' 1680 && *(Current + 1) == '-' 1681 && *(Current + 2) == '-' 1682 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1683 return scanDocumentIndicator(true); 1684 1685 if (Column == 0 && Current + 4 <= End 1686 && *Current == '.' 1687 && *(Current + 1) == '.' 1688 && *(Current + 2) == '.' 1689 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1690 return scanDocumentIndicator(false); 1691 1692 if (*Current == '[') 1693 return scanFlowCollectionStart(true); 1694 1695 if (*Current == '{') 1696 return scanFlowCollectionStart(false); 1697 1698 if (*Current == ']') 1699 return scanFlowCollectionEnd(true); 1700 1701 if (*Current == '}') 1702 return scanFlowCollectionEnd(false); 1703 1704 if (*Current == ',') 1705 return scanFlowEntry(); 1706 1707 if (*Current == '-' && isBlankOrBreak(Current + 1)) 1708 return scanBlockEntry(); 1709 1710 if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1))) 1711 return scanKey(); 1712 1713 if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1))) 1714 return scanValue(); 1715 1716 if (*Current == '*') 1717 return scanAliasOrAnchor(true); 1718 1719 if (*Current == '&') 1720 return scanAliasOrAnchor(false); 1721 1722 if (*Current == '!') 1723 return scanTag(); 1724 1725 if (*Current == '|' && !FlowLevel) 1726 return scanBlockScalar(true); 1727 1728 if (*Current == '>' && !FlowLevel) 1729 return scanBlockScalar(false); 1730 1731 if (*Current == '\'') 1732 return scanFlowScalar(false); 1733 1734 if (*Current == '"') 1735 return scanFlowScalar(true); 1736 1737 // Get a plain scalar. 1738 StringRef FirstChar(Current, 1); 1739 if (!(isBlankOrBreak(Current) 1740 || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos) 1741 || (*Current == '-' && !isBlankOrBreak(Current + 1)) 1742 || (!FlowLevel && (*Current == '?' || *Current == ':') 1743 && isBlankOrBreak(Current + 1)) 1744 || (!FlowLevel && *Current == ':' 1745 && Current + 2 < End 1746 && *(Current + 1) == ':' 1747 && !isBlankOrBreak(Current + 2))) 1748 return scanPlainScalar(); 1749 1750 setError("Unrecognized character while tokenizing."); 1751 return false; 1752 } 1753 1754 Stream::Stream(StringRef Input, SourceMgr &SM, bool ShowColors) 1755 : scanner(new Scanner(Input, SM, ShowColors)), CurrentDoc() {} 1756 1757 Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors) 1758 : scanner(new Scanner(InputBuffer, SM, ShowColors)), CurrentDoc() {} 1759 1760 Stream::~Stream() {} 1761 1762 bool Stream::failed() { return scanner->failed(); } 1763 1764 void Stream::printError(Node *N, const Twine &Msg) { 1765 scanner->printError( N->getSourceRange().Start 1766 , SourceMgr::DK_Error 1767 , Msg 1768 , N->getSourceRange()); 1769 } 1770 1771 document_iterator Stream::begin() { 1772 if (CurrentDoc) 1773 report_fatal_error("Can only iterate over the stream once"); 1774 1775 // Skip Stream-Start. 1776 scanner->getNext(); 1777 1778 CurrentDoc.reset(new Document(*this)); 1779 return document_iterator(CurrentDoc); 1780 } 1781 1782 document_iterator Stream::end() { 1783 return document_iterator(); 1784 } 1785 1786 void Stream::skip() { 1787 for (document_iterator i = begin(), e = end(); i != e; ++i) 1788 i->skip(); 1789 } 1790 1791 Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A, 1792 StringRef T) 1793 : Doc(D), TypeID(Type), Anchor(A), Tag(T) { 1794 SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); 1795 SourceRange = SMRange(Start, Start); 1796 } 1797 1798 std::string Node::getVerbatimTag() const { 1799 StringRef Raw = getRawTag(); 1800 if (!Raw.empty() && Raw != "!") { 1801 std::string Ret; 1802 if (Raw.find_last_of('!') == 0) { 1803 Ret = Doc->getTagMap().find("!")->second; 1804 Ret += Raw.substr(1); 1805 return Ret; 1806 } else if (Raw.startswith("!!")) { 1807 Ret = Doc->getTagMap().find("!!")->second; 1808 Ret += Raw.substr(2); 1809 return Ret; 1810 } else { 1811 StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1); 1812 std::map<StringRef, StringRef>::const_iterator It = 1813 Doc->getTagMap().find(TagHandle); 1814 if (It != Doc->getTagMap().end()) 1815 Ret = It->second; 1816 else { 1817 Token T; 1818 T.Kind = Token::TK_Tag; 1819 T.Range = TagHandle; 1820 setError(Twine("Unknown tag handle ") + TagHandle, T); 1821 } 1822 Ret += Raw.substr(Raw.find_last_of('!') + 1); 1823 return Ret; 1824 } 1825 } 1826 1827 switch (getType()) { 1828 case NK_Null: 1829 return "tag:yaml.org,2002:null"; 1830 case NK_Scalar: 1831 case NK_BlockScalar: 1832 // TODO: Tag resolution. 1833 return "tag:yaml.org,2002:str"; 1834 case NK_Mapping: 1835 return "tag:yaml.org,2002:map"; 1836 case NK_Sequence: 1837 return "tag:yaml.org,2002:seq"; 1838 } 1839 1840 return ""; 1841 } 1842 1843 Token &Node::peekNext() { 1844 return Doc->peekNext(); 1845 } 1846 1847 Token Node::getNext() { 1848 return Doc->getNext(); 1849 } 1850 1851 Node *Node::parseBlockNode() { 1852 return Doc->parseBlockNode(); 1853 } 1854 1855 BumpPtrAllocator &Node::getAllocator() { 1856 return Doc->NodeAllocator; 1857 } 1858 1859 void Node::setError(const Twine &Msg, Token &Tok) const { 1860 Doc->setError(Msg, Tok); 1861 } 1862 1863 bool Node::failed() const { 1864 return Doc->failed(); 1865 } 1866 1867 1868 1869 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { 1870 // TODO: Handle newlines properly. We need to remove leading whitespace. 1871 if (Value[0] == '"') { // Double quoted. 1872 // Pull off the leading and trailing "s. 1873 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1874 // Search for characters that would require unescaping the value. 1875 StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); 1876 if (i != StringRef::npos) 1877 return unescapeDoubleQuoted(UnquotedValue, i, Storage); 1878 return UnquotedValue; 1879 } else if (Value[0] == '\'') { // Single quoted. 1880 // Pull off the leading and trailing 's. 1881 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1882 StringRef::size_type i = UnquotedValue.find('\''); 1883 if (i != StringRef::npos) { 1884 // We're going to need Storage. 1885 Storage.clear(); 1886 Storage.reserve(UnquotedValue.size()); 1887 for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { 1888 StringRef Valid(UnquotedValue.begin(), i); 1889 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1890 Storage.push_back('\''); 1891 UnquotedValue = UnquotedValue.substr(i + 2); 1892 } 1893 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 1894 return StringRef(Storage.begin(), Storage.size()); 1895 } 1896 return UnquotedValue; 1897 } 1898 // Plain or block. 1899 return Value.rtrim(' '); 1900 } 1901 1902 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue 1903 , StringRef::size_type i 1904 , SmallVectorImpl<char> &Storage) 1905 const { 1906 // Use Storage to build proper value. 1907 Storage.clear(); 1908 Storage.reserve(UnquotedValue.size()); 1909 for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { 1910 // Insert all previous chars into Storage. 1911 StringRef Valid(UnquotedValue.begin(), i); 1912 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1913 // Chop off inserted chars. 1914 UnquotedValue = UnquotedValue.substr(i); 1915 1916 assert(!UnquotedValue.empty() && "Can't be empty!"); 1917 1918 // Parse escape or line break. 1919 switch (UnquotedValue[0]) { 1920 case '\r': 1921 case '\n': 1922 Storage.push_back('\n'); 1923 if ( UnquotedValue.size() > 1 1924 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1925 UnquotedValue = UnquotedValue.substr(1); 1926 UnquotedValue = UnquotedValue.substr(1); 1927 break; 1928 default: 1929 if (UnquotedValue.size() == 1) 1930 // TODO: Report error. 1931 break; 1932 UnquotedValue = UnquotedValue.substr(1); 1933 switch (UnquotedValue[0]) { 1934 default: { 1935 Token T; 1936 T.Range = StringRef(UnquotedValue.begin(), 1); 1937 setError("Unrecognized escape code!", T); 1938 return ""; 1939 } 1940 case '\r': 1941 case '\n': 1942 // Remove the new line. 1943 if ( UnquotedValue.size() > 1 1944 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1945 UnquotedValue = UnquotedValue.substr(1); 1946 // If this was just a single byte newline, it will get skipped 1947 // below. 1948 break; 1949 case '0': 1950 Storage.push_back(0x00); 1951 break; 1952 case 'a': 1953 Storage.push_back(0x07); 1954 break; 1955 case 'b': 1956 Storage.push_back(0x08); 1957 break; 1958 case 't': 1959 case 0x09: 1960 Storage.push_back(0x09); 1961 break; 1962 case 'n': 1963 Storage.push_back(0x0A); 1964 break; 1965 case 'v': 1966 Storage.push_back(0x0B); 1967 break; 1968 case 'f': 1969 Storage.push_back(0x0C); 1970 break; 1971 case 'r': 1972 Storage.push_back(0x0D); 1973 break; 1974 case 'e': 1975 Storage.push_back(0x1B); 1976 break; 1977 case ' ': 1978 Storage.push_back(0x20); 1979 break; 1980 case '"': 1981 Storage.push_back(0x22); 1982 break; 1983 case '/': 1984 Storage.push_back(0x2F); 1985 break; 1986 case '\\': 1987 Storage.push_back(0x5C); 1988 break; 1989 case 'N': 1990 encodeUTF8(0x85, Storage); 1991 break; 1992 case '_': 1993 encodeUTF8(0xA0, Storage); 1994 break; 1995 case 'L': 1996 encodeUTF8(0x2028, Storage); 1997 break; 1998 case 'P': 1999 encodeUTF8(0x2029, Storage); 2000 break; 2001 case 'x': { 2002 if (UnquotedValue.size() < 3) 2003 // TODO: Report error. 2004 break; 2005 unsigned int UnicodeScalarValue; 2006 if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) 2007 // TODO: Report error. 2008 UnicodeScalarValue = 0xFFFD; 2009 encodeUTF8(UnicodeScalarValue, Storage); 2010 UnquotedValue = UnquotedValue.substr(2); 2011 break; 2012 } 2013 case 'u': { 2014 if (UnquotedValue.size() < 5) 2015 // TODO: Report error. 2016 break; 2017 unsigned int UnicodeScalarValue; 2018 if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) 2019 // TODO: Report error. 2020 UnicodeScalarValue = 0xFFFD; 2021 encodeUTF8(UnicodeScalarValue, Storage); 2022 UnquotedValue = UnquotedValue.substr(4); 2023 break; 2024 } 2025 case 'U': { 2026 if (UnquotedValue.size() < 9) 2027 // TODO: Report error. 2028 break; 2029 unsigned int UnicodeScalarValue; 2030 if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) 2031 // TODO: Report error. 2032 UnicodeScalarValue = 0xFFFD; 2033 encodeUTF8(UnicodeScalarValue, Storage); 2034 UnquotedValue = UnquotedValue.substr(8); 2035 break; 2036 } 2037 } 2038 UnquotedValue = UnquotedValue.substr(1); 2039 } 2040 } 2041 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 2042 return StringRef(Storage.begin(), Storage.size()); 2043 } 2044 2045 Node *KeyValueNode::getKey() { 2046 if (Key) 2047 return Key; 2048 // Handle implicit null keys. 2049 { 2050 Token &t = peekNext(); 2051 if ( t.Kind == Token::TK_BlockEnd 2052 || t.Kind == Token::TK_Value 2053 || t.Kind == Token::TK_Error) { 2054 return Key = new (getAllocator()) NullNode(Doc); 2055 } 2056 if (t.Kind == Token::TK_Key) 2057 getNext(); // skip TK_Key. 2058 } 2059 2060 // Handle explicit null keys. 2061 Token &t = peekNext(); 2062 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { 2063 return Key = new (getAllocator()) NullNode(Doc); 2064 } 2065 2066 // We've got a normal key. 2067 return Key = parseBlockNode(); 2068 } 2069 2070 Node *KeyValueNode::getValue() { 2071 if (Value) 2072 return Value; 2073 getKey()->skip(); 2074 if (failed()) 2075 return Value = new (getAllocator()) NullNode(Doc); 2076 2077 // Handle implicit null values. 2078 { 2079 Token &t = peekNext(); 2080 if ( t.Kind == Token::TK_BlockEnd 2081 || t.Kind == Token::TK_FlowMappingEnd 2082 || t.Kind == Token::TK_Key 2083 || t.Kind == Token::TK_FlowEntry 2084 || t.Kind == Token::TK_Error) { 2085 return Value = new (getAllocator()) NullNode(Doc); 2086 } 2087 2088 if (t.Kind != Token::TK_Value) { 2089 setError("Unexpected token in Key Value.", t); 2090 return Value = new (getAllocator()) NullNode(Doc); 2091 } 2092 getNext(); // skip TK_Value. 2093 } 2094 2095 // Handle explicit null values. 2096 Token &t = peekNext(); 2097 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { 2098 return Value = new (getAllocator()) NullNode(Doc); 2099 } 2100 2101 // We got a normal value. 2102 return Value = parseBlockNode(); 2103 } 2104 2105 void MappingNode::increment() { 2106 if (failed()) { 2107 IsAtEnd = true; 2108 CurrentEntry = nullptr; 2109 return; 2110 } 2111 if (CurrentEntry) { 2112 CurrentEntry->skip(); 2113 if (Type == MT_Inline) { 2114 IsAtEnd = true; 2115 CurrentEntry = nullptr; 2116 return; 2117 } 2118 } 2119 Token T = peekNext(); 2120 if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { 2121 // KeyValueNode eats the TK_Key. That way it can detect null keys. 2122 CurrentEntry = new (getAllocator()) KeyValueNode(Doc); 2123 } else if (Type == MT_Block) { 2124 switch (T.Kind) { 2125 case Token::TK_BlockEnd: 2126 getNext(); 2127 IsAtEnd = true; 2128 CurrentEntry = nullptr; 2129 break; 2130 default: 2131 setError("Unexpected token. Expected Key or Block End", T); 2132 case Token::TK_Error: 2133 IsAtEnd = true; 2134 CurrentEntry = nullptr; 2135 } 2136 } else { 2137 switch (T.Kind) { 2138 case Token::TK_FlowEntry: 2139 // Eat the flow entry and recurse. 2140 getNext(); 2141 return increment(); 2142 case Token::TK_FlowMappingEnd: 2143 getNext(); 2144 case Token::TK_Error: 2145 // Set this to end iterator. 2146 IsAtEnd = true; 2147 CurrentEntry = nullptr; 2148 break; 2149 default: 2150 setError( "Unexpected token. Expected Key, Flow Entry, or Flow " 2151 "Mapping End." 2152 , T); 2153 IsAtEnd = true; 2154 CurrentEntry = nullptr; 2155 } 2156 } 2157 } 2158 2159 void SequenceNode::increment() { 2160 if (failed()) { 2161 IsAtEnd = true; 2162 CurrentEntry = nullptr; 2163 return; 2164 } 2165 if (CurrentEntry) 2166 CurrentEntry->skip(); 2167 Token T = peekNext(); 2168 if (SeqType == ST_Block) { 2169 switch (T.Kind) { 2170 case Token::TK_BlockEntry: 2171 getNext(); 2172 CurrentEntry = parseBlockNode(); 2173 if (!CurrentEntry) { // An error occurred. 2174 IsAtEnd = true; 2175 CurrentEntry = nullptr; 2176 } 2177 break; 2178 case Token::TK_BlockEnd: 2179 getNext(); 2180 IsAtEnd = true; 2181 CurrentEntry = nullptr; 2182 break; 2183 default: 2184 setError( "Unexpected token. Expected Block Entry or Block End." 2185 , T); 2186 case Token::TK_Error: 2187 IsAtEnd = true; 2188 CurrentEntry = nullptr; 2189 } 2190 } else if (SeqType == ST_Indentless) { 2191 switch (T.Kind) { 2192 case Token::TK_BlockEntry: 2193 getNext(); 2194 CurrentEntry = parseBlockNode(); 2195 if (!CurrentEntry) { // An error occurred. 2196 IsAtEnd = true; 2197 CurrentEntry = nullptr; 2198 } 2199 break; 2200 default: 2201 case Token::TK_Error: 2202 IsAtEnd = true; 2203 CurrentEntry = nullptr; 2204 } 2205 } else if (SeqType == ST_Flow) { 2206 switch (T.Kind) { 2207 case Token::TK_FlowEntry: 2208 // Eat the flow entry and recurse. 2209 getNext(); 2210 WasPreviousTokenFlowEntry = true; 2211 return increment(); 2212 case Token::TK_FlowSequenceEnd: 2213 getNext(); 2214 case Token::TK_Error: 2215 // Set this to end iterator. 2216 IsAtEnd = true; 2217 CurrentEntry = nullptr; 2218 break; 2219 case Token::TK_StreamEnd: 2220 case Token::TK_DocumentEnd: 2221 case Token::TK_DocumentStart: 2222 setError("Could not find closing ]!", T); 2223 // Set this to end iterator. 2224 IsAtEnd = true; 2225 CurrentEntry = nullptr; 2226 break; 2227 default: 2228 if (!WasPreviousTokenFlowEntry) { 2229 setError("Expected , between entries!", T); 2230 IsAtEnd = true; 2231 CurrentEntry = nullptr; 2232 break; 2233 } 2234 // Otherwise it must be a flow entry. 2235 CurrentEntry = parseBlockNode(); 2236 if (!CurrentEntry) { 2237 IsAtEnd = true; 2238 } 2239 WasPreviousTokenFlowEntry = false; 2240 break; 2241 } 2242 } 2243 } 2244 2245 Document::Document(Stream &S) : stream(S), Root(nullptr) { 2246 // Tag maps starts with two default mappings. 2247 TagMap["!"] = "!"; 2248 TagMap["!!"] = "tag:yaml.org,2002:"; 2249 2250 if (parseDirectives()) 2251 expectToken(Token::TK_DocumentStart); 2252 Token &T = peekNext(); 2253 if (T.Kind == Token::TK_DocumentStart) 2254 getNext(); 2255 } 2256 2257 bool Document::skip() { 2258 if (stream.scanner->failed()) 2259 return false; 2260 if (!Root) 2261 getRoot(); 2262 Root->skip(); 2263 Token &T = peekNext(); 2264 if (T.Kind == Token::TK_StreamEnd) 2265 return false; 2266 if (T.Kind == Token::TK_DocumentEnd) { 2267 getNext(); 2268 return skip(); 2269 } 2270 return true; 2271 } 2272 2273 Token &Document::peekNext() { 2274 return stream.scanner->peekNext(); 2275 } 2276 2277 Token Document::getNext() { 2278 return stream.scanner->getNext(); 2279 } 2280 2281 void Document::setError(const Twine &Message, Token &Location) const { 2282 stream.scanner->setError(Message, Location.Range.begin()); 2283 } 2284 2285 bool Document::failed() const { 2286 return stream.scanner->failed(); 2287 } 2288 2289 Node *Document::parseBlockNode() { 2290 Token T = peekNext(); 2291 // Handle properties. 2292 Token AnchorInfo; 2293 Token TagInfo; 2294 parse_property: 2295 switch (T.Kind) { 2296 case Token::TK_Alias: 2297 getNext(); 2298 return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); 2299 case Token::TK_Anchor: 2300 if (AnchorInfo.Kind == Token::TK_Anchor) { 2301 setError("Already encountered an anchor for this node!", T); 2302 return nullptr; 2303 } 2304 AnchorInfo = getNext(); // Consume TK_Anchor. 2305 T = peekNext(); 2306 goto parse_property; 2307 case Token::TK_Tag: 2308 if (TagInfo.Kind == Token::TK_Tag) { 2309 setError("Already encountered a tag for this node!", T); 2310 return nullptr; 2311 } 2312 TagInfo = getNext(); // Consume TK_Tag. 2313 T = peekNext(); 2314 goto parse_property; 2315 default: 2316 break; 2317 } 2318 2319 switch (T.Kind) { 2320 case Token::TK_BlockEntry: 2321 // We got an unindented BlockEntry sequence. This is not terminated with 2322 // a BlockEnd. 2323 // Don't eat the TK_BlockEntry, SequenceNode needs it. 2324 return new (NodeAllocator) SequenceNode( stream.CurrentDoc 2325 , AnchorInfo.Range.substr(1) 2326 , TagInfo.Range 2327 , SequenceNode::ST_Indentless); 2328 case Token::TK_BlockSequenceStart: 2329 getNext(); 2330 return new (NodeAllocator) 2331 SequenceNode( stream.CurrentDoc 2332 , AnchorInfo.Range.substr(1) 2333 , TagInfo.Range 2334 , SequenceNode::ST_Block); 2335 case Token::TK_BlockMappingStart: 2336 getNext(); 2337 return new (NodeAllocator) 2338 MappingNode( stream.CurrentDoc 2339 , AnchorInfo.Range.substr(1) 2340 , TagInfo.Range 2341 , MappingNode::MT_Block); 2342 case Token::TK_FlowSequenceStart: 2343 getNext(); 2344 return new (NodeAllocator) 2345 SequenceNode( stream.CurrentDoc 2346 , AnchorInfo.Range.substr(1) 2347 , TagInfo.Range 2348 , SequenceNode::ST_Flow); 2349 case Token::TK_FlowMappingStart: 2350 getNext(); 2351 return new (NodeAllocator) 2352 MappingNode( stream.CurrentDoc 2353 , AnchorInfo.Range.substr(1) 2354 , TagInfo.Range 2355 , MappingNode::MT_Flow); 2356 case Token::TK_Scalar: 2357 getNext(); 2358 return new (NodeAllocator) 2359 ScalarNode( stream.CurrentDoc 2360 , AnchorInfo.Range.substr(1) 2361 , TagInfo.Range 2362 , T.Range); 2363 case Token::TK_BlockScalar: { 2364 getNext(); 2365 StringRef NullTerminatedStr(T.Value.c_str(), T.Value.length() + 1); 2366 StringRef StrCopy = NullTerminatedStr.copy(NodeAllocator).drop_back(); 2367 return new (NodeAllocator) 2368 BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1), 2369 TagInfo.Range, StrCopy, T.Range); 2370 } 2371 case Token::TK_Key: 2372 // Don't eat the TK_Key, KeyValueNode expects it. 2373 return new (NodeAllocator) 2374 MappingNode( stream.CurrentDoc 2375 , AnchorInfo.Range.substr(1) 2376 , TagInfo.Range 2377 , MappingNode::MT_Inline); 2378 case Token::TK_DocumentStart: 2379 case Token::TK_DocumentEnd: 2380 case Token::TK_StreamEnd: 2381 default: 2382 // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not 2383 // !!null null. 2384 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2385 case Token::TK_Error: 2386 return nullptr; 2387 } 2388 llvm_unreachable("Control flow shouldn't reach here."); 2389 return nullptr; 2390 } 2391 2392 bool Document::parseDirectives() { 2393 bool isDirective = false; 2394 while (true) { 2395 Token T = peekNext(); 2396 if (T.Kind == Token::TK_TagDirective) { 2397 parseTAGDirective(); 2398 isDirective = true; 2399 } else if (T.Kind == Token::TK_VersionDirective) { 2400 parseYAMLDirective(); 2401 isDirective = true; 2402 } else 2403 break; 2404 } 2405 return isDirective; 2406 } 2407 2408 void Document::parseYAMLDirective() { 2409 getNext(); // Eat %YAML <version> 2410 } 2411 2412 void Document::parseTAGDirective() { 2413 Token Tag = getNext(); // %TAG <handle> <prefix> 2414 StringRef T = Tag.Range; 2415 // Strip %TAG 2416 T = T.substr(T.find_first_of(" \t")).ltrim(" \t"); 2417 std::size_t HandleEnd = T.find_first_of(" \t"); 2418 StringRef TagHandle = T.substr(0, HandleEnd); 2419 StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t"); 2420 TagMap[TagHandle] = TagPrefix; 2421 } 2422 2423 bool Document::expectToken(int TK) { 2424 Token T = getNext(); 2425 if (T.Kind != TK) { 2426 setError("Unexpected token", T); 2427 return false; 2428 } 2429 return true; 2430 } 2431