1 //===--- YAMLParser.cpp - Simple YAML parser ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements a YAML parser. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Support/YAMLParser.h" 15 #include "llvm/ADT/STLExtras.h" 16 #include "llvm/ADT/SmallString.h" 17 #include "llvm/ADT/SmallVector.h" 18 #include "llvm/ADT/StringExtras.h" 19 #include "llvm/ADT/Twine.h" 20 #include "llvm/ADT/AllocatorList.h" 21 #include "llvm/Support/ErrorHandling.h" 22 #include "llvm/Support/MemoryBuffer.h" 23 #include "llvm/Support/SourceMgr.h" 24 #include "llvm/Support/raw_ostream.h" 25 26 using namespace llvm; 27 using namespace yaml; 28 29 enum UnicodeEncodingForm { 30 UEF_UTF32_LE, ///< UTF-32 Little Endian 31 UEF_UTF32_BE, ///< UTF-32 Big Endian 32 UEF_UTF16_LE, ///< UTF-16 Little Endian 33 UEF_UTF16_BE, ///< UTF-16 Big Endian 34 UEF_UTF8, ///< UTF-8 or ascii. 35 UEF_Unknown ///< Not a valid Unicode encoding. 36 }; 37 38 /// EncodingInfo - Holds the encoding type and length of the byte order mark if 39 /// it exists. Length is in {0, 2, 3, 4}. 40 typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo; 41 42 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode 43 /// encoding form of \a Input. 44 /// 45 /// @param Input A string of length 0 or more. 46 /// @returns An EncodingInfo indicating the Unicode encoding form of the input 47 /// and how long the byte order mark is if one exists. 48 static EncodingInfo getUnicodeEncoding(StringRef Input) { 49 if (Input.size() == 0) 50 return std::make_pair(UEF_Unknown, 0); 51 52 switch (uint8_t(Input[0])) { 53 case 0x00: 54 if (Input.size() >= 4) { 55 if ( Input[1] == 0 56 && uint8_t(Input[2]) == 0xFE 57 && uint8_t(Input[3]) == 0xFF) 58 return std::make_pair(UEF_UTF32_BE, 4); 59 if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) 60 return std::make_pair(UEF_UTF32_BE, 0); 61 } 62 63 if (Input.size() >= 2 && Input[1] != 0) 64 return std::make_pair(UEF_UTF16_BE, 0); 65 return std::make_pair(UEF_Unknown, 0); 66 case 0xFF: 67 if ( Input.size() >= 4 68 && uint8_t(Input[1]) == 0xFE 69 && Input[2] == 0 70 && Input[3] == 0) 71 return std::make_pair(UEF_UTF32_LE, 4); 72 73 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) 74 return std::make_pair(UEF_UTF16_LE, 2); 75 return std::make_pair(UEF_Unknown, 0); 76 case 0xFE: 77 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) 78 return std::make_pair(UEF_UTF16_BE, 2); 79 return std::make_pair(UEF_Unknown, 0); 80 case 0xEF: 81 if ( Input.size() >= 3 82 && uint8_t(Input[1]) == 0xBB 83 && uint8_t(Input[2]) == 0xBF) 84 return std::make_pair(UEF_UTF8, 3); 85 return std::make_pair(UEF_Unknown, 0); 86 } 87 88 // It could still be utf-32 or utf-16. 89 if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) 90 return std::make_pair(UEF_UTF32_LE, 0); 91 92 if (Input.size() >= 2 && Input[1] == 0) 93 return std::make_pair(UEF_UTF16_LE, 0); 94 95 return std::make_pair(UEF_UTF8, 0); 96 } 97 98 namespace llvm { 99 namespace yaml { 100 /// Pin the vtables to this file. 101 void Node::anchor() {} 102 void NullNode::anchor() {} 103 void ScalarNode::anchor() {} 104 void BlockScalarNode::anchor() {} 105 void KeyValueNode::anchor() {} 106 void MappingNode::anchor() {} 107 void SequenceNode::anchor() {} 108 void AliasNode::anchor() {} 109 110 /// Token - A single YAML token. 111 struct Token { 112 enum TokenKind { 113 TK_Error, // Uninitialized token. 114 TK_StreamStart, 115 TK_StreamEnd, 116 TK_VersionDirective, 117 TK_TagDirective, 118 TK_DocumentStart, 119 TK_DocumentEnd, 120 TK_BlockEntry, 121 TK_BlockEnd, 122 TK_BlockSequenceStart, 123 TK_BlockMappingStart, 124 TK_FlowEntry, 125 TK_FlowSequenceStart, 126 TK_FlowSequenceEnd, 127 TK_FlowMappingStart, 128 TK_FlowMappingEnd, 129 TK_Key, 130 TK_Value, 131 TK_Scalar, 132 TK_BlockScalar, 133 TK_Alias, 134 TK_Anchor, 135 TK_Tag 136 } Kind; 137 138 /// A string of length 0 or more whose begin() points to the logical location 139 /// of the token in the input. 140 StringRef Range; 141 142 /// The value of a block scalar node. 143 std::string Value; 144 145 Token() : Kind(TK_Error) {} 146 }; 147 } 148 } 149 150 typedef llvm::BumpPtrList<Token> TokenQueueT; 151 152 namespace { 153 /// @brief This struct is used to track simple keys. 154 /// 155 /// Simple keys are handled by creating an entry in SimpleKeys for each Token 156 /// which could legally be the start of a simple key. When peekNext is called, 157 /// if the Token To be returned is referenced by a SimpleKey, we continue 158 /// tokenizing until that potential simple key has either been found to not be 159 /// a simple key (we moved on to the next line or went further than 1024 chars). 160 /// Or when we run into a Value, and then insert a Key token (and possibly 161 /// others) before the SimpleKey's Tok. 162 struct SimpleKey { 163 TokenQueueT::iterator Tok; 164 unsigned Column; 165 unsigned Line; 166 unsigned FlowLevel; 167 bool IsRequired; 168 169 bool operator ==(const SimpleKey &Other) { 170 return Tok == Other.Tok; 171 } 172 }; 173 } 174 175 /// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit 176 /// subsequence and the subsequence's length in code units (uint8_t). 177 /// A length of 0 represents an error. 178 typedef std::pair<uint32_t, unsigned> UTF8Decoded; 179 180 static UTF8Decoded decodeUTF8(StringRef Range) { 181 StringRef::iterator Position= Range.begin(); 182 StringRef::iterator End = Range.end(); 183 // 1 byte: [0x00, 0x7f] 184 // Bit pattern: 0xxxxxxx 185 if ((*Position & 0x80) == 0) { 186 return std::make_pair(*Position, 1); 187 } 188 // 2 bytes: [0x80, 0x7ff] 189 // Bit pattern: 110xxxxx 10xxxxxx 190 if (Position + 1 != End && 191 ((*Position & 0xE0) == 0xC0) && 192 ((*(Position + 1) & 0xC0) == 0x80)) { 193 uint32_t codepoint = ((*Position & 0x1F) << 6) | 194 (*(Position + 1) & 0x3F); 195 if (codepoint >= 0x80) 196 return std::make_pair(codepoint, 2); 197 } 198 // 3 bytes: [0x8000, 0xffff] 199 // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx 200 if (Position + 2 != End && 201 ((*Position & 0xF0) == 0xE0) && 202 ((*(Position + 1) & 0xC0) == 0x80) && 203 ((*(Position + 2) & 0xC0) == 0x80)) { 204 uint32_t codepoint = ((*Position & 0x0F) << 12) | 205 ((*(Position + 1) & 0x3F) << 6) | 206 (*(Position + 2) & 0x3F); 207 // Codepoints between 0xD800 and 0xDFFF are invalid, as 208 // they are high / low surrogate halves used by UTF-16. 209 if (codepoint >= 0x800 && 210 (codepoint < 0xD800 || codepoint > 0xDFFF)) 211 return std::make_pair(codepoint, 3); 212 } 213 // 4 bytes: [0x10000, 0x10FFFF] 214 // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 215 if (Position + 3 != End && 216 ((*Position & 0xF8) == 0xF0) && 217 ((*(Position + 1) & 0xC0) == 0x80) && 218 ((*(Position + 2) & 0xC0) == 0x80) && 219 ((*(Position + 3) & 0xC0) == 0x80)) { 220 uint32_t codepoint = ((*Position & 0x07) << 18) | 221 ((*(Position + 1) & 0x3F) << 12) | 222 ((*(Position + 2) & 0x3F) << 6) | 223 (*(Position + 3) & 0x3F); 224 if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) 225 return std::make_pair(codepoint, 4); 226 } 227 return std::make_pair(0, 0); 228 } 229 230 namespace llvm { 231 namespace yaml { 232 /// @brief Scans YAML tokens from a MemoryBuffer. 233 class Scanner { 234 public: 235 Scanner(StringRef Input, SourceMgr &SM, bool ShowColors = true); 236 Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors = true); 237 238 /// @brief Parse the next token and return it without popping it. 239 Token &peekNext(); 240 241 /// @brief Parse the next token and pop it from the queue. 242 Token getNext(); 243 244 void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, 245 ArrayRef<SMRange> Ranges = None) { 246 SM.PrintMessage(Loc, Kind, Message, Ranges, /* FixIts= */ None, ShowColors); 247 } 248 249 void setError(const Twine &Message, StringRef::iterator Position) { 250 if (Current >= End) 251 Current = End - 1; 252 253 // Don't print out more errors after the first one we encounter. The rest 254 // are just the result of the first, and have no meaning. 255 if (!Failed) 256 printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message); 257 Failed = true; 258 } 259 260 void setError(const Twine &Message) { 261 setError(Message, Current); 262 } 263 264 /// @brief Returns true if an error occurred while parsing. 265 bool failed() { 266 return Failed; 267 } 268 269 private: 270 void init(MemoryBufferRef Buffer); 271 272 StringRef currentInput() { 273 return StringRef(Current, End - Current); 274 } 275 276 /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting 277 /// at \a Position. 278 /// 279 /// If the UTF-8 code units starting at Position do not form a well-formed 280 /// code unit subsequence, then the Unicode scalar value is 0, and the length 281 /// is 0. 282 UTF8Decoded decodeUTF8(StringRef::iterator Position) { 283 return ::decodeUTF8(StringRef(Position, End - Position)); 284 } 285 286 // The following functions are based on the gramar rules in the YAML spec. The 287 // style of the function names it meant to closely match how they are written 288 // in the spec. The number within the [] is the number of the grammar rule in 289 // the spec. 290 // 291 // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. 292 // 293 // c- 294 // A production starting and ending with a special character. 295 // b- 296 // A production matching a single line break. 297 // nb- 298 // A production starting and ending with a non-break character. 299 // s- 300 // A production starting and ending with a white space character. 301 // ns- 302 // A production starting and ending with a non-space character. 303 // l- 304 // A production matching complete line(s). 305 306 /// @brief Skip a single nb-char[27] starting at Position. 307 /// 308 /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] 309 /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] 310 /// 311 /// @returns The code unit after the nb-char, or Position if it's not an 312 /// nb-char. 313 StringRef::iterator skip_nb_char(StringRef::iterator Position); 314 315 /// @brief Skip a single b-break[28] starting at Position. 316 /// 317 /// A b-break is 0xD 0xA | 0xD | 0xA 318 /// 319 /// @returns The code unit after the b-break, or Position if it's not a 320 /// b-break. 321 StringRef::iterator skip_b_break(StringRef::iterator Position); 322 323 /// Skip a single s-space[31] starting at Position. 324 /// 325 /// An s-space is 0x20 326 /// 327 /// @returns The code unit after the s-space, or Position if it's not a 328 /// s-space. 329 StringRef::iterator skip_s_space(StringRef::iterator Position); 330 331 /// @brief Skip a single s-white[33] starting at Position. 332 /// 333 /// A s-white is 0x20 | 0x9 334 /// 335 /// @returns The code unit after the s-white, or Position if it's not a 336 /// s-white. 337 StringRef::iterator skip_s_white(StringRef::iterator Position); 338 339 /// @brief Skip a single ns-char[34] starting at Position. 340 /// 341 /// A ns-char is nb-char - s-white 342 /// 343 /// @returns The code unit after the ns-char, or Position if it's not a 344 /// ns-char. 345 StringRef::iterator skip_ns_char(StringRef::iterator Position); 346 347 typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator); 348 /// @brief Skip minimal well-formed code unit subsequences until Func 349 /// returns its input. 350 /// 351 /// @returns The code unit after the last minimal well-formed code unit 352 /// subsequence that Func accepted. 353 StringRef::iterator skip_while( SkipWhileFunc Func 354 , StringRef::iterator Position); 355 356 /// Skip minimal well-formed code unit subsequences until Func returns its 357 /// input. 358 void advanceWhile(SkipWhileFunc Func); 359 360 /// @brief Scan ns-uri-char[39]s starting at Cur. 361 /// 362 /// This updates Cur and Column while scanning. 363 /// 364 /// @returns A StringRef starting at Cur which covers the longest contiguous 365 /// sequence of ns-uri-char. 366 StringRef scan_ns_uri_char(); 367 368 /// @brief Consume a minimal well-formed code unit subsequence starting at 369 /// \a Cur. Return false if it is not the same Unicode scalar value as 370 /// \a Expected. This updates \a Column. 371 bool consume(uint32_t Expected); 372 373 /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. 374 void skip(uint32_t Distance); 375 376 /// @brief Return true if the minimal well-formed code unit subsequence at 377 /// Pos is whitespace or a new line 378 bool isBlankOrBreak(StringRef::iterator Position); 379 380 /// Consume a single b-break[28] if it's present at the current position. 381 /// 382 /// Return false if the code unit at the current position isn't a line break. 383 bool consumeLineBreakIfPresent(); 384 385 /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey. 386 void saveSimpleKeyCandidate( TokenQueueT::iterator Tok 387 , unsigned AtColumn 388 , bool IsRequired); 389 390 /// @brief Remove simple keys that can no longer be valid simple keys. 391 /// 392 /// Invalid simple keys are not on the current line or are further than 1024 393 /// columns back. 394 void removeStaleSimpleKeyCandidates(); 395 396 /// @brief Remove all simple keys on FlowLevel \a Level. 397 void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); 398 399 /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd 400 /// tokens if needed. 401 bool unrollIndent(int ToColumn); 402 403 /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint 404 /// if needed. 405 bool rollIndent( int ToColumn 406 , Token::TokenKind Kind 407 , TokenQueueT::iterator InsertPoint); 408 409 /// @brief Skip a single-line comment when the comment starts at the current 410 /// position of the scanner. 411 void skipComment(); 412 413 /// @brief Skip whitespace and comments until the start of the next token. 414 void scanToNextToken(); 415 416 /// @brief Must be the first token generated. 417 bool scanStreamStart(); 418 419 /// @brief Generate tokens needed to close out the stream. 420 bool scanStreamEnd(); 421 422 /// @brief Scan a %BLAH directive. 423 bool scanDirective(); 424 425 /// @brief Scan a ... or ---. 426 bool scanDocumentIndicator(bool IsStart); 427 428 /// @brief Scan a [ or { and generate the proper flow collection start token. 429 bool scanFlowCollectionStart(bool IsSequence); 430 431 /// @brief Scan a ] or } and generate the proper flow collection end token. 432 bool scanFlowCollectionEnd(bool IsSequence); 433 434 /// @brief Scan the , that separates entries in a flow collection. 435 bool scanFlowEntry(); 436 437 /// @brief Scan the - that starts block sequence entries. 438 bool scanBlockEntry(); 439 440 /// @brief Scan an explicit ? indicating a key. 441 bool scanKey(); 442 443 /// @brief Scan an explicit : indicating a value. 444 bool scanValue(); 445 446 /// @brief Scan a quoted scalar. 447 bool scanFlowScalar(bool IsDoubleQuoted); 448 449 /// @brief Scan an unquoted scalar. 450 bool scanPlainScalar(); 451 452 /// @brief Scan an Alias or Anchor starting with * or &. 453 bool scanAliasOrAnchor(bool IsAlias); 454 455 /// @brief Scan a block scalar starting with | or >. 456 bool scanBlockScalar(bool IsLiteral); 457 458 /// Scan a chomping indicator in a block scalar header. 459 char scanBlockChompingIndicator(); 460 461 /// Scan an indentation indicator in a block scalar header. 462 unsigned scanBlockIndentationIndicator(); 463 464 /// Scan a block scalar header. 465 /// 466 /// Return false if an error occurred. 467 bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator, 468 bool &IsDone); 469 470 /// Look for the indentation level of a block scalar. 471 /// 472 /// Return false if an error occurred. 473 bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent, 474 unsigned &LineBreaks, bool &IsDone); 475 476 /// Scan the indentation of a text line in a block scalar. 477 /// 478 /// Return false if an error occurred. 479 bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent, 480 bool &IsDone); 481 482 /// @brief Scan a tag of the form !stuff. 483 bool scanTag(); 484 485 /// @brief Dispatch to the next scanning function based on \a *Cur. 486 bool fetchMoreTokens(); 487 488 /// @brief The SourceMgr used for diagnostics and buffer management. 489 SourceMgr &SM; 490 491 /// @brief The original input. 492 MemoryBufferRef InputBuffer; 493 494 /// @brief The current position of the scanner. 495 StringRef::iterator Current; 496 497 /// @brief The end of the input (one past the last character). 498 StringRef::iterator End; 499 500 /// @brief Current YAML indentation level in spaces. 501 int Indent; 502 503 /// @brief Current column number in Unicode code points. 504 unsigned Column; 505 506 /// @brief Current line number. 507 unsigned Line; 508 509 /// @brief How deep we are in flow style containers. 0 Means at block level. 510 unsigned FlowLevel; 511 512 /// @brief Are we at the start of the stream? 513 bool IsStartOfStream; 514 515 /// @brief Can the next token be the start of a simple key? 516 bool IsSimpleKeyAllowed; 517 518 /// @brief True if an error has occurred. 519 bool Failed; 520 521 /// @brief Should colors be used when printing out the diagnostic messages? 522 bool ShowColors; 523 524 /// @brief Queue of tokens. This is required to queue up tokens while looking 525 /// for the end of a simple key. And for cases where a single character 526 /// can produce multiple tokens (e.g. BlockEnd). 527 TokenQueueT TokenQueue; 528 529 /// @brief Indentation levels. 530 SmallVector<int, 4> Indents; 531 532 /// @brief Potential simple keys. 533 SmallVector<SimpleKey, 4> SimpleKeys; 534 }; 535 536 } // end namespace yaml 537 } // end namespace llvm 538 539 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. 540 static void encodeUTF8( uint32_t UnicodeScalarValue 541 , SmallVectorImpl<char> &Result) { 542 if (UnicodeScalarValue <= 0x7F) { 543 Result.push_back(UnicodeScalarValue & 0x7F); 544 } else if (UnicodeScalarValue <= 0x7FF) { 545 uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); 546 uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); 547 Result.push_back(FirstByte); 548 Result.push_back(SecondByte); 549 } else if (UnicodeScalarValue <= 0xFFFF) { 550 uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); 551 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 552 uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); 553 Result.push_back(FirstByte); 554 Result.push_back(SecondByte); 555 Result.push_back(ThirdByte); 556 } else if (UnicodeScalarValue <= 0x10FFFF) { 557 uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); 558 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); 559 uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 560 uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); 561 Result.push_back(FirstByte); 562 Result.push_back(SecondByte); 563 Result.push_back(ThirdByte); 564 Result.push_back(FourthByte); 565 } 566 } 567 568 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { 569 SourceMgr SM; 570 Scanner scanner(Input, SM); 571 while (true) { 572 Token T = scanner.getNext(); 573 switch (T.Kind) { 574 case Token::TK_StreamStart: 575 OS << "Stream-Start: "; 576 break; 577 case Token::TK_StreamEnd: 578 OS << "Stream-End: "; 579 break; 580 case Token::TK_VersionDirective: 581 OS << "Version-Directive: "; 582 break; 583 case Token::TK_TagDirective: 584 OS << "Tag-Directive: "; 585 break; 586 case Token::TK_DocumentStart: 587 OS << "Document-Start: "; 588 break; 589 case Token::TK_DocumentEnd: 590 OS << "Document-End: "; 591 break; 592 case Token::TK_BlockEntry: 593 OS << "Block-Entry: "; 594 break; 595 case Token::TK_BlockEnd: 596 OS << "Block-End: "; 597 break; 598 case Token::TK_BlockSequenceStart: 599 OS << "Block-Sequence-Start: "; 600 break; 601 case Token::TK_BlockMappingStart: 602 OS << "Block-Mapping-Start: "; 603 break; 604 case Token::TK_FlowEntry: 605 OS << "Flow-Entry: "; 606 break; 607 case Token::TK_FlowSequenceStart: 608 OS << "Flow-Sequence-Start: "; 609 break; 610 case Token::TK_FlowSequenceEnd: 611 OS << "Flow-Sequence-End: "; 612 break; 613 case Token::TK_FlowMappingStart: 614 OS << "Flow-Mapping-Start: "; 615 break; 616 case Token::TK_FlowMappingEnd: 617 OS << "Flow-Mapping-End: "; 618 break; 619 case Token::TK_Key: 620 OS << "Key: "; 621 break; 622 case Token::TK_Value: 623 OS << "Value: "; 624 break; 625 case Token::TK_Scalar: 626 OS << "Scalar: "; 627 break; 628 case Token::TK_BlockScalar: 629 OS << "Block Scalar: "; 630 break; 631 case Token::TK_Alias: 632 OS << "Alias: "; 633 break; 634 case Token::TK_Anchor: 635 OS << "Anchor: "; 636 break; 637 case Token::TK_Tag: 638 OS << "Tag: "; 639 break; 640 case Token::TK_Error: 641 break; 642 } 643 OS << T.Range << "\n"; 644 if (T.Kind == Token::TK_StreamEnd) 645 break; 646 else if (T.Kind == Token::TK_Error) 647 return false; 648 } 649 return true; 650 } 651 652 bool yaml::scanTokens(StringRef Input) { 653 llvm::SourceMgr SM; 654 llvm::yaml::Scanner scanner(Input, SM); 655 for (;;) { 656 llvm::yaml::Token T = scanner.getNext(); 657 if (T.Kind == Token::TK_StreamEnd) 658 break; 659 else if (T.Kind == Token::TK_Error) 660 return false; 661 } 662 return true; 663 } 664 665 std::string yaml::escape(StringRef Input) { 666 std::string EscapedInput; 667 for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { 668 if (*i == '\\') 669 EscapedInput += "\\\\"; 670 else if (*i == '"') 671 EscapedInput += "\\\""; 672 else if (*i == 0) 673 EscapedInput += "\\0"; 674 else if (*i == 0x07) 675 EscapedInput += "\\a"; 676 else if (*i == 0x08) 677 EscapedInput += "\\b"; 678 else if (*i == 0x09) 679 EscapedInput += "\\t"; 680 else if (*i == 0x0A) 681 EscapedInput += "\\n"; 682 else if (*i == 0x0B) 683 EscapedInput += "\\v"; 684 else if (*i == 0x0C) 685 EscapedInput += "\\f"; 686 else if (*i == 0x0D) 687 EscapedInput += "\\r"; 688 else if (*i == 0x1B) 689 EscapedInput += "\\e"; 690 else if ((unsigned char)*i < 0x20) { // Control characters not handled above. 691 std::string HexStr = utohexstr(*i); 692 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 693 } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. 694 UTF8Decoded UnicodeScalarValue 695 = decodeUTF8(StringRef(i, Input.end() - i)); 696 if (UnicodeScalarValue.second == 0) { 697 // Found invalid char. 698 SmallString<4> Val; 699 encodeUTF8(0xFFFD, Val); 700 EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end()); 701 // FIXME: Error reporting. 702 return EscapedInput; 703 } 704 if (UnicodeScalarValue.first == 0x85) 705 EscapedInput += "\\N"; 706 else if (UnicodeScalarValue.first == 0xA0) 707 EscapedInput += "\\_"; 708 else if (UnicodeScalarValue.first == 0x2028) 709 EscapedInput += "\\L"; 710 else if (UnicodeScalarValue.first == 0x2029) 711 EscapedInput += "\\P"; 712 else { 713 std::string HexStr = utohexstr(UnicodeScalarValue.first); 714 if (HexStr.size() <= 2) 715 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 716 else if (HexStr.size() <= 4) 717 EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; 718 else if (HexStr.size() <= 8) 719 EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; 720 } 721 i += UnicodeScalarValue.second - 1; 722 } else 723 EscapedInput.push_back(*i); 724 } 725 return EscapedInput; 726 } 727 728 Scanner::Scanner(StringRef Input, SourceMgr &sm, bool ShowColors) 729 : SM(sm), ShowColors(ShowColors) { 730 init(MemoryBufferRef(Input, "YAML")); 731 } 732 733 Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors) 734 : SM(SM_), ShowColors(ShowColors) { 735 init(Buffer); 736 } 737 738 void Scanner::init(MemoryBufferRef Buffer) { 739 InputBuffer = Buffer; 740 Current = InputBuffer.getBufferStart(); 741 End = InputBuffer.getBufferEnd(); 742 Indent = -1; 743 Column = 0; 744 Line = 0; 745 FlowLevel = 0; 746 IsStartOfStream = true; 747 IsSimpleKeyAllowed = true; 748 Failed = false; 749 std::unique_ptr<MemoryBuffer> InputBufferOwner = 750 MemoryBuffer::getMemBuffer(Buffer); 751 SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc()); 752 } 753 754 Token &Scanner::peekNext() { 755 // If the current token is a possible simple key, keep parsing until we 756 // can confirm. 757 bool NeedMore = false; 758 while (true) { 759 if (TokenQueue.empty() || NeedMore) { 760 if (!fetchMoreTokens()) { 761 TokenQueue.clear(); 762 TokenQueue.push_back(Token()); 763 return TokenQueue.front(); 764 } 765 } 766 assert(!TokenQueue.empty() && 767 "fetchMoreTokens lied about getting tokens!"); 768 769 removeStaleSimpleKeyCandidates(); 770 SimpleKey SK; 771 SK.Tok = TokenQueue.begin(); 772 if (!is_contained(SimpleKeys, SK)) 773 break; 774 else 775 NeedMore = true; 776 } 777 return TokenQueue.front(); 778 } 779 780 Token Scanner::getNext() { 781 Token Ret = peekNext(); 782 // TokenQueue can be empty if there was an error getting the next token. 783 if (!TokenQueue.empty()) 784 TokenQueue.pop_front(); 785 786 // There cannot be any referenced Token's if the TokenQueue is empty. So do a 787 // quick deallocation of them all. 788 if (TokenQueue.empty()) 789 TokenQueue.resetAlloc(); 790 791 return Ret; 792 } 793 794 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { 795 if (Position == End) 796 return Position; 797 // Check 7 bit c-printable - b-char. 798 if ( *Position == 0x09 799 || (*Position >= 0x20 && *Position <= 0x7E)) 800 return Position + 1; 801 802 // Check for valid UTF-8. 803 if (uint8_t(*Position) & 0x80) { 804 UTF8Decoded u8d = decodeUTF8(Position); 805 if ( u8d.second != 0 806 && u8d.first != 0xFEFF 807 && ( u8d.first == 0x85 808 || ( u8d.first >= 0xA0 809 && u8d.first <= 0xD7FF) 810 || ( u8d.first >= 0xE000 811 && u8d.first <= 0xFFFD) 812 || ( u8d.first >= 0x10000 813 && u8d.first <= 0x10FFFF))) 814 return Position + u8d.second; 815 } 816 return Position; 817 } 818 819 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { 820 if (Position == End) 821 return Position; 822 if (*Position == 0x0D) { 823 if (Position + 1 != End && *(Position + 1) == 0x0A) 824 return Position + 2; 825 return Position + 1; 826 } 827 828 if (*Position == 0x0A) 829 return Position + 1; 830 return Position; 831 } 832 833 StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) { 834 if (Position == End) 835 return Position; 836 if (*Position == ' ') 837 return Position + 1; 838 return Position; 839 } 840 841 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { 842 if (Position == End) 843 return Position; 844 if (*Position == ' ' || *Position == '\t') 845 return Position + 1; 846 return Position; 847 } 848 849 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { 850 if (Position == End) 851 return Position; 852 if (*Position == ' ' || *Position == '\t') 853 return Position; 854 return skip_nb_char(Position); 855 } 856 857 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func 858 , StringRef::iterator Position) { 859 while (true) { 860 StringRef::iterator i = (this->*Func)(Position); 861 if (i == Position) 862 break; 863 Position = i; 864 } 865 return Position; 866 } 867 868 void Scanner::advanceWhile(SkipWhileFunc Func) { 869 auto Final = skip_while(Func, Current); 870 Column += Final - Current; 871 Current = Final; 872 } 873 874 static bool is_ns_hex_digit(const char C) { 875 return (C >= '0' && C <= '9') 876 || (C >= 'a' && C <= 'z') 877 || (C >= 'A' && C <= 'Z'); 878 } 879 880 static bool is_ns_word_char(const char C) { 881 return C == '-' 882 || (C >= 'a' && C <= 'z') 883 || (C >= 'A' && C <= 'Z'); 884 } 885 886 StringRef Scanner::scan_ns_uri_char() { 887 StringRef::iterator Start = Current; 888 while (true) { 889 if (Current == End) 890 break; 891 if (( *Current == '%' 892 && Current + 2 < End 893 && is_ns_hex_digit(*(Current + 1)) 894 && is_ns_hex_digit(*(Current + 2))) 895 || is_ns_word_char(*Current) 896 || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") 897 != StringRef::npos) { 898 ++Current; 899 ++Column; 900 } else 901 break; 902 } 903 return StringRef(Start, Current - Start); 904 } 905 906 bool Scanner::consume(uint32_t Expected) { 907 if (Expected >= 0x80) 908 report_fatal_error("Not dealing with this yet"); 909 if (Current == End) 910 return false; 911 if (uint8_t(*Current) >= 0x80) 912 report_fatal_error("Not dealing with this yet"); 913 if (uint8_t(*Current) == Expected) { 914 ++Current; 915 ++Column; 916 return true; 917 } 918 return false; 919 } 920 921 void Scanner::skip(uint32_t Distance) { 922 Current += Distance; 923 Column += Distance; 924 assert(Current <= End && "Skipped past the end"); 925 } 926 927 bool Scanner::isBlankOrBreak(StringRef::iterator Position) { 928 if (Position == End) 929 return false; 930 return *Position == ' ' || *Position == '\t' || *Position == '\r' || 931 *Position == '\n'; 932 } 933 934 bool Scanner::consumeLineBreakIfPresent() { 935 auto Next = skip_b_break(Current); 936 if (Next == Current) 937 return false; 938 Column = 0; 939 ++Line; 940 Current = Next; 941 return true; 942 } 943 944 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok 945 , unsigned AtColumn 946 , bool IsRequired) { 947 if (IsSimpleKeyAllowed) { 948 SimpleKey SK; 949 SK.Tok = Tok; 950 SK.Line = Line; 951 SK.Column = AtColumn; 952 SK.IsRequired = IsRequired; 953 SK.FlowLevel = FlowLevel; 954 SimpleKeys.push_back(SK); 955 } 956 } 957 958 void Scanner::removeStaleSimpleKeyCandidates() { 959 for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); 960 i != SimpleKeys.end();) { 961 if (i->Line != Line || i->Column + 1024 < Column) { 962 if (i->IsRequired) 963 setError( "Could not find expected : for simple key" 964 , i->Tok->Range.begin()); 965 i = SimpleKeys.erase(i); 966 } else 967 ++i; 968 } 969 } 970 971 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { 972 if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) 973 SimpleKeys.pop_back(); 974 } 975 976 bool Scanner::unrollIndent(int ToColumn) { 977 Token T; 978 // Indentation is ignored in flow. 979 if (FlowLevel != 0) 980 return true; 981 982 while (Indent > ToColumn) { 983 T.Kind = Token::TK_BlockEnd; 984 T.Range = StringRef(Current, 1); 985 TokenQueue.push_back(T); 986 Indent = Indents.pop_back_val(); 987 } 988 989 return true; 990 } 991 992 bool Scanner::rollIndent( int ToColumn 993 , Token::TokenKind Kind 994 , TokenQueueT::iterator InsertPoint) { 995 if (FlowLevel) 996 return true; 997 if (Indent < ToColumn) { 998 Indents.push_back(Indent); 999 Indent = ToColumn; 1000 1001 Token T; 1002 T.Kind = Kind; 1003 T.Range = StringRef(Current, 0); 1004 TokenQueue.insert(InsertPoint, T); 1005 } 1006 return true; 1007 } 1008 1009 void Scanner::skipComment() { 1010 if (*Current != '#') 1011 return; 1012 while (true) { 1013 // This may skip more than one byte, thus Column is only incremented 1014 // for code points. 1015 StringRef::iterator I = skip_nb_char(Current); 1016 if (I == Current) 1017 break; 1018 Current = I; 1019 ++Column; 1020 } 1021 } 1022 1023 void Scanner::scanToNextToken() { 1024 while (true) { 1025 while (*Current == ' ' || *Current == '\t') { 1026 skip(1); 1027 } 1028 1029 skipComment(); 1030 1031 // Skip EOL. 1032 StringRef::iterator i = skip_b_break(Current); 1033 if (i == Current) 1034 break; 1035 Current = i; 1036 ++Line; 1037 Column = 0; 1038 // New lines may start a simple key. 1039 if (!FlowLevel) 1040 IsSimpleKeyAllowed = true; 1041 } 1042 } 1043 1044 bool Scanner::scanStreamStart() { 1045 IsStartOfStream = false; 1046 1047 EncodingInfo EI = getUnicodeEncoding(currentInput()); 1048 1049 Token T; 1050 T.Kind = Token::TK_StreamStart; 1051 T.Range = StringRef(Current, EI.second); 1052 TokenQueue.push_back(T); 1053 Current += EI.second; 1054 return true; 1055 } 1056 1057 bool Scanner::scanStreamEnd() { 1058 // Force an ending new line if one isn't present. 1059 if (Column != 0) { 1060 Column = 0; 1061 ++Line; 1062 } 1063 1064 unrollIndent(-1); 1065 SimpleKeys.clear(); 1066 IsSimpleKeyAllowed = false; 1067 1068 Token T; 1069 T.Kind = Token::TK_StreamEnd; 1070 T.Range = StringRef(Current, 0); 1071 TokenQueue.push_back(T); 1072 return true; 1073 } 1074 1075 bool Scanner::scanDirective() { 1076 // Reset the indentation level. 1077 unrollIndent(-1); 1078 SimpleKeys.clear(); 1079 IsSimpleKeyAllowed = false; 1080 1081 StringRef::iterator Start = Current; 1082 consume('%'); 1083 StringRef::iterator NameStart = Current; 1084 Current = skip_while(&Scanner::skip_ns_char, Current); 1085 StringRef Name(NameStart, Current - NameStart); 1086 Current = skip_while(&Scanner::skip_s_white, Current); 1087 1088 Token T; 1089 if (Name == "YAML") { 1090 Current = skip_while(&Scanner::skip_ns_char, Current); 1091 T.Kind = Token::TK_VersionDirective; 1092 T.Range = StringRef(Start, Current - Start); 1093 TokenQueue.push_back(T); 1094 return true; 1095 } else if(Name == "TAG") { 1096 Current = skip_while(&Scanner::skip_ns_char, Current); 1097 Current = skip_while(&Scanner::skip_s_white, Current); 1098 Current = skip_while(&Scanner::skip_ns_char, Current); 1099 T.Kind = Token::TK_TagDirective; 1100 T.Range = StringRef(Start, Current - Start); 1101 TokenQueue.push_back(T); 1102 return true; 1103 } 1104 return false; 1105 } 1106 1107 bool Scanner::scanDocumentIndicator(bool IsStart) { 1108 unrollIndent(-1); 1109 SimpleKeys.clear(); 1110 IsSimpleKeyAllowed = false; 1111 1112 Token T; 1113 T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; 1114 T.Range = StringRef(Current, 3); 1115 skip(3); 1116 TokenQueue.push_back(T); 1117 return true; 1118 } 1119 1120 bool Scanner::scanFlowCollectionStart(bool IsSequence) { 1121 Token T; 1122 T.Kind = IsSequence ? Token::TK_FlowSequenceStart 1123 : Token::TK_FlowMappingStart; 1124 T.Range = StringRef(Current, 1); 1125 skip(1); 1126 TokenQueue.push_back(T); 1127 1128 // [ and { may begin a simple key. 1129 saveSimpleKeyCandidate(--TokenQueue.end(), Column - 1, false); 1130 1131 // And may also be followed by a simple key. 1132 IsSimpleKeyAllowed = true; 1133 ++FlowLevel; 1134 return true; 1135 } 1136 1137 bool Scanner::scanFlowCollectionEnd(bool IsSequence) { 1138 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1139 IsSimpleKeyAllowed = false; 1140 Token T; 1141 T.Kind = IsSequence ? Token::TK_FlowSequenceEnd 1142 : Token::TK_FlowMappingEnd; 1143 T.Range = StringRef(Current, 1); 1144 skip(1); 1145 TokenQueue.push_back(T); 1146 if (FlowLevel) 1147 --FlowLevel; 1148 return true; 1149 } 1150 1151 bool Scanner::scanFlowEntry() { 1152 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1153 IsSimpleKeyAllowed = true; 1154 Token T; 1155 T.Kind = Token::TK_FlowEntry; 1156 T.Range = StringRef(Current, 1); 1157 skip(1); 1158 TokenQueue.push_back(T); 1159 return true; 1160 } 1161 1162 bool Scanner::scanBlockEntry() { 1163 rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); 1164 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1165 IsSimpleKeyAllowed = true; 1166 Token T; 1167 T.Kind = Token::TK_BlockEntry; 1168 T.Range = StringRef(Current, 1); 1169 skip(1); 1170 TokenQueue.push_back(T); 1171 return true; 1172 } 1173 1174 bool Scanner::scanKey() { 1175 if (!FlowLevel) 1176 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1177 1178 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1179 IsSimpleKeyAllowed = !FlowLevel; 1180 1181 Token T; 1182 T.Kind = Token::TK_Key; 1183 T.Range = StringRef(Current, 1); 1184 skip(1); 1185 TokenQueue.push_back(T); 1186 return true; 1187 } 1188 1189 bool Scanner::scanValue() { 1190 // If the previous token could have been a simple key, insert the key token 1191 // into the token queue. 1192 if (!SimpleKeys.empty()) { 1193 SimpleKey SK = SimpleKeys.pop_back_val(); 1194 Token T; 1195 T.Kind = Token::TK_Key; 1196 T.Range = SK.Tok->Range; 1197 TokenQueueT::iterator i, e; 1198 for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { 1199 if (i == SK.Tok) 1200 break; 1201 } 1202 assert(i != e && "SimpleKey not in token queue!"); 1203 i = TokenQueue.insert(i, T); 1204 1205 // We may also need to add a Block-Mapping-Start token. 1206 rollIndent(SK.Column, Token::TK_BlockMappingStart, i); 1207 1208 IsSimpleKeyAllowed = false; 1209 } else { 1210 if (!FlowLevel) 1211 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1212 IsSimpleKeyAllowed = !FlowLevel; 1213 } 1214 1215 Token T; 1216 T.Kind = Token::TK_Value; 1217 T.Range = StringRef(Current, 1); 1218 skip(1); 1219 TokenQueue.push_back(T); 1220 return true; 1221 } 1222 1223 // Forbidding inlining improves performance by roughly 20%. 1224 // FIXME: Remove once llvm optimizes this to the faster version without hints. 1225 LLVM_ATTRIBUTE_NOINLINE static bool 1226 wasEscaped(StringRef::iterator First, StringRef::iterator Position); 1227 1228 // Returns whether a character at 'Position' was escaped with a leading '\'. 1229 // 'First' specifies the position of the first character in the string. 1230 static bool wasEscaped(StringRef::iterator First, 1231 StringRef::iterator Position) { 1232 assert(Position - 1 >= First); 1233 StringRef::iterator I = Position - 1; 1234 // We calculate the number of consecutive '\'s before the current position 1235 // by iterating backwards through our string. 1236 while (I >= First && *I == '\\') --I; 1237 // (Position - 1 - I) now contains the number of '\'s before the current 1238 // position. If it is odd, the character at 'Position' was escaped. 1239 return (Position - 1 - I) % 2 == 1; 1240 } 1241 1242 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { 1243 StringRef::iterator Start = Current; 1244 unsigned ColStart = Column; 1245 if (IsDoubleQuoted) { 1246 do { 1247 ++Current; 1248 while (Current != End && *Current != '"') 1249 ++Current; 1250 // Repeat until the previous character was not a '\' or was an escaped 1251 // backslash. 1252 } while ( Current != End 1253 && *(Current - 1) == '\\' 1254 && wasEscaped(Start + 1, Current)); 1255 } else { 1256 skip(1); 1257 while (true) { 1258 // Skip a ' followed by another '. 1259 if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { 1260 skip(2); 1261 continue; 1262 } else if (*Current == '\'') 1263 break; 1264 StringRef::iterator i = skip_nb_char(Current); 1265 if (i == Current) { 1266 i = skip_b_break(Current); 1267 if (i == Current) 1268 break; 1269 Current = i; 1270 Column = 0; 1271 ++Line; 1272 } else { 1273 if (i == End) 1274 break; 1275 Current = i; 1276 ++Column; 1277 } 1278 } 1279 } 1280 1281 if (Current == End) { 1282 setError("Expected quote at end of scalar", Current); 1283 return false; 1284 } 1285 1286 skip(1); // Skip ending quote. 1287 Token T; 1288 T.Kind = Token::TK_Scalar; 1289 T.Range = StringRef(Start, Current - Start); 1290 TokenQueue.push_back(T); 1291 1292 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1293 1294 IsSimpleKeyAllowed = false; 1295 1296 return true; 1297 } 1298 1299 bool Scanner::scanPlainScalar() { 1300 StringRef::iterator Start = Current; 1301 unsigned ColStart = Column; 1302 unsigned LeadingBlanks = 0; 1303 assert(Indent >= -1 && "Indent must be >= -1 !"); 1304 unsigned indent = static_cast<unsigned>(Indent + 1); 1305 while (true) { 1306 if (*Current == '#') 1307 break; 1308 1309 while (!isBlankOrBreak(Current)) { 1310 if ( FlowLevel && *Current == ':' 1311 && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) { 1312 setError("Found unexpected ':' while scanning a plain scalar", Current); 1313 return false; 1314 } 1315 1316 // Check for the end of the plain scalar. 1317 if ( (*Current == ':' && isBlankOrBreak(Current + 1)) 1318 || ( FlowLevel 1319 && (StringRef(Current, 1).find_first_of(",:?[]{}") 1320 != StringRef::npos))) 1321 break; 1322 1323 StringRef::iterator i = skip_nb_char(Current); 1324 if (i == Current) 1325 break; 1326 Current = i; 1327 ++Column; 1328 } 1329 1330 // Are we at the end? 1331 if (!isBlankOrBreak(Current)) 1332 break; 1333 1334 // Eat blanks. 1335 StringRef::iterator Tmp = Current; 1336 while (isBlankOrBreak(Tmp)) { 1337 StringRef::iterator i = skip_s_white(Tmp); 1338 if (i != Tmp) { 1339 if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { 1340 setError("Found invalid tab character in indentation", Tmp); 1341 return false; 1342 } 1343 Tmp = i; 1344 ++Column; 1345 } else { 1346 i = skip_b_break(Tmp); 1347 if (!LeadingBlanks) 1348 LeadingBlanks = 1; 1349 Tmp = i; 1350 Column = 0; 1351 ++Line; 1352 } 1353 } 1354 1355 if (!FlowLevel && Column < indent) 1356 break; 1357 1358 Current = Tmp; 1359 } 1360 if (Start == Current) { 1361 setError("Got empty plain scalar", Start); 1362 return false; 1363 } 1364 Token T; 1365 T.Kind = Token::TK_Scalar; 1366 T.Range = StringRef(Start, Current - Start); 1367 TokenQueue.push_back(T); 1368 1369 // Plain scalars can be simple keys. 1370 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1371 1372 IsSimpleKeyAllowed = false; 1373 1374 return true; 1375 } 1376 1377 bool Scanner::scanAliasOrAnchor(bool IsAlias) { 1378 StringRef::iterator Start = Current; 1379 unsigned ColStart = Column; 1380 skip(1); 1381 while(true) { 1382 if ( *Current == '[' || *Current == ']' 1383 || *Current == '{' || *Current == '}' 1384 || *Current == ',' 1385 || *Current == ':') 1386 break; 1387 StringRef::iterator i = skip_ns_char(Current); 1388 if (i == Current) 1389 break; 1390 Current = i; 1391 ++Column; 1392 } 1393 1394 if (Start == Current) { 1395 setError("Got empty alias or anchor", Start); 1396 return false; 1397 } 1398 1399 Token T; 1400 T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; 1401 T.Range = StringRef(Start, Current - Start); 1402 TokenQueue.push_back(T); 1403 1404 // Alias and anchors can be simple keys. 1405 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1406 1407 IsSimpleKeyAllowed = false; 1408 1409 return true; 1410 } 1411 1412 char Scanner::scanBlockChompingIndicator() { 1413 char Indicator = ' '; 1414 if (Current != End && (*Current == '+' || *Current == '-')) { 1415 Indicator = *Current; 1416 skip(1); 1417 } 1418 return Indicator; 1419 } 1420 1421 /// Get the number of line breaks after chomping. 1422 /// 1423 /// Return the number of trailing line breaks to emit, depending on 1424 /// \p ChompingIndicator. 1425 static unsigned getChompedLineBreaks(char ChompingIndicator, 1426 unsigned LineBreaks, StringRef Str) { 1427 if (ChompingIndicator == '-') // Strip all line breaks. 1428 return 0; 1429 if (ChompingIndicator == '+') // Keep all line breaks. 1430 return LineBreaks; 1431 // Clip trailing lines. 1432 return Str.empty() ? 0 : 1; 1433 } 1434 1435 unsigned Scanner::scanBlockIndentationIndicator() { 1436 unsigned Indent = 0; 1437 if (Current != End && (*Current >= '1' && *Current <= '9')) { 1438 Indent = unsigned(*Current - '0'); 1439 skip(1); 1440 } 1441 return Indent; 1442 } 1443 1444 bool Scanner::scanBlockScalarHeader(char &ChompingIndicator, 1445 unsigned &IndentIndicator, bool &IsDone) { 1446 auto Start = Current; 1447 1448 ChompingIndicator = scanBlockChompingIndicator(); 1449 IndentIndicator = scanBlockIndentationIndicator(); 1450 // Check for the chomping indicator once again. 1451 if (ChompingIndicator == ' ') 1452 ChompingIndicator = scanBlockChompingIndicator(); 1453 Current = skip_while(&Scanner::skip_s_white, Current); 1454 skipComment(); 1455 1456 if (Current == End) { // EOF, we have an empty scalar. 1457 Token T; 1458 T.Kind = Token::TK_BlockScalar; 1459 T.Range = StringRef(Start, Current - Start); 1460 TokenQueue.push_back(T); 1461 IsDone = true; 1462 return true; 1463 } 1464 1465 if (!consumeLineBreakIfPresent()) { 1466 setError("Expected a line break after block scalar header", Current); 1467 return false; 1468 } 1469 return true; 1470 } 1471 1472 bool Scanner::findBlockScalarIndent(unsigned &BlockIndent, 1473 unsigned BlockExitIndent, 1474 unsigned &LineBreaks, bool &IsDone) { 1475 unsigned MaxAllSpaceLineCharacters = 0; 1476 StringRef::iterator LongestAllSpaceLine; 1477 1478 while (true) { 1479 advanceWhile(&Scanner::skip_s_space); 1480 if (skip_nb_char(Current) != Current) { 1481 // This line isn't empty, so try and find the indentation. 1482 if (Column <= BlockExitIndent) { // End of the block literal. 1483 IsDone = true; 1484 return true; 1485 } 1486 // We found the block's indentation. 1487 BlockIndent = Column; 1488 if (MaxAllSpaceLineCharacters > BlockIndent) { 1489 setError( 1490 "Leading all-spaces line must be smaller than the block indent", 1491 LongestAllSpaceLine); 1492 return false; 1493 } 1494 return true; 1495 } 1496 if (skip_b_break(Current) != Current && 1497 Column > MaxAllSpaceLineCharacters) { 1498 // Record the longest all-space line in case it's longer than the 1499 // discovered block indent. 1500 MaxAllSpaceLineCharacters = Column; 1501 LongestAllSpaceLine = Current; 1502 } 1503 1504 // Check for EOF. 1505 if (Current == End) { 1506 IsDone = true; 1507 return true; 1508 } 1509 1510 if (!consumeLineBreakIfPresent()) { 1511 IsDone = true; 1512 return true; 1513 } 1514 ++LineBreaks; 1515 } 1516 return true; 1517 } 1518 1519 bool Scanner::scanBlockScalarIndent(unsigned BlockIndent, 1520 unsigned BlockExitIndent, bool &IsDone) { 1521 // Skip the indentation. 1522 while (Column < BlockIndent) { 1523 auto I = skip_s_space(Current); 1524 if (I == Current) 1525 break; 1526 Current = I; 1527 ++Column; 1528 } 1529 1530 if (skip_nb_char(Current) == Current) 1531 return true; 1532 1533 if (Column <= BlockExitIndent) { // End of the block literal. 1534 IsDone = true; 1535 return true; 1536 } 1537 1538 if (Column < BlockIndent) { 1539 if (Current != End && *Current == '#') { // Trailing comment. 1540 IsDone = true; 1541 return true; 1542 } 1543 setError("A text line is less indented than the block scalar", Current); 1544 return false; 1545 } 1546 return true; // A normal text line. 1547 } 1548 1549 bool Scanner::scanBlockScalar(bool IsLiteral) { 1550 // Eat '|' or '>' 1551 assert(*Current == '|' || *Current == '>'); 1552 skip(1); 1553 1554 char ChompingIndicator; 1555 unsigned BlockIndent; 1556 bool IsDone = false; 1557 if (!scanBlockScalarHeader(ChompingIndicator, BlockIndent, IsDone)) 1558 return false; 1559 if (IsDone) 1560 return true; 1561 1562 auto Start = Current; 1563 unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent; 1564 unsigned LineBreaks = 0; 1565 if (BlockIndent == 0) { 1566 if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks, 1567 IsDone)) 1568 return false; 1569 } 1570 1571 // Scan the block's scalars body. 1572 SmallString<256> Str; 1573 while (!IsDone) { 1574 if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone)) 1575 return false; 1576 if (IsDone) 1577 break; 1578 1579 // Parse the current line. 1580 auto LineStart = Current; 1581 advanceWhile(&Scanner::skip_nb_char); 1582 if (LineStart != Current) { 1583 Str.append(LineBreaks, '\n'); 1584 Str.append(StringRef(LineStart, Current - LineStart)); 1585 LineBreaks = 0; 1586 } 1587 1588 // Check for EOF. 1589 if (Current == End) 1590 break; 1591 1592 if (!consumeLineBreakIfPresent()) 1593 break; 1594 ++LineBreaks; 1595 } 1596 1597 if (Current == End && !LineBreaks) 1598 // Ensure that there is at least one line break before the end of file. 1599 LineBreaks = 1; 1600 Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n'); 1601 1602 // New lines may start a simple key. 1603 if (!FlowLevel) 1604 IsSimpleKeyAllowed = true; 1605 1606 Token T; 1607 T.Kind = Token::TK_BlockScalar; 1608 T.Range = StringRef(Start, Current - Start); 1609 T.Value = Str.str().str(); 1610 TokenQueue.push_back(T); 1611 return true; 1612 } 1613 1614 bool Scanner::scanTag() { 1615 StringRef::iterator Start = Current; 1616 unsigned ColStart = Column; 1617 skip(1); // Eat !. 1618 if (Current == End || isBlankOrBreak(Current)); // An empty tag. 1619 else if (*Current == '<') { 1620 skip(1); 1621 scan_ns_uri_char(); 1622 if (!consume('>')) 1623 return false; 1624 } else { 1625 // FIXME: Actually parse the c-ns-shorthand-tag rule. 1626 Current = skip_while(&Scanner::skip_ns_char, Current); 1627 } 1628 1629 Token T; 1630 T.Kind = Token::TK_Tag; 1631 T.Range = StringRef(Start, Current - Start); 1632 TokenQueue.push_back(T); 1633 1634 // Tags can be simple keys. 1635 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1636 1637 IsSimpleKeyAllowed = false; 1638 1639 return true; 1640 } 1641 1642 bool Scanner::fetchMoreTokens() { 1643 if (IsStartOfStream) 1644 return scanStreamStart(); 1645 1646 scanToNextToken(); 1647 1648 if (Current == End) 1649 return scanStreamEnd(); 1650 1651 removeStaleSimpleKeyCandidates(); 1652 1653 unrollIndent(Column); 1654 1655 if (Column == 0 && *Current == '%') 1656 return scanDirective(); 1657 1658 if (Column == 0 && Current + 4 <= End 1659 && *Current == '-' 1660 && *(Current + 1) == '-' 1661 && *(Current + 2) == '-' 1662 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1663 return scanDocumentIndicator(true); 1664 1665 if (Column == 0 && Current + 4 <= End 1666 && *Current == '.' 1667 && *(Current + 1) == '.' 1668 && *(Current + 2) == '.' 1669 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1670 return scanDocumentIndicator(false); 1671 1672 if (*Current == '[') 1673 return scanFlowCollectionStart(true); 1674 1675 if (*Current == '{') 1676 return scanFlowCollectionStart(false); 1677 1678 if (*Current == ']') 1679 return scanFlowCollectionEnd(true); 1680 1681 if (*Current == '}') 1682 return scanFlowCollectionEnd(false); 1683 1684 if (*Current == ',') 1685 return scanFlowEntry(); 1686 1687 if (*Current == '-' && isBlankOrBreak(Current + 1)) 1688 return scanBlockEntry(); 1689 1690 if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1))) 1691 return scanKey(); 1692 1693 if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1))) 1694 return scanValue(); 1695 1696 if (*Current == '*') 1697 return scanAliasOrAnchor(true); 1698 1699 if (*Current == '&') 1700 return scanAliasOrAnchor(false); 1701 1702 if (*Current == '!') 1703 return scanTag(); 1704 1705 if (*Current == '|' && !FlowLevel) 1706 return scanBlockScalar(true); 1707 1708 if (*Current == '>' && !FlowLevel) 1709 return scanBlockScalar(false); 1710 1711 if (*Current == '\'') 1712 return scanFlowScalar(false); 1713 1714 if (*Current == '"') 1715 return scanFlowScalar(true); 1716 1717 // Get a plain scalar. 1718 StringRef FirstChar(Current, 1); 1719 if (!(isBlankOrBreak(Current) 1720 || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos) 1721 || (*Current == '-' && !isBlankOrBreak(Current + 1)) 1722 || (!FlowLevel && (*Current == '?' || *Current == ':') 1723 && isBlankOrBreak(Current + 1)) 1724 || (!FlowLevel && *Current == ':' 1725 && Current + 2 < End 1726 && *(Current + 1) == ':' 1727 && !isBlankOrBreak(Current + 2))) 1728 return scanPlainScalar(); 1729 1730 setError("Unrecognized character while tokenizing."); 1731 return false; 1732 } 1733 1734 Stream::Stream(StringRef Input, SourceMgr &SM, bool ShowColors) 1735 : scanner(new Scanner(Input, SM, ShowColors)), CurrentDoc() {} 1736 1737 Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors) 1738 : scanner(new Scanner(InputBuffer, SM, ShowColors)), CurrentDoc() {} 1739 1740 Stream::~Stream() {} 1741 1742 bool Stream::failed() { return scanner->failed(); } 1743 1744 void Stream::printError(Node *N, const Twine &Msg) { 1745 scanner->printError( N->getSourceRange().Start 1746 , SourceMgr::DK_Error 1747 , Msg 1748 , N->getSourceRange()); 1749 } 1750 1751 document_iterator Stream::begin() { 1752 if (CurrentDoc) 1753 report_fatal_error("Can only iterate over the stream once"); 1754 1755 // Skip Stream-Start. 1756 scanner->getNext(); 1757 1758 CurrentDoc.reset(new Document(*this)); 1759 return document_iterator(CurrentDoc); 1760 } 1761 1762 document_iterator Stream::end() { 1763 return document_iterator(); 1764 } 1765 1766 void Stream::skip() { 1767 for (document_iterator i = begin(), e = end(); i != e; ++i) 1768 i->skip(); 1769 } 1770 1771 Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A, 1772 StringRef T) 1773 : Doc(D), TypeID(Type), Anchor(A), Tag(T) { 1774 SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); 1775 SourceRange = SMRange(Start, Start); 1776 } 1777 1778 std::string Node::getVerbatimTag() const { 1779 StringRef Raw = getRawTag(); 1780 if (!Raw.empty() && Raw != "!") { 1781 std::string Ret; 1782 if (Raw.find_last_of('!') == 0) { 1783 Ret = Doc->getTagMap().find("!")->second; 1784 Ret += Raw.substr(1); 1785 return Ret; 1786 } else if (Raw.startswith("!!")) { 1787 Ret = Doc->getTagMap().find("!!")->second; 1788 Ret += Raw.substr(2); 1789 return Ret; 1790 } else { 1791 StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1); 1792 std::map<StringRef, StringRef>::const_iterator It = 1793 Doc->getTagMap().find(TagHandle); 1794 if (It != Doc->getTagMap().end()) 1795 Ret = It->second; 1796 else { 1797 Token T; 1798 T.Kind = Token::TK_Tag; 1799 T.Range = TagHandle; 1800 setError(Twine("Unknown tag handle ") + TagHandle, T); 1801 } 1802 Ret += Raw.substr(Raw.find_last_of('!') + 1); 1803 return Ret; 1804 } 1805 } 1806 1807 switch (getType()) { 1808 case NK_Null: 1809 return "tag:yaml.org,2002:null"; 1810 case NK_Scalar: 1811 case NK_BlockScalar: 1812 // TODO: Tag resolution. 1813 return "tag:yaml.org,2002:str"; 1814 case NK_Mapping: 1815 return "tag:yaml.org,2002:map"; 1816 case NK_Sequence: 1817 return "tag:yaml.org,2002:seq"; 1818 } 1819 1820 return ""; 1821 } 1822 1823 Token &Node::peekNext() { 1824 return Doc->peekNext(); 1825 } 1826 1827 Token Node::getNext() { 1828 return Doc->getNext(); 1829 } 1830 1831 Node *Node::parseBlockNode() { 1832 return Doc->parseBlockNode(); 1833 } 1834 1835 BumpPtrAllocator &Node::getAllocator() { 1836 return Doc->NodeAllocator; 1837 } 1838 1839 void Node::setError(const Twine &Msg, Token &Tok) const { 1840 Doc->setError(Msg, Tok); 1841 } 1842 1843 bool Node::failed() const { 1844 return Doc->failed(); 1845 } 1846 1847 1848 1849 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { 1850 // TODO: Handle newlines properly. We need to remove leading whitespace. 1851 if (Value[0] == '"') { // Double quoted. 1852 // Pull off the leading and trailing "s. 1853 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1854 // Search for characters that would require unescaping the value. 1855 StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); 1856 if (i != StringRef::npos) 1857 return unescapeDoubleQuoted(UnquotedValue, i, Storage); 1858 return UnquotedValue; 1859 } else if (Value[0] == '\'') { // Single quoted. 1860 // Pull off the leading and trailing 's. 1861 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1862 StringRef::size_type i = UnquotedValue.find('\''); 1863 if (i != StringRef::npos) { 1864 // We're going to need Storage. 1865 Storage.clear(); 1866 Storage.reserve(UnquotedValue.size()); 1867 for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { 1868 StringRef Valid(UnquotedValue.begin(), i); 1869 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1870 Storage.push_back('\''); 1871 UnquotedValue = UnquotedValue.substr(i + 2); 1872 } 1873 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 1874 return StringRef(Storage.begin(), Storage.size()); 1875 } 1876 return UnquotedValue; 1877 } 1878 // Plain or block. 1879 return Value.rtrim(' '); 1880 } 1881 1882 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue 1883 , StringRef::size_type i 1884 , SmallVectorImpl<char> &Storage) 1885 const { 1886 // Use Storage to build proper value. 1887 Storage.clear(); 1888 Storage.reserve(UnquotedValue.size()); 1889 for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { 1890 // Insert all previous chars into Storage. 1891 StringRef Valid(UnquotedValue.begin(), i); 1892 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1893 // Chop off inserted chars. 1894 UnquotedValue = UnquotedValue.substr(i); 1895 1896 assert(!UnquotedValue.empty() && "Can't be empty!"); 1897 1898 // Parse escape or line break. 1899 switch (UnquotedValue[0]) { 1900 case '\r': 1901 case '\n': 1902 Storage.push_back('\n'); 1903 if ( UnquotedValue.size() > 1 1904 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1905 UnquotedValue = UnquotedValue.substr(1); 1906 UnquotedValue = UnquotedValue.substr(1); 1907 break; 1908 default: 1909 if (UnquotedValue.size() == 1) 1910 // TODO: Report error. 1911 break; 1912 UnquotedValue = UnquotedValue.substr(1); 1913 switch (UnquotedValue[0]) { 1914 default: { 1915 Token T; 1916 T.Range = StringRef(UnquotedValue.begin(), 1); 1917 setError("Unrecognized escape code!", T); 1918 return ""; 1919 } 1920 case '\r': 1921 case '\n': 1922 // Remove the new line. 1923 if ( UnquotedValue.size() > 1 1924 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1925 UnquotedValue = UnquotedValue.substr(1); 1926 // If this was just a single byte newline, it will get skipped 1927 // below. 1928 break; 1929 case '0': 1930 Storage.push_back(0x00); 1931 break; 1932 case 'a': 1933 Storage.push_back(0x07); 1934 break; 1935 case 'b': 1936 Storage.push_back(0x08); 1937 break; 1938 case 't': 1939 case 0x09: 1940 Storage.push_back(0x09); 1941 break; 1942 case 'n': 1943 Storage.push_back(0x0A); 1944 break; 1945 case 'v': 1946 Storage.push_back(0x0B); 1947 break; 1948 case 'f': 1949 Storage.push_back(0x0C); 1950 break; 1951 case 'r': 1952 Storage.push_back(0x0D); 1953 break; 1954 case 'e': 1955 Storage.push_back(0x1B); 1956 break; 1957 case ' ': 1958 Storage.push_back(0x20); 1959 break; 1960 case '"': 1961 Storage.push_back(0x22); 1962 break; 1963 case '/': 1964 Storage.push_back(0x2F); 1965 break; 1966 case '\\': 1967 Storage.push_back(0x5C); 1968 break; 1969 case 'N': 1970 encodeUTF8(0x85, Storage); 1971 break; 1972 case '_': 1973 encodeUTF8(0xA0, Storage); 1974 break; 1975 case 'L': 1976 encodeUTF8(0x2028, Storage); 1977 break; 1978 case 'P': 1979 encodeUTF8(0x2029, Storage); 1980 break; 1981 case 'x': { 1982 if (UnquotedValue.size() < 3) 1983 // TODO: Report error. 1984 break; 1985 unsigned int UnicodeScalarValue; 1986 if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) 1987 // TODO: Report error. 1988 UnicodeScalarValue = 0xFFFD; 1989 encodeUTF8(UnicodeScalarValue, Storage); 1990 UnquotedValue = UnquotedValue.substr(2); 1991 break; 1992 } 1993 case 'u': { 1994 if (UnquotedValue.size() < 5) 1995 // TODO: Report error. 1996 break; 1997 unsigned int UnicodeScalarValue; 1998 if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) 1999 // TODO: Report error. 2000 UnicodeScalarValue = 0xFFFD; 2001 encodeUTF8(UnicodeScalarValue, Storage); 2002 UnquotedValue = UnquotedValue.substr(4); 2003 break; 2004 } 2005 case 'U': { 2006 if (UnquotedValue.size() < 9) 2007 // TODO: Report error. 2008 break; 2009 unsigned int UnicodeScalarValue; 2010 if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) 2011 // TODO: Report error. 2012 UnicodeScalarValue = 0xFFFD; 2013 encodeUTF8(UnicodeScalarValue, Storage); 2014 UnquotedValue = UnquotedValue.substr(8); 2015 break; 2016 } 2017 } 2018 UnquotedValue = UnquotedValue.substr(1); 2019 } 2020 } 2021 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 2022 return StringRef(Storage.begin(), Storage.size()); 2023 } 2024 2025 Node *KeyValueNode::getKey() { 2026 if (Key) 2027 return Key; 2028 // Handle implicit null keys. 2029 { 2030 Token &t = peekNext(); 2031 if ( t.Kind == Token::TK_BlockEnd 2032 || t.Kind == Token::TK_Value 2033 || t.Kind == Token::TK_Error) { 2034 return Key = new (getAllocator()) NullNode(Doc); 2035 } 2036 if (t.Kind == Token::TK_Key) 2037 getNext(); // skip TK_Key. 2038 } 2039 2040 // Handle explicit null keys. 2041 Token &t = peekNext(); 2042 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { 2043 return Key = new (getAllocator()) NullNode(Doc); 2044 } 2045 2046 // We've got a normal key. 2047 return Key = parseBlockNode(); 2048 } 2049 2050 Node *KeyValueNode::getValue() { 2051 if (Value) 2052 return Value; 2053 getKey()->skip(); 2054 if (failed()) 2055 return Value = new (getAllocator()) NullNode(Doc); 2056 2057 // Handle implicit null values. 2058 { 2059 Token &t = peekNext(); 2060 if ( t.Kind == Token::TK_BlockEnd 2061 || t.Kind == Token::TK_FlowMappingEnd 2062 || t.Kind == Token::TK_Key 2063 || t.Kind == Token::TK_FlowEntry 2064 || t.Kind == Token::TK_Error) { 2065 return Value = new (getAllocator()) NullNode(Doc); 2066 } 2067 2068 if (t.Kind != Token::TK_Value) { 2069 setError("Unexpected token in Key Value.", t); 2070 return Value = new (getAllocator()) NullNode(Doc); 2071 } 2072 getNext(); // skip TK_Value. 2073 } 2074 2075 // Handle explicit null values. 2076 Token &t = peekNext(); 2077 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { 2078 return Value = new (getAllocator()) NullNode(Doc); 2079 } 2080 2081 // We got a normal value. 2082 return Value = parseBlockNode(); 2083 } 2084 2085 void MappingNode::increment() { 2086 if (failed()) { 2087 IsAtEnd = true; 2088 CurrentEntry = nullptr; 2089 return; 2090 } 2091 if (CurrentEntry) { 2092 CurrentEntry->skip(); 2093 if (Type == MT_Inline) { 2094 IsAtEnd = true; 2095 CurrentEntry = nullptr; 2096 return; 2097 } 2098 } 2099 Token T = peekNext(); 2100 if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { 2101 // KeyValueNode eats the TK_Key. That way it can detect null keys. 2102 CurrentEntry = new (getAllocator()) KeyValueNode(Doc); 2103 } else if (Type == MT_Block) { 2104 switch (T.Kind) { 2105 case Token::TK_BlockEnd: 2106 getNext(); 2107 IsAtEnd = true; 2108 CurrentEntry = nullptr; 2109 break; 2110 default: 2111 setError("Unexpected token. Expected Key or Block End", T); 2112 case Token::TK_Error: 2113 IsAtEnd = true; 2114 CurrentEntry = nullptr; 2115 } 2116 } else { 2117 switch (T.Kind) { 2118 case Token::TK_FlowEntry: 2119 // Eat the flow entry and recurse. 2120 getNext(); 2121 return increment(); 2122 case Token::TK_FlowMappingEnd: 2123 getNext(); 2124 case Token::TK_Error: 2125 // Set this to end iterator. 2126 IsAtEnd = true; 2127 CurrentEntry = nullptr; 2128 break; 2129 default: 2130 setError( "Unexpected token. Expected Key, Flow Entry, or Flow " 2131 "Mapping End." 2132 , T); 2133 IsAtEnd = true; 2134 CurrentEntry = nullptr; 2135 } 2136 } 2137 } 2138 2139 void SequenceNode::increment() { 2140 if (failed()) { 2141 IsAtEnd = true; 2142 CurrentEntry = nullptr; 2143 return; 2144 } 2145 if (CurrentEntry) 2146 CurrentEntry->skip(); 2147 Token T = peekNext(); 2148 if (SeqType == ST_Block) { 2149 switch (T.Kind) { 2150 case Token::TK_BlockEntry: 2151 getNext(); 2152 CurrentEntry = parseBlockNode(); 2153 if (!CurrentEntry) { // An error occurred. 2154 IsAtEnd = true; 2155 CurrentEntry = nullptr; 2156 } 2157 break; 2158 case Token::TK_BlockEnd: 2159 getNext(); 2160 IsAtEnd = true; 2161 CurrentEntry = nullptr; 2162 break; 2163 default: 2164 setError( "Unexpected token. Expected Block Entry or Block End." 2165 , T); 2166 case Token::TK_Error: 2167 IsAtEnd = true; 2168 CurrentEntry = nullptr; 2169 } 2170 } else if (SeqType == ST_Indentless) { 2171 switch (T.Kind) { 2172 case Token::TK_BlockEntry: 2173 getNext(); 2174 CurrentEntry = parseBlockNode(); 2175 if (!CurrentEntry) { // An error occurred. 2176 IsAtEnd = true; 2177 CurrentEntry = nullptr; 2178 } 2179 break; 2180 default: 2181 case Token::TK_Error: 2182 IsAtEnd = true; 2183 CurrentEntry = nullptr; 2184 } 2185 } else if (SeqType == ST_Flow) { 2186 switch (T.Kind) { 2187 case Token::TK_FlowEntry: 2188 // Eat the flow entry and recurse. 2189 getNext(); 2190 WasPreviousTokenFlowEntry = true; 2191 return increment(); 2192 case Token::TK_FlowSequenceEnd: 2193 getNext(); 2194 case Token::TK_Error: 2195 // Set this to end iterator. 2196 IsAtEnd = true; 2197 CurrentEntry = nullptr; 2198 break; 2199 case Token::TK_StreamEnd: 2200 case Token::TK_DocumentEnd: 2201 case Token::TK_DocumentStart: 2202 setError("Could not find closing ]!", T); 2203 // Set this to end iterator. 2204 IsAtEnd = true; 2205 CurrentEntry = nullptr; 2206 break; 2207 default: 2208 if (!WasPreviousTokenFlowEntry) { 2209 setError("Expected , between entries!", T); 2210 IsAtEnd = true; 2211 CurrentEntry = nullptr; 2212 break; 2213 } 2214 // Otherwise it must be a flow entry. 2215 CurrentEntry = parseBlockNode(); 2216 if (!CurrentEntry) { 2217 IsAtEnd = true; 2218 } 2219 WasPreviousTokenFlowEntry = false; 2220 break; 2221 } 2222 } 2223 } 2224 2225 Document::Document(Stream &S) : stream(S), Root(nullptr) { 2226 // Tag maps starts with two default mappings. 2227 TagMap["!"] = "!"; 2228 TagMap["!!"] = "tag:yaml.org,2002:"; 2229 2230 if (parseDirectives()) 2231 expectToken(Token::TK_DocumentStart); 2232 Token &T = peekNext(); 2233 if (T.Kind == Token::TK_DocumentStart) 2234 getNext(); 2235 } 2236 2237 bool Document::skip() { 2238 if (stream.scanner->failed()) 2239 return false; 2240 if (!Root) 2241 getRoot(); 2242 Root->skip(); 2243 Token &T = peekNext(); 2244 if (T.Kind == Token::TK_StreamEnd) 2245 return false; 2246 if (T.Kind == Token::TK_DocumentEnd) { 2247 getNext(); 2248 return skip(); 2249 } 2250 return true; 2251 } 2252 2253 Token &Document::peekNext() { 2254 return stream.scanner->peekNext(); 2255 } 2256 2257 Token Document::getNext() { 2258 return stream.scanner->getNext(); 2259 } 2260 2261 void Document::setError(const Twine &Message, Token &Location) const { 2262 stream.scanner->setError(Message, Location.Range.begin()); 2263 } 2264 2265 bool Document::failed() const { 2266 return stream.scanner->failed(); 2267 } 2268 2269 Node *Document::parseBlockNode() { 2270 Token T = peekNext(); 2271 // Handle properties. 2272 Token AnchorInfo; 2273 Token TagInfo; 2274 parse_property: 2275 switch (T.Kind) { 2276 case Token::TK_Alias: 2277 getNext(); 2278 return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); 2279 case Token::TK_Anchor: 2280 if (AnchorInfo.Kind == Token::TK_Anchor) { 2281 setError("Already encountered an anchor for this node!", T); 2282 return nullptr; 2283 } 2284 AnchorInfo = getNext(); // Consume TK_Anchor. 2285 T = peekNext(); 2286 goto parse_property; 2287 case Token::TK_Tag: 2288 if (TagInfo.Kind == Token::TK_Tag) { 2289 setError("Already encountered a tag for this node!", T); 2290 return nullptr; 2291 } 2292 TagInfo = getNext(); // Consume TK_Tag. 2293 T = peekNext(); 2294 goto parse_property; 2295 default: 2296 break; 2297 } 2298 2299 switch (T.Kind) { 2300 case Token::TK_BlockEntry: 2301 // We got an unindented BlockEntry sequence. This is not terminated with 2302 // a BlockEnd. 2303 // Don't eat the TK_BlockEntry, SequenceNode needs it. 2304 return new (NodeAllocator) SequenceNode( stream.CurrentDoc 2305 , AnchorInfo.Range.substr(1) 2306 , TagInfo.Range 2307 , SequenceNode::ST_Indentless); 2308 case Token::TK_BlockSequenceStart: 2309 getNext(); 2310 return new (NodeAllocator) 2311 SequenceNode( stream.CurrentDoc 2312 , AnchorInfo.Range.substr(1) 2313 , TagInfo.Range 2314 , SequenceNode::ST_Block); 2315 case Token::TK_BlockMappingStart: 2316 getNext(); 2317 return new (NodeAllocator) 2318 MappingNode( stream.CurrentDoc 2319 , AnchorInfo.Range.substr(1) 2320 , TagInfo.Range 2321 , MappingNode::MT_Block); 2322 case Token::TK_FlowSequenceStart: 2323 getNext(); 2324 return new (NodeAllocator) 2325 SequenceNode( stream.CurrentDoc 2326 , AnchorInfo.Range.substr(1) 2327 , TagInfo.Range 2328 , SequenceNode::ST_Flow); 2329 case Token::TK_FlowMappingStart: 2330 getNext(); 2331 return new (NodeAllocator) 2332 MappingNode( stream.CurrentDoc 2333 , AnchorInfo.Range.substr(1) 2334 , TagInfo.Range 2335 , MappingNode::MT_Flow); 2336 case Token::TK_Scalar: 2337 getNext(); 2338 return new (NodeAllocator) 2339 ScalarNode( stream.CurrentDoc 2340 , AnchorInfo.Range.substr(1) 2341 , TagInfo.Range 2342 , T.Range); 2343 case Token::TK_BlockScalar: { 2344 getNext(); 2345 StringRef NullTerminatedStr(T.Value.c_str(), T.Value.length() + 1); 2346 StringRef StrCopy = NullTerminatedStr.copy(NodeAllocator).drop_back(); 2347 return new (NodeAllocator) 2348 BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1), 2349 TagInfo.Range, StrCopy, T.Range); 2350 } 2351 case Token::TK_Key: 2352 // Don't eat the TK_Key, KeyValueNode expects it. 2353 return new (NodeAllocator) 2354 MappingNode( stream.CurrentDoc 2355 , AnchorInfo.Range.substr(1) 2356 , TagInfo.Range 2357 , MappingNode::MT_Inline); 2358 case Token::TK_DocumentStart: 2359 case Token::TK_DocumentEnd: 2360 case Token::TK_StreamEnd: 2361 default: 2362 // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not 2363 // !!null null. 2364 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2365 case Token::TK_Error: 2366 return nullptr; 2367 } 2368 llvm_unreachable("Control flow shouldn't reach here."); 2369 return nullptr; 2370 } 2371 2372 bool Document::parseDirectives() { 2373 bool isDirective = false; 2374 while (true) { 2375 Token T = peekNext(); 2376 if (T.Kind == Token::TK_TagDirective) { 2377 parseTAGDirective(); 2378 isDirective = true; 2379 } else if (T.Kind == Token::TK_VersionDirective) { 2380 parseYAMLDirective(); 2381 isDirective = true; 2382 } else 2383 break; 2384 } 2385 return isDirective; 2386 } 2387 2388 void Document::parseYAMLDirective() { 2389 getNext(); // Eat %YAML <version> 2390 } 2391 2392 void Document::parseTAGDirective() { 2393 Token Tag = getNext(); // %TAG <handle> <prefix> 2394 StringRef T = Tag.Range; 2395 // Strip %TAG 2396 T = T.substr(T.find_first_of(" \t")).ltrim(" \t"); 2397 std::size_t HandleEnd = T.find_first_of(" \t"); 2398 StringRef TagHandle = T.substr(0, HandleEnd); 2399 StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t"); 2400 TagMap[TagHandle] = TagPrefix; 2401 } 2402 2403 bool Document::expectToken(int TK) { 2404 Token T = getNext(); 2405 if (T.Kind != TK) { 2406 setError("Unexpected token", T); 2407 return false; 2408 } 2409 return true; 2410 } 2411