1 //===--- YAMLParser.cpp - Simple YAML parser ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements a YAML parser. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Support/YAMLParser.h" 15 #include "llvm/ADT/STLExtras.h" 16 #include "llvm/ADT/SmallString.h" 17 #include "llvm/ADT/SmallVector.h" 18 #include "llvm/ADT/StringExtras.h" 19 #include "llvm/ADT/Twine.h" 20 #include "llvm/ADT/AllocatorList.h" 21 #include "llvm/Support/ErrorHandling.h" 22 #include "llvm/Support/MemoryBuffer.h" 23 #include "llvm/Support/SourceMgr.h" 24 #include "llvm/Support/raw_ostream.h" 25 26 using namespace llvm; 27 using namespace yaml; 28 29 enum UnicodeEncodingForm { 30 UEF_UTF32_LE, ///< UTF-32 Little Endian 31 UEF_UTF32_BE, ///< UTF-32 Big Endian 32 UEF_UTF16_LE, ///< UTF-16 Little Endian 33 UEF_UTF16_BE, ///< UTF-16 Big Endian 34 UEF_UTF8, ///< UTF-8 or ascii. 35 UEF_Unknown ///< Not a valid Unicode encoding. 36 }; 37 38 /// EncodingInfo - Holds the encoding type and length of the byte order mark if 39 /// it exists. Length is in {0, 2, 3, 4}. 40 typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo; 41 42 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode 43 /// encoding form of \a Input. 44 /// 45 /// @param Input A string of length 0 or more. 46 /// @returns An EncodingInfo indicating the Unicode encoding form of the input 47 /// and how long the byte order mark is if one exists. 48 static EncodingInfo getUnicodeEncoding(StringRef Input) { 49 if (Input.size() == 0) 50 return std::make_pair(UEF_Unknown, 0); 51 52 switch (uint8_t(Input[0])) { 53 case 0x00: 54 if (Input.size() >= 4) { 55 if ( Input[1] == 0 56 && uint8_t(Input[2]) == 0xFE 57 && uint8_t(Input[3]) == 0xFF) 58 return std::make_pair(UEF_UTF32_BE, 4); 59 if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) 60 return std::make_pair(UEF_UTF32_BE, 0); 61 } 62 63 if (Input.size() >= 2 && Input[1] != 0) 64 return std::make_pair(UEF_UTF16_BE, 0); 65 return std::make_pair(UEF_Unknown, 0); 66 case 0xFF: 67 if ( Input.size() >= 4 68 && uint8_t(Input[1]) == 0xFE 69 && Input[2] == 0 70 && Input[3] == 0) 71 return std::make_pair(UEF_UTF32_LE, 4); 72 73 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) 74 return std::make_pair(UEF_UTF16_LE, 2); 75 return std::make_pair(UEF_Unknown, 0); 76 case 0xFE: 77 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) 78 return std::make_pair(UEF_UTF16_BE, 2); 79 return std::make_pair(UEF_Unknown, 0); 80 case 0xEF: 81 if ( Input.size() >= 3 82 && uint8_t(Input[1]) == 0xBB 83 && uint8_t(Input[2]) == 0xBF) 84 return std::make_pair(UEF_UTF8, 3); 85 return std::make_pair(UEF_Unknown, 0); 86 } 87 88 // It could still be utf-32 or utf-16. 89 if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) 90 return std::make_pair(UEF_UTF32_LE, 0); 91 92 if (Input.size() >= 2 && Input[1] == 0) 93 return std::make_pair(UEF_UTF16_LE, 0); 94 95 return std::make_pair(UEF_UTF8, 0); 96 } 97 98 namespace llvm { 99 namespace yaml { 100 /// Pin the vtables to this file. 101 void Node::anchor() {} 102 void NullNode::anchor() {} 103 void ScalarNode::anchor() {} 104 void BlockScalarNode::anchor() {} 105 void KeyValueNode::anchor() {} 106 void MappingNode::anchor() {} 107 void SequenceNode::anchor() {} 108 void AliasNode::anchor() {} 109 110 /// Token - A single YAML token. 111 struct Token { 112 enum TokenKind { 113 TK_Error, // Uninitialized token. 114 TK_StreamStart, 115 TK_StreamEnd, 116 TK_VersionDirective, 117 TK_TagDirective, 118 TK_DocumentStart, 119 TK_DocumentEnd, 120 TK_BlockEntry, 121 TK_BlockEnd, 122 TK_BlockSequenceStart, 123 TK_BlockMappingStart, 124 TK_FlowEntry, 125 TK_FlowSequenceStart, 126 TK_FlowSequenceEnd, 127 TK_FlowMappingStart, 128 TK_FlowMappingEnd, 129 TK_Key, 130 TK_Value, 131 TK_Scalar, 132 TK_BlockScalar, 133 TK_Alias, 134 TK_Anchor, 135 TK_Tag 136 } Kind; 137 138 /// A string of length 0 or more whose begin() points to the logical location 139 /// of the token in the input. 140 StringRef Range; 141 142 /// The value of a block scalar node. 143 std::string Value; 144 145 Token() : Kind(TK_Error) {} 146 }; 147 } 148 } 149 150 typedef llvm::BumpPtrList<Token> TokenQueueT; 151 152 namespace { 153 /// @brief This struct is used to track simple keys. 154 /// 155 /// Simple keys are handled by creating an entry in SimpleKeys for each Token 156 /// which could legally be the start of a simple key. When peekNext is called, 157 /// if the Token To be returned is referenced by a SimpleKey, we continue 158 /// tokenizing until that potential simple key has either been found to not be 159 /// a simple key (we moved on to the next line or went further than 1024 chars). 160 /// Or when we run into a Value, and then insert a Key token (and possibly 161 /// others) before the SimpleKey's Tok. 162 struct SimpleKey { 163 TokenQueueT::iterator Tok; 164 unsigned Column; 165 unsigned Line; 166 unsigned FlowLevel; 167 bool IsRequired; 168 169 bool operator ==(const SimpleKey &Other) { 170 return Tok == Other.Tok; 171 } 172 }; 173 } 174 175 /// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit 176 /// subsequence and the subsequence's length in code units (uint8_t). 177 /// A length of 0 represents an error. 178 typedef std::pair<uint32_t, unsigned> UTF8Decoded; 179 180 static UTF8Decoded decodeUTF8(StringRef Range) { 181 StringRef::iterator Position= Range.begin(); 182 StringRef::iterator End = Range.end(); 183 // 1 byte: [0x00, 0x7f] 184 // Bit pattern: 0xxxxxxx 185 if ((*Position & 0x80) == 0) { 186 return std::make_pair(*Position, 1); 187 } 188 // 2 bytes: [0x80, 0x7ff] 189 // Bit pattern: 110xxxxx 10xxxxxx 190 if (Position + 1 != End && 191 ((*Position & 0xE0) == 0xC0) && 192 ((*(Position + 1) & 0xC0) == 0x80)) { 193 uint32_t codepoint = ((*Position & 0x1F) << 6) | 194 (*(Position + 1) & 0x3F); 195 if (codepoint >= 0x80) 196 return std::make_pair(codepoint, 2); 197 } 198 // 3 bytes: [0x8000, 0xffff] 199 // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx 200 if (Position + 2 != End && 201 ((*Position & 0xF0) == 0xE0) && 202 ((*(Position + 1) & 0xC0) == 0x80) && 203 ((*(Position + 2) & 0xC0) == 0x80)) { 204 uint32_t codepoint = ((*Position & 0x0F) << 12) | 205 ((*(Position + 1) & 0x3F) << 6) | 206 (*(Position + 2) & 0x3F); 207 // Codepoints between 0xD800 and 0xDFFF are invalid, as 208 // they are high / low surrogate halves used by UTF-16. 209 if (codepoint >= 0x800 && 210 (codepoint < 0xD800 || codepoint > 0xDFFF)) 211 return std::make_pair(codepoint, 3); 212 } 213 // 4 bytes: [0x10000, 0x10FFFF] 214 // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 215 if (Position + 3 != End && 216 ((*Position & 0xF8) == 0xF0) && 217 ((*(Position + 1) & 0xC0) == 0x80) && 218 ((*(Position + 2) & 0xC0) == 0x80) && 219 ((*(Position + 3) & 0xC0) == 0x80)) { 220 uint32_t codepoint = ((*Position & 0x07) << 18) | 221 ((*(Position + 1) & 0x3F) << 12) | 222 ((*(Position + 2) & 0x3F) << 6) | 223 (*(Position + 3) & 0x3F); 224 if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) 225 return std::make_pair(codepoint, 4); 226 } 227 return std::make_pair(0, 0); 228 } 229 230 namespace llvm { 231 namespace yaml { 232 /// @brief Scans YAML tokens from a MemoryBuffer. 233 class Scanner { 234 public: 235 Scanner(StringRef Input, SourceMgr &SM, bool ShowColors = true); 236 Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors = true); 237 238 /// @brief Parse the next token and return it without popping it. 239 Token &peekNext(); 240 241 /// @brief Parse the next token and pop it from the queue. 242 Token getNext(); 243 244 void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, 245 ArrayRef<SMRange> Ranges = None) { 246 SM.PrintMessage(Loc, Kind, Message, Ranges, /* FixIts= */ None, ShowColors); 247 } 248 249 void setError(const Twine &Message, StringRef::iterator Position) { 250 if (Current >= End) 251 Current = End - 1; 252 253 // Don't print out more errors after the first one we encounter. The rest 254 // are just the result of the first, and have no meaning. 255 if (!Failed) 256 printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message); 257 Failed = true; 258 } 259 260 void setError(const Twine &Message) { 261 setError(Message, Current); 262 } 263 264 /// @brief Returns true if an error occurred while parsing. 265 bool failed() { 266 return Failed; 267 } 268 269 private: 270 void init(MemoryBufferRef Buffer); 271 272 StringRef currentInput() { 273 return StringRef(Current, End - Current); 274 } 275 276 /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting 277 /// at \a Position. 278 /// 279 /// If the UTF-8 code units starting at Position do not form a well-formed 280 /// code unit subsequence, then the Unicode scalar value is 0, and the length 281 /// is 0. 282 UTF8Decoded decodeUTF8(StringRef::iterator Position) { 283 return ::decodeUTF8(StringRef(Position, End - Position)); 284 } 285 286 // The following functions are based on the gramar rules in the YAML spec. The 287 // style of the function names it meant to closely match how they are written 288 // in the spec. The number within the [] is the number of the grammar rule in 289 // the spec. 290 // 291 // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. 292 // 293 // c- 294 // A production starting and ending with a special character. 295 // b- 296 // A production matching a single line break. 297 // nb- 298 // A production starting and ending with a non-break character. 299 // s- 300 // A production starting and ending with a white space character. 301 // ns- 302 // A production starting and ending with a non-space character. 303 // l- 304 // A production matching complete line(s). 305 306 /// @brief Skip a single nb-char[27] starting at Position. 307 /// 308 /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] 309 /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] 310 /// 311 /// @returns The code unit after the nb-char, or Position if it's not an 312 /// nb-char. 313 StringRef::iterator skip_nb_char(StringRef::iterator Position); 314 315 /// @brief Skip a single b-break[28] starting at Position. 316 /// 317 /// A b-break is 0xD 0xA | 0xD | 0xA 318 /// 319 /// @returns The code unit after the b-break, or Position if it's not a 320 /// b-break. 321 StringRef::iterator skip_b_break(StringRef::iterator Position); 322 323 /// Skip a single s-space[31] starting at Position. 324 /// 325 /// An s-space is 0x20 326 /// 327 /// @returns The code unit after the s-space, or Position if it's not a 328 /// s-space. 329 StringRef::iterator skip_s_space(StringRef::iterator Position); 330 331 /// @brief Skip a single s-white[33] starting at Position. 332 /// 333 /// A s-white is 0x20 | 0x9 334 /// 335 /// @returns The code unit after the s-white, or Position if it's not a 336 /// s-white. 337 StringRef::iterator skip_s_white(StringRef::iterator Position); 338 339 /// @brief Skip a single ns-char[34] starting at Position. 340 /// 341 /// A ns-char is nb-char - s-white 342 /// 343 /// @returns The code unit after the ns-char, or Position if it's not a 344 /// ns-char. 345 StringRef::iterator skip_ns_char(StringRef::iterator Position); 346 347 typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator); 348 /// @brief Skip minimal well-formed code unit subsequences until Func 349 /// returns its input. 350 /// 351 /// @returns The code unit after the last minimal well-formed code unit 352 /// subsequence that Func accepted. 353 StringRef::iterator skip_while( SkipWhileFunc Func 354 , StringRef::iterator Position); 355 356 /// Skip minimal well-formed code unit subsequences until Func returns its 357 /// input. 358 void advanceWhile(SkipWhileFunc Func); 359 360 /// @brief Scan ns-uri-char[39]s starting at Cur. 361 /// 362 /// This updates Cur and Column while scanning. 363 void scan_ns_uri_char(); 364 365 /// @brief Consume a minimal well-formed code unit subsequence starting at 366 /// \a Cur. Return false if it is not the same Unicode scalar value as 367 /// \a Expected. This updates \a Column. 368 bool consume(uint32_t Expected); 369 370 /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. 371 void skip(uint32_t Distance); 372 373 /// @brief Return true if the minimal well-formed code unit subsequence at 374 /// Pos is whitespace or a new line 375 bool isBlankOrBreak(StringRef::iterator Position); 376 377 /// Consume a single b-break[28] if it's present at the current position. 378 /// 379 /// Return false if the code unit at the current position isn't a line break. 380 bool consumeLineBreakIfPresent(); 381 382 /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey. 383 void saveSimpleKeyCandidate( TokenQueueT::iterator Tok 384 , unsigned AtColumn 385 , bool IsRequired); 386 387 /// @brief Remove simple keys that can no longer be valid simple keys. 388 /// 389 /// Invalid simple keys are not on the current line or are further than 1024 390 /// columns back. 391 void removeStaleSimpleKeyCandidates(); 392 393 /// @brief Remove all simple keys on FlowLevel \a Level. 394 void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); 395 396 /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd 397 /// tokens if needed. 398 bool unrollIndent(int ToColumn); 399 400 /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint 401 /// if needed. 402 bool rollIndent( int ToColumn 403 , Token::TokenKind Kind 404 , TokenQueueT::iterator InsertPoint); 405 406 /// @brief Skip a single-line comment when the comment starts at the current 407 /// position of the scanner. 408 void skipComment(); 409 410 /// @brief Skip whitespace and comments until the start of the next token. 411 void scanToNextToken(); 412 413 /// @brief Must be the first token generated. 414 bool scanStreamStart(); 415 416 /// @brief Generate tokens needed to close out the stream. 417 bool scanStreamEnd(); 418 419 /// @brief Scan a %BLAH directive. 420 bool scanDirective(); 421 422 /// @brief Scan a ... or ---. 423 bool scanDocumentIndicator(bool IsStart); 424 425 /// @brief Scan a [ or { and generate the proper flow collection start token. 426 bool scanFlowCollectionStart(bool IsSequence); 427 428 /// @brief Scan a ] or } and generate the proper flow collection end token. 429 bool scanFlowCollectionEnd(bool IsSequence); 430 431 /// @brief Scan the , that separates entries in a flow collection. 432 bool scanFlowEntry(); 433 434 /// @brief Scan the - that starts block sequence entries. 435 bool scanBlockEntry(); 436 437 /// @brief Scan an explicit ? indicating a key. 438 bool scanKey(); 439 440 /// @brief Scan an explicit : indicating a value. 441 bool scanValue(); 442 443 /// @brief Scan a quoted scalar. 444 bool scanFlowScalar(bool IsDoubleQuoted); 445 446 /// @brief Scan an unquoted scalar. 447 bool scanPlainScalar(); 448 449 /// @brief Scan an Alias or Anchor starting with * or &. 450 bool scanAliasOrAnchor(bool IsAlias); 451 452 /// @brief Scan a block scalar starting with | or >. 453 bool scanBlockScalar(bool IsLiteral); 454 455 /// Scan a chomping indicator in a block scalar header. 456 char scanBlockChompingIndicator(); 457 458 /// Scan an indentation indicator in a block scalar header. 459 unsigned scanBlockIndentationIndicator(); 460 461 /// Scan a block scalar header. 462 /// 463 /// Return false if an error occurred. 464 bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator, 465 bool &IsDone); 466 467 /// Look for the indentation level of a block scalar. 468 /// 469 /// Return false if an error occurred. 470 bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent, 471 unsigned &LineBreaks, bool &IsDone); 472 473 /// Scan the indentation of a text line in a block scalar. 474 /// 475 /// Return false if an error occurred. 476 bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent, 477 bool &IsDone); 478 479 /// @brief Scan a tag of the form !stuff. 480 bool scanTag(); 481 482 /// @brief Dispatch to the next scanning function based on \a *Cur. 483 bool fetchMoreTokens(); 484 485 /// @brief The SourceMgr used for diagnostics and buffer management. 486 SourceMgr &SM; 487 488 /// @brief The original input. 489 MemoryBufferRef InputBuffer; 490 491 /// @brief The current position of the scanner. 492 StringRef::iterator Current; 493 494 /// @brief The end of the input (one past the last character). 495 StringRef::iterator End; 496 497 /// @brief Current YAML indentation level in spaces. 498 int Indent; 499 500 /// @brief Current column number in Unicode code points. 501 unsigned Column; 502 503 /// @brief Current line number. 504 unsigned Line; 505 506 /// @brief How deep we are in flow style containers. 0 Means at block level. 507 unsigned FlowLevel; 508 509 /// @brief Are we at the start of the stream? 510 bool IsStartOfStream; 511 512 /// @brief Can the next token be the start of a simple key? 513 bool IsSimpleKeyAllowed; 514 515 /// @brief True if an error has occurred. 516 bool Failed; 517 518 /// @brief Should colors be used when printing out the diagnostic messages? 519 bool ShowColors; 520 521 /// @brief Queue of tokens. This is required to queue up tokens while looking 522 /// for the end of a simple key. And for cases where a single character 523 /// can produce multiple tokens (e.g. BlockEnd). 524 TokenQueueT TokenQueue; 525 526 /// @brief Indentation levels. 527 SmallVector<int, 4> Indents; 528 529 /// @brief Potential simple keys. 530 SmallVector<SimpleKey, 4> SimpleKeys; 531 }; 532 533 } // end namespace yaml 534 } // end namespace llvm 535 536 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. 537 static void encodeUTF8( uint32_t UnicodeScalarValue 538 , SmallVectorImpl<char> &Result) { 539 if (UnicodeScalarValue <= 0x7F) { 540 Result.push_back(UnicodeScalarValue & 0x7F); 541 } else if (UnicodeScalarValue <= 0x7FF) { 542 uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); 543 uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); 544 Result.push_back(FirstByte); 545 Result.push_back(SecondByte); 546 } else if (UnicodeScalarValue <= 0xFFFF) { 547 uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); 548 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 549 uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); 550 Result.push_back(FirstByte); 551 Result.push_back(SecondByte); 552 Result.push_back(ThirdByte); 553 } else if (UnicodeScalarValue <= 0x10FFFF) { 554 uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); 555 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); 556 uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 557 uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); 558 Result.push_back(FirstByte); 559 Result.push_back(SecondByte); 560 Result.push_back(ThirdByte); 561 Result.push_back(FourthByte); 562 } 563 } 564 565 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { 566 SourceMgr SM; 567 Scanner scanner(Input, SM); 568 while (true) { 569 Token T = scanner.getNext(); 570 switch (T.Kind) { 571 case Token::TK_StreamStart: 572 OS << "Stream-Start: "; 573 break; 574 case Token::TK_StreamEnd: 575 OS << "Stream-End: "; 576 break; 577 case Token::TK_VersionDirective: 578 OS << "Version-Directive: "; 579 break; 580 case Token::TK_TagDirective: 581 OS << "Tag-Directive: "; 582 break; 583 case Token::TK_DocumentStart: 584 OS << "Document-Start: "; 585 break; 586 case Token::TK_DocumentEnd: 587 OS << "Document-End: "; 588 break; 589 case Token::TK_BlockEntry: 590 OS << "Block-Entry: "; 591 break; 592 case Token::TK_BlockEnd: 593 OS << "Block-End: "; 594 break; 595 case Token::TK_BlockSequenceStart: 596 OS << "Block-Sequence-Start: "; 597 break; 598 case Token::TK_BlockMappingStart: 599 OS << "Block-Mapping-Start: "; 600 break; 601 case Token::TK_FlowEntry: 602 OS << "Flow-Entry: "; 603 break; 604 case Token::TK_FlowSequenceStart: 605 OS << "Flow-Sequence-Start: "; 606 break; 607 case Token::TK_FlowSequenceEnd: 608 OS << "Flow-Sequence-End: "; 609 break; 610 case Token::TK_FlowMappingStart: 611 OS << "Flow-Mapping-Start: "; 612 break; 613 case Token::TK_FlowMappingEnd: 614 OS << "Flow-Mapping-End: "; 615 break; 616 case Token::TK_Key: 617 OS << "Key: "; 618 break; 619 case Token::TK_Value: 620 OS << "Value: "; 621 break; 622 case Token::TK_Scalar: 623 OS << "Scalar: "; 624 break; 625 case Token::TK_BlockScalar: 626 OS << "Block Scalar: "; 627 break; 628 case Token::TK_Alias: 629 OS << "Alias: "; 630 break; 631 case Token::TK_Anchor: 632 OS << "Anchor: "; 633 break; 634 case Token::TK_Tag: 635 OS << "Tag: "; 636 break; 637 case Token::TK_Error: 638 break; 639 } 640 OS << T.Range << "\n"; 641 if (T.Kind == Token::TK_StreamEnd) 642 break; 643 else if (T.Kind == Token::TK_Error) 644 return false; 645 } 646 return true; 647 } 648 649 bool yaml::scanTokens(StringRef Input) { 650 llvm::SourceMgr SM; 651 llvm::yaml::Scanner scanner(Input, SM); 652 for (;;) { 653 llvm::yaml::Token T = scanner.getNext(); 654 if (T.Kind == Token::TK_StreamEnd) 655 break; 656 else if (T.Kind == Token::TK_Error) 657 return false; 658 } 659 return true; 660 } 661 662 std::string yaml::escape(StringRef Input) { 663 std::string EscapedInput; 664 for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { 665 if (*i == '\\') 666 EscapedInput += "\\\\"; 667 else if (*i == '"') 668 EscapedInput += "\\\""; 669 else if (*i == 0) 670 EscapedInput += "\\0"; 671 else if (*i == 0x07) 672 EscapedInput += "\\a"; 673 else if (*i == 0x08) 674 EscapedInput += "\\b"; 675 else if (*i == 0x09) 676 EscapedInput += "\\t"; 677 else if (*i == 0x0A) 678 EscapedInput += "\\n"; 679 else if (*i == 0x0B) 680 EscapedInput += "\\v"; 681 else if (*i == 0x0C) 682 EscapedInput += "\\f"; 683 else if (*i == 0x0D) 684 EscapedInput += "\\r"; 685 else if (*i == 0x1B) 686 EscapedInput += "\\e"; 687 else if ((unsigned char)*i < 0x20) { // Control characters not handled above. 688 std::string HexStr = utohexstr(*i); 689 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 690 } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. 691 UTF8Decoded UnicodeScalarValue 692 = decodeUTF8(StringRef(i, Input.end() - i)); 693 if (UnicodeScalarValue.second == 0) { 694 // Found invalid char. 695 SmallString<4> Val; 696 encodeUTF8(0xFFFD, Val); 697 EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end()); 698 // FIXME: Error reporting. 699 return EscapedInput; 700 } 701 if (UnicodeScalarValue.first == 0x85) 702 EscapedInput += "\\N"; 703 else if (UnicodeScalarValue.first == 0xA0) 704 EscapedInput += "\\_"; 705 else if (UnicodeScalarValue.first == 0x2028) 706 EscapedInput += "\\L"; 707 else if (UnicodeScalarValue.first == 0x2029) 708 EscapedInput += "\\P"; 709 else { 710 std::string HexStr = utohexstr(UnicodeScalarValue.first); 711 if (HexStr.size() <= 2) 712 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 713 else if (HexStr.size() <= 4) 714 EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; 715 else if (HexStr.size() <= 8) 716 EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; 717 } 718 i += UnicodeScalarValue.second - 1; 719 } else 720 EscapedInput.push_back(*i); 721 } 722 return EscapedInput; 723 } 724 725 Scanner::Scanner(StringRef Input, SourceMgr &sm, bool ShowColors) 726 : SM(sm), ShowColors(ShowColors) { 727 init(MemoryBufferRef(Input, "YAML")); 728 } 729 730 Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors) 731 : SM(SM_), ShowColors(ShowColors) { 732 init(Buffer); 733 } 734 735 void Scanner::init(MemoryBufferRef Buffer) { 736 InputBuffer = Buffer; 737 Current = InputBuffer.getBufferStart(); 738 End = InputBuffer.getBufferEnd(); 739 Indent = -1; 740 Column = 0; 741 Line = 0; 742 FlowLevel = 0; 743 IsStartOfStream = true; 744 IsSimpleKeyAllowed = true; 745 Failed = false; 746 std::unique_ptr<MemoryBuffer> InputBufferOwner = 747 MemoryBuffer::getMemBuffer(Buffer); 748 SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc()); 749 } 750 751 Token &Scanner::peekNext() { 752 // If the current token is a possible simple key, keep parsing until we 753 // can confirm. 754 bool NeedMore = false; 755 while (true) { 756 if (TokenQueue.empty() || NeedMore) { 757 if (!fetchMoreTokens()) { 758 TokenQueue.clear(); 759 TokenQueue.push_back(Token()); 760 return TokenQueue.front(); 761 } 762 } 763 assert(!TokenQueue.empty() && 764 "fetchMoreTokens lied about getting tokens!"); 765 766 removeStaleSimpleKeyCandidates(); 767 SimpleKey SK; 768 SK.Tok = TokenQueue.begin(); 769 if (!is_contained(SimpleKeys, SK)) 770 break; 771 else 772 NeedMore = true; 773 } 774 return TokenQueue.front(); 775 } 776 777 Token Scanner::getNext() { 778 Token Ret = peekNext(); 779 // TokenQueue can be empty if there was an error getting the next token. 780 if (!TokenQueue.empty()) 781 TokenQueue.pop_front(); 782 783 // There cannot be any referenced Token's if the TokenQueue is empty. So do a 784 // quick deallocation of them all. 785 if (TokenQueue.empty()) 786 TokenQueue.resetAlloc(); 787 788 return Ret; 789 } 790 791 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { 792 if (Position == End) 793 return Position; 794 // Check 7 bit c-printable - b-char. 795 if ( *Position == 0x09 796 || (*Position >= 0x20 && *Position <= 0x7E)) 797 return Position + 1; 798 799 // Check for valid UTF-8. 800 if (uint8_t(*Position) & 0x80) { 801 UTF8Decoded u8d = decodeUTF8(Position); 802 if ( u8d.second != 0 803 && u8d.first != 0xFEFF 804 && ( u8d.first == 0x85 805 || ( u8d.first >= 0xA0 806 && u8d.first <= 0xD7FF) 807 || ( u8d.first >= 0xE000 808 && u8d.first <= 0xFFFD) 809 || ( u8d.first >= 0x10000 810 && u8d.first <= 0x10FFFF))) 811 return Position + u8d.second; 812 } 813 return Position; 814 } 815 816 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { 817 if (Position == End) 818 return Position; 819 if (*Position == 0x0D) { 820 if (Position + 1 != End && *(Position + 1) == 0x0A) 821 return Position + 2; 822 return Position + 1; 823 } 824 825 if (*Position == 0x0A) 826 return Position + 1; 827 return Position; 828 } 829 830 StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) { 831 if (Position == End) 832 return Position; 833 if (*Position == ' ') 834 return Position + 1; 835 return Position; 836 } 837 838 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { 839 if (Position == End) 840 return Position; 841 if (*Position == ' ' || *Position == '\t') 842 return Position + 1; 843 return Position; 844 } 845 846 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { 847 if (Position == End) 848 return Position; 849 if (*Position == ' ' || *Position == '\t') 850 return Position; 851 return skip_nb_char(Position); 852 } 853 854 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func 855 , StringRef::iterator Position) { 856 while (true) { 857 StringRef::iterator i = (this->*Func)(Position); 858 if (i == Position) 859 break; 860 Position = i; 861 } 862 return Position; 863 } 864 865 void Scanner::advanceWhile(SkipWhileFunc Func) { 866 auto Final = skip_while(Func, Current); 867 Column += Final - Current; 868 Current = Final; 869 } 870 871 static bool is_ns_hex_digit(const char C) { 872 return (C >= '0' && C <= '9') 873 || (C >= 'a' && C <= 'z') 874 || (C >= 'A' && C <= 'Z'); 875 } 876 877 static bool is_ns_word_char(const char C) { 878 return C == '-' 879 || (C >= 'a' && C <= 'z') 880 || (C >= 'A' && C <= 'Z'); 881 } 882 883 void Scanner::scan_ns_uri_char() { 884 while (true) { 885 if (Current == End) 886 break; 887 if (( *Current == '%' 888 && Current + 2 < End 889 && is_ns_hex_digit(*(Current + 1)) 890 && is_ns_hex_digit(*(Current + 2))) 891 || is_ns_word_char(*Current) 892 || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") 893 != StringRef::npos) { 894 ++Current; 895 ++Column; 896 } else 897 break; 898 } 899 } 900 901 bool Scanner::consume(uint32_t Expected) { 902 if (Expected >= 0x80) 903 report_fatal_error("Not dealing with this yet"); 904 if (Current == End) 905 return false; 906 if (uint8_t(*Current) >= 0x80) 907 report_fatal_error("Not dealing with this yet"); 908 if (uint8_t(*Current) == Expected) { 909 ++Current; 910 ++Column; 911 return true; 912 } 913 return false; 914 } 915 916 void Scanner::skip(uint32_t Distance) { 917 Current += Distance; 918 Column += Distance; 919 assert(Current <= End && "Skipped past the end"); 920 } 921 922 bool Scanner::isBlankOrBreak(StringRef::iterator Position) { 923 if (Position == End) 924 return false; 925 return *Position == ' ' || *Position == '\t' || *Position == '\r' || 926 *Position == '\n'; 927 } 928 929 bool Scanner::consumeLineBreakIfPresent() { 930 auto Next = skip_b_break(Current); 931 if (Next == Current) 932 return false; 933 Column = 0; 934 ++Line; 935 Current = Next; 936 return true; 937 } 938 939 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok 940 , unsigned AtColumn 941 , bool IsRequired) { 942 if (IsSimpleKeyAllowed) { 943 SimpleKey SK; 944 SK.Tok = Tok; 945 SK.Line = Line; 946 SK.Column = AtColumn; 947 SK.IsRequired = IsRequired; 948 SK.FlowLevel = FlowLevel; 949 SimpleKeys.push_back(SK); 950 } 951 } 952 953 void Scanner::removeStaleSimpleKeyCandidates() { 954 for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); 955 i != SimpleKeys.end();) { 956 if (i->Line != Line || i->Column + 1024 < Column) { 957 if (i->IsRequired) 958 setError( "Could not find expected : for simple key" 959 , i->Tok->Range.begin()); 960 i = SimpleKeys.erase(i); 961 } else 962 ++i; 963 } 964 } 965 966 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { 967 if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) 968 SimpleKeys.pop_back(); 969 } 970 971 bool Scanner::unrollIndent(int ToColumn) { 972 Token T; 973 // Indentation is ignored in flow. 974 if (FlowLevel != 0) 975 return true; 976 977 while (Indent > ToColumn) { 978 T.Kind = Token::TK_BlockEnd; 979 T.Range = StringRef(Current, 1); 980 TokenQueue.push_back(T); 981 Indent = Indents.pop_back_val(); 982 } 983 984 return true; 985 } 986 987 bool Scanner::rollIndent( int ToColumn 988 , Token::TokenKind Kind 989 , TokenQueueT::iterator InsertPoint) { 990 if (FlowLevel) 991 return true; 992 if (Indent < ToColumn) { 993 Indents.push_back(Indent); 994 Indent = ToColumn; 995 996 Token T; 997 T.Kind = Kind; 998 T.Range = StringRef(Current, 0); 999 TokenQueue.insert(InsertPoint, T); 1000 } 1001 return true; 1002 } 1003 1004 void Scanner::skipComment() { 1005 if (*Current != '#') 1006 return; 1007 while (true) { 1008 // This may skip more than one byte, thus Column is only incremented 1009 // for code points. 1010 StringRef::iterator I = skip_nb_char(Current); 1011 if (I == Current) 1012 break; 1013 Current = I; 1014 ++Column; 1015 } 1016 } 1017 1018 void Scanner::scanToNextToken() { 1019 while (true) { 1020 while (*Current == ' ' || *Current == '\t') { 1021 skip(1); 1022 } 1023 1024 skipComment(); 1025 1026 // Skip EOL. 1027 StringRef::iterator i = skip_b_break(Current); 1028 if (i == Current) 1029 break; 1030 Current = i; 1031 ++Line; 1032 Column = 0; 1033 // New lines may start a simple key. 1034 if (!FlowLevel) 1035 IsSimpleKeyAllowed = true; 1036 } 1037 } 1038 1039 bool Scanner::scanStreamStart() { 1040 IsStartOfStream = false; 1041 1042 EncodingInfo EI = getUnicodeEncoding(currentInput()); 1043 1044 Token T; 1045 T.Kind = Token::TK_StreamStart; 1046 T.Range = StringRef(Current, EI.second); 1047 TokenQueue.push_back(T); 1048 Current += EI.second; 1049 return true; 1050 } 1051 1052 bool Scanner::scanStreamEnd() { 1053 // Force an ending new line if one isn't present. 1054 if (Column != 0) { 1055 Column = 0; 1056 ++Line; 1057 } 1058 1059 unrollIndent(-1); 1060 SimpleKeys.clear(); 1061 IsSimpleKeyAllowed = false; 1062 1063 Token T; 1064 T.Kind = Token::TK_StreamEnd; 1065 T.Range = StringRef(Current, 0); 1066 TokenQueue.push_back(T); 1067 return true; 1068 } 1069 1070 bool Scanner::scanDirective() { 1071 // Reset the indentation level. 1072 unrollIndent(-1); 1073 SimpleKeys.clear(); 1074 IsSimpleKeyAllowed = false; 1075 1076 StringRef::iterator Start = Current; 1077 consume('%'); 1078 StringRef::iterator NameStart = Current; 1079 Current = skip_while(&Scanner::skip_ns_char, Current); 1080 StringRef Name(NameStart, Current - NameStart); 1081 Current = skip_while(&Scanner::skip_s_white, Current); 1082 1083 Token T; 1084 if (Name == "YAML") { 1085 Current = skip_while(&Scanner::skip_ns_char, Current); 1086 T.Kind = Token::TK_VersionDirective; 1087 T.Range = StringRef(Start, Current - Start); 1088 TokenQueue.push_back(T); 1089 return true; 1090 } else if(Name == "TAG") { 1091 Current = skip_while(&Scanner::skip_ns_char, Current); 1092 Current = skip_while(&Scanner::skip_s_white, Current); 1093 Current = skip_while(&Scanner::skip_ns_char, Current); 1094 T.Kind = Token::TK_TagDirective; 1095 T.Range = StringRef(Start, Current - Start); 1096 TokenQueue.push_back(T); 1097 return true; 1098 } 1099 return false; 1100 } 1101 1102 bool Scanner::scanDocumentIndicator(bool IsStart) { 1103 unrollIndent(-1); 1104 SimpleKeys.clear(); 1105 IsSimpleKeyAllowed = false; 1106 1107 Token T; 1108 T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; 1109 T.Range = StringRef(Current, 3); 1110 skip(3); 1111 TokenQueue.push_back(T); 1112 return true; 1113 } 1114 1115 bool Scanner::scanFlowCollectionStart(bool IsSequence) { 1116 Token T; 1117 T.Kind = IsSequence ? Token::TK_FlowSequenceStart 1118 : Token::TK_FlowMappingStart; 1119 T.Range = StringRef(Current, 1); 1120 skip(1); 1121 TokenQueue.push_back(T); 1122 1123 // [ and { may begin a simple key. 1124 saveSimpleKeyCandidate(--TokenQueue.end(), Column - 1, false); 1125 1126 // And may also be followed by a simple key. 1127 IsSimpleKeyAllowed = true; 1128 ++FlowLevel; 1129 return true; 1130 } 1131 1132 bool Scanner::scanFlowCollectionEnd(bool IsSequence) { 1133 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1134 IsSimpleKeyAllowed = false; 1135 Token T; 1136 T.Kind = IsSequence ? Token::TK_FlowSequenceEnd 1137 : Token::TK_FlowMappingEnd; 1138 T.Range = StringRef(Current, 1); 1139 skip(1); 1140 TokenQueue.push_back(T); 1141 if (FlowLevel) 1142 --FlowLevel; 1143 return true; 1144 } 1145 1146 bool Scanner::scanFlowEntry() { 1147 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1148 IsSimpleKeyAllowed = true; 1149 Token T; 1150 T.Kind = Token::TK_FlowEntry; 1151 T.Range = StringRef(Current, 1); 1152 skip(1); 1153 TokenQueue.push_back(T); 1154 return true; 1155 } 1156 1157 bool Scanner::scanBlockEntry() { 1158 rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); 1159 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1160 IsSimpleKeyAllowed = true; 1161 Token T; 1162 T.Kind = Token::TK_BlockEntry; 1163 T.Range = StringRef(Current, 1); 1164 skip(1); 1165 TokenQueue.push_back(T); 1166 return true; 1167 } 1168 1169 bool Scanner::scanKey() { 1170 if (!FlowLevel) 1171 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1172 1173 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1174 IsSimpleKeyAllowed = !FlowLevel; 1175 1176 Token T; 1177 T.Kind = Token::TK_Key; 1178 T.Range = StringRef(Current, 1); 1179 skip(1); 1180 TokenQueue.push_back(T); 1181 return true; 1182 } 1183 1184 bool Scanner::scanValue() { 1185 // If the previous token could have been a simple key, insert the key token 1186 // into the token queue. 1187 if (!SimpleKeys.empty()) { 1188 SimpleKey SK = SimpleKeys.pop_back_val(); 1189 Token T; 1190 T.Kind = Token::TK_Key; 1191 T.Range = SK.Tok->Range; 1192 TokenQueueT::iterator i, e; 1193 for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { 1194 if (i == SK.Tok) 1195 break; 1196 } 1197 assert(i != e && "SimpleKey not in token queue!"); 1198 i = TokenQueue.insert(i, T); 1199 1200 // We may also need to add a Block-Mapping-Start token. 1201 rollIndent(SK.Column, Token::TK_BlockMappingStart, i); 1202 1203 IsSimpleKeyAllowed = false; 1204 } else { 1205 if (!FlowLevel) 1206 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1207 IsSimpleKeyAllowed = !FlowLevel; 1208 } 1209 1210 Token T; 1211 T.Kind = Token::TK_Value; 1212 T.Range = StringRef(Current, 1); 1213 skip(1); 1214 TokenQueue.push_back(T); 1215 return true; 1216 } 1217 1218 // Forbidding inlining improves performance by roughly 20%. 1219 // FIXME: Remove once llvm optimizes this to the faster version without hints. 1220 LLVM_ATTRIBUTE_NOINLINE static bool 1221 wasEscaped(StringRef::iterator First, StringRef::iterator Position); 1222 1223 // Returns whether a character at 'Position' was escaped with a leading '\'. 1224 // 'First' specifies the position of the first character in the string. 1225 static bool wasEscaped(StringRef::iterator First, 1226 StringRef::iterator Position) { 1227 assert(Position - 1 >= First); 1228 StringRef::iterator I = Position - 1; 1229 // We calculate the number of consecutive '\'s before the current position 1230 // by iterating backwards through our string. 1231 while (I >= First && *I == '\\') --I; 1232 // (Position - 1 - I) now contains the number of '\'s before the current 1233 // position. If it is odd, the character at 'Position' was escaped. 1234 return (Position - 1 - I) % 2 == 1; 1235 } 1236 1237 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { 1238 StringRef::iterator Start = Current; 1239 unsigned ColStart = Column; 1240 if (IsDoubleQuoted) { 1241 do { 1242 ++Current; 1243 while (Current != End && *Current != '"') 1244 ++Current; 1245 // Repeat until the previous character was not a '\' or was an escaped 1246 // backslash. 1247 } while ( Current != End 1248 && *(Current - 1) == '\\' 1249 && wasEscaped(Start + 1, Current)); 1250 } else { 1251 skip(1); 1252 while (true) { 1253 // Skip a ' followed by another '. 1254 if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { 1255 skip(2); 1256 continue; 1257 } else if (*Current == '\'') 1258 break; 1259 StringRef::iterator i = skip_nb_char(Current); 1260 if (i == Current) { 1261 i = skip_b_break(Current); 1262 if (i == Current) 1263 break; 1264 Current = i; 1265 Column = 0; 1266 ++Line; 1267 } else { 1268 if (i == End) 1269 break; 1270 Current = i; 1271 ++Column; 1272 } 1273 } 1274 } 1275 1276 if (Current == End) { 1277 setError("Expected quote at end of scalar", Current); 1278 return false; 1279 } 1280 1281 skip(1); // Skip ending quote. 1282 Token T; 1283 T.Kind = Token::TK_Scalar; 1284 T.Range = StringRef(Start, Current - Start); 1285 TokenQueue.push_back(T); 1286 1287 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1288 1289 IsSimpleKeyAllowed = false; 1290 1291 return true; 1292 } 1293 1294 bool Scanner::scanPlainScalar() { 1295 StringRef::iterator Start = Current; 1296 unsigned ColStart = Column; 1297 unsigned LeadingBlanks = 0; 1298 assert(Indent >= -1 && "Indent must be >= -1 !"); 1299 unsigned indent = static_cast<unsigned>(Indent + 1); 1300 while (true) { 1301 if (*Current == '#') 1302 break; 1303 1304 while (!isBlankOrBreak(Current)) { 1305 if ( FlowLevel && *Current == ':' 1306 && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) { 1307 setError("Found unexpected ':' while scanning a plain scalar", Current); 1308 return false; 1309 } 1310 1311 // Check for the end of the plain scalar. 1312 if ( (*Current == ':' && isBlankOrBreak(Current + 1)) 1313 || ( FlowLevel 1314 && (StringRef(Current, 1).find_first_of(",:?[]{}") 1315 != StringRef::npos))) 1316 break; 1317 1318 StringRef::iterator i = skip_nb_char(Current); 1319 if (i == Current) 1320 break; 1321 Current = i; 1322 ++Column; 1323 } 1324 1325 // Are we at the end? 1326 if (!isBlankOrBreak(Current)) 1327 break; 1328 1329 // Eat blanks. 1330 StringRef::iterator Tmp = Current; 1331 while (isBlankOrBreak(Tmp)) { 1332 StringRef::iterator i = skip_s_white(Tmp); 1333 if (i != Tmp) { 1334 if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { 1335 setError("Found invalid tab character in indentation", Tmp); 1336 return false; 1337 } 1338 Tmp = i; 1339 ++Column; 1340 } else { 1341 i = skip_b_break(Tmp); 1342 if (!LeadingBlanks) 1343 LeadingBlanks = 1; 1344 Tmp = i; 1345 Column = 0; 1346 ++Line; 1347 } 1348 } 1349 1350 if (!FlowLevel && Column < indent) 1351 break; 1352 1353 Current = Tmp; 1354 } 1355 if (Start == Current) { 1356 setError("Got empty plain scalar", Start); 1357 return false; 1358 } 1359 Token T; 1360 T.Kind = Token::TK_Scalar; 1361 T.Range = StringRef(Start, Current - Start); 1362 TokenQueue.push_back(T); 1363 1364 // Plain scalars can be simple keys. 1365 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1366 1367 IsSimpleKeyAllowed = false; 1368 1369 return true; 1370 } 1371 1372 bool Scanner::scanAliasOrAnchor(bool IsAlias) { 1373 StringRef::iterator Start = Current; 1374 unsigned ColStart = Column; 1375 skip(1); 1376 while(true) { 1377 if ( *Current == '[' || *Current == ']' 1378 || *Current == '{' || *Current == '}' 1379 || *Current == ',' 1380 || *Current == ':') 1381 break; 1382 StringRef::iterator i = skip_ns_char(Current); 1383 if (i == Current) 1384 break; 1385 Current = i; 1386 ++Column; 1387 } 1388 1389 if (Start == Current) { 1390 setError("Got empty alias or anchor", Start); 1391 return false; 1392 } 1393 1394 Token T; 1395 T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; 1396 T.Range = StringRef(Start, Current - Start); 1397 TokenQueue.push_back(T); 1398 1399 // Alias and anchors can be simple keys. 1400 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1401 1402 IsSimpleKeyAllowed = false; 1403 1404 return true; 1405 } 1406 1407 char Scanner::scanBlockChompingIndicator() { 1408 char Indicator = ' '; 1409 if (Current != End && (*Current == '+' || *Current == '-')) { 1410 Indicator = *Current; 1411 skip(1); 1412 } 1413 return Indicator; 1414 } 1415 1416 /// Get the number of line breaks after chomping. 1417 /// 1418 /// Return the number of trailing line breaks to emit, depending on 1419 /// \p ChompingIndicator. 1420 static unsigned getChompedLineBreaks(char ChompingIndicator, 1421 unsigned LineBreaks, StringRef Str) { 1422 if (ChompingIndicator == '-') // Strip all line breaks. 1423 return 0; 1424 if (ChompingIndicator == '+') // Keep all line breaks. 1425 return LineBreaks; 1426 // Clip trailing lines. 1427 return Str.empty() ? 0 : 1; 1428 } 1429 1430 unsigned Scanner::scanBlockIndentationIndicator() { 1431 unsigned Indent = 0; 1432 if (Current != End && (*Current >= '1' && *Current <= '9')) { 1433 Indent = unsigned(*Current - '0'); 1434 skip(1); 1435 } 1436 return Indent; 1437 } 1438 1439 bool Scanner::scanBlockScalarHeader(char &ChompingIndicator, 1440 unsigned &IndentIndicator, bool &IsDone) { 1441 auto Start = Current; 1442 1443 ChompingIndicator = scanBlockChompingIndicator(); 1444 IndentIndicator = scanBlockIndentationIndicator(); 1445 // Check for the chomping indicator once again. 1446 if (ChompingIndicator == ' ') 1447 ChompingIndicator = scanBlockChompingIndicator(); 1448 Current = skip_while(&Scanner::skip_s_white, Current); 1449 skipComment(); 1450 1451 if (Current == End) { // EOF, we have an empty scalar. 1452 Token T; 1453 T.Kind = Token::TK_BlockScalar; 1454 T.Range = StringRef(Start, Current - Start); 1455 TokenQueue.push_back(T); 1456 IsDone = true; 1457 return true; 1458 } 1459 1460 if (!consumeLineBreakIfPresent()) { 1461 setError("Expected a line break after block scalar header", Current); 1462 return false; 1463 } 1464 return true; 1465 } 1466 1467 bool Scanner::findBlockScalarIndent(unsigned &BlockIndent, 1468 unsigned BlockExitIndent, 1469 unsigned &LineBreaks, bool &IsDone) { 1470 unsigned MaxAllSpaceLineCharacters = 0; 1471 StringRef::iterator LongestAllSpaceLine; 1472 1473 while (true) { 1474 advanceWhile(&Scanner::skip_s_space); 1475 if (skip_nb_char(Current) != Current) { 1476 // This line isn't empty, so try and find the indentation. 1477 if (Column <= BlockExitIndent) { // End of the block literal. 1478 IsDone = true; 1479 return true; 1480 } 1481 // We found the block's indentation. 1482 BlockIndent = Column; 1483 if (MaxAllSpaceLineCharacters > BlockIndent) { 1484 setError( 1485 "Leading all-spaces line must be smaller than the block indent", 1486 LongestAllSpaceLine); 1487 return false; 1488 } 1489 return true; 1490 } 1491 if (skip_b_break(Current) != Current && 1492 Column > MaxAllSpaceLineCharacters) { 1493 // Record the longest all-space line in case it's longer than the 1494 // discovered block indent. 1495 MaxAllSpaceLineCharacters = Column; 1496 LongestAllSpaceLine = Current; 1497 } 1498 1499 // Check for EOF. 1500 if (Current == End) { 1501 IsDone = true; 1502 return true; 1503 } 1504 1505 if (!consumeLineBreakIfPresent()) { 1506 IsDone = true; 1507 return true; 1508 } 1509 ++LineBreaks; 1510 } 1511 return true; 1512 } 1513 1514 bool Scanner::scanBlockScalarIndent(unsigned BlockIndent, 1515 unsigned BlockExitIndent, bool &IsDone) { 1516 // Skip the indentation. 1517 while (Column < BlockIndent) { 1518 auto I = skip_s_space(Current); 1519 if (I == Current) 1520 break; 1521 Current = I; 1522 ++Column; 1523 } 1524 1525 if (skip_nb_char(Current) == Current) 1526 return true; 1527 1528 if (Column <= BlockExitIndent) { // End of the block literal. 1529 IsDone = true; 1530 return true; 1531 } 1532 1533 if (Column < BlockIndent) { 1534 if (Current != End && *Current == '#') { // Trailing comment. 1535 IsDone = true; 1536 return true; 1537 } 1538 setError("A text line is less indented than the block scalar", Current); 1539 return false; 1540 } 1541 return true; // A normal text line. 1542 } 1543 1544 bool Scanner::scanBlockScalar(bool IsLiteral) { 1545 // Eat '|' or '>' 1546 assert(*Current == '|' || *Current == '>'); 1547 skip(1); 1548 1549 char ChompingIndicator; 1550 unsigned BlockIndent; 1551 bool IsDone = false; 1552 if (!scanBlockScalarHeader(ChompingIndicator, BlockIndent, IsDone)) 1553 return false; 1554 if (IsDone) 1555 return true; 1556 1557 auto Start = Current; 1558 unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent; 1559 unsigned LineBreaks = 0; 1560 if (BlockIndent == 0) { 1561 if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks, 1562 IsDone)) 1563 return false; 1564 } 1565 1566 // Scan the block's scalars body. 1567 SmallString<256> Str; 1568 while (!IsDone) { 1569 if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone)) 1570 return false; 1571 if (IsDone) 1572 break; 1573 1574 // Parse the current line. 1575 auto LineStart = Current; 1576 advanceWhile(&Scanner::skip_nb_char); 1577 if (LineStart != Current) { 1578 Str.append(LineBreaks, '\n'); 1579 Str.append(StringRef(LineStart, Current - LineStart)); 1580 LineBreaks = 0; 1581 } 1582 1583 // Check for EOF. 1584 if (Current == End) 1585 break; 1586 1587 if (!consumeLineBreakIfPresent()) 1588 break; 1589 ++LineBreaks; 1590 } 1591 1592 if (Current == End && !LineBreaks) 1593 // Ensure that there is at least one line break before the end of file. 1594 LineBreaks = 1; 1595 Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n'); 1596 1597 // New lines may start a simple key. 1598 if (!FlowLevel) 1599 IsSimpleKeyAllowed = true; 1600 1601 Token T; 1602 T.Kind = Token::TK_BlockScalar; 1603 T.Range = StringRef(Start, Current - Start); 1604 T.Value = Str.str().str(); 1605 TokenQueue.push_back(T); 1606 return true; 1607 } 1608 1609 bool Scanner::scanTag() { 1610 StringRef::iterator Start = Current; 1611 unsigned ColStart = Column; 1612 skip(1); // Eat !. 1613 if (Current == End || isBlankOrBreak(Current)); // An empty tag. 1614 else if (*Current == '<') { 1615 skip(1); 1616 scan_ns_uri_char(); 1617 if (!consume('>')) 1618 return false; 1619 } else { 1620 // FIXME: Actually parse the c-ns-shorthand-tag rule. 1621 Current = skip_while(&Scanner::skip_ns_char, Current); 1622 } 1623 1624 Token T; 1625 T.Kind = Token::TK_Tag; 1626 T.Range = StringRef(Start, Current - Start); 1627 TokenQueue.push_back(T); 1628 1629 // Tags can be simple keys. 1630 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1631 1632 IsSimpleKeyAllowed = false; 1633 1634 return true; 1635 } 1636 1637 bool Scanner::fetchMoreTokens() { 1638 if (IsStartOfStream) 1639 return scanStreamStart(); 1640 1641 scanToNextToken(); 1642 1643 if (Current == End) 1644 return scanStreamEnd(); 1645 1646 removeStaleSimpleKeyCandidates(); 1647 1648 unrollIndent(Column); 1649 1650 if (Column == 0 && *Current == '%') 1651 return scanDirective(); 1652 1653 if (Column == 0 && Current + 4 <= End 1654 && *Current == '-' 1655 && *(Current + 1) == '-' 1656 && *(Current + 2) == '-' 1657 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1658 return scanDocumentIndicator(true); 1659 1660 if (Column == 0 && Current + 4 <= End 1661 && *Current == '.' 1662 && *(Current + 1) == '.' 1663 && *(Current + 2) == '.' 1664 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1665 return scanDocumentIndicator(false); 1666 1667 if (*Current == '[') 1668 return scanFlowCollectionStart(true); 1669 1670 if (*Current == '{') 1671 return scanFlowCollectionStart(false); 1672 1673 if (*Current == ']') 1674 return scanFlowCollectionEnd(true); 1675 1676 if (*Current == '}') 1677 return scanFlowCollectionEnd(false); 1678 1679 if (*Current == ',') 1680 return scanFlowEntry(); 1681 1682 if (*Current == '-' && isBlankOrBreak(Current + 1)) 1683 return scanBlockEntry(); 1684 1685 if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1))) 1686 return scanKey(); 1687 1688 if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1))) 1689 return scanValue(); 1690 1691 if (*Current == '*') 1692 return scanAliasOrAnchor(true); 1693 1694 if (*Current == '&') 1695 return scanAliasOrAnchor(false); 1696 1697 if (*Current == '!') 1698 return scanTag(); 1699 1700 if (*Current == '|' && !FlowLevel) 1701 return scanBlockScalar(true); 1702 1703 if (*Current == '>' && !FlowLevel) 1704 return scanBlockScalar(false); 1705 1706 if (*Current == '\'') 1707 return scanFlowScalar(false); 1708 1709 if (*Current == '"') 1710 return scanFlowScalar(true); 1711 1712 // Get a plain scalar. 1713 StringRef FirstChar(Current, 1); 1714 if (!(isBlankOrBreak(Current) 1715 || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos) 1716 || (*Current == '-' && !isBlankOrBreak(Current + 1)) 1717 || (!FlowLevel && (*Current == '?' || *Current == ':') 1718 && isBlankOrBreak(Current + 1)) 1719 || (!FlowLevel && *Current == ':' 1720 && Current + 2 < End 1721 && *(Current + 1) == ':' 1722 && !isBlankOrBreak(Current + 2))) 1723 return scanPlainScalar(); 1724 1725 setError("Unrecognized character while tokenizing."); 1726 return false; 1727 } 1728 1729 Stream::Stream(StringRef Input, SourceMgr &SM, bool ShowColors) 1730 : scanner(new Scanner(Input, SM, ShowColors)), CurrentDoc() {} 1731 1732 Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors) 1733 : scanner(new Scanner(InputBuffer, SM, ShowColors)), CurrentDoc() {} 1734 1735 Stream::~Stream() {} 1736 1737 bool Stream::failed() { return scanner->failed(); } 1738 1739 void Stream::printError(Node *N, const Twine &Msg) { 1740 scanner->printError( N->getSourceRange().Start 1741 , SourceMgr::DK_Error 1742 , Msg 1743 , N->getSourceRange()); 1744 } 1745 1746 document_iterator Stream::begin() { 1747 if (CurrentDoc) 1748 report_fatal_error("Can only iterate over the stream once"); 1749 1750 // Skip Stream-Start. 1751 scanner->getNext(); 1752 1753 CurrentDoc.reset(new Document(*this)); 1754 return document_iterator(CurrentDoc); 1755 } 1756 1757 document_iterator Stream::end() { 1758 return document_iterator(); 1759 } 1760 1761 void Stream::skip() { 1762 for (document_iterator i = begin(), e = end(); i != e; ++i) 1763 i->skip(); 1764 } 1765 1766 Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A, 1767 StringRef T) 1768 : Doc(D), TypeID(Type), Anchor(A), Tag(T) { 1769 SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); 1770 SourceRange = SMRange(Start, Start); 1771 } 1772 1773 std::string Node::getVerbatimTag() const { 1774 StringRef Raw = getRawTag(); 1775 if (!Raw.empty() && Raw != "!") { 1776 std::string Ret; 1777 if (Raw.find_last_of('!') == 0) { 1778 Ret = Doc->getTagMap().find("!")->second; 1779 Ret += Raw.substr(1); 1780 return Ret; 1781 } else if (Raw.startswith("!!")) { 1782 Ret = Doc->getTagMap().find("!!")->second; 1783 Ret += Raw.substr(2); 1784 return Ret; 1785 } else { 1786 StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1); 1787 std::map<StringRef, StringRef>::const_iterator It = 1788 Doc->getTagMap().find(TagHandle); 1789 if (It != Doc->getTagMap().end()) 1790 Ret = It->second; 1791 else { 1792 Token T; 1793 T.Kind = Token::TK_Tag; 1794 T.Range = TagHandle; 1795 setError(Twine("Unknown tag handle ") + TagHandle, T); 1796 } 1797 Ret += Raw.substr(Raw.find_last_of('!') + 1); 1798 return Ret; 1799 } 1800 } 1801 1802 switch (getType()) { 1803 case NK_Null: 1804 return "tag:yaml.org,2002:null"; 1805 case NK_Scalar: 1806 case NK_BlockScalar: 1807 // TODO: Tag resolution. 1808 return "tag:yaml.org,2002:str"; 1809 case NK_Mapping: 1810 return "tag:yaml.org,2002:map"; 1811 case NK_Sequence: 1812 return "tag:yaml.org,2002:seq"; 1813 } 1814 1815 return ""; 1816 } 1817 1818 Token &Node::peekNext() { 1819 return Doc->peekNext(); 1820 } 1821 1822 Token Node::getNext() { 1823 return Doc->getNext(); 1824 } 1825 1826 Node *Node::parseBlockNode() { 1827 return Doc->parseBlockNode(); 1828 } 1829 1830 BumpPtrAllocator &Node::getAllocator() { 1831 return Doc->NodeAllocator; 1832 } 1833 1834 void Node::setError(const Twine &Msg, Token &Tok) const { 1835 Doc->setError(Msg, Tok); 1836 } 1837 1838 bool Node::failed() const { 1839 return Doc->failed(); 1840 } 1841 1842 1843 1844 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { 1845 // TODO: Handle newlines properly. We need to remove leading whitespace. 1846 if (Value[0] == '"') { // Double quoted. 1847 // Pull off the leading and trailing "s. 1848 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1849 // Search for characters that would require unescaping the value. 1850 StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); 1851 if (i != StringRef::npos) 1852 return unescapeDoubleQuoted(UnquotedValue, i, Storage); 1853 return UnquotedValue; 1854 } else if (Value[0] == '\'') { // Single quoted. 1855 // Pull off the leading and trailing 's. 1856 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1857 StringRef::size_type i = UnquotedValue.find('\''); 1858 if (i != StringRef::npos) { 1859 // We're going to need Storage. 1860 Storage.clear(); 1861 Storage.reserve(UnquotedValue.size()); 1862 for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { 1863 StringRef Valid(UnquotedValue.begin(), i); 1864 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1865 Storage.push_back('\''); 1866 UnquotedValue = UnquotedValue.substr(i + 2); 1867 } 1868 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 1869 return StringRef(Storage.begin(), Storage.size()); 1870 } 1871 return UnquotedValue; 1872 } 1873 // Plain or block. 1874 return Value.rtrim(' '); 1875 } 1876 1877 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue 1878 , StringRef::size_type i 1879 , SmallVectorImpl<char> &Storage) 1880 const { 1881 // Use Storage to build proper value. 1882 Storage.clear(); 1883 Storage.reserve(UnquotedValue.size()); 1884 for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { 1885 // Insert all previous chars into Storage. 1886 StringRef Valid(UnquotedValue.begin(), i); 1887 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1888 // Chop off inserted chars. 1889 UnquotedValue = UnquotedValue.substr(i); 1890 1891 assert(!UnquotedValue.empty() && "Can't be empty!"); 1892 1893 // Parse escape or line break. 1894 switch (UnquotedValue[0]) { 1895 case '\r': 1896 case '\n': 1897 Storage.push_back('\n'); 1898 if ( UnquotedValue.size() > 1 1899 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1900 UnquotedValue = UnquotedValue.substr(1); 1901 UnquotedValue = UnquotedValue.substr(1); 1902 break; 1903 default: 1904 if (UnquotedValue.size() == 1) 1905 // TODO: Report error. 1906 break; 1907 UnquotedValue = UnquotedValue.substr(1); 1908 switch (UnquotedValue[0]) { 1909 default: { 1910 Token T; 1911 T.Range = StringRef(UnquotedValue.begin(), 1); 1912 setError("Unrecognized escape code!", T); 1913 return ""; 1914 } 1915 case '\r': 1916 case '\n': 1917 // Remove the new line. 1918 if ( UnquotedValue.size() > 1 1919 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1920 UnquotedValue = UnquotedValue.substr(1); 1921 // If this was just a single byte newline, it will get skipped 1922 // below. 1923 break; 1924 case '0': 1925 Storage.push_back(0x00); 1926 break; 1927 case 'a': 1928 Storage.push_back(0x07); 1929 break; 1930 case 'b': 1931 Storage.push_back(0x08); 1932 break; 1933 case 't': 1934 case 0x09: 1935 Storage.push_back(0x09); 1936 break; 1937 case 'n': 1938 Storage.push_back(0x0A); 1939 break; 1940 case 'v': 1941 Storage.push_back(0x0B); 1942 break; 1943 case 'f': 1944 Storage.push_back(0x0C); 1945 break; 1946 case 'r': 1947 Storage.push_back(0x0D); 1948 break; 1949 case 'e': 1950 Storage.push_back(0x1B); 1951 break; 1952 case ' ': 1953 Storage.push_back(0x20); 1954 break; 1955 case '"': 1956 Storage.push_back(0x22); 1957 break; 1958 case '/': 1959 Storage.push_back(0x2F); 1960 break; 1961 case '\\': 1962 Storage.push_back(0x5C); 1963 break; 1964 case 'N': 1965 encodeUTF8(0x85, Storage); 1966 break; 1967 case '_': 1968 encodeUTF8(0xA0, Storage); 1969 break; 1970 case 'L': 1971 encodeUTF8(0x2028, Storage); 1972 break; 1973 case 'P': 1974 encodeUTF8(0x2029, Storage); 1975 break; 1976 case 'x': { 1977 if (UnquotedValue.size() < 3) 1978 // TODO: Report error. 1979 break; 1980 unsigned int UnicodeScalarValue; 1981 if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) 1982 // TODO: Report error. 1983 UnicodeScalarValue = 0xFFFD; 1984 encodeUTF8(UnicodeScalarValue, Storage); 1985 UnquotedValue = UnquotedValue.substr(2); 1986 break; 1987 } 1988 case 'u': { 1989 if (UnquotedValue.size() < 5) 1990 // TODO: Report error. 1991 break; 1992 unsigned int UnicodeScalarValue; 1993 if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) 1994 // TODO: Report error. 1995 UnicodeScalarValue = 0xFFFD; 1996 encodeUTF8(UnicodeScalarValue, Storage); 1997 UnquotedValue = UnquotedValue.substr(4); 1998 break; 1999 } 2000 case 'U': { 2001 if (UnquotedValue.size() < 9) 2002 // TODO: Report error. 2003 break; 2004 unsigned int UnicodeScalarValue; 2005 if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) 2006 // TODO: Report error. 2007 UnicodeScalarValue = 0xFFFD; 2008 encodeUTF8(UnicodeScalarValue, Storage); 2009 UnquotedValue = UnquotedValue.substr(8); 2010 break; 2011 } 2012 } 2013 UnquotedValue = UnquotedValue.substr(1); 2014 } 2015 } 2016 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 2017 return StringRef(Storage.begin(), Storage.size()); 2018 } 2019 2020 Node *KeyValueNode::getKey() { 2021 if (Key) 2022 return Key; 2023 // Handle implicit null keys. 2024 { 2025 Token &t = peekNext(); 2026 if ( t.Kind == Token::TK_BlockEnd 2027 || t.Kind == Token::TK_Value 2028 || t.Kind == Token::TK_Error) { 2029 return Key = new (getAllocator()) NullNode(Doc); 2030 } 2031 if (t.Kind == Token::TK_Key) 2032 getNext(); // skip TK_Key. 2033 } 2034 2035 // Handle explicit null keys. 2036 Token &t = peekNext(); 2037 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { 2038 return Key = new (getAllocator()) NullNode(Doc); 2039 } 2040 2041 // We've got a normal key. 2042 return Key = parseBlockNode(); 2043 } 2044 2045 Node *KeyValueNode::getValue() { 2046 if (Value) 2047 return Value; 2048 getKey()->skip(); 2049 if (failed()) 2050 return Value = new (getAllocator()) NullNode(Doc); 2051 2052 // Handle implicit null values. 2053 { 2054 Token &t = peekNext(); 2055 if ( t.Kind == Token::TK_BlockEnd 2056 || t.Kind == Token::TK_FlowMappingEnd 2057 || t.Kind == Token::TK_Key 2058 || t.Kind == Token::TK_FlowEntry 2059 || t.Kind == Token::TK_Error) { 2060 return Value = new (getAllocator()) NullNode(Doc); 2061 } 2062 2063 if (t.Kind != Token::TK_Value) { 2064 setError("Unexpected token in Key Value.", t); 2065 return Value = new (getAllocator()) NullNode(Doc); 2066 } 2067 getNext(); // skip TK_Value. 2068 } 2069 2070 // Handle explicit null values. 2071 Token &t = peekNext(); 2072 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { 2073 return Value = new (getAllocator()) NullNode(Doc); 2074 } 2075 2076 // We got a normal value. 2077 return Value = parseBlockNode(); 2078 } 2079 2080 void MappingNode::increment() { 2081 if (failed()) { 2082 IsAtEnd = true; 2083 CurrentEntry = nullptr; 2084 return; 2085 } 2086 if (CurrentEntry) { 2087 CurrentEntry->skip(); 2088 if (Type == MT_Inline) { 2089 IsAtEnd = true; 2090 CurrentEntry = nullptr; 2091 return; 2092 } 2093 } 2094 Token T = peekNext(); 2095 if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { 2096 // KeyValueNode eats the TK_Key. That way it can detect null keys. 2097 CurrentEntry = new (getAllocator()) KeyValueNode(Doc); 2098 } else if (Type == MT_Block) { 2099 switch (T.Kind) { 2100 case Token::TK_BlockEnd: 2101 getNext(); 2102 IsAtEnd = true; 2103 CurrentEntry = nullptr; 2104 break; 2105 default: 2106 setError("Unexpected token. Expected Key or Block End", T); 2107 case Token::TK_Error: 2108 IsAtEnd = true; 2109 CurrentEntry = nullptr; 2110 } 2111 } else { 2112 switch (T.Kind) { 2113 case Token::TK_FlowEntry: 2114 // Eat the flow entry and recurse. 2115 getNext(); 2116 return increment(); 2117 case Token::TK_FlowMappingEnd: 2118 getNext(); 2119 case Token::TK_Error: 2120 // Set this to end iterator. 2121 IsAtEnd = true; 2122 CurrentEntry = nullptr; 2123 break; 2124 default: 2125 setError( "Unexpected token. Expected Key, Flow Entry, or Flow " 2126 "Mapping End." 2127 , T); 2128 IsAtEnd = true; 2129 CurrentEntry = nullptr; 2130 } 2131 } 2132 } 2133 2134 void SequenceNode::increment() { 2135 if (failed()) { 2136 IsAtEnd = true; 2137 CurrentEntry = nullptr; 2138 return; 2139 } 2140 if (CurrentEntry) 2141 CurrentEntry->skip(); 2142 Token T = peekNext(); 2143 if (SeqType == ST_Block) { 2144 switch (T.Kind) { 2145 case Token::TK_BlockEntry: 2146 getNext(); 2147 CurrentEntry = parseBlockNode(); 2148 if (!CurrentEntry) { // An error occurred. 2149 IsAtEnd = true; 2150 CurrentEntry = nullptr; 2151 } 2152 break; 2153 case Token::TK_BlockEnd: 2154 getNext(); 2155 IsAtEnd = true; 2156 CurrentEntry = nullptr; 2157 break; 2158 default: 2159 setError( "Unexpected token. Expected Block Entry or Block End." 2160 , T); 2161 case Token::TK_Error: 2162 IsAtEnd = true; 2163 CurrentEntry = nullptr; 2164 } 2165 } else if (SeqType == ST_Indentless) { 2166 switch (T.Kind) { 2167 case Token::TK_BlockEntry: 2168 getNext(); 2169 CurrentEntry = parseBlockNode(); 2170 if (!CurrentEntry) { // An error occurred. 2171 IsAtEnd = true; 2172 CurrentEntry = nullptr; 2173 } 2174 break; 2175 default: 2176 case Token::TK_Error: 2177 IsAtEnd = true; 2178 CurrentEntry = nullptr; 2179 } 2180 } else if (SeqType == ST_Flow) { 2181 switch (T.Kind) { 2182 case Token::TK_FlowEntry: 2183 // Eat the flow entry and recurse. 2184 getNext(); 2185 WasPreviousTokenFlowEntry = true; 2186 return increment(); 2187 case Token::TK_FlowSequenceEnd: 2188 getNext(); 2189 case Token::TK_Error: 2190 // Set this to end iterator. 2191 IsAtEnd = true; 2192 CurrentEntry = nullptr; 2193 break; 2194 case Token::TK_StreamEnd: 2195 case Token::TK_DocumentEnd: 2196 case Token::TK_DocumentStart: 2197 setError("Could not find closing ]!", T); 2198 // Set this to end iterator. 2199 IsAtEnd = true; 2200 CurrentEntry = nullptr; 2201 break; 2202 default: 2203 if (!WasPreviousTokenFlowEntry) { 2204 setError("Expected , between entries!", T); 2205 IsAtEnd = true; 2206 CurrentEntry = nullptr; 2207 break; 2208 } 2209 // Otherwise it must be a flow entry. 2210 CurrentEntry = parseBlockNode(); 2211 if (!CurrentEntry) { 2212 IsAtEnd = true; 2213 } 2214 WasPreviousTokenFlowEntry = false; 2215 break; 2216 } 2217 } 2218 } 2219 2220 Document::Document(Stream &S) : stream(S), Root(nullptr) { 2221 // Tag maps starts with two default mappings. 2222 TagMap["!"] = "!"; 2223 TagMap["!!"] = "tag:yaml.org,2002:"; 2224 2225 if (parseDirectives()) 2226 expectToken(Token::TK_DocumentStart); 2227 Token &T = peekNext(); 2228 if (T.Kind == Token::TK_DocumentStart) 2229 getNext(); 2230 } 2231 2232 bool Document::skip() { 2233 if (stream.scanner->failed()) 2234 return false; 2235 if (!Root) 2236 getRoot(); 2237 Root->skip(); 2238 Token &T = peekNext(); 2239 if (T.Kind == Token::TK_StreamEnd) 2240 return false; 2241 if (T.Kind == Token::TK_DocumentEnd) { 2242 getNext(); 2243 return skip(); 2244 } 2245 return true; 2246 } 2247 2248 Token &Document::peekNext() { 2249 return stream.scanner->peekNext(); 2250 } 2251 2252 Token Document::getNext() { 2253 return stream.scanner->getNext(); 2254 } 2255 2256 void Document::setError(const Twine &Message, Token &Location) const { 2257 stream.scanner->setError(Message, Location.Range.begin()); 2258 } 2259 2260 bool Document::failed() const { 2261 return stream.scanner->failed(); 2262 } 2263 2264 Node *Document::parseBlockNode() { 2265 Token T = peekNext(); 2266 // Handle properties. 2267 Token AnchorInfo; 2268 Token TagInfo; 2269 parse_property: 2270 switch (T.Kind) { 2271 case Token::TK_Alias: 2272 getNext(); 2273 return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); 2274 case Token::TK_Anchor: 2275 if (AnchorInfo.Kind == Token::TK_Anchor) { 2276 setError("Already encountered an anchor for this node!", T); 2277 return nullptr; 2278 } 2279 AnchorInfo = getNext(); // Consume TK_Anchor. 2280 T = peekNext(); 2281 goto parse_property; 2282 case Token::TK_Tag: 2283 if (TagInfo.Kind == Token::TK_Tag) { 2284 setError("Already encountered a tag for this node!", T); 2285 return nullptr; 2286 } 2287 TagInfo = getNext(); // Consume TK_Tag. 2288 T = peekNext(); 2289 goto parse_property; 2290 default: 2291 break; 2292 } 2293 2294 switch (T.Kind) { 2295 case Token::TK_BlockEntry: 2296 // We got an unindented BlockEntry sequence. This is not terminated with 2297 // a BlockEnd. 2298 // Don't eat the TK_BlockEntry, SequenceNode needs it. 2299 return new (NodeAllocator) SequenceNode( stream.CurrentDoc 2300 , AnchorInfo.Range.substr(1) 2301 , TagInfo.Range 2302 , SequenceNode::ST_Indentless); 2303 case Token::TK_BlockSequenceStart: 2304 getNext(); 2305 return new (NodeAllocator) 2306 SequenceNode( stream.CurrentDoc 2307 , AnchorInfo.Range.substr(1) 2308 , TagInfo.Range 2309 , SequenceNode::ST_Block); 2310 case Token::TK_BlockMappingStart: 2311 getNext(); 2312 return new (NodeAllocator) 2313 MappingNode( stream.CurrentDoc 2314 , AnchorInfo.Range.substr(1) 2315 , TagInfo.Range 2316 , MappingNode::MT_Block); 2317 case Token::TK_FlowSequenceStart: 2318 getNext(); 2319 return new (NodeAllocator) 2320 SequenceNode( stream.CurrentDoc 2321 , AnchorInfo.Range.substr(1) 2322 , TagInfo.Range 2323 , SequenceNode::ST_Flow); 2324 case Token::TK_FlowMappingStart: 2325 getNext(); 2326 return new (NodeAllocator) 2327 MappingNode( stream.CurrentDoc 2328 , AnchorInfo.Range.substr(1) 2329 , TagInfo.Range 2330 , MappingNode::MT_Flow); 2331 case Token::TK_Scalar: 2332 getNext(); 2333 return new (NodeAllocator) 2334 ScalarNode( stream.CurrentDoc 2335 , AnchorInfo.Range.substr(1) 2336 , TagInfo.Range 2337 , T.Range); 2338 case Token::TK_BlockScalar: { 2339 getNext(); 2340 StringRef NullTerminatedStr(T.Value.c_str(), T.Value.length() + 1); 2341 StringRef StrCopy = NullTerminatedStr.copy(NodeAllocator).drop_back(); 2342 return new (NodeAllocator) 2343 BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1), 2344 TagInfo.Range, StrCopy, T.Range); 2345 } 2346 case Token::TK_Key: 2347 // Don't eat the TK_Key, KeyValueNode expects it. 2348 return new (NodeAllocator) 2349 MappingNode( stream.CurrentDoc 2350 , AnchorInfo.Range.substr(1) 2351 , TagInfo.Range 2352 , MappingNode::MT_Inline); 2353 case Token::TK_DocumentStart: 2354 case Token::TK_DocumentEnd: 2355 case Token::TK_StreamEnd: 2356 default: 2357 // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not 2358 // !!null null. 2359 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2360 case Token::TK_Error: 2361 return nullptr; 2362 } 2363 llvm_unreachable("Control flow shouldn't reach here."); 2364 return nullptr; 2365 } 2366 2367 bool Document::parseDirectives() { 2368 bool isDirective = false; 2369 while (true) { 2370 Token T = peekNext(); 2371 if (T.Kind == Token::TK_TagDirective) { 2372 parseTAGDirective(); 2373 isDirective = true; 2374 } else if (T.Kind == Token::TK_VersionDirective) { 2375 parseYAMLDirective(); 2376 isDirective = true; 2377 } else 2378 break; 2379 } 2380 return isDirective; 2381 } 2382 2383 void Document::parseYAMLDirective() { 2384 getNext(); // Eat %YAML <version> 2385 } 2386 2387 void Document::parseTAGDirective() { 2388 Token Tag = getNext(); // %TAG <handle> <prefix> 2389 StringRef T = Tag.Range; 2390 // Strip %TAG 2391 T = T.substr(T.find_first_of(" \t")).ltrim(" \t"); 2392 std::size_t HandleEnd = T.find_first_of(" \t"); 2393 StringRef TagHandle = T.substr(0, HandleEnd); 2394 StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t"); 2395 TagMap[TagHandle] = TagPrefix; 2396 } 2397 2398 bool Document::expectToken(int TK) { 2399 Token T = getNext(); 2400 if (T.Kind != TK) { 2401 setError("Unexpected token", T); 2402 return false; 2403 } 2404 return true; 2405 } 2406