1 //===--- YAMLParser.cpp - Simple YAML parser ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements a YAML parser. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Support/YAMLParser.h" 15 #include "llvm/ADT/SmallVector.h" 16 #include "llvm/ADT/StringExtras.h" 17 #include "llvm/ADT/Twine.h" 18 #include "llvm/ADT/ilist.h" 19 #include "llvm/ADT/ilist_node.h" 20 #include "llvm/Support/ErrorHandling.h" 21 #include "llvm/Support/MemoryBuffer.h" 22 #include "llvm/Support/SourceMgr.h" 23 #include "llvm/Support/raw_ostream.h" 24 25 using namespace llvm; 26 using namespace yaml; 27 28 enum UnicodeEncodingForm { 29 UEF_UTF32_LE, ///< UTF-32 Little Endian 30 UEF_UTF32_BE, ///< UTF-32 Big Endian 31 UEF_UTF16_LE, ///< UTF-16 Little Endian 32 UEF_UTF16_BE, ///< UTF-16 Big Endian 33 UEF_UTF8, ///< UTF-8 or ascii. 34 UEF_Unknown ///< Not a valid Unicode encoding. 35 }; 36 37 /// EncodingInfo - Holds the encoding type and length of the byte order mark if 38 /// it exists. Length is in {0, 2, 3, 4}. 39 typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo; 40 41 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode 42 /// encoding form of \a Input. 43 /// 44 /// @param Input A string of length 0 or more. 45 /// @returns An EncodingInfo indicating the Unicode encoding form of the input 46 /// and how long the byte order mark is if one exists. 47 static EncodingInfo getUnicodeEncoding(StringRef Input) { 48 if (Input.size() == 0) 49 return std::make_pair(UEF_Unknown, 0); 50 51 switch (uint8_t(Input[0])) { 52 case 0x00: 53 if (Input.size() >= 4) { 54 if ( Input[1] == 0 55 && uint8_t(Input[2]) == 0xFE 56 && uint8_t(Input[3]) == 0xFF) 57 return std::make_pair(UEF_UTF32_BE, 4); 58 if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) 59 return std::make_pair(UEF_UTF32_BE, 0); 60 } 61 62 if (Input.size() >= 2 && Input[1] != 0) 63 return std::make_pair(UEF_UTF16_BE, 0); 64 return std::make_pair(UEF_Unknown, 0); 65 case 0xFF: 66 if ( Input.size() >= 4 67 && uint8_t(Input[1]) == 0xFE 68 && Input[2] == 0 69 && Input[3] == 0) 70 return std::make_pair(UEF_UTF32_LE, 4); 71 72 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) 73 return std::make_pair(UEF_UTF16_LE, 2); 74 return std::make_pair(UEF_Unknown, 0); 75 case 0xFE: 76 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) 77 return std::make_pair(UEF_UTF16_BE, 2); 78 return std::make_pair(UEF_Unknown, 0); 79 case 0xEF: 80 if ( Input.size() >= 3 81 && uint8_t(Input[1]) == 0xBB 82 && uint8_t(Input[2]) == 0xBF) 83 return std::make_pair(UEF_UTF8, 3); 84 return std::make_pair(UEF_Unknown, 0); 85 } 86 87 // It could still be utf-32 or utf-16. 88 if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) 89 return std::make_pair(UEF_UTF32_LE, 0); 90 91 if (Input.size() >= 2 && Input[1] == 0) 92 return std::make_pair(UEF_UTF16_LE, 0); 93 94 return std::make_pair(UEF_UTF8, 0); 95 } 96 97 namespace llvm { 98 namespace yaml { 99 /// Token - A single YAML token. 100 struct Token : ilist_node<Token> { 101 enum TokenKind { 102 TK_Error, // Uninitialized token. 103 TK_StreamStart, 104 TK_StreamEnd, 105 TK_VersionDirective, 106 TK_TagDirective, 107 TK_DocumentStart, 108 TK_DocumentEnd, 109 TK_BlockEntry, 110 TK_BlockEnd, 111 TK_BlockSequenceStart, 112 TK_BlockMappingStart, 113 TK_FlowEntry, 114 TK_FlowSequenceStart, 115 TK_FlowSequenceEnd, 116 TK_FlowMappingStart, 117 TK_FlowMappingEnd, 118 TK_Key, 119 TK_Value, 120 TK_Scalar, 121 TK_Alias, 122 TK_Anchor, 123 TK_Tag 124 } Kind; 125 126 /// A string of length 0 or more whose begin() points to the logical location 127 /// of the token in the input. 128 StringRef Range; 129 130 Token() : Kind(TK_Error) {} 131 }; 132 } 133 } 134 135 namespace llvm { 136 template<> 137 struct ilist_sentinel_traits<Token> { 138 Token *createSentinel() const { 139 return &Sentinel; 140 } 141 static void destroySentinel(Token*) {} 142 143 Token *provideInitialHead() const { return createSentinel(); } 144 Token *ensureHead(Token*) const { return createSentinel(); } 145 static void noteHead(Token*, Token*) {} 146 147 private: 148 mutable Token Sentinel; 149 }; 150 151 template<> 152 struct ilist_node_traits<Token> { 153 Token *createNode(const Token &V) { 154 return new (Alloc.Allocate<Token>()) Token(V); 155 } 156 static void deleteNode(Token *V) {} 157 158 void addNodeToList(Token *) {} 159 void removeNodeFromList(Token *) {} 160 void transferNodesFromList(ilist_node_traits & /*SrcTraits*/, 161 ilist_iterator<Token> /*first*/, 162 ilist_iterator<Token> /*last*/) {} 163 164 BumpPtrAllocator Alloc; 165 }; 166 } 167 168 typedef ilist<Token> TokenQueueT; 169 170 namespace { 171 /// @brief This struct is used to track simple keys. 172 /// 173 /// Simple keys are handled by creating an entry in SimpleKeys for each Token 174 /// which could legally be the start of a simple key. When peekNext is called, 175 /// if the Token To be returned is referenced by a SimpleKey, we continue 176 /// tokenizing until that potential simple key has either been found to not be 177 /// a simple key (we moved on to the next line or went further than 1024 chars). 178 /// Or when we run into a Value, and then insert a Key token (and possibly 179 /// others) before the SimpleKey's Tok. 180 struct SimpleKey { 181 TokenQueueT::iterator Tok; 182 unsigned Column; 183 unsigned Line; 184 unsigned FlowLevel; 185 bool IsRequired; 186 187 bool operator ==(const SimpleKey &Other) { 188 return Tok == Other.Tok; 189 } 190 }; 191 } 192 193 /// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit 194 /// subsequence and the subsequence's length in code units (uint8_t). 195 /// A length of 0 represents an error. 196 typedef std::pair<uint32_t, unsigned> UTF8Decoded; 197 198 static UTF8Decoded decodeUTF8(StringRef Range) { 199 StringRef::iterator Position= Range.begin(); 200 StringRef::iterator End = Range.end(); 201 // 1 byte: [0x00, 0x7f] 202 // Bit pattern: 0xxxxxxx 203 if ((*Position & 0x80) == 0) { 204 return std::make_pair(*Position, 1); 205 } 206 // 2 bytes: [0x80, 0x7ff] 207 // Bit pattern: 110xxxxx 10xxxxxx 208 if (Position + 1 != End && 209 ((*Position & 0xE0) == 0xC0) && 210 ((*(Position + 1) & 0xC0) == 0x80)) { 211 uint32_t codepoint = ((*Position & 0x1F) << 6) | 212 (*(Position + 1) & 0x3F); 213 if (codepoint >= 0x80) 214 return std::make_pair(codepoint, 2); 215 } 216 // 3 bytes: [0x8000, 0xffff] 217 // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx 218 if (Position + 2 != End && 219 ((*Position & 0xF0) == 0xE0) && 220 ((*(Position + 1) & 0xC0) == 0x80) && 221 ((*(Position + 2) & 0xC0) == 0x80)) { 222 uint32_t codepoint = ((*Position & 0x0F) << 12) | 223 ((*(Position + 1) & 0x3F) << 6) | 224 (*(Position + 2) & 0x3F); 225 // Codepoints between 0xD800 and 0xDFFF are invalid, as 226 // they are high / low surrogate halves used by UTF-16. 227 if (codepoint >= 0x800 && 228 (codepoint < 0xD800 || codepoint > 0xDFFF)) 229 return std::make_pair(codepoint, 3); 230 } 231 // 4 bytes: [0x10000, 0x10FFFF] 232 // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 233 if (Position + 3 != End && 234 ((*Position & 0xF8) == 0xF0) && 235 ((*(Position + 1) & 0xC0) == 0x80) && 236 ((*(Position + 2) & 0xC0) == 0x80) && 237 ((*(Position + 3) & 0xC0) == 0x80)) { 238 uint32_t codepoint = ((*Position & 0x07) << 18) | 239 ((*(Position + 1) & 0x3F) << 12) | 240 ((*(Position + 2) & 0x3F) << 6) | 241 (*(Position + 3) & 0x3F); 242 if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) 243 return std::make_pair(codepoint, 4); 244 } 245 return std::make_pair(0, 0); 246 } 247 248 namespace llvm { 249 namespace yaml { 250 /// @brief Scans YAML tokens from a MemoryBuffer. 251 class Scanner { 252 public: 253 Scanner(const StringRef Input, SourceMgr &SM); 254 Scanner(MemoryBuffer *Buffer, SourceMgr &SM_); 255 256 /// @brief Parse the next token and return it without popping it. 257 Token &peekNext(); 258 259 /// @brief Parse the next token and pop it from the queue. 260 Token getNext(); 261 262 void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, 263 ArrayRef<SMRange> Ranges = None) { 264 SM.PrintMessage(Loc, Kind, Message, Ranges); 265 } 266 267 void setError(const Twine &Message, StringRef::iterator Position) { 268 if (Current >= End) 269 Current = End - 1; 270 271 // Don't print out more errors after the first one we encounter. The rest 272 // are just the result of the first, and have no meaning. 273 if (!Failed) 274 printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message); 275 Failed = true; 276 } 277 278 void setError(const Twine &Message) { 279 setError(Message, Current); 280 } 281 282 /// @brief Returns true if an error occurred while parsing. 283 bool failed() { 284 return Failed; 285 } 286 287 private: 288 StringRef currentInput() { 289 return StringRef(Current, End - Current); 290 } 291 292 /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting 293 /// at \a Position. 294 /// 295 /// If the UTF-8 code units starting at Position do not form a well-formed 296 /// code unit subsequence, then the Unicode scalar value is 0, and the length 297 /// is 0. 298 UTF8Decoded decodeUTF8(StringRef::iterator Position) { 299 return ::decodeUTF8(StringRef(Position, End - Position)); 300 } 301 302 // The following functions are based on the gramar rules in the YAML spec. The 303 // style of the function names it meant to closely match how they are written 304 // in the spec. The number within the [] is the number of the grammar rule in 305 // the spec. 306 // 307 // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. 308 // 309 // c- 310 // A production starting and ending with a special character. 311 // b- 312 // A production matching a single line break. 313 // nb- 314 // A production starting and ending with a non-break character. 315 // s- 316 // A production starting and ending with a white space character. 317 // ns- 318 // A production starting and ending with a non-space character. 319 // l- 320 // A production matching complete line(s). 321 322 /// @brief Skip a single nb-char[27] starting at Position. 323 /// 324 /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] 325 /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] 326 /// 327 /// @returns The code unit after the nb-char, or Position if it's not an 328 /// nb-char. 329 StringRef::iterator skip_nb_char(StringRef::iterator Position); 330 331 /// @brief Skip a single b-break[28] starting at Position. 332 /// 333 /// A b-break is 0xD 0xA | 0xD | 0xA 334 /// 335 /// @returns The code unit after the b-break, or Position if it's not a 336 /// b-break. 337 StringRef::iterator skip_b_break(StringRef::iterator Position); 338 339 /// @brief Skip a single s-white[33] starting at Position. 340 /// 341 /// A s-white is 0x20 | 0x9 342 /// 343 /// @returns The code unit after the s-white, or Position if it's not a 344 /// s-white. 345 StringRef::iterator skip_s_white(StringRef::iterator Position); 346 347 /// @brief Skip a single ns-char[34] starting at Position. 348 /// 349 /// A ns-char is nb-char - s-white 350 /// 351 /// @returns The code unit after the ns-char, or Position if it's not a 352 /// ns-char. 353 StringRef::iterator skip_ns_char(StringRef::iterator Position); 354 355 typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator); 356 /// @brief Skip minimal well-formed code unit subsequences until Func 357 /// returns its input. 358 /// 359 /// @returns The code unit after the last minimal well-formed code unit 360 /// subsequence that Func accepted. 361 StringRef::iterator skip_while( SkipWhileFunc Func 362 , StringRef::iterator Position); 363 364 /// @brief Scan ns-uri-char[39]s starting at Cur. 365 /// 366 /// This updates Cur and Column while scanning. 367 /// 368 /// @returns A StringRef starting at Cur which covers the longest contiguous 369 /// sequence of ns-uri-char. 370 StringRef scan_ns_uri_char(); 371 372 /// @brief Scan ns-plain-one-line[133] starting at \a Cur. 373 StringRef scan_ns_plain_one_line(); 374 375 /// @brief Consume a minimal well-formed code unit subsequence starting at 376 /// \a Cur. Return false if it is not the same Unicode scalar value as 377 /// \a Expected. This updates \a Column. 378 bool consume(uint32_t Expected); 379 380 /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. 381 void skip(uint32_t Distance); 382 383 /// @brief Return true if the minimal well-formed code unit subsequence at 384 /// Pos is whitespace or a new line 385 bool isBlankOrBreak(StringRef::iterator Position); 386 387 /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey. 388 void saveSimpleKeyCandidate( TokenQueueT::iterator Tok 389 , unsigned AtColumn 390 , bool IsRequired); 391 392 /// @brief Remove simple keys that can no longer be valid simple keys. 393 /// 394 /// Invalid simple keys are not on the current line or are further than 1024 395 /// columns back. 396 void removeStaleSimpleKeyCandidates(); 397 398 /// @brief Remove all simple keys on FlowLevel \a Level. 399 void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); 400 401 /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd 402 /// tokens if needed. 403 bool unrollIndent(int ToColumn); 404 405 /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint 406 /// if needed. 407 bool rollIndent( int ToColumn 408 , Token::TokenKind Kind 409 , TokenQueueT::iterator InsertPoint); 410 411 /// @brief Skip whitespace and comments until the start of the next token. 412 void scanToNextToken(); 413 414 /// @brief Must be the first token generated. 415 bool scanStreamStart(); 416 417 /// @brief Generate tokens needed to close out the stream. 418 bool scanStreamEnd(); 419 420 /// @brief Scan a %BLAH directive. 421 bool scanDirective(); 422 423 /// @brief Scan a ... or ---. 424 bool scanDocumentIndicator(bool IsStart); 425 426 /// @brief Scan a [ or { and generate the proper flow collection start token. 427 bool scanFlowCollectionStart(bool IsSequence); 428 429 /// @brief Scan a ] or } and generate the proper flow collection end token. 430 bool scanFlowCollectionEnd(bool IsSequence); 431 432 /// @brief Scan the , that separates entries in a flow collection. 433 bool scanFlowEntry(); 434 435 /// @brief Scan the - that starts block sequence entries. 436 bool scanBlockEntry(); 437 438 /// @brief Scan an explicit ? indicating a key. 439 bool scanKey(); 440 441 /// @brief Scan an explicit : indicating a value. 442 bool scanValue(); 443 444 /// @brief Scan a quoted scalar. 445 bool scanFlowScalar(bool IsDoubleQuoted); 446 447 /// @brief Scan an unquoted scalar. 448 bool scanPlainScalar(); 449 450 /// @brief Scan an Alias or Anchor starting with * or &. 451 bool scanAliasOrAnchor(bool IsAlias); 452 453 /// @brief Scan a block scalar starting with | or >. 454 bool scanBlockScalar(bool IsLiteral); 455 456 /// @brief Scan a tag of the form !stuff. 457 bool scanTag(); 458 459 /// @brief Dispatch to the next scanning function based on \a *Cur. 460 bool fetchMoreTokens(); 461 462 /// @brief The SourceMgr used for diagnostics and buffer management. 463 SourceMgr &SM; 464 465 /// @brief The original input. 466 MemoryBuffer *InputBuffer; 467 468 /// @brief The current position of the scanner. 469 StringRef::iterator Current; 470 471 /// @brief The end of the input (one past the last character). 472 StringRef::iterator End; 473 474 /// @brief Current YAML indentation level in spaces. 475 int Indent; 476 477 /// @brief Current column number in Unicode code points. 478 unsigned Column; 479 480 /// @brief Current line number. 481 unsigned Line; 482 483 /// @brief How deep we are in flow style containers. 0 Means at block level. 484 unsigned FlowLevel; 485 486 /// @brief Are we at the start of the stream? 487 bool IsStartOfStream; 488 489 /// @brief Can the next token be the start of a simple key? 490 bool IsSimpleKeyAllowed; 491 492 /// @brief True if an error has occurred. 493 bool Failed; 494 495 /// @brief Queue of tokens. This is required to queue up tokens while looking 496 /// for the end of a simple key. And for cases where a single character 497 /// can produce multiple tokens (e.g. BlockEnd). 498 TokenQueueT TokenQueue; 499 500 /// @brief Indentation levels. 501 SmallVector<int, 4> Indents; 502 503 /// @brief Potential simple keys. 504 SmallVector<SimpleKey, 4> SimpleKeys; 505 }; 506 507 } // end namespace yaml 508 } // end namespace llvm 509 510 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. 511 static void encodeUTF8( uint32_t UnicodeScalarValue 512 , SmallVectorImpl<char> &Result) { 513 if (UnicodeScalarValue <= 0x7F) { 514 Result.push_back(UnicodeScalarValue & 0x7F); 515 } else if (UnicodeScalarValue <= 0x7FF) { 516 uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); 517 uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); 518 Result.push_back(FirstByte); 519 Result.push_back(SecondByte); 520 } else if (UnicodeScalarValue <= 0xFFFF) { 521 uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); 522 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 523 uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); 524 Result.push_back(FirstByte); 525 Result.push_back(SecondByte); 526 Result.push_back(ThirdByte); 527 } else if (UnicodeScalarValue <= 0x10FFFF) { 528 uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); 529 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); 530 uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 531 uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); 532 Result.push_back(FirstByte); 533 Result.push_back(SecondByte); 534 Result.push_back(ThirdByte); 535 Result.push_back(FourthByte); 536 } 537 } 538 539 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { 540 SourceMgr SM; 541 Scanner scanner(Input, SM); 542 while (true) { 543 Token T = scanner.getNext(); 544 switch (T.Kind) { 545 case Token::TK_StreamStart: 546 OS << "Stream-Start: "; 547 break; 548 case Token::TK_StreamEnd: 549 OS << "Stream-End: "; 550 break; 551 case Token::TK_VersionDirective: 552 OS << "Version-Directive: "; 553 break; 554 case Token::TK_TagDirective: 555 OS << "Tag-Directive: "; 556 break; 557 case Token::TK_DocumentStart: 558 OS << "Document-Start: "; 559 break; 560 case Token::TK_DocumentEnd: 561 OS << "Document-End: "; 562 break; 563 case Token::TK_BlockEntry: 564 OS << "Block-Entry: "; 565 break; 566 case Token::TK_BlockEnd: 567 OS << "Block-End: "; 568 break; 569 case Token::TK_BlockSequenceStart: 570 OS << "Block-Sequence-Start: "; 571 break; 572 case Token::TK_BlockMappingStart: 573 OS << "Block-Mapping-Start: "; 574 break; 575 case Token::TK_FlowEntry: 576 OS << "Flow-Entry: "; 577 break; 578 case Token::TK_FlowSequenceStart: 579 OS << "Flow-Sequence-Start: "; 580 break; 581 case Token::TK_FlowSequenceEnd: 582 OS << "Flow-Sequence-End: "; 583 break; 584 case Token::TK_FlowMappingStart: 585 OS << "Flow-Mapping-Start: "; 586 break; 587 case Token::TK_FlowMappingEnd: 588 OS << "Flow-Mapping-End: "; 589 break; 590 case Token::TK_Key: 591 OS << "Key: "; 592 break; 593 case Token::TK_Value: 594 OS << "Value: "; 595 break; 596 case Token::TK_Scalar: 597 OS << "Scalar: "; 598 break; 599 case Token::TK_Alias: 600 OS << "Alias: "; 601 break; 602 case Token::TK_Anchor: 603 OS << "Anchor: "; 604 break; 605 case Token::TK_Tag: 606 OS << "Tag: "; 607 break; 608 case Token::TK_Error: 609 break; 610 } 611 OS << T.Range << "\n"; 612 if (T.Kind == Token::TK_StreamEnd) 613 break; 614 else if (T.Kind == Token::TK_Error) 615 return false; 616 } 617 return true; 618 } 619 620 bool yaml::scanTokens(StringRef Input) { 621 llvm::SourceMgr SM; 622 llvm::yaml::Scanner scanner(Input, SM); 623 for (;;) { 624 llvm::yaml::Token T = scanner.getNext(); 625 if (T.Kind == Token::TK_StreamEnd) 626 break; 627 else if (T.Kind == Token::TK_Error) 628 return false; 629 } 630 return true; 631 } 632 633 std::string yaml::escape(StringRef Input) { 634 std::string EscapedInput; 635 for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { 636 if (*i == '\\') 637 EscapedInput += "\\\\"; 638 else if (*i == '"') 639 EscapedInput += "\\\""; 640 else if (*i == 0) 641 EscapedInput += "\\0"; 642 else if (*i == 0x07) 643 EscapedInput += "\\a"; 644 else if (*i == 0x08) 645 EscapedInput += "\\b"; 646 else if (*i == 0x09) 647 EscapedInput += "\\t"; 648 else if (*i == 0x0A) 649 EscapedInput += "\\n"; 650 else if (*i == 0x0B) 651 EscapedInput += "\\v"; 652 else if (*i == 0x0C) 653 EscapedInput += "\\f"; 654 else if (*i == 0x0D) 655 EscapedInput += "\\r"; 656 else if (*i == 0x1B) 657 EscapedInput += "\\e"; 658 else if ((unsigned char)*i < 0x20) { // Control characters not handled above. 659 std::string HexStr = utohexstr(*i); 660 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 661 } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. 662 UTF8Decoded UnicodeScalarValue 663 = decodeUTF8(StringRef(i, Input.end() - i)); 664 if (UnicodeScalarValue.second == 0) { 665 // Found invalid char. 666 SmallString<4> Val; 667 encodeUTF8(0xFFFD, Val); 668 EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end()); 669 // FIXME: Error reporting. 670 return EscapedInput; 671 } 672 if (UnicodeScalarValue.first == 0x85) 673 EscapedInput += "\\N"; 674 else if (UnicodeScalarValue.first == 0xA0) 675 EscapedInput += "\\_"; 676 else if (UnicodeScalarValue.first == 0x2028) 677 EscapedInput += "\\L"; 678 else if (UnicodeScalarValue.first == 0x2029) 679 EscapedInput += "\\P"; 680 else { 681 std::string HexStr = utohexstr(UnicodeScalarValue.first); 682 if (HexStr.size() <= 2) 683 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 684 else if (HexStr.size() <= 4) 685 EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; 686 else if (HexStr.size() <= 8) 687 EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; 688 } 689 i += UnicodeScalarValue.second - 1; 690 } else 691 EscapedInput.push_back(*i); 692 } 693 return EscapedInput; 694 } 695 696 Scanner::Scanner(StringRef Input, SourceMgr &sm) 697 : SM(sm) 698 , Indent(-1) 699 , Column(0) 700 , Line(0) 701 , FlowLevel(0) 702 , IsStartOfStream(true) 703 , IsSimpleKeyAllowed(true) 704 , Failed(false) { 705 InputBuffer = MemoryBuffer::getMemBuffer(Input, "YAML"); 706 SM.AddNewSourceBuffer(InputBuffer, SMLoc()); 707 Current = InputBuffer->getBufferStart(); 708 End = InputBuffer->getBufferEnd(); 709 } 710 711 Scanner::Scanner(MemoryBuffer *Buffer, SourceMgr &SM_) 712 : SM(SM_) 713 , InputBuffer(Buffer) 714 , Current(InputBuffer->getBufferStart()) 715 , End(InputBuffer->getBufferEnd()) 716 , Indent(-1) 717 , Column(0) 718 , Line(0) 719 , FlowLevel(0) 720 , IsStartOfStream(true) 721 , IsSimpleKeyAllowed(true) 722 , Failed(false) { 723 SM.AddNewSourceBuffer(InputBuffer, SMLoc()); 724 } 725 726 Token &Scanner::peekNext() { 727 // If the current token is a possible simple key, keep parsing until we 728 // can confirm. 729 bool NeedMore = false; 730 while (true) { 731 if (TokenQueue.empty() || NeedMore) { 732 if (!fetchMoreTokens()) { 733 TokenQueue.clear(); 734 TokenQueue.push_back(Token()); 735 return TokenQueue.front(); 736 } 737 } 738 assert(!TokenQueue.empty() && 739 "fetchMoreTokens lied about getting tokens!"); 740 741 removeStaleSimpleKeyCandidates(); 742 SimpleKey SK; 743 SK.Tok = TokenQueue.front(); 744 if (std::find(SimpleKeys.begin(), SimpleKeys.end(), SK) 745 == SimpleKeys.end()) 746 break; 747 else 748 NeedMore = true; 749 } 750 return TokenQueue.front(); 751 } 752 753 Token Scanner::getNext() { 754 Token Ret = peekNext(); 755 // TokenQueue can be empty if there was an error getting the next token. 756 if (!TokenQueue.empty()) 757 TokenQueue.pop_front(); 758 759 // There cannot be any referenced Token's if the TokenQueue is empty. So do a 760 // quick deallocation of them all. 761 if (TokenQueue.empty()) { 762 TokenQueue.Alloc.Reset(); 763 } 764 765 return Ret; 766 } 767 768 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { 769 if (Position == End) 770 return Position; 771 // Check 7 bit c-printable - b-char. 772 if ( *Position == 0x09 773 || (*Position >= 0x20 && *Position <= 0x7E)) 774 return Position + 1; 775 776 // Check for valid UTF-8. 777 if (uint8_t(*Position) & 0x80) { 778 UTF8Decoded u8d = decodeUTF8(Position); 779 if ( u8d.second != 0 780 && u8d.first != 0xFEFF 781 && ( u8d.first == 0x85 782 || ( u8d.first >= 0xA0 783 && u8d.first <= 0xD7FF) 784 || ( u8d.first >= 0xE000 785 && u8d.first <= 0xFFFD) 786 || ( u8d.first >= 0x10000 787 && u8d.first <= 0x10FFFF))) 788 return Position + u8d.second; 789 } 790 return Position; 791 } 792 793 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { 794 if (Position == End) 795 return Position; 796 if (*Position == 0x0D) { 797 if (Position + 1 != End && *(Position + 1) == 0x0A) 798 return Position + 2; 799 return Position + 1; 800 } 801 802 if (*Position == 0x0A) 803 return Position + 1; 804 return Position; 805 } 806 807 808 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { 809 if (Position == End) 810 return Position; 811 if (*Position == ' ' || *Position == '\t') 812 return Position + 1; 813 return Position; 814 } 815 816 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { 817 if (Position == End) 818 return Position; 819 if (*Position == ' ' || *Position == '\t') 820 return Position; 821 return skip_nb_char(Position); 822 } 823 824 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func 825 , StringRef::iterator Position) { 826 while (true) { 827 StringRef::iterator i = (this->*Func)(Position); 828 if (i == Position) 829 break; 830 Position = i; 831 } 832 return Position; 833 } 834 835 static bool is_ns_hex_digit(const char C) { 836 return (C >= '0' && C <= '9') 837 || (C >= 'a' && C <= 'z') 838 || (C >= 'A' && C <= 'Z'); 839 } 840 841 static bool is_ns_word_char(const char C) { 842 return C == '-' 843 || (C >= 'a' && C <= 'z') 844 || (C >= 'A' && C <= 'Z'); 845 } 846 847 StringRef Scanner::scan_ns_uri_char() { 848 StringRef::iterator Start = Current; 849 while (true) { 850 if (Current == End) 851 break; 852 if (( *Current == '%' 853 && Current + 2 < End 854 && is_ns_hex_digit(*(Current + 1)) 855 && is_ns_hex_digit(*(Current + 2))) 856 || is_ns_word_char(*Current) 857 || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") 858 != StringRef::npos) { 859 ++Current; 860 ++Column; 861 } else 862 break; 863 } 864 return StringRef(Start, Current - Start); 865 } 866 867 StringRef Scanner::scan_ns_plain_one_line() { 868 StringRef::iterator start = Current; 869 // The first character must already be verified. 870 ++Current; 871 while (true) { 872 if (Current == End) { 873 break; 874 } else if (*Current == ':') { 875 // Check if the next character is a ns-char. 876 if (Current + 1 == End) 877 break; 878 StringRef::iterator i = skip_ns_char(Current + 1); 879 if (Current + 1 != i) { 880 Current = i; 881 Column += 2; // Consume both the ':' and ns-char. 882 } else 883 break; 884 } else if (*Current == '#') { 885 // Check if the previous character was a ns-char. 886 // The & 0x80 check is to check for the trailing byte of a utf-8 887 if (*(Current - 1) & 0x80 || skip_ns_char(Current - 1) == Current) { 888 ++Current; 889 ++Column; 890 } else 891 break; 892 } else { 893 StringRef::iterator i = skip_nb_char(Current); 894 if (i == Current) 895 break; 896 Current = i; 897 ++Column; 898 } 899 } 900 return StringRef(start, Current - start); 901 } 902 903 bool Scanner::consume(uint32_t Expected) { 904 if (Expected >= 0x80) 905 report_fatal_error("Not dealing with this yet"); 906 if (Current == End) 907 return false; 908 if (uint8_t(*Current) >= 0x80) 909 report_fatal_error("Not dealing with this yet"); 910 if (uint8_t(*Current) == Expected) { 911 ++Current; 912 ++Column; 913 return true; 914 } 915 return false; 916 } 917 918 void Scanner::skip(uint32_t Distance) { 919 Current += Distance; 920 Column += Distance; 921 assert(Current <= End && "Skipped past the end"); 922 } 923 924 bool Scanner::isBlankOrBreak(StringRef::iterator Position) { 925 if (Position == End) 926 return false; 927 if ( *Position == ' ' || *Position == '\t' 928 || *Position == '\r' || *Position == '\n') 929 return true; 930 return false; 931 } 932 933 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok 934 , unsigned AtColumn 935 , bool IsRequired) { 936 if (IsSimpleKeyAllowed) { 937 SimpleKey SK; 938 SK.Tok = Tok; 939 SK.Line = Line; 940 SK.Column = AtColumn; 941 SK.IsRequired = IsRequired; 942 SK.FlowLevel = FlowLevel; 943 SimpleKeys.push_back(SK); 944 } 945 } 946 947 void Scanner::removeStaleSimpleKeyCandidates() { 948 for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); 949 i != SimpleKeys.end();) { 950 if (i->Line != Line || i->Column + 1024 < Column) { 951 if (i->IsRequired) 952 setError( "Could not find expected : for simple key" 953 , i->Tok->Range.begin()); 954 i = SimpleKeys.erase(i); 955 } else 956 ++i; 957 } 958 } 959 960 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { 961 if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) 962 SimpleKeys.pop_back(); 963 } 964 965 bool Scanner::unrollIndent(int ToColumn) { 966 Token T; 967 // Indentation is ignored in flow. 968 if (FlowLevel != 0) 969 return true; 970 971 while (Indent > ToColumn) { 972 T.Kind = Token::TK_BlockEnd; 973 T.Range = StringRef(Current, 1); 974 TokenQueue.push_back(T); 975 Indent = Indents.pop_back_val(); 976 } 977 978 return true; 979 } 980 981 bool Scanner::rollIndent( int ToColumn 982 , Token::TokenKind Kind 983 , TokenQueueT::iterator InsertPoint) { 984 if (FlowLevel) 985 return true; 986 if (Indent < ToColumn) { 987 Indents.push_back(Indent); 988 Indent = ToColumn; 989 990 Token T; 991 T.Kind = Kind; 992 T.Range = StringRef(Current, 0); 993 TokenQueue.insert(InsertPoint, T); 994 } 995 return true; 996 } 997 998 void Scanner::scanToNextToken() { 999 while (true) { 1000 while (*Current == ' ' || *Current == '\t') { 1001 skip(1); 1002 } 1003 1004 // Skip comment. 1005 if (*Current == '#') { 1006 while (true) { 1007 // This may skip more than one byte, thus Column is only incremented 1008 // for code points. 1009 StringRef::iterator i = skip_nb_char(Current); 1010 if (i == Current) 1011 break; 1012 Current = i; 1013 ++Column; 1014 } 1015 } 1016 1017 // Skip EOL. 1018 StringRef::iterator i = skip_b_break(Current); 1019 if (i == Current) 1020 break; 1021 Current = i; 1022 ++Line; 1023 Column = 0; 1024 // New lines may start a simple key. 1025 if (!FlowLevel) 1026 IsSimpleKeyAllowed = true; 1027 } 1028 } 1029 1030 bool Scanner::scanStreamStart() { 1031 IsStartOfStream = false; 1032 1033 EncodingInfo EI = getUnicodeEncoding(currentInput()); 1034 1035 Token T; 1036 T.Kind = Token::TK_StreamStart; 1037 T.Range = StringRef(Current, EI.second); 1038 TokenQueue.push_back(T); 1039 Current += EI.second; 1040 return true; 1041 } 1042 1043 bool Scanner::scanStreamEnd() { 1044 // Force an ending new line if one isn't present. 1045 if (Column != 0) { 1046 Column = 0; 1047 ++Line; 1048 } 1049 1050 unrollIndent(-1); 1051 SimpleKeys.clear(); 1052 IsSimpleKeyAllowed = false; 1053 1054 Token T; 1055 T.Kind = Token::TK_StreamEnd; 1056 T.Range = StringRef(Current, 0); 1057 TokenQueue.push_back(T); 1058 return true; 1059 } 1060 1061 bool Scanner::scanDirective() { 1062 // Reset the indentation level. 1063 unrollIndent(-1); 1064 SimpleKeys.clear(); 1065 IsSimpleKeyAllowed = false; 1066 1067 StringRef::iterator Start = Current; 1068 consume('%'); 1069 StringRef::iterator NameStart = Current; 1070 Current = skip_while(&Scanner::skip_ns_char, Current); 1071 StringRef Name(NameStart, Current - NameStart); 1072 Current = skip_while(&Scanner::skip_s_white, Current); 1073 1074 Token T; 1075 if (Name == "YAML") { 1076 Current = skip_while(&Scanner::skip_ns_char, Current); 1077 T.Kind = Token::TK_VersionDirective; 1078 T.Range = StringRef(Start, Current - Start); 1079 TokenQueue.push_back(T); 1080 return true; 1081 } else if(Name == "TAG") { 1082 Current = skip_while(&Scanner::skip_ns_char, Current); 1083 Current = skip_while(&Scanner::skip_s_white, Current); 1084 Current = skip_while(&Scanner::skip_ns_char, Current); 1085 T.Kind = Token::TK_TagDirective; 1086 T.Range = StringRef(Start, Current - Start); 1087 TokenQueue.push_back(T); 1088 return true; 1089 } 1090 return false; 1091 } 1092 1093 bool Scanner::scanDocumentIndicator(bool IsStart) { 1094 unrollIndent(-1); 1095 SimpleKeys.clear(); 1096 IsSimpleKeyAllowed = false; 1097 1098 Token T; 1099 T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; 1100 T.Range = StringRef(Current, 3); 1101 skip(3); 1102 TokenQueue.push_back(T); 1103 return true; 1104 } 1105 1106 bool Scanner::scanFlowCollectionStart(bool IsSequence) { 1107 Token T; 1108 T.Kind = IsSequence ? Token::TK_FlowSequenceStart 1109 : Token::TK_FlowMappingStart; 1110 T.Range = StringRef(Current, 1); 1111 skip(1); 1112 TokenQueue.push_back(T); 1113 1114 // [ and { may begin a simple key. 1115 saveSimpleKeyCandidate(TokenQueue.back(), Column - 1, false); 1116 1117 // And may also be followed by a simple key. 1118 IsSimpleKeyAllowed = true; 1119 ++FlowLevel; 1120 return true; 1121 } 1122 1123 bool Scanner::scanFlowCollectionEnd(bool IsSequence) { 1124 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1125 IsSimpleKeyAllowed = false; 1126 Token T; 1127 T.Kind = IsSequence ? Token::TK_FlowSequenceEnd 1128 : Token::TK_FlowMappingEnd; 1129 T.Range = StringRef(Current, 1); 1130 skip(1); 1131 TokenQueue.push_back(T); 1132 if (FlowLevel) 1133 --FlowLevel; 1134 return true; 1135 } 1136 1137 bool Scanner::scanFlowEntry() { 1138 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1139 IsSimpleKeyAllowed = true; 1140 Token T; 1141 T.Kind = Token::TK_FlowEntry; 1142 T.Range = StringRef(Current, 1); 1143 skip(1); 1144 TokenQueue.push_back(T); 1145 return true; 1146 } 1147 1148 bool Scanner::scanBlockEntry() { 1149 rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); 1150 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1151 IsSimpleKeyAllowed = true; 1152 Token T; 1153 T.Kind = Token::TK_BlockEntry; 1154 T.Range = StringRef(Current, 1); 1155 skip(1); 1156 TokenQueue.push_back(T); 1157 return true; 1158 } 1159 1160 bool Scanner::scanKey() { 1161 if (!FlowLevel) 1162 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1163 1164 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1165 IsSimpleKeyAllowed = !FlowLevel; 1166 1167 Token T; 1168 T.Kind = Token::TK_Key; 1169 T.Range = StringRef(Current, 1); 1170 skip(1); 1171 TokenQueue.push_back(T); 1172 return true; 1173 } 1174 1175 bool Scanner::scanValue() { 1176 // If the previous token could have been a simple key, insert the key token 1177 // into the token queue. 1178 if (!SimpleKeys.empty()) { 1179 SimpleKey SK = SimpleKeys.pop_back_val(); 1180 Token T; 1181 T.Kind = Token::TK_Key; 1182 T.Range = SK.Tok->Range; 1183 TokenQueueT::iterator i, e; 1184 for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { 1185 if (i == SK.Tok) 1186 break; 1187 } 1188 assert(i != e && "SimpleKey not in token queue!"); 1189 i = TokenQueue.insert(i, T); 1190 1191 // We may also need to add a Block-Mapping-Start token. 1192 rollIndent(SK.Column, Token::TK_BlockMappingStart, i); 1193 1194 IsSimpleKeyAllowed = false; 1195 } else { 1196 if (!FlowLevel) 1197 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1198 IsSimpleKeyAllowed = !FlowLevel; 1199 } 1200 1201 Token T; 1202 T.Kind = Token::TK_Value; 1203 T.Range = StringRef(Current, 1); 1204 skip(1); 1205 TokenQueue.push_back(T); 1206 return true; 1207 } 1208 1209 // Forbidding inlining improves performance by roughly 20%. 1210 // FIXME: Remove once llvm optimizes this to the faster version without hints. 1211 LLVM_ATTRIBUTE_NOINLINE static bool 1212 wasEscaped(StringRef::iterator First, StringRef::iterator Position); 1213 1214 // Returns whether a character at 'Position' was escaped with a leading '\'. 1215 // 'First' specifies the position of the first character in the string. 1216 static bool wasEscaped(StringRef::iterator First, 1217 StringRef::iterator Position) { 1218 assert(Position - 1 >= First); 1219 StringRef::iterator I = Position - 1; 1220 // We calculate the number of consecutive '\'s before the current position 1221 // by iterating backwards through our string. 1222 while (I >= First && *I == '\\') --I; 1223 // (Position - 1 - I) now contains the number of '\'s before the current 1224 // position. If it is odd, the character at 'Position' was escaped. 1225 return (Position - 1 - I) % 2 == 1; 1226 } 1227 1228 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { 1229 StringRef::iterator Start = Current; 1230 unsigned ColStart = Column; 1231 if (IsDoubleQuoted) { 1232 do { 1233 ++Current; 1234 while (Current != End && *Current != '"') 1235 ++Current; 1236 // Repeat until the previous character was not a '\' or was an escaped 1237 // backslash. 1238 } while ( Current != End 1239 && *(Current - 1) == '\\' 1240 && wasEscaped(Start + 1, Current)); 1241 } else { 1242 skip(1); 1243 while (true) { 1244 // Skip a ' followed by another '. 1245 if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { 1246 skip(2); 1247 continue; 1248 } else if (*Current == '\'') 1249 break; 1250 StringRef::iterator i = skip_nb_char(Current); 1251 if (i == Current) { 1252 i = skip_b_break(Current); 1253 if (i == Current) 1254 break; 1255 Current = i; 1256 Column = 0; 1257 ++Line; 1258 } else { 1259 if (i == End) 1260 break; 1261 Current = i; 1262 ++Column; 1263 } 1264 } 1265 } 1266 1267 if (Current == End) { 1268 setError("Expected quote at end of scalar", Current); 1269 return false; 1270 } 1271 1272 skip(1); // Skip ending quote. 1273 Token T; 1274 T.Kind = Token::TK_Scalar; 1275 T.Range = StringRef(Start, Current - Start); 1276 TokenQueue.push_back(T); 1277 1278 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1279 1280 IsSimpleKeyAllowed = false; 1281 1282 return true; 1283 } 1284 1285 bool Scanner::scanPlainScalar() { 1286 StringRef::iterator Start = Current; 1287 unsigned ColStart = Column; 1288 unsigned LeadingBlanks = 0; 1289 assert(Indent >= -1 && "Indent must be >= -1 !"); 1290 unsigned indent = static_cast<unsigned>(Indent + 1); 1291 while (true) { 1292 if (*Current == '#') 1293 break; 1294 1295 while (!isBlankOrBreak(Current)) { 1296 if ( FlowLevel && *Current == ':' 1297 && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) { 1298 setError("Found unexpected ':' while scanning a plain scalar", Current); 1299 return false; 1300 } 1301 1302 // Check for the end of the plain scalar. 1303 if ( (*Current == ':' && isBlankOrBreak(Current + 1)) 1304 || ( FlowLevel 1305 && (StringRef(Current, 1).find_first_of(",:?[]{}") 1306 != StringRef::npos))) 1307 break; 1308 1309 StringRef::iterator i = skip_nb_char(Current); 1310 if (i == Current) 1311 break; 1312 Current = i; 1313 ++Column; 1314 } 1315 1316 // Are we at the end? 1317 if (!isBlankOrBreak(Current)) 1318 break; 1319 1320 // Eat blanks. 1321 StringRef::iterator Tmp = Current; 1322 while (isBlankOrBreak(Tmp)) { 1323 StringRef::iterator i = skip_s_white(Tmp); 1324 if (i != Tmp) { 1325 if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { 1326 setError("Found invalid tab character in indentation", Tmp); 1327 return false; 1328 } 1329 Tmp = i; 1330 ++Column; 1331 } else { 1332 i = skip_b_break(Tmp); 1333 if (!LeadingBlanks) 1334 LeadingBlanks = 1; 1335 Tmp = i; 1336 Column = 0; 1337 ++Line; 1338 } 1339 } 1340 1341 if (!FlowLevel && Column < indent) 1342 break; 1343 1344 Current = Tmp; 1345 } 1346 if (Start == Current) { 1347 setError("Got empty plain scalar", Start); 1348 return false; 1349 } 1350 Token T; 1351 T.Kind = Token::TK_Scalar; 1352 T.Range = StringRef(Start, Current - Start); 1353 TokenQueue.push_back(T); 1354 1355 // Plain scalars can be simple keys. 1356 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1357 1358 IsSimpleKeyAllowed = false; 1359 1360 return true; 1361 } 1362 1363 bool Scanner::scanAliasOrAnchor(bool IsAlias) { 1364 StringRef::iterator Start = Current; 1365 unsigned ColStart = Column; 1366 skip(1); 1367 while(true) { 1368 if ( *Current == '[' || *Current == ']' 1369 || *Current == '{' || *Current == '}' 1370 || *Current == ',' 1371 || *Current == ':') 1372 break; 1373 StringRef::iterator i = skip_ns_char(Current); 1374 if (i == Current) 1375 break; 1376 Current = i; 1377 ++Column; 1378 } 1379 1380 if (Start == Current) { 1381 setError("Got empty alias or anchor", Start); 1382 return false; 1383 } 1384 1385 Token T; 1386 T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; 1387 T.Range = StringRef(Start, Current - Start); 1388 TokenQueue.push_back(T); 1389 1390 // Alias and anchors can be simple keys. 1391 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1392 1393 IsSimpleKeyAllowed = false; 1394 1395 return true; 1396 } 1397 1398 bool Scanner::scanBlockScalar(bool IsLiteral) { 1399 StringRef::iterator Start = Current; 1400 skip(1); // Eat | or > 1401 while(true) { 1402 StringRef::iterator i = skip_nb_char(Current); 1403 if (i == Current) { 1404 if (Column == 0) 1405 break; 1406 i = skip_b_break(Current); 1407 if (i != Current) { 1408 // We got a line break. 1409 Column = 0; 1410 ++Line; 1411 Current = i; 1412 continue; 1413 } else { 1414 // There was an error, which should already have been printed out. 1415 return false; 1416 } 1417 } 1418 Current = i; 1419 ++Column; 1420 } 1421 1422 if (Start == Current) { 1423 setError("Got empty block scalar", Start); 1424 return false; 1425 } 1426 1427 Token T; 1428 T.Kind = Token::TK_Scalar; 1429 T.Range = StringRef(Start, Current - Start); 1430 TokenQueue.push_back(T); 1431 return true; 1432 } 1433 1434 bool Scanner::scanTag() { 1435 StringRef::iterator Start = Current; 1436 unsigned ColStart = Column; 1437 skip(1); // Eat !. 1438 if (Current == End || isBlankOrBreak(Current)); // An empty tag. 1439 else if (*Current == '<') { 1440 skip(1); 1441 scan_ns_uri_char(); 1442 if (!consume('>')) 1443 return false; 1444 } else { 1445 // FIXME: Actually parse the c-ns-shorthand-tag rule. 1446 Current = skip_while(&Scanner::skip_ns_char, Current); 1447 } 1448 1449 Token T; 1450 T.Kind = Token::TK_Tag; 1451 T.Range = StringRef(Start, Current - Start); 1452 TokenQueue.push_back(T); 1453 1454 // Tags can be simple keys. 1455 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1456 1457 IsSimpleKeyAllowed = false; 1458 1459 return true; 1460 } 1461 1462 bool Scanner::fetchMoreTokens() { 1463 if (IsStartOfStream) 1464 return scanStreamStart(); 1465 1466 scanToNextToken(); 1467 1468 if (Current == End) 1469 return scanStreamEnd(); 1470 1471 removeStaleSimpleKeyCandidates(); 1472 1473 unrollIndent(Column); 1474 1475 if (Column == 0 && *Current == '%') 1476 return scanDirective(); 1477 1478 if (Column == 0 && Current + 4 <= End 1479 && *Current == '-' 1480 && *(Current + 1) == '-' 1481 && *(Current + 2) == '-' 1482 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1483 return scanDocumentIndicator(true); 1484 1485 if (Column == 0 && Current + 4 <= End 1486 && *Current == '.' 1487 && *(Current + 1) == '.' 1488 && *(Current + 2) == '.' 1489 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1490 return scanDocumentIndicator(false); 1491 1492 if (*Current == '[') 1493 return scanFlowCollectionStart(true); 1494 1495 if (*Current == '{') 1496 return scanFlowCollectionStart(false); 1497 1498 if (*Current == ']') 1499 return scanFlowCollectionEnd(true); 1500 1501 if (*Current == '}') 1502 return scanFlowCollectionEnd(false); 1503 1504 if (*Current == ',') 1505 return scanFlowEntry(); 1506 1507 if (*Current == '-' && isBlankOrBreak(Current + 1)) 1508 return scanBlockEntry(); 1509 1510 if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1))) 1511 return scanKey(); 1512 1513 if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1))) 1514 return scanValue(); 1515 1516 if (*Current == '*') 1517 return scanAliasOrAnchor(true); 1518 1519 if (*Current == '&') 1520 return scanAliasOrAnchor(false); 1521 1522 if (*Current == '!') 1523 return scanTag(); 1524 1525 if (*Current == '|' && !FlowLevel) 1526 return scanBlockScalar(true); 1527 1528 if (*Current == '>' && !FlowLevel) 1529 return scanBlockScalar(false); 1530 1531 if (*Current == '\'') 1532 return scanFlowScalar(false); 1533 1534 if (*Current == '"') 1535 return scanFlowScalar(true); 1536 1537 // Get a plain scalar. 1538 StringRef FirstChar(Current, 1); 1539 if (!(isBlankOrBreak(Current) 1540 || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos) 1541 || (*Current == '-' && !isBlankOrBreak(Current + 1)) 1542 || (!FlowLevel && (*Current == '?' || *Current == ':') 1543 && isBlankOrBreak(Current + 1)) 1544 || (!FlowLevel && *Current == ':' 1545 && Current + 2 < End 1546 && *(Current + 1) == ':' 1547 && !isBlankOrBreak(Current + 2))) 1548 return scanPlainScalar(); 1549 1550 setError("Unrecognized character while tokenizing."); 1551 return false; 1552 } 1553 1554 Stream::Stream(StringRef Input, SourceMgr &SM) 1555 : scanner(new Scanner(Input, SM)) 1556 , CurrentDoc(0) {} 1557 1558 Stream::Stream(MemoryBuffer *InputBuffer, SourceMgr &SM) 1559 : scanner(new Scanner(InputBuffer, SM)) 1560 , CurrentDoc(0) {} 1561 1562 Stream::~Stream() {} 1563 1564 bool Stream::failed() { return scanner->failed(); } 1565 1566 void Stream::printError(Node *N, const Twine &Msg) { 1567 SmallVector<SMRange, 1> Ranges; 1568 Ranges.push_back(N->getSourceRange()); 1569 scanner->printError( N->getSourceRange().Start 1570 , SourceMgr::DK_Error 1571 , Msg 1572 , Ranges); 1573 } 1574 1575 document_iterator Stream::begin() { 1576 if (CurrentDoc) 1577 report_fatal_error("Can only iterate over the stream once"); 1578 1579 // Skip Stream-Start. 1580 scanner->getNext(); 1581 1582 CurrentDoc.reset(new Document(*this)); 1583 return document_iterator(CurrentDoc); 1584 } 1585 1586 document_iterator Stream::end() { 1587 return document_iterator(); 1588 } 1589 1590 void Stream::skip() { 1591 for (document_iterator i = begin(), e = end(); i != e; ++i) 1592 i->skip(); 1593 } 1594 1595 Node::Node(unsigned int Type, OwningPtr<Document> &D, StringRef A, StringRef T) 1596 : Doc(D) 1597 , TypeID(Type) 1598 , Anchor(A) 1599 , Tag(T) { 1600 SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); 1601 SourceRange = SMRange(Start, Start); 1602 } 1603 1604 std::string Node::getVerbatimTag() const { 1605 StringRef Raw = getRawTag(); 1606 if (!Raw.empty() && Raw != "!") { 1607 std::string Ret; 1608 if (Raw.find_last_of('!') == 0) { 1609 Ret = Doc->getTagMap().find("!")->second; 1610 Ret += Raw.substr(1); 1611 return llvm_move(Ret); 1612 } else if (Raw.startswith("!!")) { 1613 Ret = Doc->getTagMap().find("!!")->second; 1614 Ret += Raw.substr(2); 1615 return llvm_move(Ret); 1616 } else { 1617 StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1); 1618 std::map<StringRef, StringRef>::const_iterator It = 1619 Doc->getTagMap().find(TagHandle); 1620 if (It != Doc->getTagMap().end()) 1621 Ret = It->second; 1622 else { 1623 Token T; 1624 T.Kind = Token::TK_Tag; 1625 T.Range = TagHandle; 1626 setError(Twine("Unknown tag handle ") + TagHandle, T); 1627 } 1628 Ret += Raw.substr(Raw.find_last_of('!') + 1); 1629 return llvm_move(Ret); 1630 } 1631 } 1632 1633 switch (getType()) { 1634 case NK_Null: 1635 return "tag:yaml.org,2002:null"; 1636 case NK_Scalar: 1637 // TODO: Tag resolution. 1638 return "tag:yaml.org,2002:str"; 1639 case NK_Mapping: 1640 return "tag:yaml.org,2002:map"; 1641 case NK_Sequence: 1642 return "tag:yaml.org,2002:seq"; 1643 } 1644 1645 return ""; 1646 } 1647 1648 Token &Node::peekNext() { 1649 return Doc->peekNext(); 1650 } 1651 1652 Token Node::getNext() { 1653 return Doc->getNext(); 1654 } 1655 1656 Node *Node::parseBlockNode() { 1657 return Doc->parseBlockNode(); 1658 } 1659 1660 BumpPtrAllocator &Node::getAllocator() { 1661 return Doc->NodeAllocator; 1662 } 1663 1664 void Node::setError(const Twine &Msg, Token &Tok) const { 1665 Doc->setError(Msg, Tok); 1666 } 1667 1668 bool Node::failed() const { 1669 return Doc->failed(); 1670 } 1671 1672 1673 1674 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { 1675 // TODO: Handle newlines properly. We need to remove leading whitespace. 1676 if (Value[0] == '"') { // Double quoted. 1677 // Pull off the leading and trailing "s. 1678 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1679 // Search for characters that would require unescaping the value. 1680 StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); 1681 if (i != StringRef::npos) 1682 return unescapeDoubleQuoted(UnquotedValue, i, Storage); 1683 return UnquotedValue; 1684 } else if (Value[0] == '\'') { // Single quoted. 1685 // Pull off the leading and trailing 's. 1686 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1687 StringRef::size_type i = UnquotedValue.find('\''); 1688 if (i != StringRef::npos) { 1689 // We're going to need Storage. 1690 Storage.clear(); 1691 Storage.reserve(UnquotedValue.size()); 1692 for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { 1693 StringRef Valid(UnquotedValue.begin(), i); 1694 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1695 Storage.push_back('\''); 1696 UnquotedValue = UnquotedValue.substr(i + 2); 1697 } 1698 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 1699 return StringRef(Storage.begin(), Storage.size()); 1700 } 1701 return UnquotedValue; 1702 } 1703 // Plain or block. 1704 return Value.rtrim(" "); 1705 } 1706 1707 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue 1708 , StringRef::size_type i 1709 , SmallVectorImpl<char> &Storage) 1710 const { 1711 // Use Storage to build proper value. 1712 Storage.clear(); 1713 Storage.reserve(UnquotedValue.size()); 1714 for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { 1715 // Insert all previous chars into Storage. 1716 StringRef Valid(UnquotedValue.begin(), i); 1717 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1718 // Chop off inserted chars. 1719 UnquotedValue = UnquotedValue.substr(i); 1720 1721 assert(!UnquotedValue.empty() && "Can't be empty!"); 1722 1723 // Parse escape or line break. 1724 switch (UnquotedValue[0]) { 1725 case '\r': 1726 case '\n': 1727 Storage.push_back('\n'); 1728 if ( UnquotedValue.size() > 1 1729 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1730 UnquotedValue = UnquotedValue.substr(1); 1731 UnquotedValue = UnquotedValue.substr(1); 1732 break; 1733 default: 1734 if (UnquotedValue.size() == 1) 1735 // TODO: Report error. 1736 break; 1737 UnquotedValue = UnquotedValue.substr(1); 1738 switch (UnquotedValue[0]) { 1739 default: { 1740 Token T; 1741 T.Range = StringRef(UnquotedValue.begin(), 1); 1742 setError("Unrecognized escape code!", T); 1743 return ""; 1744 } 1745 case '\r': 1746 case '\n': 1747 // Remove the new line. 1748 if ( UnquotedValue.size() > 1 1749 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1750 UnquotedValue = UnquotedValue.substr(1); 1751 // If this was just a single byte newline, it will get skipped 1752 // below. 1753 break; 1754 case '0': 1755 Storage.push_back(0x00); 1756 break; 1757 case 'a': 1758 Storage.push_back(0x07); 1759 break; 1760 case 'b': 1761 Storage.push_back(0x08); 1762 break; 1763 case 't': 1764 case 0x09: 1765 Storage.push_back(0x09); 1766 break; 1767 case 'n': 1768 Storage.push_back(0x0A); 1769 break; 1770 case 'v': 1771 Storage.push_back(0x0B); 1772 break; 1773 case 'f': 1774 Storage.push_back(0x0C); 1775 break; 1776 case 'r': 1777 Storage.push_back(0x0D); 1778 break; 1779 case 'e': 1780 Storage.push_back(0x1B); 1781 break; 1782 case ' ': 1783 Storage.push_back(0x20); 1784 break; 1785 case '"': 1786 Storage.push_back(0x22); 1787 break; 1788 case '/': 1789 Storage.push_back(0x2F); 1790 break; 1791 case '\\': 1792 Storage.push_back(0x5C); 1793 break; 1794 case 'N': 1795 encodeUTF8(0x85, Storage); 1796 break; 1797 case '_': 1798 encodeUTF8(0xA0, Storage); 1799 break; 1800 case 'L': 1801 encodeUTF8(0x2028, Storage); 1802 break; 1803 case 'P': 1804 encodeUTF8(0x2029, Storage); 1805 break; 1806 case 'x': { 1807 if (UnquotedValue.size() < 3) 1808 // TODO: Report error. 1809 break; 1810 unsigned int UnicodeScalarValue; 1811 if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) 1812 // TODO: Report error. 1813 UnicodeScalarValue = 0xFFFD; 1814 encodeUTF8(UnicodeScalarValue, Storage); 1815 UnquotedValue = UnquotedValue.substr(2); 1816 break; 1817 } 1818 case 'u': { 1819 if (UnquotedValue.size() < 5) 1820 // TODO: Report error. 1821 break; 1822 unsigned int UnicodeScalarValue; 1823 if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) 1824 // TODO: Report error. 1825 UnicodeScalarValue = 0xFFFD; 1826 encodeUTF8(UnicodeScalarValue, Storage); 1827 UnquotedValue = UnquotedValue.substr(4); 1828 break; 1829 } 1830 case 'U': { 1831 if (UnquotedValue.size() < 9) 1832 // TODO: Report error. 1833 break; 1834 unsigned int UnicodeScalarValue; 1835 if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) 1836 // TODO: Report error. 1837 UnicodeScalarValue = 0xFFFD; 1838 encodeUTF8(UnicodeScalarValue, Storage); 1839 UnquotedValue = UnquotedValue.substr(8); 1840 break; 1841 } 1842 } 1843 UnquotedValue = UnquotedValue.substr(1); 1844 } 1845 } 1846 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 1847 return StringRef(Storage.begin(), Storage.size()); 1848 } 1849 1850 Node *KeyValueNode::getKey() { 1851 if (Key) 1852 return Key; 1853 // Handle implicit null keys. 1854 { 1855 Token &t = peekNext(); 1856 if ( t.Kind == Token::TK_BlockEnd 1857 || t.Kind == Token::TK_Value 1858 || t.Kind == Token::TK_Error) { 1859 return Key = new (getAllocator()) NullNode(Doc); 1860 } 1861 if (t.Kind == Token::TK_Key) 1862 getNext(); // skip TK_Key. 1863 } 1864 1865 // Handle explicit null keys. 1866 Token &t = peekNext(); 1867 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { 1868 return Key = new (getAllocator()) NullNode(Doc); 1869 } 1870 1871 // We've got a normal key. 1872 return Key = parseBlockNode(); 1873 } 1874 1875 Node *KeyValueNode::getValue() { 1876 if (Value) 1877 return Value; 1878 getKey()->skip(); 1879 if (failed()) 1880 return Value = new (getAllocator()) NullNode(Doc); 1881 1882 // Handle implicit null values. 1883 { 1884 Token &t = peekNext(); 1885 if ( t.Kind == Token::TK_BlockEnd 1886 || t.Kind == Token::TK_FlowMappingEnd 1887 || t.Kind == Token::TK_Key 1888 || t.Kind == Token::TK_FlowEntry 1889 || t.Kind == Token::TK_Error) { 1890 return Value = new (getAllocator()) NullNode(Doc); 1891 } 1892 1893 if (t.Kind != Token::TK_Value) { 1894 setError("Unexpected token in Key Value.", t); 1895 return Value = new (getAllocator()) NullNode(Doc); 1896 } 1897 getNext(); // skip TK_Value. 1898 } 1899 1900 // Handle explicit null values. 1901 Token &t = peekNext(); 1902 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { 1903 return Value = new (getAllocator()) NullNode(Doc); 1904 } 1905 1906 // We got a normal value. 1907 return Value = parseBlockNode(); 1908 } 1909 1910 void MappingNode::increment() { 1911 if (failed()) { 1912 IsAtEnd = true; 1913 CurrentEntry = 0; 1914 return; 1915 } 1916 if (CurrentEntry) { 1917 CurrentEntry->skip(); 1918 if (Type == MT_Inline) { 1919 IsAtEnd = true; 1920 CurrentEntry = 0; 1921 return; 1922 } 1923 } 1924 Token T = peekNext(); 1925 if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { 1926 // KeyValueNode eats the TK_Key. That way it can detect null keys. 1927 CurrentEntry = new (getAllocator()) KeyValueNode(Doc); 1928 } else if (Type == MT_Block) { 1929 switch (T.Kind) { 1930 case Token::TK_BlockEnd: 1931 getNext(); 1932 IsAtEnd = true; 1933 CurrentEntry = 0; 1934 break; 1935 default: 1936 setError("Unexpected token. Expected Key or Block End", T); 1937 case Token::TK_Error: 1938 IsAtEnd = true; 1939 CurrentEntry = 0; 1940 } 1941 } else { 1942 switch (T.Kind) { 1943 case Token::TK_FlowEntry: 1944 // Eat the flow entry and recurse. 1945 getNext(); 1946 return increment(); 1947 case Token::TK_FlowMappingEnd: 1948 getNext(); 1949 case Token::TK_Error: 1950 // Set this to end iterator. 1951 IsAtEnd = true; 1952 CurrentEntry = 0; 1953 break; 1954 default: 1955 setError( "Unexpected token. Expected Key, Flow Entry, or Flow " 1956 "Mapping End." 1957 , T); 1958 IsAtEnd = true; 1959 CurrentEntry = 0; 1960 } 1961 } 1962 } 1963 1964 void SequenceNode::increment() { 1965 if (failed()) { 1966 IsAtEnd = true; 1967 CurrentEntry = 0; 1968 return; 1969 } 1970 if (CurrentEntry) 1971 CurrentEntry->skip(); 1972 Token T = peekNext(); 1973 if (SeqType == ST_Block) { 1974 switch (T.Kind) { 1975 case Token::TK_BlockEntry: 1976 getNext(); 1977 CurrentEntry = parseBlockNode(); 1978 if (CurrentEntry == 0) { // An error occurred. 1979 IsAtEnd = true; 1980 CurrentEntry = 0; 1981 } 1982 break; 1983 case Token::TK_BlockEnd: 1984 getNext(); 1985 IsAtEnd = true; 1986 CurrentEntry = 0; 1987 break; 1988 default: 1989 setError( "Unexpected token. Expected Block Entry or Block End." 1990 , T); 1991 case Token::TK_Error: 1992 IsAtEnd = true; 1993 CurrentEntry = 0; 1994 } 1995 } else if (SeqType == ST_Indentless) { 1996 switch (T.Kind) { 1997 case Token::TK_BlockEntry: 1998 getNext(); 1999 CurrentEntry = parseBlockNode(); 2000 if (CurrentEntry == 0) { // An error occurred. 2001 IsAtEnd = true; 2002 CurrentEntry = 0; 2003 } 2004 break; 2005 default: 2006 case Token::TK_Error: 2007 IsAtEnd = true; 2008 CurrentEntry = 0; 2009 } 2010 } else if (SeqType == ST_Flow) { 2011 switch (T.Kind) { 2012 case Token::TK_FlowEntry: 2013 // Eat the flow entry and recurse. 2014 getNext(); 2015 WasPreviousTokenFlowEntry = true; 2016 return increment(); 2017 case Token::TK_FlowSequenceEnd: 2018 getNext(); 2019 case Token::TK_Error: 2020 // Set this to end iterator. 2021 IsAtEnd = true; 2022 CurrentEntry = 0; 2023 break; 2024 case Token::TK_StreamEnd: 2025 case Token::TK_DocumentEnd: 2026 case Token::TK_DocumentStart: 2027 setError("Could not find closing ]!", T); 2028 // Set this to end iterator. 2029 IsAtEnd = true; 2030 CurrentEntry = 0; 2031 break; 2032 default: 2033 if (!WasPreviousTokenFlowEntry) { 2034 setError("Expected , between entries!", T); 2035 IsAtEnd = true; 2036 CurrentEntry = 0; 2037 break; 2038 } 2039 // Otherwise it must be a flow entry. 2040 CurrentEntry = parseBlockNode(); 2041 if (!CurrentEntry) { 2042 IsAtEnd = true; 2043 } 2044 WasPreviousTokenFlowEntry = false; 2045 break; 2046 } 2047 } 2048 } 2049 2050 Document::Document(Stream &S) : stream(S), Root(0) { 2051 // Tag maps starts with two default mappings. 2052 TagMap["!"] = "!"; 2053 TagMap["!!"] = "tag:yaml.org,2002:"; 2054 2055 if (parseDirectives()) 2056 expectToken(Token::TK_DocumentStart); 2057 Token &T = peekNext(); 2058 if (T.Kind == Token::TK_DocumentStart) 2059 getNext(); 2060 } 2061 2062 bool Document::skip() { 2063 if (stream.scanner->failed()) 2064 return false; 2065 if (!Root) 2066 getRoot(); 2067 Root->skip(); 2068 Token &T = peekNext(); 2069 if (T.Kind == Token::TK_StreamEnd) 2070 return false; 2071 if (T.Kind == Token::TK_DocumentEnd) { 2072 getNext(); 2073 return skip(); 2074 } 2075 return true; 2076 } 2077 2078 Token &Document::peekNext() { 2079 return stream.scanner->peekNext(); 2080 } 2081 2082 Token Document::getNext() { 2083 return stream.scanner->getNext(); 2084 } 2085 2086 void Document::setError(const Twine &Message, Token &Location) const { 2087 stream.scanner->setError(Message, Location.Range.begin()); 2088 } 2089 2090 bool Document::failed() const { 2091 return stream.scanner->failed(); 2092 } 2093 2094 Node *Document::parseBlockNode() { 2095 Token T = peekNext(); 2096 // Handle properties. 2097 Token AnchorInfo; 2098 Token TagInfo; 2099 parse_property: 2100 switch (T.Kind) { 2101 case Token::TK_Alias: 2102 getNext(); 2103 return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); 2104 case Token::TK_Anchor: 2105 if (AnchorInfo.Kind == Token::TK_Anchor) { 2106 setError("Already encountered an anchor for this node!", T); 2107 return 0; 2108 } 2109 AnchorInfo = getNext(); // Consume TK_Anchor. 2110 T = peekNext(); 2111 goto parse_property; 2112 case Token::TK_Tag: 2113 if (TagInfo.Kind == Token::TK_Tag) { 2114 setError("Already encountered a tag for this node!", T); 2115 return 0; 2116 } 2117 TagInfo = getNext(); // Consume TK_Tag. 2118 T = peekNext(); 2119 goto parse_property; 2120 default: 2121 break; 2122 } 2123 2124 switch (T.Kind) { 2125 case Token::TK_BlockEntry: 2126 // We got an unindented BlockEntry sequence. This is not terminated with 2127 // a BlockEnd. 2128 // Don't eat the TK_BlockEntry, SequenceNode needs it. 2129 return new (NodeAllocator) SequenceNode( stream.CurrentDoc 2130 , AnchorInfo.Range.substr(1) 2131 , TagInfo.Range 2132 , SequenceNode::ST_Indentless); 2133 case Token::TK_BlockSequenceStart: 2134 getNext(); 2135 return new (NodeAllocator) 2136 SequenceNode( stream.CurrentDoc 2137 , AnchorInfo.Range.substr(1) 2138 , TagInfo.Range 2139 , SequenceNode::ST_Block); 2140 case Token::TK_BlockMappingStart: 2141 getNext(); 2142 return new (NodeAllocator) 2143 MappingNode( stream.CurrentDoc 2144 , AnchorInfo.Range.substr(1) 2145 , TagInfo.Range 2146 , MappingNode::MT_Block); 2147 case Token::TK_FlowSequenceStart: 2148 getNext(); 2149 return new (NodeAllocator) 2150 SequenceNode( stream.CurrentDoc 2151 , AnchorInfo.Range.substr(1) 2152 , TagInfo.Range 2153 , SequenceNode::ST_Flow); 2154 case Token::TK_FlowMappingStart: 2155 getNext(); 2156 return new (NodeAllocator) 2157 MappingNode( stream.CurrentDoc 2158 , AnchorInfo.Range.substr(1) 2159 , TagInfo.Range 2160 , MappingNode::MT_Flow); 2161 case Token::TK_Scalar: 2162 getNext(); 2163 return new (NodeAllocator) 2164 ScalarNode( stream.CurrentDoc 2165 , AnchorInfo.Range.substr(1) 2166 , TagInfo.Range 2167 , T.Range); 2168 case Token::TK_Key: 2169 // Don't eat the TK_Key, KeyValueNode expects it. 2170 return new (NodeAllocator) 2171 MappingNode( stream.CurrentDoc 2172 , AnchorInfo.Range.substr(1) 2173 , TagInfo.Range 2174 , MappingNode::MT_Inline); 2175 case Token::TK_DocumentStart: 2176 case Token::TK_DocumentEnd: 2177 case Token::TK_StreamEnd: 2178 default: 2179 // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not 2180 // !!null null. 2181 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2182 case Token::TK_Error: 2183 return 0; 2184 } 2185 llvm_unreachable("Control flow shouldn't reach here."); 2186 return 0; 2187 } 2188 2189 bool Document::parseDirectives() { 2190 bool isDirective = false; 2191 while (true) { 2192 Token T = peekNext(); 2193 if (T.Kind == Token::TK_TagDirective) { 2194 parseTAGDirective(); 2195 isDirective = true; 2196 } else if (T.Kind == Token::TK_VersionDirective) { 2197 parseYAMLDirective(); 2198 isDirective = true; 2199 } else 2200 break; 2201 } 2202 return isDirective; 2203 } 2204 2205 void Document::parseYAMLDirective() { 2206 getNext(); // Eat %YAML <version> 2207 } 2208 2209 void Document::parseTAGDirective() { 2210 Token Tag = getNext(); // %TAG <handle> <prefix> 2211 StringRef T = Tag.Range; 2212 // Strip %TAG 2213 T = T.substr(T.find_first_of(" \t")).ltrim(" \t"); 2214 std::size_t HandleEnd = T.find_first_of(" \t"); 2215 StringRef TagHandle = T.substr(0, HandleEnd); 2216 StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t"); 2217 TagMap[TagHandle] = TagPrefix; 2218 } 2219 2220 bool Document::expectToken(int TK) { 2221 Token T = getNext(); 2222 if (T.Kind != TK) { 2223 setError("Unexpected token", T); 2224 return false; 2225 } 2226 return true; 2227 } 2228