1 //===--- YAMLParser.cpp - Simple YAML parser ------------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 //  This file implements a YAML parser.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/Support/YAMLParser.h"
15 #include "llvm/ADT/STLExtras.h"
16 #include "llvm/ADT/SmallString.h"
17 #include "llvm/ADT/SmallVector.h"
18 #include "llvm/ADT/StringExtras.h"
19 #include "llvm/ADT/Twine.h"
20 #include "llvm/ADT/ilist.h"
21 #include "llvm/ADT/ilist_node.h"
22 #include "llvm/Support/ErrorHandling.h"
23 #include "llvm/Support/MemoryBuffer.h"
24 #include "llvm/Support/SourceMgr.h"
25 #include "llvm/Support/raw_ostream.h"
26 
27 using namespace llvm;
28 using namespace yaml;
29 
30 enum UnicodeEncodingForm {
31   UEF_UTF32_LE, ///< UTF-32 Little Endian
32   UEF_UTF32_BE, ///< UTF-32 Big Endian
33   UEF_UTF16_LE, ///< UTF-16 Little Endian
34   UEF_UTF16_BE, ///< UTF-16 Big Endian
35   UEF_UTF8,     ///< UTF-8 or ascii.
36   UEF_Unknown   ///< Not a valid Unicode encoding.
37 };
38 
39 /// EncodingInfo - Holds the encoding type and length of the byte order mark if
40 ///                it exists. Length is in {0, 2, 3, 4}.
41 typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo;
42 
43 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode
44 ///                      encoding form of \a Input.
45 ///
46 /// @param Input A string of length 0 or more.
47 /// @returns An EncodingInfo indicating the Unicode encoding form of the input
48 ///          and how long the byte order mark is if one exists.
49 static EncodingInfo getUnicodeEncoding(StringRef Input) {
50   if (Input.size() == 0)
51     return std::make_pair(UEF_Unknown, 0);
52 
53   switch (uint8_t(Input[0])) {
54   case 0x00:
55     if (Input.size() >= 4) {
56       if (  Input[1] == 0
57          && uint8_t(Input[2]) == 0xFE
58          && uint8_t(Input[3]) == 0xFF)
59         return std::make_pair(UEF_UTF32_BE, 4);
60       if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0)
61         return std::make_pair(UEF_UTF32_BE, 0);
62     }
63 
64     if (Input.size() >= 2 && Input[1] != 0)
65       return std::make_pair(UEF_UTF16_BE, 0);
66     return std::make_pair(UEF_Unknown, 0);
67   case 0xFF:
68     if (  Input.size() >= 4
69        && uint8_t(Input[1]) == 0xFE
70        && Input[2] == 0
71        && Input[3] == 0)
72       return std::make_pair(UEF_UTF32_LE, 4);
73 
74     if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE)
75       return std::make_pair(UEF_UTF16_LE, 2);
76     return std::make_pair(UEF_Unknown, 0);
77   case 0xFE:
78     if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF)
79       return std::make_pair(UEF_UTF16_BE, 2);
80     return std::make_pair(UEF_Unknown, 0);
81   case 0xEF:
82     if (  Input.size() >= 3
83        && uint8_t(Input[1]) == 0xBB
84        && uint8_t(Input[2]) == 0xBF)
85       return std::make_pair(UEF_UTF8, 3);
86     return std::make_pair(UEF_Unknown, 0);
87   }
88 
89   // It could still be utf-32 or utf-16.
90   if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0)
91     return std::make_pair(UEF_UTF32_LE, 0);
92 
93   if (Input.size() >= 2 && Input[1] == 0)
94     return std::make_pair(UEF_UTF16_LE, 0);
95 
96   return std::make_pair(UEF_UTF8, 0);
97 }
98 
99 namespace llvm {
100 namespace yaml {
101 /// Pin the vtables to this file.
102 void Node::anchor() {}
103 void NullNode::anchor() {}
104 void ScalarNode::anchor() {}
105 void BlockScalarNode::anchor() {}
106 void KeyValueNode::anchor() {}
107 void MappingNode::anchor() {}
108 void SequenceNode::anchor() {}
109 void AliasNode::anchor() {}
110 
111 /// Token - A single YAML token.
112 struct Token : ilist_node<Token> {
113   enum TokenKind {
114     TK_Error, // Uninitialized token.
115     TK_StreamStart,
116     TK_StreamEnd,
117     TK_VersionDirective,
118     TK_TagDirective,
119     TK_DocumentStart,
120     TK_DocumentEnd,
121     TK_BlockEntry,
122     TK_BlockEnd,
123     TK_BlockSequenceStart,
124     TK_BlockMappingStart,
125     TK_FlowEntry,
126     TK_FlowSequenceStart,
127     TK_FlowSequenceEnd,
128     TK_FlowMappingStart,
129     TK_FlowMappingEnd,
130     TK_Key,
131     TK_Value,
132     TK_Scalar,
133     TK_BlockScalar,
134     TK_Alias,
135     TK_Anchor,
136     TK_Tag
137   } Kind;
138 
139   /// A string of length 0 or more whose begin() points to the logical location
140   /// of the token in the input.
141   StringRef Range;
142 
143   /// The value of a block scalar node.
144   std::string Value;
145 
146   Token() : Kind(TK_Error) {}
147 };
148 }
149 }
150 
151 namespace llvm {
152 template<>
153 struct ilist_node_traits<Token> {
154   Token *createNode(const Token &V) {
155     return new (Alloc.Allocate<Token>()) Token(V);
156   }
157   static void deleteNode(Token *V) { V->~Token(); }
158 
159   void addNodeToList(Token *) {}
160   void removeNodeFromList(Token *) {}
161   void transferNodesFromList(ilist_node_traits &    /*SrcTraits*/,
162                              ilist_iterator<Token> /*first*/,
163                              ilist_iterator<Token> /*last*/) {}
164 
165   BumpPtrAllocator Alloc;
166 };
167 }
168 
169 typedef ilist<Token> TokenQueueT;
170 
171 namespace {
172 /// @brief This struct is used to track simple keys.
173 ///
174 /// Simple keys are handled by creating an entry in SimpleKeys for each Token
175 /// which could legally be the start of a simple key. When peekNext is called,
176 /// if the Token To be returned is referenced by a SimpleKey, we continue
177 /// tokenizing until that potential simple key has either been found to not be
178 /// a simple key (we moved on to the next line or went further than 1024 chars).
179 /// Or when we run into a Value, and then insert a Key token (and possibly
180 /// others) before the SimpleKey's Tok.
181 struct SimpleKey {
182   TokenQueueT::iterator Tok;
183   unsigned Column;
184   unsigned Line;
185   unsigned FlowLevel;
186   bool IsRequired;
187 
188   bool operator ==(const SimpleKey &Other) {
189     return Tok == Other.Tok;
190   }
191 };
192 }
193 
194 /// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit
195 ///        subsequence and the subsequence's length in code units (uint8_t).
196 ///        A length of 0 represents an error.
197 typedef std::pair<uint32_t, unsigned> UTF8Decoded;
198 
199 static UTF8Decoded decodeUTF8(StringRef Range) {
200   StringRef::iterator Position= Range.begin();
201   StringRef::iterator End = Range.end();
202   // 1 byte: [0x00, 0x7f]
203   // Bit pattern: 0xxxxxxx
204   if ((*Position & 0x80) == 0) {
205      return std::make_pair(*Position, 1);
206   }
207   // 2 bytes: [0x80, 0x7ff]
208   // Bit pattern: 110xxxxx 10xxxxxx
209   if (Position + 1 != End &&
210       ((*Position & 0xE0) == 0xC0) &&
211       ((*(Position + 1) & 0xC0) == 0x80)) {
212     uint32_t codepoint = ((*Position & 0x1F) << 6) |
213                           (*(Position + 1) & 0x3F);
214     if (codepoint >= 0x80)
215       return std::make_pair(codepoint, 2);
216   }
217   // 3 bytes: [0x8000, 0xffff]
218   // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx
219   if (Position + 2 != End &&
220       ((*Position & 0xF0) == 0xE0) &&
221       ((*(Position + 1) & 0xC0) == 0x80) &&
222       ((*(Position + 2) & 0xC0) == 0x80)) {
223     uint32_t codepoint = ((*Position & 0x0F) << 12) |
224                          ((*(Position + 1) & 0x3F) << 6) |
225                           (*(Position + 2) & 0x3F);
226     // Codepoints between 0xD800 and 0xDFFF are invalid, as
227     // they are high / low surrogate halves used by UTF-16.
228     if (codepoint >= 0x800 &&
229         (codepoint < 0xD800 || codepoint > 0xDFFF))
230       return std::make_pair(codepoint, 3);
231   }
232   // 4 bytes: [0x10000, 0x10FFFF]
233   // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
234   if (Position + 3 != End &&
235       ((*Position & 0xF8) == 0xF0) &&
236       ((*(Position + 1) & 0xC0) == 0x80) &&
237       ((*(Position + 2) & 0xC0) == 0x80) &&
238       ((*(Position + 3) & 0xC0) == 0x80)) {
239     uint32_t codepoint = ((*Position & 0x07) << 18) |
240                          ((*(Position + 1) & 0x3F) << 12) |
241                          ((*(Position + 2) & 0x3F) << 6) |
242                           (*(Position + 3) & 0x3F);
243     if (codepoint >= 0x10000 && codepoint <= 0x10FFFF)
244       return std::make_pair(codepoint, 4);
245   }
246   return std::make_pair(0, 0);
247 }
248 
249 namespace llvm {
250 namespace yaml {
251 /// @brief Scans YAML tokens from a MemoryBuffer.
252 class Scanner {
253 public:
254   Scanner(StringRef Input, SourceMgr &SM, bool ShowColors = true);
255   Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors = true);
256 
257   /// @brief Parse the next token and return it without popping it.
258   Token &peekNext();
259 
260   /// @brief Parse the next token and pop it from the queue.
261   Token getNext();
262 
263   void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message,
264                   ArrayRef<SMRange> Ranges = None) {
265     SM.PrintMessage(Loc, Kind, Message, Ranges, /* FixIts= */ None, ShowColors);
266   }
267 
268   void setError(const Twine &Message, StringRef::iterator Position) {
269     if (Current >= End)
270       Current = End - 1;
271 
272     // Don't print out more errors after the first one we encounter. The rest
273     // are just the result of the first, and have no meaning.
274     if (!Failed)
275       printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message);
276     Failed = true;
277   }
278 
279   void setError(const Twine &Message) {
280     setError(Message, Current);
281   }
282 
283   /// @brief Returns true if an error occurred while parsing.
284   bool failed() {
285     return Failed;
286   }
287 
288 private:
289   void init(MemoryBufferRef Buffer);
290 
291   StringRef currentInput() {
292     return StringRef(Current, End - Current);
293   }
294 
295   /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting
296   ///        at \a Position.
297   ///
298   /// If the UTF-8 code units starting at Position do not form a well-formed
299   /// code unit subsequence, then the Unicode scalar value is 0, and the length
300   /// is 0.
301   UTF8Decoded decodeUTF8(StringRef::iterator Position) {
302     return ::decodeUTF8(StringRef(Position, End - Position));
303   }
304 
305   // The following functions are based on the gramar rules in the YAML spec. The
306   // style of the function names it meant to closely match how they are written
307   // in the spec. The number within the [] is the number of the grammar rule in
308   // the spec.
309   //
310   // See 4.2 [Production Naming Conventions] for the meaning of the prefixes.
311   //
312   // c-
313   //   A production starting and ending with a special character.
314   // b-
315   //   A production matching a single line break.
316   // nb-
317   //   A production starting and ending with a non-break character.
318   // s-
319   //   A production starting and ending with a white space character.
320   // ns-
321   //   A production starting and ending with a non-space character.
322   // l-
323   //   A production matching complete line(s).
324 
325   /// @brief Skip a single nb-char[27] starting at Position.
326   ///
327   /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE]
328   ///                  | [0xFF00-0xFFFD] | [0x10000-0x10FFFF]
329   ///
330   /// @returns The code unit after the nb-char, or Position if it's not an
331   ///          nb-char.
332   StringRef::iterator skip_nb_char(StringRef::iterator Position);
333 
334   /// @brief Skip a single b-break[28] starting at Position.
335   ///
336   /// A b-break is 0xD 0xA | 0xD | 0xA
337   ///
338   /// @returns The code unit after the b-break, or Position if it's not a
339   ///          b-break.
340   StringRef::iterator skip_b_break(StringRef::iterator Position);
341 
342   /// Skip a single s-space[31] starting at Position.
343   ///
344   /// An s-space is 0x20
345   ///
346   /// @returns The code unit after the s-space, or Position if it's not a
347   ///          s-space.
348   StringRef::iterator skip_s_space(StringRef::iterator Position);
349 
350   /// @brief Skip a single s-white[33] starting at Position.
351   ///
352   /// A s-white is 0x20 | 0x9
353   ///
354   /// @returns The code unit after the s-white, or Position if it's not a
355   ///          s-white.
356   StringRef::iterator skip_s_white(StringRef::iterator Position);
357 
358   /// @brief Skip a single ns-char[34] starting at Position.
359   ///
360   /// A ns-char is nb-char - s-white
361   ///
362   /// @returns The code unit after the ns-char, or Position if it's not a
363   ///          ns-char.
364   StringRef::iterator skip_ns_char(StringRef::iterator Position);
365 
366   typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator);
367   /// @brief Skip minimal well-formed code unit subsequences until Func
368   ///        returns its input.
369   ///
370   /// @returns The code unit after the last minimal well-formed code unit
371   ///          subsequence that Func accepted.
372   StringRef::iterator skip_while( SkipWhileFunc Func
373                                 , StringRef::iterator Position);
374 
375   /// Skip minimal well-formed code unit subsequences until Func returns its
376   /// input.
377   void advanceWhile(SkipWhileFunc Func);
378 
379   /// @brief Scan ns-uri-char[39]s starting at Cur.
380   ///
381   /// This updates Cur and Column while scanning.
382   ///
383   /// @returns A StringRef starting at Cur which covers the longest contiguous
384   ///          sequence of ns-uri-char.
385   StringRef scan_ns_uri_char();
386 
387   /// @brief Consume a minimal well-formed code unit subsequence starting at
388   ///        \a Cur. Return false if it is not the same Unicode scalar value as
389   ///        \a Expected. This updates \a Column.
390   bool consume(uint32_t Expected);
391 
392   /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column.
393   void skip(uint32_t Distance);
394 
395   /// @brief Return true if the minimal well-formed code unit subsequence at
396   ///        Pos is whitespace or a new line
397   bool isBlankOrBreak(StringRef::iterator Position);
398 
399   /// Consume a single b-break[28] if it's present at the current position.
400   ///
401   /// Return false if the code unit at the current position isn't a line break.
402   bool consumeLineBreakIfPresent();
403 
404   /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey.
405   void saveSimpleKeyCandidate( TokenQueueT::iterator Tok
406                              , unsigned AtColumn
407                              , bool IsRequired);
408 
409   /// @brief Remove simple keys that can no longer be valid simple keys.
410   ///
411   /// Invalid simple keys are not on the current line or are further than 1024
412   /// columns back.
413   void removeStaleSimpleKeyCandidates();
414 
415   /// @brief Remove all simple keys on FlowLevel \a Level.
416   void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level);
417 
418   /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd
419   ///        tokens if needed.
420   bool unrollIndent(int ToColumn);
421 
422   /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint
423   ///        if needed.
424   bool rollIndent( int ToColumn
425                  , Token::TokenKind Kind
426                  , TokenQueueT::iterator InsertPoint);
427 
428   /// @brief Skip a single-line comment when the comment starts at the current
429   /// position of the scanner.
430   void skipComment();
431 
432   /// @brief Skip whitespace and comments until the start of the next token.
433   void scanToNextToken();
434 
435   /// @brief Must be the first token generated.
436   bool scanStreamStart();
437 
438   /// @brief Generate tokens needed to close out the stream.
439   bool scanStreamEnd();
440 
441   /// @brief Scan a %BLAH directive.
442   bool scanDirective();
443 
444   /// @brief Scan a ... or ---.
445   bool scanDocumentIndicator(bool IsStart);
446 
447   /// @brief Scan a [ or { and generate the proper flow collection start token.
448   bool scanFlowCollectionStart(bool IsSequence);
449 
450   /// @brief Scan a ] or } and generate the proper flow collection end token.
451   bool scanFlowCollectionEnd(bool IsSequence);
452 
453   /// @brief Scan the , that separates entries in a flow collection.
454   bool scanFlowEntry();
455 
456   /// @brief Scan the - that starts block sequence entries.
457   bool scanBlockEntry();
458 
459   /// @brief Scan an explicit ? indicating a key.
460   bool scanKey();
461 
462   /// @brief Scan an explicit : indicating a value.
463   bool scanValue();
464 
465   /// @brief Scan a quoted scalar.
466   bool scanFlowScalar(bool IsDoubleQuoted);
467 
468   /// @brief Scan an unquoted scalar.
469   bool scanPlainScalar();
470 
471   /// @brief Scan an Alias or Anchor starting with * or &.
472   bool scanAliasOrAnchor(bool IsAlias);
473 
474   /// @brief Scan a block scalar starting with | or >.
475   bool scanBlockScalar(bool IsLiteral);
476 
477   /// Scan a chomping indicator in a block scalar header.
478   char scanBlockChompingIndicator();
479 
480   /// Scan an indentation indicator in a block scalar header.
481   unsigned scanBlockIndentationIndicator();
482 
483   /// Scan a block scalar header.
484   ///
485   /// Return false if an error occurred.
486   bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator,
487                              bool &IsDone);
488 
489   /// Look for the indentation level of a block scalar.
490   ///
491   /// Return false if an error occurred.
492   bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent,
493                              unsigned &LineBreaks, bool &IsDone);
494 
495   /// Scan the indentation of a text line in a block scalar.
496   ///
497   /// Return false if an error occurred.
498   bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent,
499                              bool &IsDone);
500 
501   /// @brief Scan a tag of the form !stuff.
502   bool scanTag();
503 
504   /// @brief Dispatch to the next scanning function based on \a *Cur.
505   bool fetchMoreTokens();
506 
507   /// @brief The SourceMgr used for diagnostics and buffer management.
508   SourceMgr &SM;
509 
510   /// @brief The original input.
511   MemoryBufferRef InputBuffer;
512 
513   /// @brief The current position of the scanner.
514   StringRef::iterator Current;
515 
516   /// @brief The end of the input (one past the last character).
517   StringRef::iterator End;
518 
519   /// @brief Current YAML indentation level in spaces.
520   int Indent;
521 
522   /// @brief Current column number in Unicode code points.
523   unsigned Column;
524 
525   /// @brief Current line number.
526   unsigned Line;
527 
528   /// @brief How deep we are in flow style containers. 0 Means at block level.
529   unsigned FlowLevel;
530 
531   /// @brief Are we at the start of the stream?
532   bool IsStartOfStream;
533 
534   /// @brief Can the next token be the start of a simple key?
535   bool IsSimpleKeyAllowed;
536 
537   /// @brief True if an error has occurred.
538   bool Failed;
539 
540   /// @brief Should colors be used when printing out the diagnostic messages?
541   bool ShowColors;
542 
543   /// @brief Queue of tokens. This is required to queue up tokens while looking
544   ///        for the end of a simple key. And for cases where a single character
545   ///        can produce multiple tokens (e.g. BlockEnd).
546   TokenQueueT TokenQueue;
547 
548   /// @brief Indentation levels.
549   SmallVector<int, 4> Indents;
550 
551   /// @brief Potential simple keys.
552   SmallVector<SimpleKey, 4> SimpleKeys;
553 };
554 
555 } // end namespace yaml
556 } // end namespace llvm
557 
558 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result.
559 static void encodeUTF8( uint32_t UnicodeScalarValue
560                       , SmallVectorImpl<char> &Result) {
561   if (UnicodeScalarValue <= 0x7F) {
562     Result.push_back(UnicodeScalarValue & 0x7F);
563   } else if (UnicodeScalarValue <= 0x7FF) {
564     uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6);
565     uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F);
566     Result.push_back(FirstByte);
567     Result.push_back(SecondByte);
568   } else if (UnicodeScalarValue <= 0xFFFF) {
569     uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12);
570     uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6);
571     uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F);
572     Result.push_back(FirstByte);
573     Result.push_back(SecondByte);
574     Result.push_back(ThirdByte);
575   } else if (UnicodeScalarValue <= 0x10FFFF) {
576     uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18);
577     uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12);
578     uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6);
579     uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F);
580     Result.push_back(FirstByte);
581     Result.push_back(SecondByte);
582     Result.push_back(ThirdByte);
583     Result.push_back(FourthByte);
584   }
585 }
586 
587 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) {
588   SourceMgr SM;
589   Scanner scanner(Input, SM);
590   while (true) {
591     Token T = scanner.getNext();
592     switch (T.Kind) {
593     case Token::TK_StreamStart:
594       OS << "Stream-Start: ";
595       break;
596     case Token::TK_StreamEnd:
597       OS << "Stream-End: ";
598       break;
599     case Token::TK_VersionDirective:
600       OS << "Version-Directive: ";
601       break;
602     case Token::TK_TagDirective:
603       OS << "Tag-Directive: ";
604       break;
605     case Token::TK_DocumentStart:
606       OS << "Document-Start: ";
607       break;
608     case Token::TK_DocumentEnd:
609       OS << "Document-End: ";
610       break;
611     case Token::TK_BlockEntry:
612       OS << "Block-Entry: ";
613       break;
614     case Token::TK_BlockEnd:
615       OS << "Block-End: ";
616       break;
617     case Token::TK_BlockSequenceStart:
618       OS << "Block-Sequence-Start: ";
619       break;
620     case Token::TK_BlockMappingStart:
621       OS << "Block-Mapping-Start: ";
622       break;
623     case Token::TK_FlowEntry:
624       OS << "Flow-Entry: ";
625       break;
626     case Token::TK_FlowSequenceStart:
627       OS << "Flow-Sequence-Start: ";
628       break;
629     case Token::TK_FlowSequenceEnd:
630       OS << "Flow-Sequence-End: ";
631       break;
632     case Token::TK_FlowMappingStart:
633       OS << "Flow-Mapping-Start: ";
634       break;
635     case Token::TK_FlowMappingEnd:
636       OS << "Flow-Mapping-End: ";
637       break;
638     case Token::TK_Key:
639       OS << "Key: ";
640       break;
641     case Token::TK_Value:
642       OS << "Value: ";
643       break;
644     case Token::TK_Scalar:
645       OS << "Scalar: ";
646       break;
647     case Token::TK_BlockScalar:
648       OS << "Block Scalar: ";
649       break;
650     case Token::TK_Alias:
651       OS << "Alias: ";
652       break;
653     case Token::TK_Anchor:
654       OS << "Anchor: ";
655       break;
656     case Token::TK_Tag:
657       OS << "Tag: ";
658       break;
659     case Token::TK_Error:
660       break;
661     }
662     OS << T.Range << "\n";
663     if (T.Kind == Token::TK_StreamEnd)
664       break;
665     else if (T.Kind == Token::TK_Error)
666       return false;
667   }
668   return true;
669 }
670 
671 bool yaml::scanTokens(StringRef Input) {
672   llvm::SourceMgr SM;
673   llvm::yaml::Scanner scanner(Input, SM);
674   for (;;) {
675     llvm::yaml::Token T = scanner.getNext();
676     if (T.Kind == Token::TK_StreamEnd)
677       break;
678     else if (T.Kind == Token::TK_Error)
679       return false;
680   }
681   return true;
682 }
683 
684 std::string yaml::escape(StringRef Input) {
685   std::string EscapedInput;
686   for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) {
687     if (*i == '\\')
688       EscapedInput += "\\\\";
689     else if (*i == '"')
690       EscapedInput += "\\\"";
691     else if (*i == 0)
692       EscapedInput += "\\0";
693     else if (*i == 0x07)
694       EscapedInput += "\\a";
695     else if (*i == 0x08)
696       EscapedInput += "\\b";
697     else if (*i == 0x09)
698       EscapedInput += "\\t";
699     else if (*i == 0x0A)
700       EscapedInput += "\\n";
701     else if (*i == 0x0B)
702       EscapedInput += "\\v";
703     else if (*i == 0x0C)
704       EscapedInput += "\\f";
705     else if (*i == 0x0D)
706       EscapedInput += "\\r";
707     else if (*i == 0x1B)
708       EscapedInput += "\\e";
709     else if ((unsigned char)*i < 0x20) { // Control characters not handled above.
710       std::string HexStr = utohexstr(*i);
711       EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
712     } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence.
713       UTF8Decoded UnicodeScalarValue
714         = decodeUTF8(StringRef(i, Input.end() - i));
715       if (UnicodeScalarValue.second == 0) {
716         // Found invalid char.
717         SmallString<4> Val;
718         encodeUTF8(0xFFFD, Val);
719         EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end());
720         // FIXME: Error reporting.
721         return EscapedInput;
722       }
723       if (UnicodeScalarValue.first == 0x85)
724         EscapedInput += "\\N";
725       else if (UnicodeScalarValue.first == 0xA0)
726         EscapedInput += "\\_";
727       else if (UnicodeScalarValue.first == 0x2028)
728         EscapedInput += "\\L";
729       else if (UnicodeScalarValue.first == 0x2029)
730         EscapedInput += "\\P";
731       else {
732         std::string HexStr = utohexstr(UnicodeScalarValue.first);
733         if (HexStr.size() <= 2)
734           EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
735         else if (HexStr.size() <= 4)
736           EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr;
737         else if (HexStr.size() <= 8)
738           EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr;
739       }
740       i += UnicodeScalarValue.second - 1;
741     } else
742       EscapedInput.push_back(*i);
743   }
744   return EscapedInput;
745 }
746 
747 Scanner::Scanner(StringRef Input, SourceMgr &sm, bool ShowColors)
748     : SM(sm), ShowColors(ShowColors) {
749   init(MemoryBufferRef(Input, "YAML"));
750 }
751 
752 Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors)
753     : SM(SM_), ShowColors(ShowColors) {
754   init(Buffer);
755 }
756 
757 void Scanner::init(MemoryBufferRef Buffer) {
758   InputBuffer = Buffer;
759   Current = InputBuffer.getBufferStart();
760   End = InputBuffer.getBufferEnd();
761   Indent = -1;
762   Column = 0;
763   Line = 0;
764   FlowLevel = 0;
765   IsStartOfStream = true;
766   IsSimpleKeyAllowed = true;
767   Failed = false;
768   std::unique_ptr<MemoryBuffer> InputBufferOwner =
769       MemoryBuffer::getMemBuffer(Buffer);
770   SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc());
771 }
772 
773 Token &Scanner::peekNext() {
774   // If the current token is a possible simple key, keep parsing until we
775   // can confirm.
776   bool NeedMore = false;
777   while (true) {
778     if (TokenQueue.empty() || NeedMore) {
779       if (!fetchMoreTokens()) {
780         TokenQueue.clear();
781         TokenQueue.push_back(Token());
782         return TokenQueue.front();
783       }
784     }
785     assert(!TokenQueue.empty() &&
786             "fetchMoreTokens lied about getting tokens!");
787 
788     removeStaleSimpleKeyCandidates();
789     SimpleKey SK;
790     SK.Tok = TokenQueue.begin();
791     if (!is_contained(SimpleKeys, SK))
792       break;
793     else
794       NeedMore = true;
795   }
796   return TokenQueue.front();
797 }
798 
799 Token Scanner::getNext() {
800   Token Ret = peekNext();
801   // TokenQueue can be empty if there was an error getting the next token.
802   if (!TokenQueue.empty())
803     TokenQueue.pop_front();
804 
805   // There cannot be any referenced Token's if the TokenQueue is empty. So do a
806   // quick deallocation of them all.
807   if (TokenQueue.empty()) {
808     TokenQueue.Alloc.Reset();
809   }
810 
811   return Ret;
812 }
813 
814 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) {
815   if (Position == End)
816     return Position;
817   // Check 7 bit c-printable - b-char.
818   if (   *Position == 0x09
819       || (*Position >= 0x20 && *Position <= 0x7E))
820     return Position + 1;
821 
822   // Check for valid UTF-8.
823   if (uint8_t(*Position) & 0x80) {
824     UTF8Decoded u8d = decodeUTF8(Position);
825     if (   u8d.second != 0
826         && u8d.first != 0xFEFF
827         && ( u8d.first == 0x85
828           || ( u8d.first >= 0xA0
829             && u8d.first <= 0xD7FF)
830           || ( u8d.first >= 0xE000
831             && u8d.first <= 0xFFFD)
832           || ( u8d.first >= 0x10000
833             && u8d.first <= 0x10FFFF)))
834       return Position + u8d.second;
835   }
836   return Position;
837 }
838 
839 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) {
840   if (Position == End)
841     return Position;
842   if (*Position == 0x0D) {
843     if (Position + 1 != End && *(Position + 1) == 0x0A)
844       return Position + 2;
845     return Position + 1;
846   }
847 
848   if (*Position == 0x0A)
849     return Position + 1;
850   return Position;
851 }
852 
853 StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) {
854   if (Position == End)
855     return Position;
856   if (*Position == ' ')
857     return Position + 1;
858   return Position;
859 }
860 
861 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) {
862   if (Position == End)
863     return Position;
864   if (*Position == ' ' || *Position == '\t')
865     return Position + 1;
866   return Position;
867 }
868 
869 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) {
870   if (Position == End)
871     return Position;
872   if (*Position == ' ' || *Position == '\t')
873     return Position;
874   return skip_nb_char(Position);
875 }
876 
877 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func
878                                        , StringRef::iterator Position) {
879   while (true) {
880     StringRef::iterator i = (this->*Func)(Position);
881     if (i == Position)
882       break;
883     Position = i;
884   }
885   return Position;
886 }
887 
888 void Scanner::advanceWhile(SkipWhileFunc Func) {
889   auto Final = skip_while(Func, Current);
890   Column += Final - Current;
891   Current = Final;
892 }
893 
894 static bool is_ns_hex_digit(const char C) {
895   return    (C >= '0' && C <= '9')
896          || (C >= 'a' && C <= 'z')
897          || (C >= 'A' && C <= 'Z');
898 }
899 
900 static bool is_ns_word_char(const char C) {
901   return    C == '-'
902          || (C >= 'a' && C <= 'z')
903          || (C >= 'A' && C <= 'Z');
904 }
905 
906 StringRef Scanner::scan_ns_uri_char() {
907   StringRef::iterator Start = Current;
908   while (true) {
909     if (Current == End)
910       break;
911     if ((   *Current == '%'
912           && Current + 2 < End
913           && is_ns_hex_digit(*(Current + 1))
914           && is_ns_hex_digit(*(Current + 2)))
915         || is_ns_word_char(*Current)
916         || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]")
917           != StringRef::npos) {
918       ++Current;
919       ++Column;
920     } else
921       break;
922   }
923   return StringRef(Start, Current - Start);
924 }
925 
926 bool Scanner::consume(uint32_t Expected) {
927   if (Expected >= 0x80)
928     report_fatal_error("Not dealing with this yet");
929   if (Current == End)
930     return false;
931   if (uint8_t(*Current) >= 0x80)
932     report_fatal_error("Not dealing with this yet");
933   if (uint8_t(*Current) == Expected) {
934     ++Current;
935     ++Column;
936     return true;
937   }
938   return false;
939 }
940 
941 void Scanner::skip(uint32_t Distance) {
942   Current += Distance;
943   Column += Distance;
944   assert(Current <= End && "Skipped past the end");
945 }
946 
947 bool Scanner::isBlankOrBreak(StringRef::iterator Position) {
948   if (Position == End)
949     return false;
950   return *Position == ' ' || *Position == '\t' || *Position == '\r' ||
951          *Position == '\n';
952 }
953 
954 bool Scanner::consumeLineBreakIfPresent() {
955   auto Next = skip_b_break(Current);
956   if (Next == Current)
957     return false;
958   Column = 0;
959   ++Line;
960   Current = Next;
961   return true;
962 }
963 
964 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok
965                                     , unsigned AtColumn
966                                     , bool IsRequired) {
967   if (IsSimpleKeyAllowed) {
968     SimpleKey SK;
969     SK.Tok = Tok;
970     SK.Line = Line;
971     SK.Column = AtColumn;
972     SK.IsRequired = IsRequired;
973     SK.FlowLevel = FlowLevel;
974     SimpleKeys.push_back(SK);
975   }
976 }
977 
978 void Scanner::removeStaleSimpleKeyCandidates() {
979   for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin();
980                                             i != SimpleKeys.end();) {
981     if (i->Line != Line || i->Column + 1024 < Column) {
982       if (i->IsRequired)
983         setError( "Could not find expected : for simple key"
984                 , i->Tok->Range.begin());
985       i = SimpleKeys.erase(i);
986     } else
987       ++i;
988   }
989 }
990 
991 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) {
992   if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level)
993     SimpleKeys.pop_back();
994 }
995 
996 bool Scanner::unrollIndent(int ToColumn) {
997   Token T;
998   // Indentation is ignored in flow.
999   if (FlowLevel != 0)
1000     return true;
1001 
1002   while (Indent > ToColumn) {
1003     T.Kind = Token::TK_BlockEnd;
1004     T.Range = StringRef(Current, 1);
1005     TokenQueue.push_back(T);
1006     Indent = Indents.pop_back_val();
1007   }
1008 
1009   return true;
1010 }
1011 
1012 bool Scanner::rollIndent( int ToColumn
1013                         , Token::TokenKind Kind
1014                         , TokenQueueT::iterator InsertPoint) {
1015   if (FlowLevel)
1016     return true;
1017   if (Indent < ToColumn) {
1018     Indents.push_back(Indent);
1019     Indent = ToColumn;
1020 
1021     Token T;
1022     T.Kind = Kind;
1023     T.Range = StringRef(Current, 0);
1024     TokenQueue.insert(InsertPoint, T);
1025   }
1026   return true;
1027 }
1028 
1029 void Scanner::skipComment() {
1030   if (*Current != '#')
1031     return;
1032   while (true) {
1033     // This may skip more than one byte, thus Column is only incremented
1034     // for code points.
1035     StringRef::iterator I = skip_nb_char(Current);
1036     if (I == Current)
1037       break;
1038     Current = I;
1039     ++Column;
1040   }
1041 }
1042 
1043 void Scanner::scanToNextToken() {
1044   while (true) {
1045     while (*Current == ' ' || *Current == '\t') {
1046       skip(1);
1047     }
1048 
1049     skipComment();
1050 
1051     // Skip EOL.
1052     StringRef::iterator i = skip_b_break(Current);
1053     if (i == Current)
1054       break;
1055     Current = i;
1056     ++Line;
1057     Column = 0;
1058     // New lines may start a simple key.
1059     if (!FlowLevel)
1060       IsSimpleKeyAllowed = true;
1061   }
1062 }
1063 
1064 bool Scanner::scanStreamStart() {
1065   IsStartOfStream = false;
1066 
1067   EncodingInfo EI = getUnicodeEncoding(currentInput());
1068 
1069   Token T;
1070   T.Kind = Token::TK_StreamStart;
1071   T.Range = StringRef(Current, EI.second);
1072   TokenQueue.push_back(T);
1073   Current += EI.second;
1074   return true;
1075 }
1076 
1077 bool Scanner::scanStreamEnd() {
1078   // Force an ending new line if one isn't present.
1079   if (Column != 0) {
1080     Column = 0;
1081     ++Line;
1082   }
1083 
1084   unrollIndent(-1);
1085   SimpleKeys.clear();
1086   IsSimpleKeyAllowed = false;
1087 
1088   Token T;
1089   T.Kind = Token::TK_StreamEnd;
1090   T.Range = StringRef(Current, 0);
1091   TokenQueue.push_back(T);
1092   return true;
1093 }
1094 
1095 bool Scanner::scanDirective() {
1096   // Reset the indentation level.
1097   unrollIndent(-1);
1098   SimpleKeys.clear();
1099   IsSimpleKeyAllowed = false;
1100 
1101   StringRef::iterator Start = Current;
1102   consume('%');
1103   StringRef::iterator NameStart = Current;
1104   Current = skip_while(&Scanner::skip_ns_char, Current);
1105   StringRef Name(NameStart, Current - NameStart);
1106   Current = skip_while(&Scanner::skip_s_white, Current);
1107 
1108   Token T;
1109   if (Name == "YAML") {
1110     Current = skip_while(&Scanner::skip_ns_char, Current);
1111     T.Kind = Token::TK_VersionDirective;
1112     T.Range = StringRef(Start, Current - Start);
1113     TokenQueue.push_back(T);
1114     return true;
1115   } else if(Name == "TAG") {
1116     Current = skip_while(&Scanner::skip_ns_char, Current);
1117     Current = skip_while(&Scanner::skip_s_white, Current);
1118     Current = skip_while(&Scanner::skip_ns_char, Current);
1119     T.Kind = Token::TK_TagDirective;
1120     T.Range = StringRef(Start, Current - Start);
1121     TokenQueue.push_back(T);
1122     return true;
1123   }
1124   return false;
1125 }
1126 
1127 bool Scanner::scanDocumentIndicator(bool IsStart) {
1128   unrollIndent(-1);
1129   SimpleKeys.clear();
1130   IsSimpleKeyAllowed = false;
1131 
1132   Token T;
1133   T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd;
1134   T.Range = StringRef(Current, 3);
1135   skip(3);
1136   TokenQueue.push_back(T);
1137   return true;
1138 }
1139 
1140 bool Scanner::scanFlowCollectionStart(bool IsSequence) {
1141   Token T;
1142   T.Kind = IsSequence ? Token::TK_FlowSequenceStart
1143                       : Token::TK_FlowMappingStart;
1144   T.Range = StringRef(Current, 1);
1145   skip(1);
1146   TokenQueue.push_back(T);
1147 
1148   // [ and { may begin a simple key.
1149   saveSimpleKeyCandidate(--TokenQueue.end(), Column - 1, false);
1150 
1151   // And may also be followed by a simple key.
1152   IsSimpleKeyAllowed = true;
1153   ++FlowLevel;
1154   return true;
1155 }
1156 
1157 bool Scanner::scanFlowCollectionEnd(bool IsSequence) {
1158   removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1159   IsSimpleKeyAllowed = false;
1160   Token T;
1161   T.Kind = IsSequence ? Token::TK_FlowSequenceEnd
1162                       : Token::TK_FlowMappingEnd;
1163   T.Range = StringRef(Current, 1);
1164   skip(1);
1165   TokenQueue.push_back(T);
1166   if (FlowLevel)
1167     --FlowLevel;
1168   return true;
1169 }
1170 
1171 bool Scanner::scanFlowEntry() {
1172   removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1173   IsSimpleKeyAllowed = true;
1174   Token T;
1175   T.Kind = Token::TK_FlowEntry;
1176   T.Range = StringRef(Current, 1);
1177   skip(1);
1178   TokenQueue.push_back(T);
1179   return true;
1180 }
1181 
1182 bool Scanner::scanBlockEntry() {
1183   rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end());
1184   removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1185   IsSimpleKeyAllowed = true;
1186   Token T;
1187   T.Kind = Token::TK_BlockEntry;
1188   T.Range = StringRef(Current, 1);
1189   skip(1);
1190   TokenQueue.push_back(T);
1191   return true;
1192 }
1193 
1194 bool Scanner::scanKey() {
1195   if (!FlowLevel)
1196     rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
1197 
1198   removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1199   IsSimpleKeyAllowed = !FlowLevel;
1200 
1201   Token T;
1202   T.Kind = Token::TK_Key;
1203   T.Range = StringRef(Current, 1);
1204   skip(1);
1205   TokenQueue.push_back(T);
1206   return true;
1207 }
1208 
1209 bool Scanner::scanValue() {
1210   // If the previous token could have been a simple key, insert the key token
1211   // into the token queue.
1212   if (!SimpleKeys.empty()) {
1213     SimpleKey SK = SimpleKeys.pop_back_val();
1214     Token T;
1215     T.Kind = Token::TK_Key;
1216     T.Range = SK.Tok->Range;
1217     TokenQueueT::iterator i, e;
1218     for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) {
1219       if (i == SK.Tok)
1220         break;
1221     }
1222     assert(i != e && "SimpleKey not in token queue!");
1223     i = TokenQueue.insert(i, T);
1224 
1225     // We may also need to add a Block-Mapping-Start token.
1226     rollIndent(SK.Column, Token::TK_BlockMappingStart, i);
1227 
1228     IsSimpleKeyAllowed = false;
1229   } else {
1230     if (!FlowLevel)
1231       rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
1232     IsSimpleKeyAllowed = !FlowLevel;
1233   }
1234 
1235   Token T;
1236   T.Kind = Token::TK_Value;
1237   T.Range = StringRef(Current, 1);
1238   skip(1);
1239   TokenQueue.push_back(T);
1240   return true;
1241 }
1242 
1243 // Forbidding inlining improves performance by roughly 20%.
1244 // FIXME: Remove once llvm optimizes this to the faster version without hints.
1245 LLVM_ATTRIBUTE_NOINLINE static bool
1246 wasEscaped(StringRef::iterator First, StringRef::iterator Position);
1247 
1248 // Returns whether a character at 'Position' was escaped with a leading '\'.
1249 // 'First' specifies the position of the first character in the string.
1250 static bool wasEscaped(StringRef::iterator First,
1251                        StringRef::iterator Position) {
1252   assert(Position - 1 >= First);
1253   StringRef::iterator I = Position - 1;
1254   // We calculate the number of consecutive '\'s before the current position
1255   // by iterating backwards through our string.
1256   while (I >= First && *I == '\\') --I;
1257   // (Position - 1 - I) now contains the number of '\'s before the current
1258   // position. If it is odd, the character at 'Position' was escaped.
1259   return (Position - 1 - I) % 2 == 1;
1260 }
1261 
1262 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) {
1263   StringRef::iterator Start = Current;
1264   unsigned ColStart = Column;
1265   if (IsDoubleQuoted) {
1266     do {
1267       ++Current;
1268       while (Current != End && *Current != '"')
1269         ++Current;
1270       // Repeat until the previous character was not a '\' or was an escaped
1271       // backslash.
1272     } while (   Current != End
1273              && *(Current - 1) == '\\'
1274              && wasEscaped(Start + 1, Current));
1275   } else {
1276     skip(1);
1277     while (true) {
1278       // Skip a ' followed by another '.
1279       if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') {
1280         skip(2);
1281         continue;
1282       } else if (*Current == '\'')
1283         break;
1284       StringRef::iterator i = skip_nb_char(Current);
1285       if (i == Current) {
1286         i = skip_b_break(Current);
1287         if (i == Current)
1288           break;
1289         Current = i;
1290         Column = 0;
1291         ++Line;
1292       } else {
1293         if (i == End)
1294           break;
1295         Current = i;
1296         ++Column;
1297       }
1298     }
1299   }
1300 
1301   if (Current == End) {
1302     setError("Expected quote at end of scalar", Current);
1303     return false;
1304   }
1305 
1306   skip(1); // Skip ending quote.
1307   Token T;
1308   T.Kind = Token::TK_Scalar;
1309   T.Range = StringRef(Start, Current - Start);
1310   TokenQueue.push_back(T);
1311 
1312   saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
1313 
1314   IsSimpleKeyAllowed = false;
1315 
1316   return true;
1317 }
1318 
1319 bool Scanner::scanPlainScalar() {
1320   StringRef::iterator Start = Current;
1321   unsigned ColStart = Column;
1322   unsigned LeadingBlanks = 0;
1323   assert(Indent >= -1 && "Indent must be >= -1 !");
1324   unsigned indent = static_cast<unsigned>(Indent + 1);
1325   while (true) {
1326     if (*Current == '#')
1327       break;
1328 
1329     while (!isBlankOrBreak(Current)) {
1330       if (  FlowLevel && *Current == ':'
1331           && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) {
1332         setError("Found unexpected ':' while scanning a plain scalar", Current);
1333         return false;
1334       }
1335 
1336       // Check for the end of the plain scalar.
1337       if (  (*Current == ':' && isBlankOrBreak(Current + 1))
1338           || (  FlowLevel
1339           && (StringRef(Current, 1).find_first_of(",:?[]{}")
1340               != StringRef::npos)))
1341         break;
1342 
1343       StringRef::iterator i = skip_nb_char(Current);
1344       if (i == Current)
1345         break;
1346       Current = i;
1347       ++Column;
1348     }
1349 
1350     // Are we at the end?
1351     if (!isBlankOrBreak(Current))
1352       break;
1353 
1354     // Eat blanks.
1355     StringRef::iterator Tmp = Current;
1356     while (isBlankOrBreak(Tmp)) {
1357       StringRef::iterator i = skip_s_white(Tmp);
1358       if (i != Tmp) {
1359         if (LeadingBlanks && (Column < indent) && *Tmp == '\t') {
1360           setError("Found invalid tab character in indentation", Tmp);
1361           return false;
1362         }
1363         Tmp = i;
1364         ++Column;
1365       } else {
1366         i = skip_b_break(Tmp);
1367         if (!LeadingBlanks)
1368           LeadingBlanks = 1;
1369         Tmp = i;
1370         Column = 0;
1371         ++Line;
1372       }
1373     }
1374 
1375     if (!FlowLevel && Column < indent)
1376       break;
1377 
1378     Current = Tmp;
1379   }
1380   if (Start == Current) {
1381     setError("Got empty plain scalar", Start);
1382     return false;
1383   }
1384   Token T;
1385   T.Kind = Token::TK_Scalar;
1386   T.Range = StringRef(Start, Current - Start);
1387   TokenQueue.push_back(T);
1388 
1389   // Plain scalars can be simple keys.
1390   saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
1391 
1392   IsSimpleKeyAllowed = false;
1393 
1394   return true;
1395 }
1396 
1397 bool Scanner::scanAliasOrAnchor(bool IsAlias) {
1398   StringRef::iterator Start = Current;
1399   unsigned ColStart = Column;
1400   skip(1);
1401   while(true) {
1402     if (   *Current == '[' || *Current == ']'
1403         || *Current == '{' || *Current == '}'
1404         || *Current == ','
1405         || *Current == ':')
1406       break;
1407     StringRef::iterator i = skip_ns_char(Current);
1408     if (i == Current)
1409       break;
1410     Current = i;
1411     ++Column;
1412   }
1413 
1414   if (Start == Current) {
1415     setError("Got empty alias or anchor", Start);
1416     return false;
1417   }
1418 
1419   Token T;
1420   T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor;
1421   T.Range = StringRef(Start, Current - Start);
1422   TokenQueue.push_back(T);
1423 
1424   // Alias and anchors can be simple keys.
1425   saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
1426 
1427   IsSimpleKeyAllowed = false;
1428 
1429   return true;
1430 }
1431 
1432 char Scanner::scanBlockChompingIndicator() {
1433   char Indicator = ' ';
1434   if (Current != End && (*Current == '+' || *Current == '-')) {
1435     Indicator = *Current;
1436     skip(1);
1437   }
1438   return Indicator;
1439 }
1440 
1441 /// Get the number of line breaks after chomping.
1442 ///
1443 /// Return the number of trailing line breaks to emit, depending on
1444 /// \p ChompingIndicator.
1445 static unsigned getChompedLineBreaks(char ChompingIndicator,
1446                                      unsigned LineBreaks, StringRef Str) {
1447   if (ChompingIndicator == '-') // Strip all line breaks.
1448     return 0;
1449   if (ChompingIndicator == '+') // Keep all line breaks.
1450     return LineBreaks;
1451   // Clip trailing lines.
1452   return Str.empty() ? 0 : 1;
1453 }
1454 
1455 unsigned Scanner::scanBlockIndentationIndicator() {
1456   unsigned Indent = 0;
1457   if (Current != End && (*Current >= '1' && *Current <= '9')) {
1458     Indent = unsigned(*Current - '0');
1459     skip(1);
1460   }
1461   return Indent;
1462 }
1463 
1464 bool Scanner::scanBlockScalarHeader(char &ChompingIndicator,
1465                                     unsigned &IndentIndicator, bool &IsDone) {
1466   auto Start = Current;
1467 
1468   ChompingIndicator = scanBlockChompingIndicator();
1469   IndentIndicator = scanBlockIndentationIndicator();
1470   // Check for the chomping indicator once again.
1471   if (ChompingIndicator == ' ')
1472     ChompingIndicator = scanBlockChompingIndicator();
1473   Current = skip_while(&Scanner::skip_s_white, Current);
1474   skipComment();
1475 
1476   if (Current == End) { // EOF, we have an empty scalar.
1477     Token T;
1478     T.Kind = Token::TK_BlockScalar;
1479     T.Range = StringRef(Start, Current - Start);
1480     TokenQueue.push_back(T);
1481     IsDone = true;
1482     return true;
1483   }
1484 
1485   if (!consumeLineBreakIfPresent()) {
1486     setError("Expected a line break after block scalar header", Current);
1487     return false;
1488   }
1489   return true;
1490 }
1491 
1492 bool Scanner::findBlockScalarIndent(unsigned &BlockIndent,
1493                                     unsigned BlockExitIndent,
1494                                     unsigned &LineBreaks, bool &IsDone) {
1495   unsigned MaxAllSpaceLineCharacters = 0;
1496   StringRef::iterator LongestAllSpaceLine;
1497 
1498   while (true) {
1499     advanceWhile(&Scanner::skip_s_space);
1500     if (skip_nb_char(Current) != Current) {
1501       // This line isn't empty, so try and find the indentation.
1502       if (Column <= BlockExitIndent) { // End of the block literal.
1503         IsDone = true;
1504         return true;
1505       }
1506       // We found the block's indentation.
1507       BlockIndent = Column;
1508       if (MaxAllSpaceLineCharacters > BlockIndent) {
1509         setError(
1510             "Leading all-spaces line must be smaller than the block indent",
1511             LongestAllSpaceLine);
1512         return false;
1513       }
1514       return true;
1515     }
1516     if (skip_b_break(Current) != Current &&
1517         Column > MaxAllSpaceLineCharacters) {
1518       // Record the longest all-space line in case it's longer than the
1519       // discovered block indent.
1520       MaxAllSpaceLineCharacters = Column;
1521       LongestAllSpaceLine = Current;
1522     }
1523 
1524     // Check for EOF.
1525     if (Current == End) {
1526       IsDone = true;
1527       return true;
1528     }
1529 
1530     if (!consumeLineBreakIfPresent()) {
1531       IsDone = true;
1532       return true;
1533     }
1534     ++LineBreaks;
1535   }
1536   return true;
1537 }
1538 
1539 bool Scanner::scanBlockScalarIndent(unsigned BlockIndent,
1540                                     unsigned BlockExitIndent, bool &IsDone) {
1541   // Skip the indentation.
1542   while (Column < BlockIndent) {
1543     auto I = skip_s_space(Current);
1544     if (I == Current)
1545       break;
1546     Current = I;
1547     ++Column;
1548   }
1549 
1550   if (skip_nb_char(Current) == Current)
1551     return true;
1552 
1553   if (Column <= BlockExitIndent) { // End of the block literal.
1554     IsDone = true;
1555     return true;
1556   }
1557 
1558   if (Column < BlockIndent) {
1559     if (Current != End && *Current == '#') { // Trailing comment.
1560       IsDone = true;
1561       return true;
1562     }
1563     setError("A text line is less indented than the block scalar", Current);
1564     return false;
1565   }
1566   return true; // A normal text line.
1567 }
1568 
1569 bool Scanner::scanBlockScalar(bool IsLiteral) {
1570   // Eat '|' or '>'
1571   assert(*Current == '|' || *Current == '>');
1572   skip(1);
1573 
1574   char ChompingIndicator;
1575   unsigned BlockIndent;
1576   bool IsDone = false;
1577   if (!scanBlockScalarHeader(ChompingIndicator, BlockIndent, IsDone))
1578     return false;
1579   if (IsDone)
1580     return true;
1581 
1582   auto Start = Current;
1583   unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent;
1584   unsigned LineBreaks = 0;
1585   if (BlockIndent == 0) {
1586     if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks,
1587                                IsDone))
1588       return false;
1589   }
1590 
1591   // Scan the block's scalars body.
1592   SmallString<256> Str;
1593   while (!IsDone) {
1594     if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone))
1595       return false;
1596     if (IsDone)
1597       break;
1598 
1599     // Parse the current line.
1600     auto LineStart = Current;
1601     advanceWhile(&Scanner::skip_nb_char);
1602     if (LineStart != Current) {
1603       Str.append(LineBreaks, '\n');
1604       Str.append(StringRef(LineStart, Current - LineStart));
1605       LineBreaks = 0;
1606     }
1607 
1608     // Check for EOF.
1609     if (Current == End)
1610       break;
1611 
1612     if (!consumeLineBreakIfPresent())
1613       break;
1614     ++LineBreaks;
1615   }
1616 
1617   if (Current == End && !LineBreaks)
1618     // Ensure that there is at least one line break before the end of file.
1619     LineBreaks = 1;
1620   Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n');
1621 
1622   // New lines may start a simple key.
1623   if (!FlowLevel)
1624     IsSimpleKeyAllowed = true;
1625 
1626   Token T;
1627   T.Kind = Token::TK_BlockScalar;
1628   T.Range = StringRef(Start, Current - Start);
1629   T.Value = Str.str().str();
1630   TokenQueue.push_back(T);
1631   return true;
1632 }
1633 
1634 bool Scanner::scanTag() {
1635   StringRef::iterator Start = Current;
1636   unsigned ColStart = Column;
1637   skip(1); // Eat !.
1638   if (Current == End || isBlankOrBreak(Current)); // An empty tag.
1639   else if (*Current == '<') {
1640     skip(1);
1641     scan_ns_uri_char();
1642     if (!consume('>'))
1643       return false;
1644   } else {
1645     // FIXME: Actually parse the c-ns-shorthand-tag rule.
1646     Current = skip_while(&Scanner::skip_ns_char, Current);
1647   }
1648 
1649   Token T;
1650   T.Kind = Token::TK_Tag;
1651   T.Range = StringRef(Start, Current - Start);
1652   TokenQueue.push_back(T);
1653 
1654   // Tags can be simple keys.
1655   saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
1656 
1657   IsSimpleKeyAllowed = false;
1658 
1659   return true;
1660 }
1661 
1662 bool Scanner::fetchMoreTokens() {
1663   if (IsStartOfStream)
1664     return scanStreamStart();
1665 
1666   scanToNextToken();
1667 
1668   if (Current == End)
1669     return scanStreamEnd();
1670 
1671   removeStaleSimpleKeyCandidates();
1672 
1673   unrollIndent(Column);
1674 
1675   if (Column == 0 && *Current == '%')
1676     return scanDirective();
1677 
1678   if (Column == 0 && Current + 4 <= End
1679       && *Current == '-'
1680       && *(Current + 1) == '-'
1681       && *(Current + 2) == '-'
1682       && (Current + 3 == End || isBlankOrBreak(Current + 3)))
1683     return scanDocumentIndicator(true);
1684 
1685   if (Column == 0 && Current + 4 <= End
1686       && *Current == '.'
1687       && *(Current + 1) == '.'
1688       && *(Current + 2) == '.'
1689       && (Current + 3 == End || isBlankOrBreak(Current + 3)))
1690     return scanDocumentIndicator(false);
1691 
1692   if (*Current == '[')
1693     return scanFlowCollectionStart(true);
1694 
1695   if (*Current == '{')
1696     return scanFlowCollectionStart(false);
1697 
1698   if (*Current == ']')
1699     return scanFlowCollectionEnd(true);
1700 
1701   if (*Current == '}')
1702     return scanFlowCollectionEnd(false);
1703 
1704   if (*Current == ',')
1705     return scanFlowEntry();
1706 
1707   if (*Current == '-' && isBlankOrBreak(Current + 1))
1708     return scanBlockEntry();
1709 
1710   if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1)))
1711     return scanKey();
1712 
1713   if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1)))
1714     return scanValue();
1715 
1716   if (*Current == '*')
1717     return scanAliasOrAnchor(true);
1718 
1719   if (*Current == '&')
1720     return scanAliasOrAnchor(false);
1721 
1722   if (*Current == '!')
1723     return scanTag();
1724 
1725   if (*Current == '|' && !FlowLevel)
1726     return scanBlockScalar(true);
1727 
1728   if (*Current == '>' && !FlowLevel)
1729     return scanBlockScalar(false);
1730 
1731   if (*Current == '\'')
1732     return scanFlowScalar(false);
1733 
1734   if (*Current == '"')
1735     return scanFlowScalar(true);
1736 
1737   // Get a plain scalar.
1738   StringRef FirstChar(Current, 1);
1739   if (!(isBlankOrBreak(Current)
1740         || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos)
1741       || (*Current == '-' && !isBlankOrBreak(Current + 1))
1742       || (!FlowLevel && (*Current == '?' || *Current == ':')
1743           && isBlankOrBreak(Current + 1))
1744       || (!FlowLevel && *Current == ':'
1745                       && Current + 2 < End
1746                       && *(Current + 1) == ':'
1747                       && !isBlankOrBreak(Current + 2)))
1748     return scanPlainScalar();
1749 
1750   setError("Unrecognized character while tokenizing.");
1751   return false;
1752 }
1753 
1754 Stream::Stream(StringRef Input, SourceMgr &SM, bool ShowColors)
1755     : scanner(new Scanner(Input, SM, ShowColors)), CurrentDoc() {}
1756 
1757 Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors)
1758     : scanner(new Scanner(InputBuffer, SM, ShowColors)), CurrentDoc() {}
1759 
1760 Stream::~Stream() {}
1761 
1762 bool Stream::failed() { return scanner->failed(); }
1763 
1764 void Stream::printError(Node *N, const Twine &Msg) {
1765   scanner->printError( N->getSourceRange().Start
1766                      , SourceMgr::DK_Error
1767                      , Msg
1768                      , N->getSourceRange());
1769 }
1770 
1771 document_iterator Stream::begin() {
1772   if (CurrentDoc)
1773     report_fatal_error("Can only iterate over the stream once");
1774 
1775   // Skip Stream-Start.
1776   scanner->getNext();
1777 
1778   CurrentDoc.reset(new Document(*this));
1779   return document_iterator(CurrentDoc);
1780 }
1781 
1782 document_iterator Stream::end() {
1783   return document_iterator();
1784 }
1785 
1786 void Stream::skip() {
1787   for (document_iterator i = begin(), e = end(); i != e; ++i)
1788     i->skip();
1789 }
1790 
1791 Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A,
1792            StringRef T)
1793     : Doc(D), TypeID(Type), Anchor(A), Tag(T) {
1794   SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin());
1795   SourceRange = SMRange(Start, Start);
1796 }
1797 
1798 std::string Node::getVerbatimTag() const {
1799   StringRef Raw = getRawTag();
1800   if (!Raw.empty() && Raw != "!") {
1801     std::string Ret;
1802     if (Raw.find_last_of('!') == 0) {
1803       Ret = Doc->getTagMap().find("!")->second;
1804       Ret += Raw.substr(1);
1805       return Ret;
1806     } else if (Raw.startswith("!!")) {
1807       Ret = Doc->getTagMap().find("!!")->second;
1808       Ret += Raw.substr(2);
1809       return Ret;
1810     } else {
1811       StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1);
1812       std::map<StringRef, StringRef>::const_iterator It =
1813           Doc->getTagMap().find(TagHandle);
1814       if (It != Doc->getTagMap().end())
1815         Ret = It->second;
1816       else {
1817         Token T;
1818         T.Kind = Token::TK_Tag;
1819         T.Range = TagHandle;
1820         setError(Twine("Unknown tag handle ") + TagHandle, T);
1821       }
1822       Ret += Raw.substr(Raw.find_last_of('!') + 1);
1823       return Ret;
1824     }
1825   }
1826 
1827   switch (getType()) {
1828   case NK_Null:
1829     return "tag:yaml.org,2002:null";
1830   case NK_Scalar:
1831   case NK_BlockScalar:
1832     // TODO: Tag resolution.
1833     return "tag:yaml.org,2002:str";
1834   case NK_Mapping:
1835     return "tag:yaml.org,2002:map";
1836   case NK_Sequence:
1837     return "tag:yaml.org,2002:seq";
1838   }
1839 
1840   return "";
1841 }
1842 
1843 Token &Node::peekNext() {
1844   return Doc->peekNext();
1845 }
1846 
1847 Token Node::getNext() {
1848   return Doc->getNext();
1849 }
1850 
1851 Node *Node::parseBlockNode() {
1852   return Doc->parseBlockNode();
1853 }
1854 
1855 BumpPtrAllocator &Node::getAllocator() {
1856   return Doc->NodeAllocator;
1857 }
1858 
1859 void Node::setError(const Twine &Msg, Token &Tok) const {
1860   Doc->setError(Msg, Tok);
1861 }
1862 
1863 bool Node::failed() const {
1864   return Doc->failed();
1865 }
1866 
1867 
1868 
1869 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const {
1870   // TODO: Handle newlines properly. We need to remove leading whitespace.
1871   if (Value[0] == '"') { // Double quoted.
1872     // Pull off the leading and trailing "s.
1873     StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
1874     // Search for characters that would require unescaping the value.
1875     StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n");
1876     if (i != StringRef::npos)
1877       return unescapeDoubleQuoted(UnquotedValue, i, Storage);
1878     return UnquotedValue;
1879   } else if (Value[0] == '\'') { // Single quoted.
1880     // Pull off the leading and trailing 's.
1881     StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
1882     StringRef::size_type i = UnquotedValue.find('\'');
1883     if (i != StringRef::npos) {
1884       // We're going to need Storage.
1885       Storage.clear();
1886       Storage.reserve(UnquotedValue.size());
1887       for (; i != StringRef::npos; i = UnquotedValue.find('\'')) {
1888         StringRef Valid(UnquotedValue.begin(), i);
1889         Storage.insert(Storage.end(), Valid.begin(), Valid.end());
1890         Storage.push_back('\'');
1891         UnquotedValue = UnquotedValue.substr(i + 2);
1892       }
1893       Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end());
1894       return StringRef(Storage.begin(), Storage.size());
1895     }
1896     return UnquotedValue;
1897   }
1898   // Plain or block.
1899   return Value.rtrim(' ');
1900 }
1901 
1902 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
1903                                           , StringRef::size_type i
1904                                           , SmallVectorImpl<char> &Storage)
1905                                           const {
1906   // Use Storage to build proper value.
1907   Storage.clear();
1908   Storage.reserve(UnquotedValue.size());
1909   for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) {
1910     // Insert all previous chars into Storage.
1911     StringRef Valid(UnquotedValue.begin(), i);
1912     Storage.insert(Storage.end(), Valid.begin(), Valid.end());
1913     // Chop off inserted chars.
1914     UnquotedValue = UnquotedValue.substr(i);
1915 
1916     assert(!UnquotedValue.empty() && "Can't be empty!");
1917 
1918     // Parse escape or line break.
1919     switch (UnquotedValue[0]) {
1920     case '\r':
1921     case '\n':
1922       Storage.push_back('\n');
1923       if (   UnquotedValue.size() > 1
1924           && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
1925         UnquotedValue = UnquotedValue.substr(1);
1926       UnquotedValue = UnquotedValue.substr(1);
1927       break;
1928     default:
1929       if (UnquotedValue.size() == 1)
1930         // TODO: Report error.
1931         break;
1932       UnquotedValue = UnquotedValue.substr(1);
1933       switch (UnquotedValue[0]) {
1934       default: {
1935           Token T;
1936           T.Range = StringRef(UnquotedValue.begin(), 1);
1937           setError("Unrecognized escape code!", T);
1938           return "";
1939         }
1940       case '\r':
1941       case '\n':
1942         // Remove the new line.
1943         if (   UnquotedValue.size() > 1
1944             && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
1945           UnquotedValue = UnquotedValue.substr(1);
1946         // If this was just a single byte newline, it will get skipped
1947         // below.
1948         break;
1949       case '0':
1950         Storage.push_back(0x00);
1951         break;
1952       case 'a':
1953         Storage.push_back(0x07);
1954         break;
1955       case 'b':
1956         Storage.push_back(0x08);
1957         break;
1958       case 't':
1959       case 0x09:
1960         Storage.push_back(0x09);
1961         break;
1962       case 'n':
1963         Storage.push_back(0x0A);
1964         break;
1965       case 'v':
1966         Storage.push_back(0x0B);
1967         break;
1968       case 'f':
1969         Storage.push_back(0x0C);
1970         break;
1971       case 'r':
1972         Storage.push_back(0x0D);
1973         break;
1974       case 'e':
1975         Storage.push_back(0x1B);
1976         break;
1977       case ' ':
1978         Storage.push_back(0x20);
1979         break;
1980       case '"':
1981         Storage.push_back(0x22);
1982         break;
1983       case '/':
1984         Storage.push_back(0x2F);
1985         break;
1986       case '\\':
1987         Storage.push_back(0x5C);
1988         break;
1989       case 'N':
1990         encodeUTF8(0x85, Storage);
1991         break;
1992       case '_':
1993         encodeUTF8(0xA0, Storage);
1994         break;
1995       case 'L':
1996         encodeUTF8(0x2028, Storage);
1997         break;
1998       case 'P':
1999         encodeUTF8(0x2029, Storage);
2000         break;
2001       case 'x': {
2002           if (UnquotedValue.size() < 3)
2003             // TODO: Report error.
2004             break;
2005           unsigned int UnicodeScalarValue;
2006           if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue))
2007             // TODO: Report error.
2008             UnicodeScalarValue = 0xFFFD;
2009           encodeUTF8(UnicodeScalarValue, Storage);
2010           UnquotedValue = UnquotedValue.substr(2);
2011           break;
2012         }
2013       case 'u': {
2014           if (UnquotedValue.size() < 5)
2015             // TODO: Report error.
2016             break;
2017           unsigned int UnicodeScalarValue;
2018           if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue))
2019             // TODO: Report error.
2020             UnicodeScalarValue = 0xFFFD;
2021           encodeUTF8(UnicodeScalarValue, Storage);
2022           UnquotedValue = UnquotedValue.substr(4);
2023           break;
2024         }
2025       case 'U': {
2026           if (UnquotedValue.size() < 9)
2027             // TODO: Report error.
2028             break;
2029           unsigned int UnicodeScalarValue;
2030           if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue))
2031             // TODO: Report error.
2032             UnicodeScalarValue = 0xFFFD;
2033           encodeUTF8(UnicodeScalarValue, Storage);
2034           UnquotedValue = UnquotedValue.substr(8);
2035           break;
2036         }
2037       }
2038       UnquotedValue = UnquotedValue.substr(1);
2039     }
2040   }
2041   Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end());
2042   return StringRef(Storage.begin(), Storage.size());
2043 }
2044 
2045 Node *KeyValueNode::getKey() {
2046   if (Key)
2047     return Key;
2048   // Handle implicit null keys.
2049   {
2050     Token &t = peekNext();
2051     if (   t.Kind == Token::TK_BlockEnd
2052         || t.Kind == Token::TK_Value
2053         || t.Kind == Token::TK_Error) {
2054       return Key = new (getAllocator()) NullNode(Doc);
2055     }
2056     if (t.Kind == Token::TK_Key)
2057       getNext(); // skip TK_Key.
2058   }
2059 
2060   // Handle explicit null keys.
2061   Token &t = peekNext();
2062   if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) {
2063     return Key = new (getAllocator()) NullNode(Doc);
2064   }
2065 
2066   // We've got a normal key.
2067   return Key = parseBlockNode();
2068 }
2069 
2070 Node *KeyValueNode::getValue() {
2071   if (Value)
2072     return Value;
2073   getKey()->skip();
2074   if (failed())
2075     return Value = new (getAllocator()) NullNode(Doc);
2076 
2077   // Handle implicit null values.
2078   {
2079     Token &t = peekNext();
2080     if (   t.Kind == Token::TK_BlockEnd
2081         || t.Kind == Token::TK_FlowMappingEnd
2082         || t.Kind == Token::TK_Key
2083         || t.Kind == Token::TK_FlowEntry
2084         || t.Kind == Token::TK_Error) {
2085       return Value = new (getAllocator()) NullNode(Doc);
2086     }
2087 
2088     if (t.Kind != Token::TK_Value) {
2089       setError("Unexpected token in Key Value.", t);
2090       return Value = new (getAllocator()) NullNode(Doc);
2091     }
2092     getNext(); // skip TK_Value.
2093   }
2094 
2095   // Handle explicit null values.
2096   Token &t = peekNext();
2097   if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) {
2098     return Value = new (getAllocator()) NullNode(Doc);
2099   }
2100 
2101   // We got a normal value.
2102   return Value = parseBlockNode();
2103 }
2104 
2105 void MappingNode::increment() {
2106   if (failed()) {
2107     IsAtEnd = true;
2108     CurrentEntry = nullptr;
2109     return;
2110   }
2111   if (CurrentEntry) {
2112     CurrentEntry->skip();
2113     if (Type == MT_Inline) {
2114       IsAtEnd = true;
2115       CurrentEntry = nullptr;
2116       return;
2117     }
2118   }
2119   Token T = peekNext();
2120   if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) {
2121     // KeyValueNode eats the TK_Key. That way it can detect null keys.
2122     CurrentEntry = new (getAllocator()) KeyValueNode(Doc);
2123   } else if (Type == MT_Block) {
2124     switch (T.Kind) {
2125     case Token::TK_BlockEnd:
2126       getNext();
2127       IsAtEnd = true;
2128       CurrentEntry = nullptr;
2129       break;
2130     default:
2131       setError("Unexpected token. Expected Key or Block End", T);
2132     case Token::TK_Error:
2133       IsAtEnd = true;
2134       CurrentEntry = nullptr;
2135     }
2136   } else {
2137     switch (T.Kind) {
2138     case Token::TK_FlowEntry:
2139       // Eat the flow entry and recurse.
2140       getNext();
2141       return increment();
2142     case Token::TK_FlowMappingEnd:
2143       getNext();
2144     case Token::TK_Error:
2145       // Set this to end iterator.
2146       IsAtEnd = true;
2147       CurrentEntry = nullptr;
2148       break;
2149     default:
2150       setError( "Unexpected token. Expected Key, Flow Entry, or Flow "
2151                 "Mapping End."
2152               , T);
2153       IsAtEnd = true;
2154       CurrentEntry = nullptr;
2155     }
2156   }
2157 }
2158 
2159 void SequenceNode::increment() {
2160   if (failed()) {
2161     IsAtEnd = true;
2162     CurrentEntry = nullptr;
2163     return;
2164   }
2165   if (CurrentEntry)
2166     CurrentEntry->skip();
2167   Token T = peekNext();
2168   if (SeqType == ST_Block) {
2169     switch (T.Kind) {
2170     case Token::TK_BlockEntry:
2171       getNext();
2172       CurrentEntry = parseBlockNode();
2173       if (!CurrentEntry) { // An error occurred.
2174         IsAtEnd = true;
2175         CurrentEntry = nullptr;
2176       }
2177       break;
2178     case Token::TK_BlockEnd:
2179       getNext();
2180       IsAtEnd = true;
2181       CurrentEntry = nullptr;
2182       break;
2183     default:
2184       setError( "Unexpected token. Expected Block Entry or Block End."
2185               , T);
2186     case Token::TK_Error:
2187       IsAtEnd = true;
2188       CurrentEntry = nullptr;
2189     }
2190   } else if (SeqType == ST_Indentless) {
2191     switch (T.Kind) {
2192     case Token::TK_BlockEntry:
2193       getNext();
2194       CurrentEntry = parseBlockNode();
2195       if (!CurrentEntry) { // An error occurred.
2196         IsAtEnd = true;
2197         CurrentEntry = nullptr;
2198       }
2199       break;
2200     default:
2201     case Token::TK_Error:
2202       IsAtEnd = true;
2203       CurrentEntry = nullptr;
2204     }
2205   } else if (SeqType == ST_Flow) {
2206     switch (T.Kind) {
2207     case Token::TK_FlowEntry:
2208       // Eat the flow entry and recurse.
2209       getNext();
2210       WasPreviousTokenFlowEntry = true;
2211       return increment();
2212     case Token::TK_FlowSequenceEnd:
2213       getNext();
2214     case Token::TK_Error:
2215       // Set this to end iterator.
2216       IsAtEnd = true;
2217       CurrentEntry = nullptr;
2218       break;
2219     case Token::TK_StreamEnd:
2220     case Token::TK_DocumentEnd:
2221     case Token::TK_DocumentStart:
2222       setError("Could not find closing ]!", T);
2223       // Set this to end iterator.
2224       IsAtEnd = true;
2225       CurrentEntry = nullptr;
2226       break;
2227     default:
2228       if (!WasPreviousTokenFlowEntry) {
2229         setError("Expected , between entries!", T);
2230         IsAtEnd = true;
2231         CurrentEntry = nullptr;
2232         break;
2233       }
2234       // Otherwise it must be a flow entry.
2235       CurrentEntry = parseBlockNode();
2236       if (!CurrentEntry) {
2237         IsAtEnd = true;
2238       }
2239       WasPreviousTokenFlowEntry = false;
2240       break;
2241     }
2242   }
2243 }
2244 
2245 Document::Document(Stream &S) : stream(S), Root(nullptr) {
2246   // Tag maps starts with two default mappings.
2247   TagMap["!"] = "!";
2248   TagMap["!!"] = "tag:yaml.org,2002:";
2249 
2250   if (parseDirectives())
2251     expectToken(Token::TK_DocumentStart);
2252   Token &T = peekNext();
2253   if (T.Kind == Token::TK_DocumentStart)
2254     getNext();
2255 }
2256 
2257 bool Document::skip()  {
2258   if (stream.scanner->failed())
2259     return false;
2260   if (!Root)
2261     getRoot();
2262   Root->skip();
2263   Token &T = peekNext();
2264   if (T.Kind == Token::TK_StreamEnd)
2265     return false;
2266   if (T.Kind == Token::TK_DocumentEnd) {
2267     getNext();
2268     return skip();
2269   }
2270   return true;
2271 }
2272 
2273 Token &Document::peekNext() {
2274   return stream.scanner->peekNext();
2275 }
2276 
2277 Token Document::getNext() {
2278   return stream.scanner->getNext();
2279 }
2280 
2281 void Document::setError(const Twine &Message, Token &Location) const {
2282   stream.scanner->setError(Message, Location.Range.begin());
2283 }
2284 
2285 bool Document::failed() const {
2286   return stream.scanner->failed();
2287 }
2288 
2289 Node *Document::parseBlockNode() {
2290   Token T = peekNext();
2291   // Handle properties.
2292   Token AnchorInfo;
2293   Token TagInfo;
2294 parse_property:
2295   switch (T.Kind) {
2296   case Token::TK_Alias:
2297     getNext();
2298     return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1));
2299   case Token::TK_Anchor:
2300     if (AnchorInfo.Kind == Token::TK_Anchor) {
2301       setError("Already encountered an anchor for this node!", T);
2302       return nullptr;
2303     }
2304     AnchorInfo = getNext(); // Consume TK_Anchor.
2305     T = peekNext();
2306     goto parse_property;
2307   case Token::TK_Tag:
2308     if (TagInfo.Kind == Token::TK_Tag) {
2309       setError("Already encountered a tag for this node!", T);
2310       return nullptr;
2311     }
2312     TagInfo = getNext(); // Consume TK_Tag.
2313     T = peekNext();
2314     goto parse_property;
2315   default:
2316     break;
2317   }
2318 
2319   switch (T.Kind) {
2320   case Token::TK_BlockEntry:
2321     // We got an unindented BlockEntry sequence. This is not terminated with
2322     // a BlockEnd.
2323     // Don't eat the TK_BlockEntry, SequenceNode needs it.
2324     return new (NodeAllocator) SequenceNode( stream.CurrentDoc
2325                                            , AnchorInfo.Range.substr(1)
2326                                            , TagInfo.Range
2327                                            , SequenceNode::ST_Indentless);
2328   case Token::TK_BlockSequenceStart:
2329     getNext();
2330     return new (NodeAllocator)
2331       SequenceNode( stream.CurrentDoc
2332                   , AnchorInfo.Range.substr(1)
2333                   , TagInfo.Range
2334                   , SequenceNode::ST_Block);
2335   case Token::TK_BlockMappingStart:
2336     getNext();
2337     return new (NodeAllocator)
2338       MappingNode( stream.CurrentDoc
2339                  , AnchorInfo.Range.substr(1)
2340                  , TagInfo.Range
2341                  , MappingNode::MT_Block);
2342   case Token::TK_FlowSequenceStart:
2343     getNext();
2344     return new (NodeAllocator)
2345       SequenceNode( stream.CurrentDoc
2346                   , AnchorInfo.Range.substr(1)
2347                   , TagInfo.Range
2348                   , SequenceNode::ST_Flow);
2349   case Token::TK_FlowMappingStart:
2350     getNext();
2351     return new (NodeAllocator)
2352       MappingNode( stream.CurrentDoc
2353                  , AnchorInfo.Range.substr(1)
2354                  , TagInfo.Range
2355                  , MappingNode::MT_Flow);
2356   case Token::TK_Scalar:
2357     getNext();
2358     return new (NodeAllocator)
2359       ScalarNode( stream.CurrentDoc
2360                 , AnchorInfo.Range.substr(1)
2361                 , TagInfo.Range
2362                 , T.Range);
2363   case Token::TK_BlockScalar: {
2364     getNext();
2365     StringRef NullTerminatedStr(T.Value.c_str(), T.Value.length() + 1);
2366     StringRef StrCopy = NullTerminatedStr.copy(NodeAllocator).drop_back();
2367     return new (NodeAllocator)
2368         BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1),
2369                         TagInfo.Range, StrCopy, T.Range);
2370   }
2371   case Token::TK_Key:
2372     // Don't eat the TK_Key, KeyValueNode expects it.
2373     return new (NodeAllocator)
2374       MappingNode( stream.CurrentDoc
2375                  , AnchorInfo.Range.substr(1)
2376                  , TagInfo.Range
2377                  , MappingNode::MT_Inline);
2378   case Token::TK_DocumentStart:
2379   case Token::TK_DocumentEnd:
2380   case Token::TK_StreamEnd:
2381   default:
2382     // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not
2383     //       !!null null.
2384     return new (NodeAllocator) NullNode(stream.CurrentDoc);
2385   case Token::TK_Error:
2386     return nullptr;
2387   }
2388   llvm_unreachable("Control flow shouldn't reach here.");
2389   return nullptr;
2390 }
2391 
2392 bool Document::parseDirectives() {
2393   bool isDirective = false;
2394   while (true) {
2395     Token T = peekNext();
2396     if (T.Kind == Token::TK_TagDirective) {
2397       parseTAGDirective();
2398       isDirective = true;
2399     } else if (T.Kind == Token::TK_VersionDirective) {
2400       parseYAMLDirective();
2401       isDirective = true;
2402     } else
2403       break;
2404   }
2405   return isDirective;
2406 }
2407 
2408 void Document::parseYAMLDirective() {
2409   getNext(); // Eat %YAML <version>
2410 }
2411 
2412 void Document::parseTAGDirective() {
2413   Token Tag = getNext(); // %TAG <handle> <prefix>
2414   StringRef T = Tag.Range;
2415   // Strip %TAG
2416   T = T.substr(T.find_first_of(" \t")).ltrim(" \t");
2417   std::size_t HandleEnd = T.find_first_of(" \t");
2418   StringRef TagHandle = T.substr(0, HandleEnd);
2419   StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t");
2420   TagMap[TagHandle] = TagPrefix;
2421 }
2422 
2423 bool Document::expectToken(int TK) {
2424   Token T = getNext();
2425   if (T.Kind != TK) {
2426     setError("Unexpected token", T);
2427     return false;
2428   }
2429   return true;
2430 }
2431