xref: /llvm-project-15.0.7/clang/lib/Lex/Lexer.cpp (revision 80bb52ae)
1 //===--- Lexer.cpp - C Language Family Lexer ------------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 //  This file implements the Lexer and Token interfaces.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "clang/Lex/Lexer.h"
15 #include "UnicodeCharSets.h"
16 #include "clang/Basic/CharInfo.h"
17 #include "clang/Basic/IdentifierTable.h"
18 #include "clang/Basic/SourceManager.h"
19 #include "clang/Lex/LexDiagnostic.h"
20 #include "clang/Lex/LiteralSupport.h"
21 #include "clang/Lex/Preprocessor.h"
22 #include "clang/Lex/PreprocessorOptions.h"
23 #include "llvm/ADT/StringExtras.h"
24 #include "llvm/ADT/StringSwitch.h"
25 #include "llvm/Support/Compiler.h"
26 #include "llvm/Support/ConvertUTF.h"
27 #include "llvm/Support/MathExtras.h"
28 #include "llvm/Support/MemoryBuffer.h"
29 #include "llvm/Support/UnicodeCharRanges.h"
30 #include <algorithm>
31 #include <cassert>
32 #include <cstddef>
33 #include <cstdint>
34 #include <cstring>
35 #include <string>
36 #include <tuple>
37 #include <utility>
38 
39 using namespace clang;
40 
41 //===----------------------------------------------------------------------===//
42 // Token Class Implementation
43 //===----------------------------------------------------------------------===//
44 
45 /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
46 bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {
47   if (isAnnotation())
48     return false;
49   if (IdentifierInfo *II = getIdentifierInfo())
50     return II->getObjCKeywordID() == objcKey;
51   return false;
52 }
53 
54 /// getObjCKeywordID - Return the ObjC keyword kind.
55 tok::ObjCKeywordKind Token::getObjCKeywordID() const {
56   if (isAnnotation())
57     return tok::objc_not_keyword;
58   IdentifierInfo *specId = getIdentifierInfo();
59   return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
60 }
61 
62 //===----------------------------------------------------------------------===//
63 // Lexer Class Implementation
64 //===----------------------------------------------------------------------===//
65 
66 void Lexer::anchor() { }
67 
68 void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
69                       const char *BufEnd) {
70   BufferStart = BufStart;
71   BufferPtr = BufPtr;
72   BufferEnd = BufEnd;
73 
74   assert(BufEnd[0] == 0 &&
75          "We assume that the input buffer has a null character at the end"
76          " to simplify lexing!");
77 
78   // Check whether we have a BOM in the beginning of the buffer. If yes - act
79   // accordingly. Right now we support only UTF-8 with and without BOM, so, just
80   // skip the UTF-8 BOM if it's present.
81   if (BufferStart == BufferPtr) {
82     // Determine the size of the BOM.
83     StringRef Buf(BufferStart, BufferEnd - BufferStart);
84     size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
85       .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
86       .Default(0);
87 
88     // Skip the BOM.
89     BufferPtr += BOMLength;
90   }
91 
92   Is_PragmaLexer = false;
93   CurrentConflictMarkerState = CMK_None;
94 
95   // Start of the file is a start of line.
96   IsAtStartOfLine = true;
97   IsAtPhysicalStartOfLine = true;
98 
99   HasLeadingSpace = false;
100   HasLeadingEmptyMacro = false;
101 
102   // We are not after parsing a #.
103   ParsingPreprocessorDirective = false;
104 
105   // We are not after parsing #include.
106   ParsingFilename = false;
107 
108   // We are not in raw mode.  Raw mode disables diagnostics and interpretation
109   // of tokens (e.g. identifiers, thus disabling macro expansion).  It is used
110   // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
111   // or otherwise skipping over tokens.
112   LexingRawMode = false;
113 
114   // Default to not keeping comments.
115   ExtendedTokenMode = 0;
116 }
117 
118 /// Lexer constructor - Create a new lexer object for the specified buffer
119 /// with the specified preprocessor managing the lexing process.  This lexer
120 /// assumes that the associated file buffer and Preprocessor objects will
121 /// outlive it, so it doesn't take ownership of either of them.
122 Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP)
123   : PreprocessorLexer(&PP, FID),
124     FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
125     LangOpts(PP.getLangOpts()) {
126 
127   InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(),
128             InputFile->getBufferEnd());
129 
130   resetExtendedTokenMode();
131 }
132 
133 void Lexer::resetExtendedTokenMode() {
134   assert(PP && "Cannot reset token mode without a preprocessor");
135   if (LangOpts.TraditionalCPP)
136     SetKeepWhitespaceMode(true);
137   else
138     SetCommentRetentionState(PP->getCommentRetentionState());
139 }
140 
141 /// Lexer constructor - Create a new raw lexer object.  This object is only
142 /// suitable for calls to 'LexFromRawLexer'.  This lexer assumes that the text
143 /// range will outlive it, so it doesn't take ownership of it.
144 Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
145              const char *BufStart, const char *BufPtr, const char *BufEnd)
146   : FileLoc(fileloc), LangOpts(langOpts) {
147 
148   InitLexer(BufStart, BufPtr, BufEnd);
149 
150   // We *are* in raw mode.
151   LexingRawMode = true;
152 }
153 
154 /// Lexer constructor - Create a new raw lexer object.  This object is only
155 /// suitable for calls to 'LexFromRawLexer'.  This lexer assumes that the text
156 /// range will outlive it, so it doesn't take ownership of it.
157 Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile,
158              const SourceManager &SM, const LangOptions &langOpts)
159     : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile->getBufferStart(),
160             FromFile->getBufferStart(), FromFile->getBufferEnd()) {}
161 
162 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
163 /// _Pragma expansion.  This has a variety of magic semantics that this method
164 /// sets up.  It returns a new'd Lexer that must be delete'd when done.
165 ///
166 /// On entrance to this routine, TokStartLoc is a macro location which has a
167 /// spelling loc that indicates the bytes to be lexed for the token and an
168 /// expansion location that indicates where all lexed tokens should be
169 /// "expanded from".
170 ///
171 /// TODO: It would really be nice to make _Pragma just be a wrapper around a
172 /// normal lexer that remaps tokens as they fly by.  This would require making
173 /// Preprocessor::Lex virtual.  Given that, we could just dump in a magic lexer
174 /// interface that could handle this stuff.  This would pull GetMappedTokenLoc
175 /// out of the critical path of the lexer!
176 ///
177 Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
178                                  SourceLocation ExpansionLocStart,
179                                  SourceLocation ExpansionLocEnd,
180                                  unsigned TokLen, Preprocessor &PP) {
181   SourceManager &SM = PP.getSourceManager();
182 
183   // Create the lexer as if we were going to lex the file normally.
184   FileID SpellingFID = SM.getFileID(SpellingLoc);
185   const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID);
186   Lexer *L = new Lexer(SpellingFID, InputFile, PP);
187 
188   // Now that the lexer is created, change the start/end locations so that we
189   // just lex the subsection of the file that we want.  This is lexing from a
190   // scratch buffer.
191   const char *StrData = SM.getCharacterData(SpellingLoc);
192 
193   L->BufferPtr = StrData;
194   L->BufferEnd = StrData+TokLen;
195   assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");
196 
197   // Set the SourceLocation with the remapping information.  This ensures that
198   // GetMappedTokenLoc will remap the tokens as they are lexed.
199   L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
200                                      ExpansionLocStart,
201                                      ExpansionLocEnd, TokLen);
202 
203   // Ensure that the lexer thinks it is inside a directive, so that end \n will
204   // return an EOD token.
205   L->ParsingPreprocessorDirective = true;
206 
207   // This lexer really is for _Pragma.
208   L->Is_PragmaLexer = true;
209   return L;
210 }
211 
212 /// Stringify - Convert the specified string into a C string, with surrounding
213 /// ""'s, and with escaped \ and " characters.
214 std::string Lexer::Stringify(StringRef Str, bool Charify) {
215   std::string Result = Str;
216   char Quote = Charify ? '\'' : '"';
217   for (unsigned i = 0, e = Result.size(); i != e; ++i) {
218     if (Result[i] == '\\' || Result[i] == Quote) {
219       Result.insert(Result.begin()+i, '\\');
220       ++i; ++e;
221     }
222   }
223   return Result;
224 }
225 
226 /// Stringify - Convert the specified string into a C string by escaping '\'
227 /// and " characters.  This does not add surrounding ""'s to the string.
228 void Lexer::Stringify(SmallVectorImpl<char> &Str) {
229   for (unsigned i = 0, e = Str.size(); i != e; ++i) {
230     if (Str[i] == '\\' || Str[i] == '"') {
231       Str.insert(Str.begin()+i, '\\');
232       ++i; ++e;
233     }
234   }
235 }
236 
237 //===----------------------------------------------------------------------===//
238 // Token Spelling
239 //===----------------------------------------------------------------------===//
240 
241 /// \brief Slow case of getSpelling. Extract the characters comprising the
242 /// spelling of this token from the provided input buffer.
243 static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
244                               const LangOptions &LangOpts, char *Spelling) {
245   assert(Tok.needsCleaning() && "getSpellingSlow called on simple token");
246 
247   size_t Length = 0;
248   const char *BufEnd = BufPtr + Tok.getLength();
249 
250   if (tok::isStringLiteral(Tok.getKind())) {
251     // Munch the encoding-prefix and opening double-quote.
252     while (BufPtr < BufEnd) {
253       unsigned Size;
254       Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
255       BufPtr += Size;
256 
257       if (Spelling[Length - 1] == '"')
258         break;
259     }
260 
261     // Raw string literals need special handling; trigraph expansion and line
262     // splicing do not occur within their d-char-sequence nor within their
263     // r-char-sequence.
264     if (Length >= 2 &&
265         Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
266       // Search backwards from the end of the token to find the matching closing
267       // quote.
268       const char *RawEnd = BufEnd;
269       do --RawEnd; while (*RawEnd != '"');
270       size_t RawLength = RawEnd - BufPtr + 1;
271 
272       // Everything between the quotes is included verbatim in the spelling.
273       memcpy(Spelling + Length, BufPtr, RawLength);
274       Length += RawLength;
275       BufPtr += RawLength;
276 
277       // The rest of the token is lexed normally.
278     }
279   }
280 
281   while (BufPtr < BufEnd) {
282     unsigned Size;
283     Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
284     BufPtr += Size;
285   }
286 
287   assert(Length < Tok.getLength() &&
288          "NeedsCleaning flag set on token that didn't need cleaning!");
289   return Length;
290 }
291 
292 /// getSpelling() - Return the 'spelling' of this token.  The spelling of a
293 /// token are the characters used to represent the token in the source file
294 /// after trigraph expansion and escaped-newline folding.  In particular, this
295 /// wants to get the true, uncanonicalized, spelling of things like digraphs
296 /// UCNs, etc.
297 StringRef Lexer::getSpelling(SourceLocation loc,
298                              SmallVectorImpl<char> &buffer,
299                              const SourceManager &SM,
300                              const LangOptions &options,
301                              bool *invalid) {
302   // Break down the source location.
303   std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
304 
305   // Try to the load the file buffer.
306   bool invalidTemp = false;
307   StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
308   if (invalidTemp) {
309     if (invalid) *invalid = true;
310     return StringRef();
311   }
312 
313   const char *tokenBegin = file.data() + locInfo.second;
314 
315   // Lex from the start of the given location.
316   Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
317               file.begin(), tokenBegin, file.end());
318   Token token;
319   lexer.LexFromRawLexer(token);
320 
321   unsigned length = token.getLength();
322 
323   // Common case:  no need for cleaning.
324   if (!token.needsCleaning())
325     return StringRef(tokenBegin, length);
326 
327   // Hard case, we need to relex the characters into the string.
328   buffer.resize(length);
329   buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
330   return StringRef(buffer.data(), buffer.size());
331 }
332 
333 /// getSpelling() - Return the 'spelling' of this token.  The spelling of a
334 /// token are the characters used to represent the token in the source file
335 /// after trigraph expansion and escaped-newline folding.  In particular, this
336 /// wants to get the true, uncanonicalized, spelling of things like digraphs
337 /// UCNs, etc.
338 std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
339                                const LangOptions &LangOpts, bool *Invalid) {
340   assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
341 
342   bool CharDataInvalid = false;
343   const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
344                                                     &CharDataInvalid);
345   if (Invalid)
346     *Invalid = CharDataInvalid;
347   if (CharDataInvalid)
348     return std::string();
349 
350   // If this token contains nothing interesting, return it directly.
351   if (!Tok.needsCleaning())
352     return std::string(TokStart, TokStart + Tok.getLength());
353 
354   std::string Result;
355   Result.resize(Tok.getLength());
356   Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
357   return Result;
358 }
359 
360 /// getSpelling - This method is used to get the spelling of a token into a
361 /// preallocated buffer, instead of as an std::string.  The caller is required
362 /// to allocate enough space for the token, which is guaranteed to be at least
363 /// Tok.getLength() bytes long.  The actual length of the token is returned.
364 ///
365 /// Note that this method may do two possible things: it may either fill in
366 /// the buffer specified with characters, or it may *change the input pointer*
367 /// to point to a constant buffer with the data already in it (avoiding a
368 /// copy).  The caller is not allowed to modify the returned buffer pointer
369 /// if an internal buffer is returned.
370 unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
371                             const SourceManager &SourceMgr,
372                             const LangOptions &LangOpts, bool *Invalid) {
373   assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
374 
375   const char *TokStart = nullptr;
376   // NOTE: this has to be checked *before* testing for an IdentifierInfo.
377   if (Tok.is(tok::raw_identifier))
378     TokStart = Tok.getRawIdentifier().data();
379   else if (!Tok.hasUCN()) {
380     if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
381       // Just return the string from the identifier table, which is very quick.
382       Buffer = II->getNameStart();
383       return II->getLength();
384     }
385   }
386 
387   // NOTE: this can be checked even after testing for an IdentifierInfo.
388   if (Tok.isLiteral())
389     TokStart = Tok.getLiteralData();
390 
391   if (!TokStart) {
392     // Compute the start of the token in the input lexer buffer.
393     bool CharDataInvalid = false;
394     TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
395     if (Invalid)
396       *Invalid = CharDataInvalid;
397     if (CharDataInvalid) {
398       Buffer = "";
399       return 0;
400     }
401   }
402 
403   // If this token contains nothing interesting, return it directly.
404   if (!Tok.needsCleaning()) {
405     Buffer = TokStart;
406     return Tok.getLength();
407   }
408 
409   // Otherwise, hard case, relex the characters into the string.
410   return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
411 }
412 
413 /// MeasureTokenLength - Relex the token at the specified location and return
414 /// its length in bytes in the input file.  If the token needs cleaning (e.g.
415 /// includes a trigraph or an escaped newline) then this count includes bytes
416 /// that are part of that.
417 unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
418                                    const SourceManager &SM,
419                                    const LangOptions &LangOpts) {
420   Token TheTok;
421   if (getRawToken(Loc, TheTok, SM, LangOpts))
422     return 0;
423   return TheTok.getLength();
424 }
425 
426 /// \brief Relex the token at the specified location.
427 /// \returns true if there was a failure, false on success.
428 bool Lexer::getRawToken(SourceLocation Loc, Token &Result,
429                         const SourceManager &SM,
430                         const LangOptions &LangOpts,
431                         bool IgnoreWhiteSpace) {
432   // TODO: this could be special cased for common tokens like identifiers, ')',
433   // etc to make this faster, if it mattered.  Just look at StrData[0] to handle
434   // all obviously single-char tokens.  This could use
435   // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
436   // something.
437 
438   // If this comes from a macro expansion, we really do want the macro name, not
439   // the token this macro expanded to.
440   Loc = SM.getExpansionLoc(Loc);
441   std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
442   bool Invalid = false;
443   StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
444   if (Invalid)
445     return true;
446 
447   const char *StrData = Buffer.data()+LocInfo.second;
448 
449   if (!IgnoreWhiteSpace && isWhitespace(StrData[0]))
450     return true;
451 
452   // Create a lexer starting at the beginning of this token.
453   Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
454                  Buffer.begin(), StrData, Buffer.end());
455   TheLexer.SetCommentRetentionState(true);
456   TheLexer.LexFromRawLexer(Result);
457   return false;
458 }
459 
460 /// Returns the pointer that points to the beginning of line that contains
461 /// the given offset, or null if the offset if invalid.
462 static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {
463   const char *BufStart = Buffer.data();
464   if (Offset >= Buffer.size())
465     return nullptr;
466 
467   const char *LexStart = BufStart + Offset;
468   for (; LexStart != BufStart; --LexStart) {
469     if (isVerticalWhitespace(LexStart[0]) &&
470         !Lexer::isNewLineEscaped(BufStart, LexStart)) {
471       // LexStart should point at first character of logical line.
472       ++LexStart;
473       break;
474     }
475   }
476   return LexStart;
477 }
478 
479 static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
480                                               const SourceManager &SM,
481                                               const LangOptions &LangOpts) {
482   assert(Loc.isFileID());
483   std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
484   if (LocInfo.first.isInvalid())
485     return Loc;
486 
487   bool Invalid = false;
488   StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
489   if (Invalid)
490     return Loc;
491 
492   // Back up from the current location until we hit the beginning of a line
493   // (or the buffer). We'll relex from that point.
494   const char *StrData = Buffer.data() + LocInfo.second;
495   const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
496   if (!LexStart || LexStart == StrData)
497     return Loc;
498 
499   // Create a lexer starting at the beginning of this token.
500   SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
501   Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
502                  Buffer.end());
503   TheLexer.SetCommentRetentionState(true);
504 
505   // Lex tokens until we find the token that contains the source location.
506   Token TheTok;
507   do {
508     TheLexer.LexFromRawLexer(TheTok);
509 
510     if (TheLexer.getBufferLocation() > StrData) {
511       // Lexing this token has taken the lexer past the source location we're
512       // looking for. If the current token encompasses our source location,
513       // return the beginning of that token.
514       if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
515         return TheTok.getLocation();
516 
517       // We ended up skipping over the source location entirely, which means
518       // that it points into whitespace. We're done here.
519       break;
520     }
521   } while (TheTok.getKind() != tok::eof);
522 
523   // We've passed our source location; just return the original source location.
524   return Loc;
525 }
526 
527 SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
528                                           const SourceManager &SM,
529                                           const LangOptions &LangOpts) {
530   if (Loc.isFileID())
531     return getBeginningOfFileToken(Loc, SM, LangOpts);
532 
533   if (!SM.isMacroArgExpansion(Loc))
534     return Loc;
535 
536   SourceLocation FileLoc = SM.getSpellingLoc(Loc);
537   SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
538   std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
539   std::pair<FileID, unsigned> BeginFileLocInfo =
540       SM.getDecomposedLoc(BeginFileLoc);
541   assert(FileLocInfo.first == BeginFileLocInfo.first &&
542          FileLocInfo.second >= BeginFileLocInfo.second);
543   return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
544 }
545 
546 namespace {
547 
548   enum PreambleDirectiveKind {
549     PDK_Skipped,
550     PDK_Unknown
551   };
552 
553 } // end anonymous namespace
554 
555 PreambleBounds Lexer::ComputePreamble(StringRef Buffer,
556                                       const LangOptions &LangOpts,
557                                       unsigned MaxLines) {
558   // Create a lexer starting at the beginning of the file. Note that we use a
559   // "fake" file source location at offset 1 so that the lexer will track our
560   // position within the file.
561   const unsigned StartOffset = 1;
562   SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset);
563   Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
564                  Buffer.end());
565   TheLexer.SetCommentRetentionState(true);
566 
567   bool InPreprocessorDirective = false;
568   Token TheTok;
569   SourceLocation ActiveCommentLoc;
570 
571   unsigned MaxLineOffset = 0;
572   if (MaxLines) {
573     const char *CurPtr = Buffer.begin();
574     unsigned CurLine = 0;
575     while (CurPtr != Buffer.end()) {
576       char ch = *CurPtr++;
577       if (ch == '\n') {
578         ++CurLine;
579         if (CurLine == MaxLines)
580           break;
581       }
582     }
583     if (CurPtr != Buffer.end())
584       MaxLineOffset = CurPtr - Buffer.begin();
585   }
586 
587   do {
588     TheLexer.LexFromRawLexer(TheTok);
589 
590     if (InPreprocessorDirective) {
591       // If we've hit the end of the file, we're done.
592       if (TheTok.getKind() == tok::eof) {
593         break;
594       }
595 
596       // If we haven't hit the end of the preprocessor directive, skip this
597       // token.
598       if (!TheTok.isAtStartOfLine())
599         continue;
600 
601       // We've passed the end of the preprocessor directive, and will look
602       // at this token again below.
603       InPreprocessorDirective = false;
604     }
605 
606     // Keep track of the # of lines in the preamble.
607     if (TheTok.isAtStartOfLine()) {
608       unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
609 
610       // If we were asked to limit the number of lines in the preamble,
611       // and we're about to exceed that limit, we're done.
612       if (MaxLineOffset && TokOffset >= MaxLineOffset)
613         break;
614     }
615 
616     // Comments are okay; skip over them.
617     if (TheTok.getKind() == tok::comment) {
618       if (ActiveCommentLoc.isInvalid())
619         ActiveCommentLoc = TheTok.getLocation();
620       continue;
621     }
622 
623     if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
624       // This is the start of a preprocessor directive.
625       Token HashTok = TheTok;
626       InPreprocessorDirective = true;
627       ActiveCommentLoc = SourceLocation();
628 
629       // Figure out which directive this is. Since we're lexing raw tokens,
630       // we don't have an identifier table available. Instead, just look at
631       // the raw identifier to recognize and categorize preprocessor directives.
632       TheLexer.LexFromRawLexer(TheTok);
633       if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
634         StringRef Keyword = TheTok.getRawIdentifier();
635         PreambleDirectiveKind PDK
636           = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
637               .Case("include", PDK_Skipped)
638               .Case("__include_macros", PDK_Skipped)
639               .Case("define", PDK_Skipped)
640               .Case("undef", PDK_Skipped)
641               .Case("line", PDK_Skipped)
642               .Case("error", PDK_Skipped)
643               .Case("pragma", PDK_Skipped)
644               .Case("import", PDK_Skipped)
645               .Case("include_next", PDK_Skipped)
646               .Case("warning", PDK_Skipped)
647               .Case("ident", PDK_Skipped)
648               .Case("sccs", PDK_Skipped)
649               .Case("assert", PDK_Skipped)
650               .Case("unassert", PDK_Skipped)
651               .Case("if", PDK_Skipped)
652               .Case("ifdef", PDK_Skipped)
653               .Case("ifndef", PDK_Skipped)
654               .Case("elif", PDK_Skipped)
655               .Case("else", PDK_Skipped)
656               .Case("endif", PDK_Skipped)
657               .Default(PDK_Unknown);
658 
659         switch (PDK) {
660         case PDK_Skipped:
661           continue;
662 
663         case PDK_Unknown:
664           // We don't know what this directive is; stop at the '#'.
665           break;
666         }
667       }
668 
669       // We only end up here if we didn't recognize the preprocessor
670       // directive or it was one that can't occur in the preamble at this
671       // point. Roll back the current token to the location of the '#'.
672       InPreprocessorDirective = false;
673       TheTok = HashTok;
674     }
675 
676     // We hit a token that we don't recognize as being in the
677     // "preprocessing only" part of the file, so we're no longer in
678     // the preamble.
679     break;
680   } while (true);
681 
682   SourceLocation End;
683   if (ActiveCommentLoc.isValid())
684     End = ActiveCommentLoc; // don't truncate a decl comment.
685   else
686     End = TheTok.getLocation();
687 
688   return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(),
689                         TheTok.isAtStartOfLine());
690 }
691 
692 /// AdvanceToTokenCharacter - Given a location that specifies the start of a
693 /// token, return a new location that specifies a character within the token.
694 SourceLocation Lexer::AdvanceToTokenCharacter(SourceLocation TokStart,
695                                               unsigned CharNo,
696                                               const SourceManager &SM,
697                                               const LangOptions &LangOpts) {
698   // Figure out how many physical characters away the specified expansion
699   // character is.  This needs to take into consideration newlines and
700   // trigraphs.
701   bool Invalid = false;
702   const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
703 
704   // If they request the first char of the token, we're trivially done.
705   if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
706     return TokStart;
707 
708   unsigned PhysOffset = 0;
709 
710   // The usual case is that tokens don't contain anything interesting.  Skip
711   // over the uninteresting characters.  If a token only consists of simple
712   // chars, this method is extremely fast.
713   while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
714     if (CharNo == 0)
715       return TokStart.getLocWithOffset(PhysOffset);
716     ++TokPtr;
717     --CharNo;
718     ++PhysOffset;
719   }
720 
721   // If we have a character that may be a trigraph or escaped newline, use a
722   // lexer to parse it correctly.
723   for (; CharNo; --CharNo) {
724     unsigned Size;
725     Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts);
726     TokPtr += Size;
727     PhysOffset += Size;
728   }
729 
730   // Final detail: if we end up on an escaped newline, we want to return the
731   // location of the actual byte of the token.  For example foo\<newline>bar
732   // advanced by 3 should return the location of b, not of \\.  One compounding
733   // detail of this is that the escape may be made by a trigraph.
734   if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
735     PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
736 
737   return TokStart.getLocWithOffset(PhysOffset);
738 }
739 
740 /// \brief Computes the source location just past the end of the
741 /// token at this source location.
742 ///
743 /// This routine can be used to produce a source location that
744 /// points just past the end of the token referenced by \p Loc, and
745 /// is generally used when a diagnostic needs to point just after a
746 /// token where it expected something different that it received. If
747 /// the returned source location would not be meaningful (e.g., if
748 /// it points into a macro), this routine returns an invalid
749 /// source location.
750 ///
751 /// \param Offset an offset from the end of the token, where the source
752 /// location should refer to. The default offset (0) produces a source
753 /// location pointing just past the end of the token; an offset of 1 produces
754 /// a source location pointing to the last character in the token, etc.
755 SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
756                                           const SourceManager &SM,
757                                           const LangOptions &LangOpts) {
758   if (Loc.isInvalid())
759     return SourceLocation();
760 
761   if (Loc.isMacroID()) {
762     if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
763       return SourceLocation(); // Points inside the macro expansion.
764   }
765 
766   unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
767   if (Len > Offset)
768     Len = Len - Offset;
769   else
770     return Loc;
771 
772   return Loc.getLocWithOffset(Len);
773 }
774 
775 /// \brief Returns true if the given MacroID location points at the first
776 /// token of the macro expansion.
777 bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc,
778                                       const SourceManager &SM,
779                                       const LangOptions &LangOpts,
780                                       SourceLocation *MacroBegin) {
781   assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
782 
783   SourceLocation expansionLoc;
784   if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))
785     return false;
786 
787   if (expansionLoc.isFileID()) {
788     // No other macro expansions, this is the first.
789     if (MacroBegin)
790       *MacroBegin = expansionLoc;
791     return true;
792   }
793 
794   return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin);
795 }
796 
797 /// \brief Returns true if the given MacroID location points at the last
798 /// token of the macro expansion.
799 bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc,
800                                     const SourceManager &SM,
801                                     const LangOptions &LangOpts,
802                                     SourceLocation *MacroEnd) {
803   assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
804 
805   SourceLocation spellLoc = SM.getSpellingLoc(loc);
806   unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts);
807   if (tokLen == 0)
808     return false;
809 
810   SourceLocation afterLoc = loc.getLocWithOffset(tokLen);
811   SourceLocation expansionLoc;
812   if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))
813     return false;
814 
815   if (expansionLoc.isFileID()) {
816     // No other macro expansions.
817     if (MacroEnd)
818       *MacroEnd = expansionLoc;
819     return true;
820   }
821 
822   return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd);
823 }
824 
825 static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range,
826                                              const SourceManager &SM,
827                                              const LangOptions &LangOpts) {
828   SourceLocation Begin = Range.getBegin();
829   SourceLocation End = Range.getEnd();
830   assert(Begin.isFileID() && End.isFileID());
831   if (Range.isTokenRange()) {
832     End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
833     if (End.isInvalid())
834       return CharSourceRange();
835   }
836 
837   // Break down the source locations.
838   FileID FID;
839   unsigned BeginOffs;
840   std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
841   if (FID.isInvalid())
842     return CharSourceRange();
843 
844   unsigned EndOffs;
845   if (!SM.isInFileID(End, FID, &EndOffs) ||
846       BeginOffs > EndOffs)
847     return CharSourceRange();
848 
849   return CharSourceRange::getCharRange(Begin, End);
850 }
851 
852 CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
853                                          const SourceManager &SM,
854                                          const LangOptions &LangOpts) {
855   SourceLocation Begin = Range.getBegin();
856   SourceLocation End = Range.getEnd();
857   if (Begin.isInvalid() || End.isInvalid())
858     return CharSourceRange();
859 
860   if (Begin.isFileID() && End.isFileID())
861     return makeRangeFromFileLocs(Range, SM, LangOpts);
862 
863   if (Begin.isMacroID() && End.isFileID()) {
864     if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
865       return CharSourceRange();
866     Range.setBegin(Begin);
867     return makeRangeFromFileLocs(Range, SM, LangOpts);
868   }
869 
870   if (Begin.isFileID() && End.isMacroID()) {
871     if ((Range.isTokenRange() && !isAtEndOfMacroExpansion(End, SM, LangOpts,
872                                                           &End)) ||
873         (Range.isCharRange() && !isAtStartOfMacroExpansion(End, SM, LangOpts,
874                                                            &End)))
875       return CharSourceRange();
876     Range.setEnd(End);
877     return makeRangeFromFileLocs(Range, SM, LangOpts);
878   }
879 
880   assert(Begin.isMacroID() && End.isMacroID());
881   SourceLocation MacroBegin, MacroEnd;
882   if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
883       ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts,
884                                                         &MacroEnd)) ||
885        (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts,
886                                                          &MacroEnd)))) {
887     Range.setBegin(MacroBegin);
888     Range.setEnd(MacroEnd);
889     return makeRangeFromFileLocs(Range, SM, LangOpts);
890   }
891 
892   bool Invalid = false;
893   const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
894                                                         &Invalid);
895   if (Invalid)
896     return CharSourceRange();
897 
898   if (BeginEntry.getExpansion().isMacroArgExpansion()) {
899     const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
900                                                         &Invalid);
901     if (Invalid)
902       return CharSourceRange();
903 
904     if (EndEntry.getExpansion().isMacroArgExpansion() &&
905         BeginEntry.getExpansion().getExpansionLocStart() ==
906             EndEntry.getExpansion().getExpansionLocStart()) {
907       Range.setBegin(SM.getImmediateSpellingLoc(Begin));
908       Range.setEnd(SM.getImmediateSpellingLoc(End));
909       return makeFileCharRange(Range, SM, LangOpts);
910     }
911   }
912 
913   return CharSourceRange();
914 }
915 
916 StringRef Lexer::getSourceText(CharSourceRange Range,
917                                const SourceManager &SM,
918                                const LangOptions &LangOpts,
919                                bool *Invalid) {
920   Range = makeFileCharRange(Range, SM, LangOpts);
921   if (Range.isInvalid()) {
922     if (Invalid) *Invalid = true;
923     return StringRef();
924   }
925 
926   // Break down the source location.
927   std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
928   if (beginInfo.first.isInvalid()) {
929     if (Invalid) *Invalid = true;
930     return StringRef();
931   }
932 
933   unsigned EndOffs;
934   if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
935       beginInfo.second > EndOffs) {
936     if (Invalid) *Invalid = true;
937     return StringRef();
938   }
939 
940   // Try to the load the file buffer.
941   bool invalidTemp = false;
942   StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
943   if (invalidTemp) {
944     if (Invalid) *Invalid = true;
945     return StringRef();
946   }
947 
948   if (Invalid) *Invalid = false;
949   return file.substr(beginInfo.second, EndOffs - beginInfo.second);
950 }
951 
952 StringRef Lexer::getImmediateMacroName(SourceLocation Loc,
953                                        const SourceManager &SM,
954                                        const LangOptions &LangOpts) {
955   assert(Loc.isMacroID() && "Only reasonble to call this on macros");
956 
957   // Find the location of the immediate macro expansion.
958   while (true) {
959     FileID FID = SM.getFileID(Loc);
960     const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
961     const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
962     Loc = Expansion.getExpansionLocStart();
963     if (!Expansion.isMacroArgExpansion())
964       break;
965 
966     // For macro arguments we need to check that the argument did not come
967     // from an inner macro, e.g: "MAC1( MAC2(foo) )"
968 
969     // Loc points to the argument id of the macro definition, move to the
970     // macro expansion.
971     Loc = SM.getImmediateExpansionRange(Loc).first;
972     SourceLocation SpellLoc = Expansion.getSpellingLoc();
973     if (SpellLoc.isFileID())
974       break; // No inner macro.
975 
976     // If spelling location resides in the same FileID as macro expansion
977     // location, it means there is no inner macro.
978     FileID MacroFID = SM.getFileID(Loc);
979     if (SM.isInFileID(SpellLoc, MacroFID))
980       break;
981 
982     // Argument came from inner macro.
983     Loc = SpellLoc;
984   }
985 
986   // Find the spelling location of the start of the non-argument expansion
987   // range. This is where the macro name was spelled in order to begin
988   // expanding this macro.
989   Loc = SM.getSpellingLoc(Loc);
990 
991   // Dig out the buffer where the macro name was spelled and the extents of the
992   // name so that we can render it into the expansion note.
993   std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
994   unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
995   StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
996   return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
997 }
998 
999 StringRef Lexer::getImmediateMacroNameForDiagnostics(
1000     SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) {
1001   assert(Loc.isMacroID() && "Only reasonble to call this on macros");
1002   // Walk past macro argument expanions.
1003   while (SM.isMacroArgExpansion(Loc))
1004     Loc = SM.getImmediateExpansionRange(Loc).first;
1005 
1006   // If the macro's spelling has no FileID, then it's actually a token paste
1007   // or stringization (or similar) and not a macro at all.
1008   if (!SM.getFileEntryForID(SM.getFileID(SM.getSpellingLoc(Loc))))
1009     return StringRef();
1010 
1011   // Find the spelling location of the start of the non-argument expansion
1012   // range. This is where the macro name was spelled in order to begin
1013   // expanding this macro.
1014   Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).first);
1015 
1016   // Dig out the buffer where the macro name was spelled and the extents of the
1017   // name so that we can render it into the expansion note.
1018   std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1019   unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1020   StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1021   return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1022 }
1023 
1024 bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) {
1025   return isIdentifierBody(c, LangOpts.DollarIdents);
1026 }
1027 
1028 bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
1029   assert(isVerticalWhitespace(Str[0]));
1030   if (Str - 1 < BufferStart)
1031     return false;
1032 
1033   if ((Str[0] == '\n' && Str[-1] == '\r') ||
1034       (Str[0] == '\r' && Str[-1] == '\n')) {
1035     if (Str - 2 < BufferStart)
1036       return false;
1037     --Str;
1038   }
1039   --Str;
1040 
1041   // Rewind to first non-space character:
1042   while (Str > BufferStart && isHorizontalWhitespace(*Str))
1043     --Str;
1044 
1045   return *Str == '\\';
1046 }
1047 
1048 StringRef Lexer::getIndentationForLine(SourceLocation Loc,
1049                                        const SourceManager &SM) {
1050   if (Loc.isInvalid() || Loc.isMacroID())
1051     return "";
1052   std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1053   if (LocInfo.first.isInvalid())
1054     return "";
1055   bool Invalid = false;
1056   StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
1057   if (Invalid)
1058     return "";
1059   const char *Line = findBeginningOfLine(Buffer, LocInfo.second);
1060   if (!Line)
1061     return "";
1062   StringRef Rest = Buffer.substr(Line - Buffer.data());
1063   size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");
1064   return NumWhitespaceChars == StringRef::npos
1065              ? ""
1066              : Rest.take_front(NumWhitespaceChars);
1067 }
1068 
1069 //===----------------------------------------------------------------------===//
1070 // Diagnostics forwarding code.
1071 //===----------------------------------------------------------------------===//
1072 
1073 /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
1074 /// lexer buffer was all expanded at a single point, perform the mapping.
1075 /// This is currently only used for _Pragma implementation, so it is the slow
1076 /// path of the hot getSourceLocation method.  Do not allow it to be inlined.
1077 static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(
1078     Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
1079 static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
1080                                         SourceLocation FileLoc,
1081                                         unsigned CharNo, unsigned TokLen) {
1082   assert(FileLoc.isMacroID() && "Must be a macro expansion");
1083 
1084   // Otherwise, we're lexing "mapped tokens".  This is used for things like
1085   // _Pragma handling.  Combine the expansion location of FileLoc with the
1086   // spelling location.
1087   SourceManager &SM = PP.getSourceManager();
1088 
1089   // Create a new SLoc which is expanded from Expansion(FileLoc) but whose
1090   // characters come from spelling(FileLoc)+Offset.
1091   SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
1092   SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);
1093 
1094   // Figure out the expansion loc range, which is the range covered by the
1095   // original _Pragma(...) sequence.
1096   std::pair<SourceLocation,SourceLocation> II =
1097     SM.getImmediateExpansionRange(FileLoc);
1098 
1099   return SM.createExpansionLoc(SpellingLoc, II.first, II.second, TokLen);
1100 }
1101 
1102 /// getSourceLocation - Return a source location identifier for the specified
1103 /// offset in the current file.
1104 SourceLocation Lexer::getSourceLocation(const char *Loc,
1105                                         unsigned TokLen) const {
1106   assert(Loc >= BufferStart && Loc <= BufferEnd &&
1107          "Location out of range for this buffer!");
1108 
1109   // In the normal case, we're just lexing from a simple file buffer, return
1110   // the file id from FileLoc with the offset specified.
1111   unsigned CharNo = Loc-BufferStart;
1112   if (FileLoc.isFileID())
1113     return FileLoc.getLocWithOffset(CharNo);
1114 
1115   // Otherwise, this is the _Pragma lexer case, which pretends that all of the
1116   // tokens are lexed from where the _Pragma was defined.
1117   assert(PP && "This doesn't work on raw lexers");
1118   return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
1119 }
1120 
1121 /// Diag - Forwarding function for diagnostics.  This translate a source
1122 /// position in the current buffer into a SourceLocation object for rendering.
1123 DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
1124   return PP->Diag(getSourceLocation(Loc), DiagID);
1125 }
1126 
1127 //===----------------------------------------------------------------------===//
1128 // Trigraph and Escaped Newline Handling Code.
1129 //===----------------------------------------------------------------------===//
1130 
1131 /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
1132 /// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
1133 static char GetTrigraphCharForLetter(char Letter) {
1134   switch (Letter) {
1135   default:   return 0;
1136   case '=':  return '#';
1137   case ')':  return ']';
1138   case '(':  return '[';
1139   case '!':  return '|';
1140   case '\'': return '^';
1141   case '>':  return '}';
1142   case '/':  return '\\';
1143   case '<':  return '{';
1144   case '-':  return '~';
1145   }
1146 }
1147 
1148 /// DecodeTrigraphChar - If the specified character is a legal trigraph when
1149 /// prefixed with ??, emit a trigraph warning.  If trigraphs are enabled,
1150 /// return the result character.  Finally, emit a warning about trigraph use
1151 /// whether trigraphs are enabled or not.
1152 static char DecodeTrigraphChar(const char *CP, Lexer *L) {
1153   char Res = GetTrigraphCharForLetter(*CP);
1154   if (!Res || !L) return Res;
1155 
1156   if (!L->getLangOpts().Trigraphs) {
1157     if (!L->isLexingRawMode())
1158       L->Diag(CP-2, diag::trigraph_ignored);
1159     return 0;
1160   }
1161 
1162   if (!L->isLexingRawMode())
1163     L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
1164   return Res;
1165 }
1166 
1167 /// getEscapedNewLineSize - Return the size of the specified escaped newline,
1168 /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
1169 /// trigraph equivalent on entry to this function.
1170 unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
1171   unsigned Size = 0;
1172   while (isWhitespace(Ptr[Size])) {
1173     ++Size;
1174 
1175     if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
1176       continue;
1177 
1178     // If this is a \r\n or \n\r, skip the other half.
1179     if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
1180         Ptr[Size-1] != Ptr[Size])
1181       ++Size;
1182 
1183     return Size;
1184   }
1185 
1186   // Not an escaped newline, must be a \t or something else.
1187   return 0;
1188 }
1189 
1190 /// SkipEscapedNewLines - If P points to an escaped newline (or a series of
1191 /// them), skip over them and return the first non-escaped-newline found,
1192 /// otherwise return P.
1193 const char *Lexer::SkipEscapedNewLines(const char *P) {
1194   while (true) {
1195     const char *AfterEscape;
1196     if (*P == '\\') {
1197       AfterEscape = P+1;
1198     } else if (*P == '?') {
1199       // If not a trigraph for escape, bail out.
1200       if (P[1] != '?' || P[2] != '/')
1201         return P;
1202       // FIXME: Take LangOpts into account; the language might not
1203       // support trigraphs.
1204       AfterEscape = P+3;
1205     } else {
1206       return P;
1207     }
1208 
1209     unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
1210     if (NewLineSize == 0) return P;
1211     P = AfterEscape+NewLineSize;
1212   }
1213 }
1214 
1215 /// \brief Checks that the given token is the first token that occurs after the
1216 /// given location (this excludes comments and whitespace). Returns the location
1217 /// immediately after the specified token. If the token is not found or the
1218 /// location is inside a macro, the returned source location will be invalid.
1219 SourceLocation Lexer::findLocationAfterToken(SourceLocation Loc,
1220                                         tok::TokenKind TKind,
1221                                         const SourceManager &SM,
1222                                         const LangOptions &LangOpts,
1223                                         bool SkipTrailingWhitespaceAndNewLine) {
1224   if (Loc.isMacroID()) {
1225     if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
1226       return SourceLocation();
1227   }
1228   Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
1229 
1230   // Break down the source location.
1231   std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1232 
1233   // Try to load the file buffer.
1234   bool InvalidTemp = false;
1235   StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
1236   if (InvalidTemp)
1237     return SourceLocation();
1238 
1239   const char *TokenBegin = File.data() + LocInfo.second;
1240 
1241   // Lex from the start of the given location.
1242   Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
1243                                       TokenBegin, File.end());
1244   // Find the token.
1245   Token Tok;
1246   lexer.LexFromRawLexer(Tok);
1247   if (Tok.isNot(TKind))
1248     return SourceLocation();
1249   SourceLocation TokenLoc = Tok.getLocation();
1250 
1251   // Calculate how much whitespace needs to be skipped if any.
1252   unsigned NumWhitespaceChars = 0;
1253   if (SkipTrailingWhitespaceAndNewLine) {
1254     const char *TokenEnd = SM.getCharacterData(TokenLoc) +
1255                            Tok.getLength();
1256     unsigned char C = *TokenEnd;
1257     while (isHorizontalWhitespace(C)) {
1258       C = *(++TokenEnd);
1259       NumWhitespaceChars++;
1260     }
1261 
1262     // Skip \r, \n, \r\n, or \n\r
1263     if (C == '\n' || C == '\r') {
1264       char PrevC = C;
1265       C = *(++TokenEnd);
1266       NumWhitespaceChars++;
1267       if ((C == '\n' || C == '\r') && C != PrevC)
1268         NumWhitespaceChars++;
1269     }
1270   }
1271 
1272   return TokenLoc.getLocWithOffset(Tok.getLength() + NumWhitespaceChars);
1273 }
1274 
1275 /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
1276 /// get its size, and return it.  This is tricky in several cases:
1277 ///   1. If currently at the start of a trigraph, we warn about the trigraph,
1278 ///      then either return the trigraph (skipping 3 chars) or the '?',
1279 ///      depending on whether trigraphs are enabled or not.
1280 ///   2. If this is an escaped newline (potentially with whitespace between
1281 ///      the backslash and newline), implicitly skip the newline and return
1282 ///      the char after it.
1283 ///
1284 /// This handles the slow/uncommon case of the getCharAndSize method.  Here we
1285 /// know that we can accumulate into Size, and that we have already incremented
1286 /// Ptr by Size bytes.
1287 ///
1288 /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
1289 /// be updated to match.
1290 ///
1291 char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
1292                                Token *Tok) {
1293   // If we have a slash, look for an escaped newline.
1294   if (Ptr[0] == '\\') {
1295     ++Size;
1296     ++Ptr;
1297 Slash:
1298     // Common case, backslash-char where the char is not whitespace.
1299     if (!isWhitespace(Ptr[0])) return '\\';
1300 
1301     // See if we have optional whitespace characters between the slash and
1302     // newline.
1303     if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1304       // Remember that this token needs to be cleaned.
1305       if (Tok) Tok->setFlag(Token::NeedsCleaning);
1306 
1307       // Warn if there was whitespace between the backslash and newline.
1308       if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
1309         Diag(Ptr, diag::backslash_newline_space);
1310 
1311       // Found backslash<whitespace><newline>.  Parse the char after it.
1312       Size += EscapedNewLineSize;
1313       Ptr  += EscapedNewLineSize;
1314 
1315       // Use slow version to accumulate a correct size field.
1316       return getCharAndSizeSlow(Ptr, Size, Tok);
1317     }
1318 
1319     // Otherwise, this is not an escaped newline, just return the slash.
1320     return '\\';
1321   }
1322 
1323   // If this is a trigraph, process it.
1324   if (Ptr[0] == '?' && Ptr[1] == '?') {
1325     // If this is actually a legal trigraph (not something like "??x"), emit
1326     // a trigraph warning.  If so, and if trigraphs are enabled, return it.
1327     if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : nullptr)) {
1328       // Remember that this token needs to be cleaned.
1329       if (Tok) Tok->setFlag(Token::NeedsCleaning);
1330 
1331       Ptr += 3;
1332       Size += 3;
1333       if (C == '\\') goto Slash;
1334       return C;
1335     }
1336   }
1337 
1338   // If this is neither, return a single character.
1339   ++Size;
1340   return *Ptr;
1341 }
1342 
1343 /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
1344 /// getCharAndSizeNoWarn method.  Here we know that we can accumulate into Size,
1345 /// and that we have already incremented Ptr by Size bytes.
1346 ///
1347 /// NOTE: When this method is updated, getCharAndSizeSlow (above) should
1348 /// be updated to match.
1349 char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
1350                                      const LangOptions &LangOpts) {
1351   // If we have a slash, look for an escaped newline.
1352   if (Ptr[0] == '\\') {
1353     ++Size;
1354     ++Ptr;
1355 Slash:
1356     // Common case, backslash-char where the char is not whitespace.
1357     if (!isWhitespace(Ptr[0])) return '\\';
1358 
1359     // See if we have optional whitespace characters followed by a newline.
1360     if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1361       // Found backslash<whitespace><newline>.  Parse the char after it.
1362       Size += EscapedNewLineSize;
1363       Ptr  += EscapedNewLineSize;
1364 
1365       // Use slow version to accumulate a correct size field.
1366       return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
1367     }
1368 
1369     // Otherwise, this is not an escaped newline, just return the slash.
1370     return '\\';
1371   }
1372 
1373   // If this is a trigraph, process it.
1374   if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
1375     // If this is actually a legal trigraph (not something like "??x"), return
1376     // it.
1377     if (char C = GetTrigraphCharForLetter(Ptr[2])) {
1378       Ptr += 3;
1379       Size += 3;
1380       if (C == '\\') goto Slash;
1381       return C;
1382     }
1383   }
1384 
1385   // If this is neither, return a single character.
1386   ++Size;
1387   return *Ptr;
1388 }
1389 
1390 //===----------------------------------------------------------------------===//
1391 // Helper methods for lexing.
1392 //===----------------------------------------------------------------------===//
1393 
1394 /// \brief Routine that indiscriminately sets the offset into the source file.
1395 void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
1396   BufferPtr = BufferStart + Offset;
1397   if (BufferPtr > BufferEnd)
1398     BufferPtr = BufferEnd;
1399   // FIXME: What exactly does the StartOfLine bit mean?  There are two
1400   // possible meanings for the "start" of the line: the first token on the
1401   // unexpanded line, or the first token on the expanded line.
1402   IsAtStartOfLine = StartOfLine;
1403   IsAtPhysicalStartOfLine = StartOfLine;
1404 }
1405 
1406 static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
1407   if (LangOpts.AsmPreprocessor) {
1408     return false;
1409   } else if (LangOpts.CPlusPlus11 || LangOpts.C11) {
1410     static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
1411         C11AllowedIDCharRanges);
1412     return C11AllowedIDChars.contains(C);
1413   } else if (LangOpts.CPlusPlus) {
1414     static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
1415         CXX03AllowedIDCharRanges);
1416     return CXX03AllowedIDChars.contains(C);
1417   } else {
1418     static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1419         C99AllowedIDCharRanges);
1420     return C99AllowedIDChars.contains(C);
1421   }
1422 }
1423 
1424 static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) {
1425   assert(isAllowedIDChar(C, LangOpts));
1426   if (LangOpts.AsmPreprocessor) {
1427     return false;
1428   } else if (LangOpts.CPlusPlus11 || LangOpts.C11) {
1429     static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
1430         C11DisallowedInitialIDCharRanges);
1431     return !C11DisallowedInitialIDChars.contains(C);
1432   } else if (LangOpts.CPlusPlus) {
1433     return true;
1434   } else {
1435     static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1436         C99DisallowedInitialIDCharRanges);
1437     return !C99DisallowedInitialIDChars.contains(C);
1438   }
1439 }
1440 
1441 static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
1442                                             const char *End) {
1443   return CharSourceRange::getCharRange(L.getSourceLocation(Begin),
1444                                        L.getSourceLocation(End));
1445 }
1446 
1447 static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
1448                                       CharSourceRange Range, bool IsFirst) {
1449   // Check C99 compatibility.
1450   if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) {
1451     enum {
1452       CannotAppearInIdentifier = 0,
1453       CannotStartIdentifier
1454     };
1455 
1456     static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1457         C99AllowedIDCharRanges);
1458     static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1459         C99DisallowedInitialIDCharRanges);
1460     if (!C99AllowedIDChars.contains(C)) {
1461       Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1462         << Range
1463         << CannotAppearInIdentifier;
1464     } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {
1465       Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1466         << Range
1467         << CannotStartIdentifier;
1468     }
1469   }
1470 
1471   // Check C++98 compatibility.
1472   if (!Diags.isIgnored(diag::warn_cxx98_compat_unicode_id, Range.getBegin())) {
1473     static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
1474         CXX03AllowedIDCharRanges);
1475     if (!CXX03AllowedIDChars.contains(C)) {
1476       Diags.Report(Range.getBegin(), diag::warn_cxx98_compat_unicode_id)
1477         << Range;
1478     }
1479   }
1480 }
1481 
1482 bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
1483                                     Token &Result) {
1484   const char *UCNPtr = CurPtr + Size;
1485   uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr);
1486   if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts))
1487     return false;
1488 
1489   if (!isLexingRawMode())
1490     maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
1491                               makeCharRange(*this, CurPtr, UCNPtr),
1492                               /*IsFirst=*/false);
1493 
1494   Result.setFlag(Token::HasUCN);
1495   if ((UCNPtr - CurPtr ==  6 && CurPtr[1] == 'u') ||
1496       (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
1497     CurPtr = UCNPtr;
1498   else
1499     while (CurPtr != UCNPtr)
1500       (void)getAndAdvanceChar(CurPtr, Result);
1501   return true;
1502 }
1503 
1504 bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
1505   const char *UnicodePtr = CurPtr;
1506   llvm::UTF32 CodePoint;
1507   llvm::ConversionResult Result =
1508       llvm::convertUTF8Sequence((const llvm::UTF8 **)&UnicodePtr,
1509                                 (const llvm::UTF8 *)BufferEnd,
1510                                 &CodePoint,
1511                                 llvm::strictConversion);
1512   if (Result != llvm::conversionOK ||
1513       !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts))
1514     return false;
1515 
1516   if (!isLexingRawMode())
1517     maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
1518                               makeCharRange(*this, CurPtr, UnicodePtr),
1519                               /*IsFirst=*/false);
1520 
1521   CurPtr = UnicodePtr;
1522   return true;
1523 }
1524 
1525 bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
1526   // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
1527   unsigned Size;
1528   unsigned char C = *CurPtr++;
1529   while (isIdentifierBody(C))
1530     C = *CurPtr++;
1531 
1532   --CurPtr;   // Back up over the skipped character.
1533 
1534   // Fast path, no $,\,? in identifier found.  '\' might be an escaped newline
1535   // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
1536   //
1537   // TODO: Could merge these checks into an InfoTable flag to make the
1538   // comparison cheaper
1539   if (isASCII(C) && C != '\\' && C != '?' &&
1540       (C != '$' || !LangOpts.DollarIdents)) {
1541 FinishIdentifier:
1542     const char *IdStart = BufferPtr;
1543     FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
1544     Result.setRawIdentifierData(IdStart);
1545 
1546     // If we are in raw mode, return this identifier raw.  There is no need to
1547     // look up identifier information or attempt to macro expand it.
1548     if (LexingRawMode)
1549       return true;
1550 
1551     // Fill in Result.IdentifierInfo and update the token kind,
1552     // looking up the identifier in the identifier table.
1553     IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
1554 
1555     // Finally, now that we know we have an identifier, pass this off to the
1556     // preprocessor, which may macro expand it or something.
1557     if (II->isHandleIdentifierCase())
1558       return PP->HandleIdentifier(Result);
1559 
1560     if (II->getTokenID() == tok::identifier && isCodeCompletionPoint(CurPtr)
1561         && II->getPPKeywordID() == tok::pp_not_keyword
1562         && II->getObjCKeywordID() == tok::objc_not_keyword) {
1563       // Return the code-completion token.
1564       Result.setKind(tok::code_completion);
1565       cutOffLexing();
1566       return true;
1567     }
1568     return true;
1569   }
1570 
1571   // Otherwise, $,\,? in identifier found.  Enter slower path.
1572 
1573   C = getCharAndSize(CurPtr, Size);
1574   while (true) {
1575     if (C == '$') {
1576       // If we hit a $ and they are not supported in identifiers, we are done.
1577       if (!LangOpts.DollarIdents) goto FinishIdentifier;
1578 
1579       // Otherwise, emit a diagnostic and continue.
1580       if (!isLexingRawMode())
1581         Diag(CurPtr, diag::ext_dollar_in_identifier);
1582       CurPtr = ConsumeChar(CurPtr, Size, Result);
1583       C = getCharAndSize(CurPtr, Size);
1584       continue;
1585 
1586     } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
1587       C = getCharAndSize(CurPtr, Size);
1588       continue;
1589     } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {
1590       C = getCharAndSize(CurPtr, Size);
1591       continue;
1592     } else if (!isIdentifierBody(C)) {
1593       goto FinishIdentifier;
1594     }
1595 
1596     // Otherwise, this character is good, consume it.
1597     CurPtr = ConsumeChar(CurPtr, Size, Result);
1598 
1599     C = getCharAndSize(CurPtr, Size);
1600     while (isIdentifierBody(C)) {
1601       CurPtr = ConsumeChar(CurPtr, Size, Result);
1602       C = getCharAndSize(CurPtr, Size);
1603     }
1604   }
1605 }
1606 
1607 /// isHexaLiteral - Return true if Start points to a hex constant.
1608 /// in microsoft mode (where this is supposed to be several different tokens).
1609 bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
1610   unsigned Size;
1611   char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts);
1612   if (C1 != '0')
1613     return false;
1614   char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts);
1615   return (C2 == 'x' || C2 == 'X');
1616 }
1617 
1618 /// LexNumericConstant - Lex the remainder of a integer or floating point
1619 /// constant. From[-1] is the first character lexed.  Return the end of the
1620 /// constant.
1621 bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
1622   unsigned Size;
1623   char C = getCharAndSize(CurPtr, Size);
1624   char PrevCh = 0;
1625   while (isPreprocessingNumberBody(C)) {
1626     CurPtr = ConsumeChar(CurPtr, Size, Result);
1627     PrevCh = C;
1628     C = getCharAndSize(CurPtr, Size);
1629   }
1630 
1631   // If we fell out, check for a sign, due to 1e+12.  If we have one, continue.
1632   if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {
1633     // If we are in Microsoft mode, don't continue if the constant is hex.
1634     // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
1635     if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts))
1636       return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1637   }
1638 
1639   // If we have a hex FP constant, continue.
1640   if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) {
1641     // Outside C99 and C++17, we accept hexadecimal floating point numbers as a
1642     // not-quite-conforming extension. Only do so if this looks like it's
1643     // actually meant to be a hexfloat, and not if it has a ud-suffix.
1644     bool IsHexFloat = true;
1645     if (!LangOpts.C99) {
1646       if (!isHexaLiteral(BufferPtr, LangOpts))
1647         IsHexFloat = false;
1648       else if (!getLangOpts().CPlusPlus1z &&
1649                std::find(BufferPtr, CurPtr, '_') != CurPtr)
1650         IsHexFloat = false;
1651     }
1652     if (IsHexFloat)
1653       return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1654   }
1655 
1656   // If we have a digit separator, continue.
1657   if (C == '\'' && getLangOpts().CPlusPlus14) {
1658     unsigned NextSize;
1659     char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, getLangOpts());
1660     if (isIdentifierBody(Next)) {
1661       if (!isLexingRawMode())
1662         Diag(CurPtr, diag::warn_cxx11_compat_digit_separator);
1663       CurPtr = ConsumeChar(CurPtr, Size, Result);
1664       CurPtr = ConsumeChar(CurPtr, NextSize, Result);
1665       return LexNumericConstant(Result, CurPtr);
1666     }
1667   }
1668 
1669   // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
1670   if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1671     return LexNumericConstant(Result, CurPtr);
1672   if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
1673     return LexNumericConstant(Result, CurPtr);
1674 
1675   // Update the location of token as well as BufferPtr.
1676   const char *TokStart = BufferPtr;
1677   FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
1678   Result.setLiteralData(TokStart);
1679   return true;
1680 }
1681 
1682 /// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
1683 /// in C++11, or warn on a ud-suffix in C++98.
1684 const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
1685                                bool IsStringLiteral) {
1686   assert(getLangOpts().CPlusPlus);
1687 
1688   // Maximally munch an identifier.
1689   unsigned Size;
1690   char C = getCharAndSize(CurPtr, Size);
1691   bool Consumed = false;
1692 
1693   if (!isIdentifierHead(C)) {
1694     if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1695       Consumed = true;
1696     else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
1697       Consumed = true;
1698     else
1699       return CurPtr;
1700   }
1701 
1702   if (!getLangOpts().CPlusPlus11) {
1703     if (!isLexingRawMode())
1704       Diag(CurPtr,
1705            C == '_' ? diag::warn_cxx11_compat_user_defined_literal
1706                     : diag::warn_cxx11_compat_reserved_user_defined_literal)
1707         << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
1708     return CurPtr;
1709   }
1710 
1711   // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
1712   // that does not start with an underscore is ill-formed. As a conforming
1713   // extension, we treat all such suffixes as if they had whitespace before
1714   // them. We assume a suffix beginning with a UCN or UTF-8 character is more
1715   // likely to be a ud-suffix than a macro, however, and accept that.
1716   if (!Consumed) {
1717     bool IsUDSuffix = false;
1718     if (C == '_')
1719       IsUDSuffix = true;
1720     else if (IsStringLiteral && getLangOpts().CPlusPlus14) {
1721       // In C++1y, we need to look ahead a few characters to see if this is a
1722       // valid suffix for a string literal or a numeric literal (this could be
1723       // the 'operator""if' defining a numeric literal operator).
1724       const unsigned MaxStandardSuffixLength = 3;
1725       char Buffer[MaxStandardSuffixLength] = { C };
1726       unsigned Consumed = Size;
1727       unsigned Chars = 1;
1728       while (true) {
1729         unsigned NextSize;
1730         char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize,
1731                                          getLangOpts());
1732         if (!isIdentifierBody(Next)) {
1733           // End of suffix. Check whether this is on the whitelist.
1734           const StringRef CompleteSuffix(Buffer, Chars);
1735           IsUDSuffix = StringLiteralParser::isValidUDSuffix(getLangOpts(),
1736                                                             CompleteSuffix);
1737           break;
1738         }
1739 
1740         if (Chars == MaxStandardSuffixLength)
1741           // Too long: can't be a standard suffix.
1742           break;
1743 
1744         Buffer[Chars++] = Next;
1745         Consumed += NextSize;
1746       }
1747     }
1748 
1749     if (!IsUDSuffix) {
1750       if (!isLexingRawMode())
1751         Diag(CurPtr, getLangOpts().MSVCCompat
1752                          ? diag::ext_ms_reserved_user_defined_literal
1753                          : diag::ext_reserved_user_defined_literal)
1754           << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
1755       return CurPtr;
1756     }
1757 
1758     CurPtr = ConsumeChar(CurPtr, Size, Result);
1759   }
1760 
1761   Result.setFlag(Token::HasUDSuffix);
1762   while (true) {
1763     C = getCharAndSize(CurPtr, Size);
1764     if (isIdentifierBody(C)) { CurPtr = ConsumeChar(CurPtr, Size, Result); }
1765     else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {}
1766     else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {}
1767     else break;
1768   }
1769 
1770   return CurPtr;
1771 }
1772 
1773 /// LexStringLiteral - Lex the remainder of a string literal, after having lexed
1774 /// either " or L" or u8" or u" or U".
1775 bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
1776                              tok::TokenKind Kind) {
1777   // Does this string contain the \0 character?
1778   const char *NulCharacter = nullptr;
1779 
1780   if (!isLexingRawMode() &&
1781       (Kind == tok::utf8_string_literal ||
1782        Kind == tok::utf16_string_literal ||
1783        Kind == tok::utf32_string_literal))
1784     Diag(BufferPtr, getLangOpts().CPlusPlus
1785            ? diag::warn_cxx98_compat_unicode_literal
1786            : diag::warn_c99_compat_unicode_literal);
1787 
1788   char C = getAndAdvanceChar(CurPtr, Result);
1789   while (C != '"') {
1790     // Skip escaped characters.  Escaped newlines will already be processed by
1791     // getAndAdvanceChar.
1792     if (C == '\\')
1793       C = getAndAdvanceChar(CurPtr, Result);
1794 
1795     if (C == '\n' || C == '\r' ||             // Newline.
1796         (C == 0 && CurPtr-1 == BufferEnd)) {  // End of file.
1797       if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
1798         Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;
1799       FormTokenWithChars(Result, CurPtr-1, tok::unknown);
1800       return true;
1801     }
1802 
1803     if (C == 0) {
1804       if (isCodeCompletionPoint(CurPtr-1)) {
1805         PP->CodeCompleteNaturalLanguage();
1806         FormTokenWithChars(Result, CurPtr-1, tok::unknown);
1807         cutOffLexing();
1808         return true;
1809       }
1810 
1811       NulCharacter = CurPtr-1;
1812     }
1813     C = getAndAdvanceChar(CurPtr, Result);
1814   }
1815 
1816   // If we are in C++11, lex the optional ud-suffix.
1817   if (getLangOpts().CPlusPlus)
1818     CurPtr = LexUDSuffix(Result, CurPtr, true);
1819 
1820   // If a nul character existed in the string, warn about it.
1821   if (NulCharacter && !isLexingRawMode())
1822     Diag(NulCharacter, diag::null_in_char_or_string) << 1;
1823 
1824   // Update the location of the token as well as the BufferPtr instance var.
1825   const char *TokStart = BufferPtr;
1826   FormTokenWithChars(Result, CurPtr, Kind);
1827   Result.setLiteralData(TokStart);
1828   return true;
1829 }
1830 
1831 /// LexRawStringLiteral - Lex the remainder of a raw string literal, after
1832 /// having lexed R", LR", u8R", uR", or UR".
1833 bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
1834                                 tok::TokenKind Kind) {
1835   // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
1836   //  Between the initial and final double quote characters of the raw string,
1837   //  any transformations performed in phases 1 and 2 (trigraphs,
1838   //  universal-character-names, and line splicing) are reverted.
1839 
1840   if (!isLexingRawMode())
1841     Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
1842 
1843   unsigned PrefixLen = 0;
1844 
1845   while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen]))
1846     ++PrefixLen;
1847 
1848   // If the last character was not a '(', then we didn't lex a valid delimiter.
1849   if (CurPtr[PrefixLen] != '(') {
1850     if (!isLexingRawMode()) {
1851       const char *PrefixEnd = &CurPtr[PrefixLen];
1852       if (PrefixLen == 16) {
1853         Diag(PrefixEnd, diag::err_raw_delim_too_long);
1854       } else {
1855         Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
1856           << StringRef(PrefixEnd, 1);
1857       }
1858     }
1859 
1860     // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
1861     // it's possible the '"' was intended to be part of the raw string, but
1862     // there's not much we can do about that.
1863     while (true) {
1864       char C = *CurPtr++;
1865 
1866       if (C == '"')
1867         break;
1868       if (C == 0 && CurPtr-1 == BufferEnd) {
1869         --CurPtr;
1870         break;
1871       }
1872     }
1873 
1874     FormTokenWithChars(Result, CurPtr, tok::unknown);
1875     return true;
1876   }
1877 
1878   // Save prefix and move CurPtr past it
1879   const char *Prefix = CurPtr;
1880   CurPtr += PrefixLen + 1; // skip over prefix and '('
1881 
1882   while (true) {
1883     char C = *CurPtr++;
1884 
1885     if (C == ')') {
1886       // Check for prefix match and closing quote.
1887       if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {
1888         CurPtr += PrefixLen + 1; // skip over prefix and '"'
1889         break;
1890       }
1891     } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.
1892       if (!isLexingRawMode())
1893         Diag(BufferPtr, diag::err_unterminated_raw_string)
1894           << StringRef(Prefix, PrefixLen);
1895       FormTokenWithChars(Result, CurPtr-1, tok::unknown);
1896       return true;
1897     }
1898   }
1899 
1900   // If we are in C++11, lex the optional ud-suffix.
1901   if (getLangOpts().CPlusPlus)
1902     CurPtr = LexUDSuffix(Result, CurPtr, true);
1903 
1904   // Update the location of token as well as BufferPtr.
1905   const char *TokStart = BufferPtr;
1906   FormTokenWithChars(Result, CurPtr, Kind);
1907   Result.setLiteralData(TokStart);
1908   return true;
1909 }
1910 
1911 /// LexAngledStringLiteral - Lex the remainder of an angled string literal,
1912 /// after having lexed the '<' character.  This is used for #include filenames.
1913 bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
1914   // Does this string contain the \0 character?
1915   const char *NulCharacter = nullptr;
1916   const char *AfterLessPos = CurPtr;
1917   char C = getAndAdvanceChar(CurPtr, Result);
1918   while (C != '>') {
1919     // Skip escaped characters.
1920     if (C == '\\' && CurPtr < BufferEnd) {
1921       // Skip the escaped character.
1922       getAndAdvanceChar(CurPtr, Result);
1923     } else if (C == '\n' || C == '\r' ||             // Newline.
1924                (C == 0 && (CurPtr-1 == BufferEnd ||  // End of file.
1925                            isCodeCompletionPoint(CurPtr-1)))) {
1926       // If the filename is unterminated, then it must just be a lone <
1927       // character.  Return this as such.
1928       FormTokenWithChars(Result, AfterLessPos, tok::less);
1929       return true;
1930     } else if (C == 0) {
1931       NulCharacter = CurPtr-1;
1932     }
1933     C = getAndAdvanceChar(CurPtr, Result);
1934   }
1935 
1936   // If a nul character existed in the string, warn about it.
1937   if (NulCharacter && !isLexingRawMode())
1938     Diag(NulCharacter, diag::null_in_char_or_string) << 1;
1939 
1940   // Update the location of token as well as BufferPtr.
1941   const char *TokStart = BufferPtr;
1942   FormTokenWithChars(Result, CurPtr, tok::angle_string_literal);
1943   Result.setLiteralData(TokStart);
1944   return true;
1945 }
1946 
1947 /// LexCharConstant - Lex the remainder of a character constant, after having
1948 /// lexed either ' or L' or u8' or u' or U'.
1949 bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
1950                             tok::TokenKind Kind) {
1951   // Does this character contain the \0 character?
1952   const char *NulCharacter = nullptr;
1953 
1954   if (!isLexingRawMode()) {
1955     if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)
1956       Diag(BufferPtr, getLangOpts().CPlusPlus
1957                           ? diag::warn_cxx98_compat_unicode_literal
1958                           : diag::warn_c99_compat_unicode_literal);
1959     else if (Kind == tok::utf8_char_constant)
1960       Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal);
1961   }
1962 
1963   char C = getAndAdvanceChar(CurPtr, Result);
1964   if (C == '\'') {
1965     if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
1966       Diag(BufferPtr, diag::ext_empty_character);
1967     FormTokenWithChars(Result, CurPtr, tok::unknown);
1968     return true;
1969   }
1970 
1971   while (C != '\'') {
1972     // Skip escaped characters.
1973     if (C == '\\')
1974       C = getAndAdvanceChar(CurPtr, Result);
1975 
1976     if (C == '\n' || C == '\r' ||             // Newline.
1977         (C == 0 && CurPtr-1 == BufferEnd)) {  // End of file.
1978       if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
1979         Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0;
1980       FormTokenWithChars(Result, CurPtr-1, tok::unknown);
1981       return true;
1982     }
1983 
1984     if (C == 0) {
1985       if (isCodeCompletionPoint(CurPtr-1)) {
1986         PP->CodeCompleteNaturalLanguage();
1987         FormTokenWithChars(Result, CurPtr-1, tok::unknown);
1988         cutOffLexing();
1989         return true;
1990       }
1991 
1992       NulCharacter = CurPtr-1;
1993     }
1994     C = getAndAdvanceChar(CurPtr, Result);
1995   }
1996 
1997   // If we are in C++11, lex the optional ud-suffix.
1998   if (getLangOpts().CPlusPlus)
1999     CurPtr = LexUDSuffix(Result, CurPtr, false);
2000 
2001   // If a nul character existed in the character, warn about it.
2002   if (NulCharacter && !isLexingRawMode())
2003     Diag(NulCharacter, diag::null_in_char_or_string) << 0;
2004 
2005   // Update the location of token as well as BufferPtr.
2006   const char *TokStart = BufferPtr;
2007   FormTokenWithChars(Result, CurPtr, Kind);
2008   Result.setLiteralData(TokStart);
2009   return true;
2010 }
2011 
2012 /// SkipWhitespace - Efficiently skip over a series of whitespace characters.
2013 /// Update BufferPtr to point to the next non-whitespace character and return.
2014 ///
2015 /// This method forms a token and returns true if KeepWhitespaceMode is enabled.
2016 ///
2017 bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
2018                            bool &TokAtPhysicalStartOfLine) {
2019   // Whitespace - Skip it, then return the token after the whitespace.
2020   bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
2021 
2022   unsigned char Char = *CurPtr;
2023 
2024   // Skip consecutive spaces efficiently.
2025   while (true) {
2026     // Skip horizontal whitespace very aggressively.
2027     while (isHorizontalWhitespace(Char))
2028       Char = *++CurPtr;
2029 
2030     // Otherwise if we have something other than whitespace, we're done.
2031     if (!isVerticalWhitespace(Char))
2032       break;
2033 
2034     if (ParsingPreprocessorDirective) {
2035       // End of preprocessor directive line, let LexTokenInternal handle this.
2036       BufferPtr = CurPtr;
2037       return false;
2038     }
2039 
2040     // OK, but handle newline.
2041     SawNewline = true;
2042     Char = *++CurPtr;
2043   }
2044 
2045   // If the client wants us to return whitespace, return it now.
2046   if (isKeepWhitespaceMode()) {
2047     FormTokenWithChars(Result, CurPtr, tok::unknown);
2048     if (SawNewline) {
2049       IsAtStartOfLine = true;
2050       IsAtPhysicalStartOfLine = true;
2051     }
2052     // FIXME: The next token will not have LeadingSpace set.
2053     return true;
2054   }
2055 
2056   // If this isn't immediately after a newline, there is leading space.
2057   char PrevChar = CurPtr[-1];
2058   bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
2059 
2060   Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
2061   if (SawNewline) {
2062     Result.setFlag(Token::StartOfLine);
2063     TokAtPhysicalStartOfLine = true;
2064   }
2065 
2066   BufferPtr = CurPtr;
2067   return false;
2068 }
2069 
2070 /// We have just read the // characters from input.  Skip until we find the
2071 /// newline character thats terminate the comment.  Then update BufferPtr and
2072 /// return.
2073 ///
2074 /// If we're in KeepCommentMode or any CommentHandler has inserted
2075 /// some tokens, this will store the first token and return true.
2076 bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
2077                             bool &TokAtPhysicalStartOfLine) {
2078   // If Line comments aren't explicitly enabled for this language, emit an
2079   // extension warning.
2080   if (!LangOpts.LineComment && !isLexingRawMode()) {
2081     Diag(BufferPtr, diag::ext_line_comment);
2082 
2083     // Mark them enabled so we only emit one warning for this translation
2084     // unit.
2085     LangOpts.LineComment = true;
2086   }
2087 
2088   // Scan over the body of the comment.  The common case, when scanning, is that
2089   // the comment contains normal ascii characters with nothing interesting in
2090   // them.  As such, optimize for this case with the inner loop.
2091   //
2092   // This loop terminates with CurPtr pointing at the newline (or end of buffer)
2093   // character that ends the line comment.
2094   char C;
2095   while (true) {
2096     C = *CurPtr;
2097     // Skip over characters in the fast loop.
2098     while (C != 0 &&                // Potentially EOF.
2099            C != '\n' && C != '\r')  // Newline or DOS-style newline.
2100       C = *++CurPtr;
2101 
2102     const char *NextLine = CurPtr;
2103     if (C != 0) {
2104       // We found a newline, see if it's escaped.
2105       const char *EscapePtr = CurPtr-1;
2106       bool HasSpace = false;
2107       while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace.
2108         --EscapePtr;
2109         HasSpace = true;
2110       }
2111 
2112       if (*EscapePtr == '\\')
2113         // Escaped newline.
2114         CurPtr = EscapePtr;
2115       else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&
2116                EscapePtr[-2] == '?' && LangOpts.Trigraphs)
2117         // Trigraph-escaped newline.
2118         CurPtr = EscapePtr-2;
2119       else
2120         break; // This is a newline, we're done.
2121 
2122       // If there was space between the backslash and newline, warn about it.
2123       if (HasSpace && !isLexingRawMode())
2124         Diag(EscapePtr, diag::backslash_newline_space);
2125     }
2126 
2127     // Otherwise, this is a hard case.  Fall back on getAndAdvanceChar to
2128     // properly decode the character.  Read it in raw mode to avoid emitting
2129     // diagnostics about things like trigraphs.  If we see an escaped newline,
2130     // we'll handle it below.
2131     const char *OldPtr = CurPtr;
2132     bool OldRawMode = isLexingRawMode();
2133     LexingRawMode = true;
2134     C = getAndAdvanceChar(CurPtr, Result);
2135     LexingRawMode = OldRawMode;
2136 
2137     // If we only read only one character, then no special handling is needed.
2138     // We're done and can skip forward to the newline.
2139     if (C != 0 && CurPtr == OldPtr+1) {
2140       CurPtr = NextLine;
2141       break;
2142     }
2143 
2144     // If we read multiple characters, and one of those characters was a \r or
2145     // \n, then we had an escaped newline within the comment.  Emit diagnostic
2146     // unless the next line is also a // comment.
2147     if (CurPtr != OldPtr+1 && C != '/' && CurPtr[0] != '/') {
2148       for (; OldPtr != CurPtr; ++OldPtr)
2149         if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
2150           // Okay, we found a // comment that ends in a newline, if the next
2151           // line is also a // comment, but has spaces, don't emit a diagnostic.
2152           if (isWhitespace(C)) {
2153             const char *ForwardPtr = CurPtr;
2154             while (isWhitespace(*ForwardPtr))  // Skip whitespace.
2155               ++ForwardPtr;
2156             if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
2157               break;
2158           }
2159 
2160           if (!isLexingRawMode())
2161             Diag(OldPtr-1, diag::ext_multi_line_line_comment);
2162           break;
2163         }
2164     }
2165 
2166     if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) {
2167       --CurPtr;
2168       break;
2169     }
2170 
2171     if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2172       PP->CodeCompleteNaturalLanguage();
2173       cutOffLexing();
2174       return false;
2175     }
2176   }
2177 
2178   // Found but did not consume the newline.  Notify comment handlers about the
2179   // comment unless we're in a #if 0 block.
2180   if (PP && !isLexingRawMode() &&
2181       PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2182                                             getSourceLocation(CurPtr)))) {
2183     BufferPtr = CurPtr;
2184     return true; // A token has to be returned.
2185   }
2186 
2187   // If we are returning comments as tokens, return this comment as a token.
2188   if (inKeepCommentMode())
2189     return SaveLineComment(Result, CurPtr);
2190 
2191   // If we are inside a preprocessor directive and we see the end of line,
2192   // return immediately, so that the lexer can return this as an EOD token.
2193   if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
2194     BufferPtr = CurPtr;
2195     return false;
2196   }
2197 
2198   // Otherwise, eat the \n character.  We don't care if this is a \n\r or
2199   // \r\n sequence.  This is an efficiency hack (because we know the \n can't
2200   // contribute to another token), it isn't needed for correctness.  Note that
2201   // this is ok even in KeepWhitespaceMode, because we would have returned the
2202   /// comment above in that mode.
2203   ++CurPtr;
2204 
2205   // The next returned token is at the start of the line.
2206   Result.setFlag(Token::StartOfLine);
2207   TokAtPhysicalStartOfLine = true;
2208   // No leading whitespace seen so far.
2209   Result.clearFlag(Token::LeadingSpace);
2210   BufferPtr = CurPtr;
2211   return false;
2212 }
2213 
2214 /// If in save-comment mode, package up this Line comment in an appropriate
2215 /// way and return it.
2216 bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
2217   // If we're not in a preprocessor directive, just return the // comment
2218   // directly.
2219   FormTokenWithChars(Result, CurPtr, tok::comment);
2220 
2221   if (!ParsingPreprocessorDirective || LexingRawMode)
2222     return true;
2223 
2224   // If this Line-style comment is in a macro definition, transmogrify it into
2225   // a C-style block comment.
2226   bool Invalid = false;
2227   std::string Spelling = PP->getSpelling(Result, &Invalid);
2228   if (Invalid)
2229     return true;
2230 
2231   assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");
2232   Spelling[1] = '*';   // Change prefix to "/*".
2233   Spelling += "*/";    // add suffix.
2234 
2235   Result.setKind(tok::comment);
2236   PP->CreateString(Spelling, Result,
2237                    Result.getLocation(), Result.getLocation());
2238   return true;
2239 }
2240 
2241 /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
2242 /// character (either \\n or \\r) is part of an escaped newline sequence.  Issue
2243 /// a diagnostic if so.  We know that the newline is inside of a block comment.
2244 static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
2245                                                   Lexer *L) {
2246   assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
2247 
2248   // Back up off the newline.
2249   --CurPtr;
2250 
2251   // If this is a two-character newline sequence, skip the other character.
2252   if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
2253     // \n\n or \r\r -> not escaped newline.
2254     if (CurPtr[0] == CurPtr[1])
2255       return false;
2256     // \n\r or \r\n -> skip the newline.
2257     --CurPtr;
2258   }
2259 
2260   // If we have horizontal whitespace, skip over it.  We allow whitespace
2261   // between the slash and newline.
2262   bool HasSpace = false;
2263   while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
2264     --CurPtr;
2265     HasSpace = true;
2266   }
2267 
2268   // If we have a slash, we know this is an escaped newline.
2269   if (*CurPtr == '\\') {
2270     if (CurPtr[-1] != '*') return false;
2271   } else {
2272     // It isn't a slash, is it the ?? / trigraph?
2273     if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' ||
2274         CurPtr[-3] != '*')
2275       return false;
2276 
2277     // This is the trigraph ending the comment.  Emit a stern warning!
2278     CurPtr -= 2;
2279 
2280     // If no trigraphs are enabled, warn that we ignored this trigraph and
2281     // ignore this * character.
2282     if (!L->getLangOpts().Trigraphs) {
2283       if (!L->isLexingRawMode())
2284         L->Diag(CurPtr, diag::trigraph_ignored_block_comment);
2285       return false;
2286     }
2287     if (!L->isLexingRawMode())
2288       L->Diag(CurPtr, diag::trigraph_ends_block_comment);
2289   }
2290 
2291   // Warn about having an escaped newline between the */ characters.
2292   if (!L->isLexingRawMode())
2293     L->Diag(CurPtr, diag::escaped_newline_block_comment_end);
2294 
2295   // If there was space between the backslash and newline, warn about it.
2296   if (HasSpace && !L->isLexingRawMode())
2297     L->Diag(CurPtr, diag::backslash_newline_space);
2298 
2299   return true;
2300 }
2301 
2302 #ifdef __SSE2__
2303 #include <emmintrin.h>
2304 #elif __ALTIVEC__
2305 #include <altivec.h>
2306 #undef bool
2307 #endif
2308 
2309 /// We have just read from input the / and * characters that started a comment.
2310 /// Read until we find the * and / characters that terminate the comment.
2311 /// Note that we don't bother decoding trigraphs or escaped newlines in block
2312 /// comments, because they cannot cause the comment to end.  The only thing
2313 /// that can happen is the comment could end with an escaped newline between
2314 /// the terminating * and /.
2315 ///
2316 /// If we're in KeepCommentMode or any CommentHandler has inserted
2317 /// some tokens, this will store the first token and return true.
2318 bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
2319                              bool &TokAtPhysicalStartOfLine) {
2320   // Scan one character past where we should, looking for a '/' character.  Once
2321   // we find it, check to see if it was preceded by a *.  This common
2322   // optimization helps people who like to put a lot of * characters in their
2323   // comments.
2324 
2325   // The first character we get with newlines and trigraphs skipped to handle
2326   // the degenerate /*/ case below correctly if the * has an escaped newline
2327   // after it.
2328   unsigned CharSize;
2329   unsigned char C = getCharAndSize(CurPtr, CharSize);
2330   CurPtr += CharSize;
2331   if (C == 0 && CurPtr == BufferEnd+1) {
2332     if (!isLexingRawMode())
2333       Diag(BufferPtr, diag::err_unterminated_block_comment);
2334     --CurPtr;
2335 
2336     // KeepWhitespaceMode should return this broken comment as a token.  Since
2337     // it isn't a well formed comment, just return it as an 'unknown' token.
2338     if (isKeepWhitespaceMode()) {
2339       FormTokenWithChars(Result, CurPtr, tok::unknown);
2340       return true;
2341     }
2342 
2343     BufferPtr = CurPtr;
2344     return false;
2345   }
2346 
2347   // Check to see if the first character after the '/*' is another /.  If so,
2348   // then this slash does not end the block comment, it is part of it.
2349   if (C == '/')
2350     C = *CurPtr++;
2351 
2352   while (true) {
2353     // Skip over all non-interesting characters until we find end of buffer or a
2354     // (probably ending) '/' character.
2355     if (CurPtr + 24 < BufferEnd &&
2356         // If there is a code-completion point avoid the fast scan because it
2357         // doesn't check for '\0'.
2358         !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
2359       // While not aligned to a 16-byte boundary.
2360       while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
2361         C = *CurPtr++;
2362 
2363       if (C == '/') goto FoundSlash;
2364 
2365 #ifdef __SSE2__
2366       __m128i Slashes = _mm_set1_epi8('/');
2367       while (CurPtr+16 <= BufferEnd) {
2368         int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
2369                                     Slashes));
2370         if (cmp != 0) {
2371           // Adjust the pointer to point directly after the first slash. It's
2372           // not necessary to set C here, it will be overwritten at the end of
2373           // the outer loop.
2374           CurPtr += llvm::countTrailingZeros<unsigned>(cmp) + 1;
2375           goto FoundSlash;
2376         }
2377         CurPtr += 16;
2378       }
2379 #elif __ALTIVEC__
2380       __vector unsigned char Slashes = {
2381         '/', '/', '/', '/',  '/', '/', '/', '/',
2382         '/', '/', '/', '/',  '/', '/', '/', '/'
2383       };
2384       while (CurPtr+16 <= BufferEnd &&
2385              !vec_any_eq(*(const vector unsigned char*)CurPtr, Slashes))
2386         CurPtr += 16;
2387 #else
2388       // Scan for '/' quickly.  Many block comments are very large.
2389       while (CurPtr[0] != '/' &&
2390              CurPtr[1] != '/' &&
2391              CurPtr[2] != '/' &&
2392              CurPtr[3] != '/' &&
2393              CurPtr+4 < BufferEnd) {
2394         CurPtr += 4;
2395       }
2396 #endif
2397 
2398       // It has to be one of the bytes scanned, increment to it and read one.
2399       C = *CurPtr++;
2400     }
2401 
2402     // Loop to scan the remainder.
2403     while (C != '/' && C != '\0')
2404       C = *CurPtr++;
2405 
2406     if (C == '/') {
2407   FoundSlash:
2408       if (CurPtr[-2] == '*')  // We found the final */.  We're done!
2409         break;
2410 
2411       if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
2412         if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) {
2413           // We found the final */, though it had an escaped newline between the
2414           // * and /.  We're done!
2415           break;
2416         }
2417       }
2418       if (CurPtr[0] == '*' && CurPtr[1] != '/') {
2419         // If this is a /* inside of the comment, emit a warning.  Don't do this
2420         // if this is a /*/, which will end the comment.  This misses cases with
2421         // embedded escaped newlines, but oh well.
2422         if (!isLexingRawMode())
2423           Diag(CurPtr-1, diag::warn_nested_block_comment);
2424       }
2425     } else if (C == 0 && CurPtr == BufferEnd+1) {
2426       if (!isLexingRawMode())
2427         Diag(BufferPtr, diag::err_unterminated_block_comment);
2428       // Note: the user probably forgot a */.  We could continue immediately
2429       // after the /*, but this would involve lexing a lot of what really is the
2430       // comment, which surely would confuse the parser.
2431       --CurPtr;
2432 
2433       // KeepWhitespaceMode should return this broken comment as a token.  Since
2434       // it isn't a well formed comment, just return it as an 'unknown' token.
2435       if (isKeepWhitespaceMode()) {
2436         FormTokenWithChars(Result, CurPtr, tok::unknown);
2437         return true;
2438       }
2439 
2440       BufferPtr = CurPtr;
2441       return false;
2442     } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2443       PP->CodeCompleteNaturalLanguage();
2444       cutOffLexing();
2445       return false;
2446     }
2447 
2448     C = *CurPtr++;
2449   }
2450 
2451   // Notify comment handlers about the comment unless we're in a #if 0 block.
2452   if (PP && !isLexingRawMode() &&
2453       PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2454                                             getSourceLocation(CurPtr)))) {
2455     BufferPtr = CurPtr;
2456     return true; // A token has to be returned.
2457   }
2458 
2459   // If we are returning comments as tokens, return this comment as a token.
2460   if (inKeepCommentMode()) {
2461     FormTokenWithChars(Result, CurPtr, tok::comment);
2462     return true;
2463   }
2464 
2465   // It is common for the tokens immediately after a /**/ comment to be
2466   // whitespace.  Instead of going through the big switch, handle it
2467   // efficiently now.  This is safe even in KeepWhitespaceMode because we would
2468   // have already returned above with the comment as a token.
2469   if (isHorizontalWhitespace(*CurPtr)) {
2470     SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);
2471     return false;
2472   }
2473 
2474   // Otherwise, just return so that the next character will be lexed as a token.
2475   BufferPtr = CurPtr;
2476   Result.setFlag(Token::LeadingSpace);
2477   return false;
2478 }
2479 
2480 //===----------------------------------------------------------------------===//
2481 // Primary Lexing Entry Points
2482 //===----------------------------------------------------------------------===//
2483 
2484 /// ReadToEndOfLine - Read the rest of the current preprocessor line as an
2485 /// uninterpreted string.  This switches the lexer out of directive mode.
2486 void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) {
2487   assert(ParsingPreprocessorDirective && ParsingFilename == false &&
2488          "Must be in a preprocessing directive!");
2489   Token Tmp;
2490 
2491   // CurPtr - Cache BufferPtr in an automatic variable.
2492   const char *CurPtr = BufferPtr;
2493   while (true) {
2494     char Char = getAndAdvanceChar(CurPtr, Tmp);
2495     switch (Char) {
2496     default:
2497       if (Result)
2498         Result->push_back(Char);
2499       break;
2500     case 0:  // Null.
2501       // Found end of file?
2502       if (CurPtr-1 != BufferEnd) {
2503         if (isCodeCompletionPoint(CurPtr-1)) {
2504           PP->CodeCompleteNaturalLanguage();
2505           cutOffLexing();
2506           return;
2507         }
2508 
2509         // Nope, normal character, continue.
2510         if (Result)
2511           Result->push_back(Char);
2512         break;
2513       }
2514       // FALL THROUGH.
2515       LLVM_FALLTHROUGH;
2516     case '\r':
2517     case '\n':
2518       // Okay, we found the end of the line. First, back up past the \0, \r, \n.
2519       assert(CurPtr[-1] == Char && "Trigraphs for newline?");
2520       BufferPtr = CurPtr-1;
2521 
2522       // Next, lex the character, which should handle the EOD transition.
2523       Lex(Tmp);
2524       if (Tmp.is(tok::code_completion)) {
2525         if (PP)
2526           PP->CodeCompleteNaturalLanguage();
2527         Lex(Tmp);
2528       }
2529       assert(Tmp.is(tok::eod) && "Unexpected token!");
2530 
2531       // Finally, we're done;
2532       return;
2533     }
2534   }
2535 }
2536 
2537 /// LexEndOfFile - CurPtr points to the end of this file.  Handle this
2538 /// condition, reporting diagnostics and handling other edge cases as required.
2539 /// This returns true if Result contains a token, false if PP.Lex should be
2540 /// called again.
2541 bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
2542   // If we hit the end of the file while parsing a preprocessor directive,
2543   // end the preprocessor directive first.  The next token returned will
2544   // then be the end of file.
2545   if (ParsingPreprocessorDirective) {
2546     // Done parsing the "line".
2547     ParsingPreprocessorDirective = false;
2548     // Update the location of token as well as BufferPtr.
2549     FormTokenWithChars(Result, CurPtr, tok::eod);
2550 
2551     // Restore comment saving mode, in case it was disabled for directive.
2552     if (PP)
2553       resetExtendedTokenMode();
2554     return true;  // Have a token.
2555   }
2556 
2557   // If we are in raw mode, return this event as an EOF token.  Let the caller
2558   // that put us in raw mode handle the event.
2559   if (isLexingRawMode()) {
2560     Result.startToken();
2561     BufferPtr = BufferEnd;
2562     FormTokenWithChars(Result, BufferEnd, tok::eof);
2563     return true;
2564   }
2565 
2566   if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) {
2567     PP->setRecordedPreambleConditionalStack(ConditionalStack);
2568     ConditionalStack.clear();
2569   }
2570 
2571   // Issue diagnostics for unterminated #if and missing newline.
2572 
2573   // If we are in a #if directive, emit an error.
2574   while (!ConditionalStack.empty()) {
2575     if (PP->getCodeCompletionFileLoc() != FileLoc)
2576       PP->Diag(ConditionalStack.back().IfLoc,
2577                diag::err_pp_unterminated_conditional);
2578     ConditionalStack.pop_back();
2579   }
2580 
2581   // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
2582   // a pedwarn.
2583   if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) {
2584     DiagnosticsEngine &Diags = PP->getDiagnostics();
2585     SourceLocation EndLoc = getSourceLocation(BufferEnd);
2586     unsigned DiagID;
2587 
2588     if (LangOpts.CPlusPlus11) {
2589       // C++11 [lex.phases] 2.2 p2
2590       // Prefer the C++98 pedantic compatibility warning over the generic,
2591       // non-extension, user-requested "missing newline at EOF" warning.
2592       if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) {
2593         DiagID = diag::warn_cxx98_compat_no_newline_eof;
2594       } else {
2595         DiagID = diag::warn_no_newline_eof;
2596       }
2597     } else {
2598       DiagID = diag::ext_no_newline_eof;
2599     }
2600 
2601     Diag(BufferEnd, DiagID)
2602       << FixItHint::CreateInsertion(EndLoc, "\n");
2603   }
2604 
2605   BufferPtr = CurPtr;
2606 
2607   // Finally, let the preprocessor handle this.
2608   return PP->HandleEndOfFile(Result, isPragmaLexer());
2609 }
2610 
2611 /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
2612 /// the specified lexer will return a tok::l_paren token, 0 if it is something
2613 /// else and 2 if there are no more tokens in the buffer controlled by the
2614 /// lexer.
2615 unsigned Lexer::isNextPPTokenLParen() {
2616   assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
2617 
2618   // Switch to 'skipping' mode.  This will ensure that we can lex a token
2619   // without emitting diagnostics, disables macro expansion, and will cause EOF
2620   // to return an EOF token instead of popping the include stack.
2621   LexingRawMode = true;
2622 
2623   // Save state that can be changed while lexing so that we can restore it.
2624   const char *TmpBufferPtr = BufferPtr;
2625   bool inPPDirectiveMode = ParsingPreprocessorDirective;
2626   bool atStartOfLine = IsAtStartOfLine;
2627   bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
2628   bool leadingSpace = HasLeadingSpace;
2629 
2630   Token Tok;
2631   Lex(Tok);
2632 
2633   // Restore state that may have changed.
2634   BufferPtr = TmpBufferPtr;
2635   ParsingPreprocessorDirective = inPPDirectiveMode;
2636   HasLeadingSpace = leadingSpace;
2637   IsAtStartOfLine = atStartOfLine;
2638   IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
2639 
2640   // Restore the lexer back to non-skipping mode.
2641   LexingRawMode = false;
2642 
2643   if (Tok.is(tok::eof))
2644     return 2;
2645   return Tok.is(tok::l_paren);
2646 }
2647 
2648 /// \brief Find the end of a version control conflict marker.
2649 static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
2650                                    ConflictMarkerKind CMK) {
2651   const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
2652   size_t TermLen = CMK == CMK_Perforce ? 5 : 7;
2653   auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);
2654   size_t Pos = RestOfBuffer.find(Terminator);
2655   while (Pos != StringRef::npos) {
2656     // Must occur at start of line.
2657     if (Pos == 0 ||
2658         (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) {
2659       RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
2660       Pos = RestOfBuffer.find(Terminator);
2661       continue;
2662     }
2663     return RestOfBuffer.data()+Pos;
2664   }
2665   return nullptr;
2666 }
2667 
2668 /// IsStartOfConflictMarker - If the specified pointer is the start of a version
2669 /// control conflict marker like '<<<<<<<', recognize it as such, emit an error
2670 /// and recover nicely.  This returns true if it is a conflict marker and false
2671 /// if not.
2672 bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
2673   // Only a conflict marker if it starts at the beginning of a line.
2674   if (CurPtr != BufferStart &&
2675       CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
2676     return false;
2677 
2678   // Check to see if we have <<<<<<< or >>>>.
2679   if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") &&
2680       !StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> "))
2681     return false;
2682 
2683   // If we have a situation where we don't care about conflict markers, ignore
2684   // it.
2685   if (CurrentConflictMarkerState || isLexingRawMode())
2686     return false;
2687 
2688   ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
2689 
2690   // Check to see if there is an ending marker somewhere in the buffer at the
2691   // start of a line to terminate this conflict marker.
2692   if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {
2693     // We found a match.  We are really in a conflict marker.
2694     // Diagnose this, and ignore to the end of line.
2695     Diag(CurPtr, diag::err_conflict_marker);
2696     CurrentConflictMarkerState = Kind;
2697 
2698     // Skip ahead to the end of line.  We know this exists because the
2699     // end-of-conflict marker starts with \r or \n.
2700     while (*CurPtr != '\r' && *CurPtr != '\n') {
2701       assert(CurPtr != BufferEnd && "Didn't find end of line");
2702       ++CurPtr;
2703     }
2704     BufferPtr = CurPtr;
2705     return true;
2706   }
2707 
2708   // No end of conflict marker found.
2709   return false;
2710 }
2711 
2712 /// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if
2713 /// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
2714 /// is the end of a conflict marker.  Handle it by ignoring up until the end of
2715 /// the line.  This returns true if it is a conflict marker and false if not.
2716 bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
2717   // Only a conflict marker if it starts at the beginning of a line.
2718   if (CurPtr != BufferStart &&
2719       CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
2720     return false;
2721 
2722   // If we have a situation where we don't care about conflict markers, ignore
2723   // it.
2724   if (!CurrentConflictMarkerState || isLexingRawMode())
2725     return false;
2726 
2727   // Check to see if we have the marker (4 characters in a row).
2728   for (unsigned i = 1; i != 4; ++i)
2729     if (CurPtr[i] != CurPtr[0])
2730       return false;
2731 
2732   // If we do have it, search for the end of the conflict marker.  This could
2733   // fail if it got skipped with a '#if 0' or something.  Note that CurPtr might
2734   // be the end of conflict marker.
2735   if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
2736                                         CurrentConflictMarkerState)) {
2737     CurPtr = End;
2738 
2739     // Skip ahead to the end of line.
2740     while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
2741       ++CurPtr;
2742 
2743     BufferPtr = CurPtr;
2744 
2745     // No longer in the conflict marker.
2746     CurrentConflictMarkerState = CMK_None;
2747     return true;
2748   }
2749 
2750   return false;
2751 }
2752 
2753 static const char *findPlaceholderEnd(const char *CurPtr,
2754                                       const char *BufferEnd) {
2755   if (CurPtr == BufferEnd)
2756     return nullptr;
2757   BufferEnd -= 1; // Scan until the second last character.
2758   for (; CurPtr != BufferEnd; ++CurPtr) {
2759     if (CurPtr[0] == '#' && CurPtr[1] == '>')
2760       return CurPtr + 2;
2761   }
2762   return nullptr;
2763 }
2764 
2765 bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {
2766   assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!");
2767   if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode)
2768     return false;
2769   const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd);
2770   if (!End)
2771     return false;
2772   const char *Start = CurPtr - 1;
2773   if (!LangOpts.AllowEditorPlaceholders)
2774     Diag(Start, diag::err_placeholder_in_source);
2775   Result.startToken();
2776   FormTokenWithChars(Result, End, tok::raw_identifier);
2777   Result.setRawIdentifierData(Start);
2778   PP->LookUpIdentifierInfo(Result);
2779   Result.setFlag(Token::IsEditorPlaceholder);
2780   BufferPtr = End;
2781   return true;
2782 }
2783 
2784 bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
2785   if (PP && PP->isCodeCompletionEnabled()) {
2786     SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
2787     return Loc == PP->getCodeCompletionLoc();
2788   }
2789 
2790   return false;
2791 }
2792 
2793 uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
2794                            Token *Result) {
2795   unsigned CharSize;
2796   char Kind = getCharAndSize(StartPtr, CharSize);
2797 
2798   unsigned NumHexDigits;
2799   if (Kind == 'u')
2800     NumHexDigits = 4;
2801   else if (Kind == 'U')
2802     NumHexDigits = 8;
2803   else
2804     return 0;
2805 
2806   if (!LangOpts.CPlusPlus && !LangOpts.C99) {
2807     if (Result && !isLexingRawMode())
2808       Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
2809     return 0;
2810   }
2811 
2812   const char *CurPtr = StartPtr + CharSize;
2813   const char *KindLoc = &CurPtr[-1];
2814 
2815   uint32_t CodePoint = 0;
2816   for (unsigned i = 0; i < NumHexDigits; ++i) {
2817     char C = getCharAndSize(CurPtr, CharSize);
2818 
2819     unsigned Value = llvm::hexDigitValue(C);
2820     if (Value == -1U) {
2821       if (Result && !isLexingRawMode()) {
2822         if (i == 0) {
2823           Diag(BufferPtr, diag::warn_ucn_escape_no_digits)
2824             << StringRef(KindLoc, 1);
2825         } else {
2826           Diag(BufferPtr, diag::warn_ucn_escape_incomplete);
2827 
2828           // If the user wrote \U1234, suggest a fixit to \u.
2829           if (i == 4 && NumHexDigits == 8) {
2830             CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
2831             Diag(KindLoc, diag::note_ucn_four_not_eight)
2832               << FixItHint::CreateReplacement(URange, "u");
2833           }
2834         }
2835       }
2836 
2837       return 0;
2838     }
2839 
2840     CodePoint <<= 4;
2841     CodePoint += Value;
2842 
2843     CurPtr += CharSize;
2844   }
2845 
2846   if (Result) {
2847     Result->setFlag(Token::HasUCN);
2848     if (CurPtr - StartPtr == (ptrdiff_t)NumHexDigits + 2)
2849       StartPtr = CurPtr;
2850     else
2851       while (StartPtr != CurPtr)
2852         (void)getAndAdvanceChar(StartPtr, *Result);
2853   } else {
2854     StartPtr = CurPtr;
2855   }
2856 
2857   // Don't apply C family restrictions to UCNs in assembly mode
2858   if (LangOpts.AsmPreprocessor)
2859     return CodePoint;
2860 
2861   // C99 6.4.3p2: A universal character name shall not specify a character whose
2862   //   short identifier is less than 00A0 other than 0024 ($), 0040 (@), or
2863   //   0060 (`), nor one in the range D800 through DFFF inclusive.)
2864   // C++11 [lex.charset]p2: If the hexadecimal value for a
2865   //   universal-character-name corresponds to a surrogate code point (in the
2866   //   range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
2867   //   if the hexadecimal value for a universal-character-name outside the
2868   //   c-char-sequence, s-char-sequence, or r-char-sequence of a character or
2869   //   string literal corresponds to a control character (in either of the
2870   //   ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
2871   //   basic source character set, the program is ill-formed.
2872   if (CodePoint < 0xA0) {
2873     if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60)
2874       return CodePoint;
2875 
2876     // We don't use isLexingRawMode() here because we need to warn about bad
2877     // UCNs even when skipping preprocessing tokens in a #if block.
2878     if (Result && PP) {
2879       if (CodePoint < 0x20 || CodePoint >= 0x7F)
2880         Diag(BufferPtr, diag::err_ucn_control_character);
2881       else {
2882         char C = static_cast<char>(CodePoint);
2883         Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
2884       }
2885     }
2886 
2887     return 0;
2888 
2889   } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {
2890     // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
2891     // We don't use isLexingRawMode() here because we need to diagnose bad
2892     // UCNs even when skipping preprocessing tokens in a #if block.
2893     if (Result && PP) {
2894       if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)
2895         Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
2896       else
2897         Diag(BufferPtr, diag::err_ucn_escape_invalid);
2898     }
2899     return 0;
2900   }
2901 
2902   return CodePoint;
2903 }
2904 
2905 bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
2906                                    const char *CurPtr) {
2907   static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
2908       UnicodeWhitespaceCharRanges);
2909   if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
2910       UnicodeWhitespaceChars.contains(C)) {
2911     Diag(BufferPtr, diag::ext_unicode_whitespace)
2912       << makeCharRange(*this, BufferPtr, CurPtr);
2913 
2914     Result.setFlag(Token::LeadingSpace);
2915     return true;
2916   }
2917   return false;
2918 }
2919 
2920 bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
2921   if (isAllowedIDChar(C, LangOpts) && isAllowedInitiallyIDChar(C, LangOpts)) {
2922     if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
2923         !PP->isPreprocessedOutput()) {
2924       maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,
2925                                 makeCharRange(*this, BufferPtr, CurPtr),
2926                                 /*IsFirst=*/true);
2927     }
2928 
2929     MIOpt.ReadToken();
2930     return LexIdentifier(Result, CurPtr);
2931   }
2932 
2933   if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
2934       !PP->isPreprocessedOutput() &&
2935       !isASCII(*BufferPtr) && !isAllowedIDChar(C, LangOpts)) {
2936     // Non-ASCII characters tend to creep into source code unintentionally.
2937     // Instead of letting the parser complain about the unknown token,
2938     // just drop the character.
2939     // Note that we can /only/ do this when the non-ASCII character is actually
2940     // spelled as Unicode, not written as a UCN. The standard requires that
2941     // we not throw away any possible preprocessor tokens, but there's a
2942     // loophole in the mapping of Unicode characters to basic character set
2943     // characters that allows us to map these particular characters to, say,
2944     // whitespace.
2945     Diag(BufferPtr, diag::err_non_ascii)
2946       << FixItHint::CreateRemoval(makeCharRange(*this, BufferPtr, CurPtr));
2947 
2948     BufferPtr = CurPtr;
2949     return false;
2950   }
2951 
2952   // Otherwise, we have an explicit UCN or a character that's unlikely to show
2953   // up by accident.
2954   MIOpt.ReadToken();
2955   FormTokenWithChars(Result, CurPtr, tok::unknown);
2956   return true;
2957 }
2958 
2959 void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
2960   IsAtStartOfLine = Result.isAtStartOfLine();
2961   HasLeadingSpace = Result.hasLeadingSpace();
2962   HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
2963   // Note that this doesn't affect IsAtPhysicalStartOfLine.
2964 }
2965 
2966 bool Lexer::Lex(Token &Result) {
2967   // Start a new token.
2968   Result.startToken();
2969 
2970   // Set up misc whitespace flags for LexTokenInternal.
2971   if (IsAtStartOfLine) {
2972     Result.setFlag(Token::StartOfLine);
2973     IsAtStartOfLine = false;
2974   }
2975 
2976   if (HasLeadingSpace) {
2977     Result.setFlag(Token::LeadingSpace);
2978     HasLeadingSpace = false;
2979   }
2980 
2981   if (HasLeadingEmptyMacro) {
2982     Result.setFlag(Token::LeadingEmptyMacro);
2983     HasLeadingEmptyMacro = false;
2984   }
2985 
2986   bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
2987   IsAtPhysicalStartOfLine = false;
2988   bool isRawLex = isLexingRawMode();
2989   (void) isRawLex;
2990   bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
2991   // (After the LexTokenInternal call, the lexer might be destroyed.)
2992   assert((returnedToken || !isRawLex) && "Raw lex must succeed");
2993   return returnedToken;
2994 }
2995 
2996 /// LexTokenInternal - This implements a simple C family lexer.  It is an
2997 /// extremely performance critical piece of code.  This assumes that the buffer
2998 /// has a null character at the end of the file.  This returns a preprocessing
2999 /// token, not a normal token, as such, it is an internal interface.  It assumes
3000 /// that the Flags of result have been cleared before calling this.
3001 bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
3002 LexNextToken:
3003   // New token, can't need cleaning yet.
3004   Result.clearFlag(Token::NeedsCleaning);
3005   Result.setIdentifierInfo(nullptr);
3006 
3007   // CurPtr - Cache BufferPtr in an automatic variable.
3008   const char *CurPtr = BufferPtr;
3009 
3010   // Small amounts of horizontal whitespace is very common between tokens.
3011   if ((*CurPtr == ' ') || (*CurPtr == '\t')) {
3012     ++CurPtr;
3013     while ((*CurPtr == ' ') || (*CurPtr == '\t'))
3014       ++CurPtr;
3015 
3016     // If we are keeping whitespace and other tokens, just return what we just
3017     // skipped.  The next lexer invocation will return the token after the
3018     // whitespace.
3019     if (isKeepWhitespaceMode()) {
3020       FormTokenWithChars(Result, CurPtr, tok::unknown);
3021       // FIXME: The next token will not have LeadingSpace set.
3022       return true;
3023     }
3024 
3025     BufferPtr = CurPtr;
3026     Result.setFlag(Token::LeadingSpace);
3027   }
3028 
3029   unsigned SizeTmp, SizeTmp2;   // Temporaries for use in cases below.
3030 
3031   // Read a character, advancing over it.
3032   char Char = getAndAdvanceChar(CurPtr, Result);
3033   tok::TokenKind Kind;
3034 
3035   switch (Char) {
3036   case 0:  // Null.
3037     // Found end of file?
3038     if (CurPtr-1 == BufferEnd)
3039       return LexEndOfFile(Result, CurPtr-1);
3040 
3041     // Check if we are performing code completion.
3042     if (isCodeCompletionPoint(CurPtr-1)) {
3043       // Return the code-completion token.
3044       Result.startToken();
3045       FormTokenWithChars(Result, CurPtr, tok::code_completion);
3046       return true;
3047     }
3048 
3049     if (!isLexingRawMode())
3050       Diag(CurPtr-1, diag::null_in_file);
3051     Result.setFlag(Token::LeadingSpace);
3052     if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3053       return true; // KeepWhitespaceMode
3054 
3055     // We know the lexer hasn't changed, so just try again with this lexer.
3056     // (We manually eliminate the tail call to avoid recursion.)
3057     goto LexNextToken;
3058 
3059   case 26:  // DOS & CP/M EOF: "^Z".
3060     // If we're in Microsoft extensions mode, treat this as end of file.
3061     if (LangOpts.MicrosoftExt) {
3062       if (!isLexingRawMode())
3063         Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);
3064       return LexEndOfFile(Result, CurPtr-1);
3065     }
3066 
3067     // If Microsoft extensions are disabled, this is just random garbage.
3068     Kind = tok::unknown;
3069     break;
3070 
3071   case '\r':
3072     if (CurPtr[0] == '\n')
3073       Char = getAndAdvanceChar(CurPtr, Result);
3074     LLVM_FALLTHROUGH;
3075   case '\n':
3076     // If we are inside a preprocessor directive and we see the end of line,
3077     // we know we are done with the directive, so return an EOD token.
3078     if (ParsingPreprocessorDirective) {
3079       // Done parsing the "line".
3080       ParsingPreprocessorDirective = false;
3081 
3082       // Restore comment saving mode, in case it was disabled for directive.
3083       if (PP)
3084         resetExtendedTokenMode();
3085 
3086       // Since we consumed a newline, we are back at the start of a line.
3087       IsAtStartOfLine = true;
3088       IsAtPhysicalStartOfLine = true;
3089 
3090       Kind = tok::eod;
3091       break;
3092     }
3093 
3094     // No leading whitespace seen so far.
3095     Result.clearFlag(Token::LeadingSpace);
3096 
3097     if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3098       return true; // KeepWhitespaceMode
3099 
3100     // We only saw whitespace, so just try again with this lexer.
3101     // (We manually eliminate the tail call to avoid recursion.)
3102     goto LexNextToken;
3103   case ' ':
3104   case '\t':
3105   case '\f':
3106   case '\v':
3107   SkipHorizontalWhitespace:
3108     Result.setFlag(Token::LeadingSpace);
3109     if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3110       return true; // KeepWhitespaceMode
3111 
3112   SkipIgnoredUnits:
3113     CurPtr = BufferPtr;
3114 
3115     // If the next token is obviously a // or /* */ comment, skip it efficiently
3116     // too (without going through the big switch stmt).
3117     if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
3118         LangOpts.LineComment &&
3119         (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {
3120       if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3121         return true; // There is a token to return.
3122       goto SkipIgnoredUnits;
3123     } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
3124       if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3125         return true; // There is a token to return.
3126       goto SkipIgnoredUnits;
3127     } else if (isHorizontalWhitespace(*CurPtr)) {
3128       goto SkipHorizontalWhitespace;
3129     }
3130     // We only saw whitespace, so just try again with this lexer.
3131     // (We manually eliminate the tail call to avoid recursion.)
3132     goto LexNextToken;
3133 
3134   // C99 6.4.4.1: Integer Constants.
3135   // C99 6.4.4.2: Floating Constants.
3136   case '0': case '1': case '2': case '3': case '4':
3137   case '5': case '6': case '7': case '8': case '9':
3138     // Notify MIOpt that we read a non-whitespace/non-comment token.
3139     MIOpt.ReadToken();
3140     return LexNumericConstant(Result, CurPtr);
3141 
3142   case 'u':   // Identifier (uber) or C11/C++11 UTF-8 or UTF-16 string literal
3143     // Notify MIOpt that we read a non-whitespace/non-comment token.
3144     MIOpt.ReadToken();
3145 
3146     if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3147       Char = getCharAndSize(CurPtr, SizeTmp);
3148 
3149       // UTF-16 string literal
3150       if (Char == '"')
3151         return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3152                                 tok::utf16_string_literal);
3153 
3154       // UTF-16 character constant
3155       if (Char == '\'')
3156         return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3157                                tok::utf16_char_constant);
3158 
3159       // UTF-16 raw string literal
3160       if (Char == 'R' && LangOpts.CPlusPlus11 &&
3161           getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3162         return LexRawStringLiteral(Result,
3163                                ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3164                                            SizeTmp2, Result),
3165                                tok::utf16_string_literal);
3166 
3167       if (Char == '8') {
3168         char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
3169 
3170         // UTF-8 string literal
3171         if (Char2 == '"')
3172           return LexStringLiteral(Result,
3173                                ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3174                                            SizeTmp2, Result),
3175                                tok::utf8_string_literal);
3176         if (Char2 == '\'' && LangOpts.CPlusPlus1z)
3177           return LexCharConstant(
3178               Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3179                                   SizeTmp2, Result),
3180               tok::utf8_char_constant);
3181 
3182         if (Char2 == 'R' && LangOpts.CPlusPlus11) {
3183           unsigned SizeTmp3;
3184           char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3185           // UTF-8 raw string literal
3186           if (Char3 == '"') {
3187             return LexRawStringLiteral(Result,
3188                    ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3189                                            SizeTmp2, Result),
3190                                SizeTmp3, Result),
3191                    tok::utf8_string_literal);
3192           }
3193         }
3194       }
3195     }
3196 
3197     // treat u like the start of an identifier.
3198     return LexIdentifier(Result, CurPtr);
3199 
3200   case 'U':   // Identifier (Uber) or C11/C++11 UTF-32 string literal
3201     // Notify MIOpt that we read a non-whitespace/non-comment token.
3202     MIOpt.ReadToken();
3203 
3204     if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3205       Char = getCharAndSize(CurPtr, SizeTmp);
3206 
3207       // UTF-32 string literal
3208       if (Char == '"')
3209         return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3210                                 tok::utf32_string_literal);
3211 
3212       // UTF-32 character constant
3213       if (Char == '\'')
3214         return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3215                                tok::utf32_char_constant);
3216 
3217       // UTF-32 raw string literal
3218       if (Char == 'R' && LangOpts.CPlusPlus11 &&
3219           getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3220         return LexRawStringLiteral(Result,
3221                                ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3222                                            SizeTmp2, Result),
3223                                tok::utf32_string_literal);
3224     }
3225 
3226     // treat U like the start of an identifier.
3227     return LexIdentifier(Result, CurPtr);
3228 
3229   case 'R': // Identifier or C++0x raw string literal
3230     // Notify MIOpt that we read a non-whitespace/non-comment token.
3231     MIOpt.ReadToken();
3232 
3233     if (LangOpts.CPlusPlus11) {
3234       Char = getCharAndSize(CurPtr, SizeTmp);
3235 
3236       if (Char == '"')
3237         return LexRawStringLiteral(Result,
3238                                    ConsumeChar(CurPtr, SizeTmp, Result),
3239                                    tok::string_literal);
3240     }
3241 
3242     // treat R like the start of an identifier.
3243     return LexIdentifier(Result, CurPtr);
3244 
3245   case 'L':   // Identifier (Loony) or wide literal (L'x' or L"xyz").
3246     // Notify MIOpt that we read a non-whitespace/non-comment token.
3247     MIOpt.ReadToken();
3248     Char = getCharAndSize(CurPtr, SizeTmp);
3249 
3250     // Wide string literal.
3251     if (Char == '"')
3252       return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3253                               tok::wide_string_literal);
3254 
3255     // Wide raw string literal.
3256     if (LangOpts.CPlusPlus11 && Char == 'R' &&
3257         getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3258       return LexRawStringLiteral(Result,
3259                                ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3260                                            SizeTmp2, Result),
3261                                tok::wide_string_literal);
3262 
3263     // Wide character constant.
3264     if (Char == '\'')
3265       return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3266                              tok::wide_char_constant);
3267     // FALL THROUGH, treating L like the start of an identifier.
3268     LLVM_FALLTHROUGH;
3269 
3270   // C99 6.4.2: Identifiers.
3271   case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
3272   case 'H': case 'I': case 'J': case 'K':    /*'L'*/case 'M': case 'N':
3273   case 'O': case 'P': case 'Q':    /*'R'*/case 'S': case 'T':    /*'U'*/
3274   case 'V': case 'W': case 'X': case 'Y': case 'Z':
3275   case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
3276   case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
3277   case 'o': case 'p': case 'q': case 'r': case 's': case 't':    /*'u'*/
3278   case 'v': case 'w': case 'x': case 'y': case 'z':
3279   case '_':
3280     // Notify MIOpt that we read a non-whitespace/non-comment token.
3281     MIOpt.ReadToken();
3282     return LexIdentifier(Result, CurPtr);
3283 
3284   case '$':   // $ in identifiers.
3285     if (LangOpts.DollarIdents) {
3286       if (!isLexingRawMode())
3287         Diag(CurPtr-1, diag::ext_dollar_in_identifier);
3288       // Notify MIOpt that we read a non-whitespace/non-comment token.
3289       MIOpt.ReadToken();
3290       return LexIdentifier(Result, CurPtr);
3291     }
3292 
3293     Kind = tok::unknown;
3294     break;
3295 
3296   // C99 6.4.4: Character Constants.
3297   case '\'':
3298     // Notify MIOpt that we read a non-whitespace/non-comment token.
3299     MIOpt.ReadToken();
3300     return LexCharConstant(Result, CurPtr, tok::char_constant);
3301 
3302   // C99 6.4.5: String Literals.
3303   case '"':
3304     // Notify MIOpt that we read a non-whitespace/non-comment token.
3305     MIOpt.ReadToken();
3306     return LexStringLiteral(Result, CurPtr, tok::string_literal);
3307 
3308   // C99 6.4.6: Punctuators.
3309   case '?':
3310     Kind = tok::question;
3311     break;
3312   case '[':
3313     Kind = tok::l_square;
3314     break;
3315   case ']':
3316     Kind = tok::r_square;
3317     break;
3318   case '(':
3319     Kind = tok::l_paren;
3320     break;
3321   case ')':
3322     Kind = tok::r_paren;
3323     break;
3324   case '{':
3325     Kind = tok::l_brace;
3326     break;
3327   case '}':
3328     Kind = tok::r_brace;
3329     break;
3330   case '.':
3331     Char = getCharAndSize(CurPtr, SizeTmp);
3332     if (Char >= '0' && Char <= '9') {
3333       // Notify MIOpt that we read a non-whitespace/non-comment token.
3334       MIOpt.ReadToken();
3335 
3336       return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
3337     } else if (LangOpts.CPlusPlus && Char == '*') {
3338       Kind = tok::periodstar;
3339       CurPtr += SizeTmp;
3340     } else if (Char == '.' &&
3341                getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
3342       Kind = tok::ellipsis;
3343       CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3344                            SizeTmp2, Result);
3345     } else {
3346       Kind = tok::period;
3347     }
3348     break;
3349   case '&':
3350     Char = getCharAndSize(CurPtr, SizeTmp);
3351     if (Char == '&') {
3352       Kind = tok::ampamp;
3353       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3354     } else if (Char == '=') {
3355       Kind = tok::ampequal;
3356       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3357     } else {
3358       Kind = tok::amp;
3359     }
3360     break;
3361   case '*':
3362     if (getCharAndSize(CurPtr, SizeTmp) == '=') {
3363       Kind = tok::starequal;
3364       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3365     } else {
3366       Kind = tok::star;
3367     }
3368     break;
3369   case '+':
3370     Char = getCharAndSize(CurPtr, SizeTmp);
3371     if (Char == '+') {
3372       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3373       Kind = tok::plusplus;
3374     } else if (Char == '=') {
3375       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3376       Kind = tok::plusequal;
3377     } else {
3378       Kind = tok::plus;
3379     }
3380     break;
3381   case '-':
3382     Char = getCharAndSize(CurPtr, SizeTmp);
3383     if (Char == '-') {      // --
3384       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3385       Kind = tok::minusminus;
3386     } else if (Char == '>' && LangOpts.CPlusPlus &&
3387                getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') {  // C++ ->*
3388       CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3389                            SizeTmp2, Result);
3390       Kind = tok::arrowstar;
3391     } else if (Char == '>') {   // ->
3392       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3393       Kind = tok::arrow;
3394     } else if (Char == '=') {   // -=
3395       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3396       Kind = tok::minusequal;
3397     } else {
3398       Kind = tok::minus;
3399     }
3400     break;
3401   case '~':
3402     Kind = tok::tilde;
3403     break;
3404   case '!':
3405     if (getCharAndSize(CurPtr, SizeTmp) == '=') {
3406       Kind = tok::exclaimequal;
3407       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3408     } else {
3409       Kind = tok::exclaim;
3410     }
3411     break;
3412   case '/':
3413     // 6.4.9: Comments
3414     Char = getCharAndSize(CurPtr, SizeTmp);
3415     if (Char == '/') {         // Line comment.
3416       // Even if Line comments are disabled (e.g. in C89 mode), we generally
3417       // want to lex this as a comment.  There is one problem with this though,
3418       // that in one particular corner case, this can change the behavior of the
3419       // resultant program.  For example, In  "foo //**/ bar", C89 would lex
3420       // this as "foo / bar" and langauges with Line comments would lex it as
3421       // "foo".  Check to see if the character after the second slash is a '*'.
3422       // If so, we will lex that as a "/" instead of the start of a comment.
3423       // However, we never do this if we are just preprocessing.
3424       bool TreatAsComment = LangOpts.LineComment &&
3425                             (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);
3426       if (!TreatAsComment)
3427         if (!(PP && PP->isPreprocessedOutput()))
3428           TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
3429 
3430       if (TreatAsComment) {
3431         if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3432                             TokAtPhysicalStartOfLine))
3433           return true; // There is a token to return.
3434 
3435         // It is common for the tokens immediately after a // comment to be
3436         // whitespace (indentation for the next line).  Instead of going through
3437         // the big switch, handle it efficiently now.
3438         goto SkipIgnoredUnits;
3439       }
3440     }
3441 
3442     if (Char == '*') {  // /**/ comment.
3443       if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3444                            TokAtPhysicalStartOfLine))
3445         return true; // There is a token to return.
3446 
3447       // We only saw whitespace, so just try again with this lexer.
3448       // (We manually eliminate the tail call to avoid recursion.)
3449       goto LexNextToken;
3450     }
3451 
3452     if (Char == '=') {
3453       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3454       Kind = tok::slashequal;
3455     } else {
3456       Kind = tok::slash;
3457     }
3458     break;
3459   case '%':
3460     Char = getCharAndSize(CurPtr, SizeTmp);
3461     if (Char == '=') {
3462       Kind = tok::percentequal;
3463       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3464     } else if (LangOpts.Digraphs && Char == '>') {
3465       Kind = tok::r_brace;                             // '%>' -> '}'
3466       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3467     } else if (LangOpts.Digraphs && Char == ':') {
3468       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3469       Char = getCharAndSize(CurPtr, SizeTmp);
3470       if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
3471         Kind = tok::hashhash;                          // '%:%:' -> '##'
3472         CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3473                              SizeTmp2, Result);
3474       } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize
3475         CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3476         if (!isLexingRawMode())
3477           Diag(BufferPtr, diag::ext_charize_microsoft);
3478         Kind = tok::hashat;
3479       } else {                                         // '%:' -> '#'
3480         // We parsed a # character.  If this occurs at the start of the line,
3481         // it's actually the start of a preprocessing directive.  Callback to
3482         // the preprocessor to handle it.
3483         // TODO: -fpreprocessed mode??
3484         if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
3485           goto HandleDirective;
3486 
3487         Kind = tok::hash;
3488       }
3489     } else {
3490       Kind = tok::percent;
3491     }
3492     break;
3493   case '<':
3494     Char = getCharAndSize(CurPtr, SizeTmp);
3495     if (ParsingFilename) {
3496       return LexAngledStringLiteral(Result, CurPtr);
3497     } else if (Char == '<') {
3498       char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3499       if (After == '=') {
3500         Kind = tok::lesslessequal;
3501         CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3502                              SizeTmp2, Result);
3503       } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
3504         // If this is actually a '<<<<<<<' version control conflict marker,
3505         // recognize it as such and recover nicely.
3506         goto LexNextToken;
3507       } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {
3508         // If this is '<<<<' and we're in a Perforce-style conflict marker,
3509         // ignore it.
3510         goto LexNextToken;
3511       } else if (LangOpts.CUDA && After == '<') {
3512         Kind = tok::lesslessless;
3513         CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3514                              SizeTmp2, Result);
3515       } else {
3516         CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3517         Kind = tok::lessless;
3518       }
3519     } else if (Char == '=') {
3520       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3521       Kind = tok::lessequal;
3522     } else if (LangOpts.Digraphs && Char == ':') {     // '<:' -> '['
3523       if (LangOpts.CPlusPlus11 &&
3524           getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {
3525         // C++0x [lex.pptoken]p3:
3526         //  Otherwise, if the next three characters are <:: and the subsequent
3527         //  character is neither : nor >, the < is treated as a preprocessor
3528         //  token by itself and not as the first character of the alternative
3529         //  token <:.
3530         unsigned SizeTmp3;
3531         char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3532         if (After != ':' && After != '>') {
3533           Kind = tok::less;
3534           if (!isLexingRawMode())
3535             Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
3536           break;
3537         }
3538       }
3539 
3540       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3541       Kind = tok::l_square;
3542     } else if (LangOpts.Digraphs && Char == '%') {     // '<%' -> '{'
3543       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3544       Kind = tok::l_brace;
3545     } else if (Char == '#' && lexEditorPlaceholder(Result, CurPtr)) {
3546       return true;
3547     } else {
3548       Kind = tok::less;
3549     }
3550     break;
3551   case '>':
3552     Char = getCharAndSize(CurPtr, SizeTmp);
3553     if (Char == '=') {
3554       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3555       Kind = tok::greaterequal;
3556     } else if (Char == '>') {
3557       char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3558       if (After == '=') {
3559         CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3560                              SizeTmp2, Result);
3561         Kind = tok::greatergreaterequal;
3562       } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {
3563         // If this is actually a '>>>>' conflict marker, recognize it as such
3564         // and recover nicely.
3565         goto LexNextToken;
3566       } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
3567         // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
3568         goto LexNextToken;
3569       } else if (LangOpts.CUDA && After == '>') {
3570         Kind = tok::greatergreatergreater;
3571         CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3572                              SizeTmp2, Result);
3573       } else {
3574         CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3575         Kind = tok::greatergreater;
3576       }
3577     } else {
3578       Kind = tok::greater;
3579     }
3580     break;
3581   case '^':
3582     Char = getCharAndSize(CurPtr, SizeTmp);
3583     if (Char == '=') {
3584       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3585       Kind = tok::caretequal;
3586     } else if (LangOpts.OpenCL && Char == '^') {
3587       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3588       Kind = tok::caretcaret;
3589     } else {
3590       Kind = tok::caret;
3591     }
3592     break;
3593   case '|':
3594     Char = getCharAndSize(CurPtr, SizeTmp);
3595     if (Char == '=') {
3596       Kind = tok::pipeequal;
3597       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3598     } else if (Char == '|') {
3599       // If this is '|||||||' and we're in a conflict marker, ignore it.
3600       if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))
3601         goto LexNextToken;
3602       Kind = tok::pipepipe;
3603       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3604     } else {
3605       Kind = tok::pipe;
3606     }
3607     break;
3608   case ':':
3609     Char = getCharAndSize(CurPtr, SizeTmp);
3610     if (LangOpts.Digraphs && Char == '>') {
3611       Kind = tok::r_square; // ':>' -> ']'
3612       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3613     } else if (LangOpts.CPlusPlus && Char == ':') {
3614       Kind = tok::coloncolon;
3615       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3616     } else {
3617       Kind = tok::colon;
3618     }
3619     break;
3620   case ';':
3621     Kind = tok::semi;
3622     break;
3623   case '=':
3624     Char = getCharAndSize(CurPtr, SizeTmp);
3625     if (Char == '=') {
3626       // If this is '====' and we're in a conflict marker, ignore it.
3627       if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
3628         goto LexNextToken;
3629 
3630       Kind = tok::equalequal;
3631       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3632     } else {
3633       Kind = tok::equal;
3634     }
3635     break;
3636   case ',':
3637     Kind = tok::comma;
3638     break;
3639   case '#':
3640     Char = getCharAndSize(CurPtr, SizeTmp);
3641     if (Char == '#') {
3642       Kind = tok::hashhash;
3643       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3644     } else if (Char == '@' && LangOpts.MicrosoftExt) {  // #@ -> Charize
3645       Kind = tok::hashat;
3646       if (!isLexingRawMode())
3647         Diag(BufferPtr, diag::ext_charize_microsoft);
3648       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3649     } else {
3650       // We parsed a # character.  If this occurs at the start of the line,
3651       // it's actually the start of a preprocessing directive.  Callback to
3652       // the preprocessor to handle it.
3653       // TODO: -fpreprocessed mode??
3654       if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
3655         goto HandleDirective;
3656 
3657       Kind = tok::hash;
3658     }
3659     break;
3660 
3661   case '@':
3662     // Objective C support.
3663     if (CurPtr[-1] == '@' && LangOpts.ObjC1)
3664       Kind = tok::at;
3665     else
3666       Kind = tok::unknown;
3667     break;
3668 
3669   // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
3670   case '\\':
3671     if (!LangOpts.AsmPreprocessor) {
3672       if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
3673         if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
3674           if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3675             return true; // KeepWhitespaceMode
3676 
3677           // We only saw whitespace, so just try again with this lexer.
3678           // (We manually eliminate the tail call to avoid recursion.)
3679           goto LexNextToken;
3680         }
3681 
3682         return LexUnicode(Result, CodePoint, CurPtr);
3683       }
3684     }
3685 
3686     Kind = tok::unknown;
3687     break;
3688 
3689   default: {
3690     if (isASCII(Char)) {
3691       Kind = tok::unknown;
3692       break;
3693     }
3694 
3695     llvm::UTF32 CodePoint;
3696 
3697     // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
3698     // an escaped newline.
3699     --CurPtr;
3700     llvm::ConversionResult Status =
3701         llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
3702                                   (const llvm::UTF8 *)BufferEnd,
3703                                   &CodePoint,
3704                                   llvm::strictConversion);
3705     if (Status == llvm::conversionOK) {
3706       if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
3707         if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3708           return true; // KeepWhitespaceMode
3709 
3710         // We only saw whitespace, so just try again with this lexer.
3711         // (We manually eliminate the tail call to avoid recursion.)
3712         goto LexNextToken;
3713       }
3714       return LexUnicode(Result, CodePoint, CurPtr);
3715     }
3716 
3717     if (isLexingRawMode() || ParsingPreprocessorDirective ||
3718         PP->isPreprocessedOutput()) {
3719       ++CurPtr;
3720       Kind = tok::unknown;
3721       break;
3722     }
3723 
3724     // Non-ASCII characters tend to creep into source code unintentionally.
3725     // Instead of letting the parser complain about the unknown token,
3726     // just diagnose the invalid UTF-8, then drop the character.
3727     Diag(CurPtr, diag::err_invalid_utf8);
3728 
3729     BufferPtr = CurPtr+1;
3730     // We're pretending the character didn't exist, so just try again with
3731     // this lexer.
3732     // (We manually eliminate the tail call to avoid recursion.)
3733     goto LexNextToken;
3734   }
3735   }
3736 
3737   // Notify MIOpt that we read a non-whitespace/non-comment token.
3738   MIOpt.ReadToken();
3739 
3740   // Update the location of token as well as BufferPtr.
3741   FormTokenWithChars(Result, CurPtr, Kind);
3742   return true;
3743 
3744 HandleDirective:
3745   // We parsed a # character and it's the start of a preprocessing directive.
3746 
3747   FormTokenWithChars(Result, CurPtr, tok::hash);
3748   PP->HandleDirective(Result);
3749 
3750   if (PP->hadModuleLoaderFatalFailure()) {
3751     // With a fatal failure in the module loader, we abort parsing.
3752     assert(Result.is(tok::eof) && "Preprocessor did not set tok:eof");
3753     return true;
3754   }
3755 
3756   // We parsed the directive; lex a token with the new state.
3757   return false;
3758 }
3759