1 //===- Lexer.cpp - C Language Family Lexer --------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the Lexer and Token interfaces. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "clang/Lex/Lexer.h" 14 #include "UnicodeCharSets.h" 15 #include "clang/Basic/CharInfo.h" 16 #include "clang/Basic/IdentifierTable.h" 17 #include "clang/Basic/LangOptions.h" 18 #include "clang/Basic/SourceLocation.h" 19 #include "clang/Basic/SourceManager.h" 20 #include "clang/Basic/TokenKinds.h" 21 #include "clang/Lex/LexDiagnostic.h" 22 #include "clang/Lex/LiteralSupport.h" 23 #include "clang/Lex/MultipleIncludeOpt.h" 24 #include "clang/Lex/Preprocessor.h" 25 #include "clang/Lex/PreprocessorOptions.h" 26 #include "clang/Lex/Token.h" 27 #include "clang/Basic/Diagnostic.h" 28 #include "clang/Basic/LLVM.h" 29 #include "clang/Basic/TokenKinds.h" 30 #include "llvm/ADT/None.h" 31 #include "llvm/ADT/Optional.h" 32 #include "llvm/ADT/StringExtras.h" 33 #include "llvm/ADT/StringSwitch.h" 34 #include "llvm/ADT/StringRef.h" 35 #include "llvm/Support/Compiler.h" 36 #include "llvm/Support/ConvertUTF.h" 37 #include "llvm/Support/MathExtras.h" 38 #include "llvm/Support/MemoryBuffer.h" 39 #include "llvm/Support/NativeFormatting.h" 40 #include "llvm/Support/UnicodeCharRanges.h" 41 #include <algorithm> 42 #include <cassert> 43 #include <cstddef> 44 #include <cstdint> 45 #include <cstring> 46 #include <string> 47 #include <tuple> 48 #include <utility> 49 50 using namespace clang; 51 52 //===----------------------------------------------------------------------===// 53 // Token Class Implementation 54 //===----------------------------------------------------------------------===// 55 56 /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. 57 bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { 58 if (isAnnotation()) 59 return false; 60 if (IdentifierInfo *II = getIdentifierInfo()) 61 return II->getObjCKeywordID() == objcKey; 62 return false; 63 } 64 65 /// getObjCKeywordID - Return the ObjC keyword kind. 66 tok::ObjCKeywordKind Token::getObjCKeywordID() const { 67 if (isAnnotation()) 68 return tok::objc_not_keyword; 69 IdentifierInfo *specId = getIdentifierInfo(); 70 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword; 71 } 72 73 //===----------------------------------------------------------------------===// 74 // Lexer Class Implementation 75 //===----------------------------------------------------------------------===// 76 77 void Lexer::anchor() {} 78 79 void Lexer::InitLexer(const char *BufStart, const char *BufPtr, 80 const char *BufEnd) { 81 BufferStart = BufStart; 82 BufferPtr = BufPtr; 83 BufferEnd = BufEnd; 84 85 assert(BufEnd[0] == 0 && 86 "We assume that the input buffer has a null character at the end" 87 " to simplify lexing!"); 88 89 // Check whether we have a BOM in the beginning of the buffer. If yes - act 90 // accordingly. Right now we support only UTF-8 with and without BOM, so, just 91 // skip the UTF-8 BOM if it's present. 92 if (BufferStart == BufferPtr) { 93 // Determine the size of the BOM. 94 StringRef Buf(BufferStart, BufferEnd - BufferStart); 95 size_t BOMLength = llvm::StringSwitch<size_t>(Buf) 96 .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM 97 .Default(0); 98 99 // Skip the BOM. 100 BufferPtr += BOMLength; 101 } 102 103 Is_PragmaLexer = false; 104 CurrentConflictMarkerState = CMK_None; 105 106 // Start of the file is a start of line. 107 IsAtStartOfLine = true; 108 IsAtPhysicalStartOfLine = true; 109 110 HasLeadingSpace = false; 111 HasLeadingEmptyMacro = false; 112 113 // We are not after parsing a #. 114 ParsingPreprocessorDirective = false; 115 116 // We are not after parsing #include. 117 ParsingFilename = false; 118 119 // We are not in raw mode. Raw mode disables diagnostics and interpretation 120 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used 121 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block 122 // or otherwise skipping over tokens. 123 LexingRawMode = false; 124 125 // Default to not keeping comments. 126 ExtendedTokenMode = 0; 127 } 128 129 /// Lexer constructor - Create a new lexer object for the specified buffer 130 /// with the specified preprocessor managing the lexing process. This lexer 131 /// assumes that the associated file buffer and Preprocessor objects will 132 /// outlive it, so it doesn't take ownership of either of them. 133 Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP) 134 : PreprocessorLexer(&PP, FID), 135 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), 136 LangOpts(PP.getLangOpts()) { 137 InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(), 138 InputFile->getBufferEnd()); 139 140 resetExtendedTokenMode(); 141 } 142 143 /// Lexer constructor - Create a new raw lexer object. This object is only 144 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 145 /// range will outlive it, so it doesn't take ownership of it. 146 Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts, 147 const char *BufStart, const char *BufPtr, const char *BufEnd) 148 : FileLoc(fileloc), LangOpts(langOpts) { 149 InitLexer(BufStart, BufPtr, BufEnd); 150 151 // We *are* in raw mode. 152 LexingRawMode = true; 153 } 154 155 /// Lexer constructor - Create a new raw lexer object. This object is only 156 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 157 /// range will outlive it, so it doesn't take ownership of it. 158 Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile, 159 const SourceManager &SM, const LangOptions &langOpts) 160 : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile->getBufferStart(), 161 FromFile->getBufferStart(), FromFile->getBufferEnd()) {} 162 163 void Lexer::resetExtendedTokenMode() { 164 assert(PP && "Cannot reset token mode without a preprocessor"); 165 if (LangOpts.TraditionalCPP) 166 SetKeepWhitespaceMode(true); 167 else 168 SetCommentRetentionState(PP->getCommentRetentionState()); 169 } 170 171 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for 172 /// _Pragma expansion. This has a variety of magic semantics that this method 173 /// sets up. It returns a new'd Lexer that must be delete'd when done. 174 /// 175 /// On entrance to this routine, TokStartLoc is a macro location which has a 176 /// spelling loc that indicates the bytes to be lexed for the token and an 177 /// expansion location that indicates where all lexed tokens should be 178 /// "expanded from". 179 /// 180 /// TODO: It would really be nice to make _Pragma just be a wrapper around a 181 /// normal lexer that remaps tokens as they fly by. This would require making 182 /// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer 183 /// interface that could handle this stuff. This would pull GetMappedTokenLoc 184 /// out of the critical path of the lexer! 185 /// 186 Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, 187 SourceLocation ExpansionLocStart, 188 SourceLocation ExpansionLocEnd, 189 unsigned TokLen, Preprocessor &PP) { 190 SourceManager &SM = PP.getSourceManager(); 191 192 // Create the lexer as if we were going to lex the file normally. 193 FileID SpellingFID = SM.getFileID(SpellingLoc); 194 const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID); 195 Lexer *L = new Lexer(SpellingFID, InputFile, PP); 196 197 // Now that the lexer is created, change the start/end locations so that we 198 // just lex the subsection of the file that we want. This is lexing from a 199 // scratch buffer. 200 const char *StrData = SM.getCharacterData(SpellingLoc); 201 202 L->BufferPtr = StrData; 203 L->BufferEnd = StrData+TokLen; 204 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"); 205 206 // Set the SourceLocation with the remapping information. This ensures that 207 // GetMappedTokenLoc will remap the tokens as they are lexed. 208 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID), 209 ExpansionLocStart, 210 ExpansionLocEnd, TokLen); 211 212 // Ensure that the lexer thinks it is inside a directive, so that end \n will 213 // return an EOD token. 214 L->ParsingPreprocessorDirective = true; 215 216 // This lexer really is for _Pragma. 217 L->Is_PragmaLexer = true; 218 return L; 219 } 220 221 template <typename T> static void StringifyImpl(T &Str, char Quote) { 222 typename T::size_type i = 0, e = Str.size(); 223 while (i < e) { 224 if (Str[i] == '\\' || Str[i] == Quote) { 225 Str.insert(Str.begin() + i, '\\'); 226 i += 2; 227 ++e; 228 } else if (Str[i] == '\n' || Str[i] == '\r') { 229 // Replace '\r\n' and '\n\r' to '\\' followed by 'n'. 230 if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') && 231 Str[i] != Str[i + 1]) { 232 Str[i] = '\\'; 233 Str[i + 1] = 'n'; 234 } else { 235 // Replace '\n' and '\r' to '\\' followed by 'n'. 236 Str[i] = '\\'; 237 Str.insert(Str.begin() + i + 1, 'n'); 238 ++e; 239 } 240 i += 2; 241 } else 242 ++i; 243 } 244 } 245 246 std::string Lexer::Stringify(StringRef Str, bool Charify) { 247 std::string Result = Str; 248 char Quote = Charify ? '\'' : '"'; 249 StringifyImpl(Result, Quote); 250 return Result; 251 } 252 253 void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); } 254 255 //===----------------------------------------------------------------------===// 256 // Token Spelling 257 //===----------------------------------------------------------------------===// 258 259 /// Slow case of getSpelling. Extract the characters comprising the 260 /// spelling of this token from the provided input buffer. 261 static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, 262 const LangOptions &LangOpts, char *Spelling) { 263 assert(Tok.needsCleaning() && "getSpellingSlow called on simple token"); 264 265 size_t Length = 0; 266 const char *BufEnd = BufPtr + Tok.getLength(); 267 268 if (tok::isStringLiteral(Tok.getKind())) { 269 // Munch the encoding-prefix and opening double-quote. 270 while (BufPtr < BufEnd) { 271 unsigned Size; 272 Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts); 273 BufPtr += Size; 274 275 if (Spelling[Length - 1] == '"') 276 break; 277 } 278 279 // Raw string literals need special handling; trigraph expansion and line 280 // splicing do not occur within their d-char-sequence nor within their 281 // r-char-sequence. 282 if (Length >= 2 && 283 Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') { 284 // Search backwards from the end of the token to find the matching closing 285 // quote. 286 const char *RawEnd = BufEnd; 287 do --RawEnd; while (*RawEnd != '"'); 288 size_t RawLength = RawEnd - BufPtr + 1; 289 290 // Everything between the quotes is included verbatim in the spelling. 291 memcpy(Spelling + Length, BufPtr, RawLength); 292 Length += RawLength; 293 BufPtr += RawLength; 294 295 // The rest of the token is lexed normally. 296 } 297 } 298 299 while (BufPtr < BufEnd) { 300 unsigned Size; 301 Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts); 302 BufPtr += Size; 303 } 304 305 assert(Length < Tok.getLength() && 306 "NeedsCleaning flag set on token that didn't need cleaning!"); 307 return Length; 308 } 309 310 /// getSpelling() - Return the 'spelling' of this token. The spelling of a 311 /// token are the characters used to represent the token in the source file 312 /// after trigraph expansion and escaped-newline folding. In particular, this 313 /// wants to get the true, uncanonicalized, spelling of things like digraphs 314 /// UCNs, etc. 315 StringRef Lexer::getSpelling(SourceLocation loc, 316 SmallVectorImpl<char> &buffer, 317 const SourceManager &SM, 318 const LangOptions &options, 319 bool *invalid) { 320 // Break down the source location. 321 std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc); 322 323 // Try to the load the file buffer. 324 bool invalidTemp = false; 325 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp); 326 if (invalidTemp) { 327 if (invalid) *invalid = true; 328 return {}; 329 } 330 331 const char *tokenBegin = file.data() + locInfo.second; 332 333 // Lex from the start of the given location. 334 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options, 335 file.begin(), tokenBegin, file.end()); 336 Token token; 337 lexer.LexFromRawLexer(token); 338 339 unsigned length = token.getLength(); 340 341 // Common case: no need for cleaning. 342 if (!token.needsCleaning()) 343 return StringRef(tokenBegin, length); 344 345 // Hard case, we need to relex the characters into the string. 346 buffer.resize(length); 347 buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data())); 348 return StringRef(buffer.data(), buffer.size()); 349 } 350 351 /// getSpelling() - Return the 'spelling' of this token. The spelling of a 352 /// token are the characters used to represent the token in the source file 353 /// after trigraph expansion and escaped-newline folding. In particular, this 354 /// wants to get the true, uncanonicalized, spelling of things like digraphs 355 /// UCNs, etc. 356 std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr, 357 const LangOptions &LangOpts, bool *Invalid) { 358 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 359 360 bool CharDataInvalid = false; 361 const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(), 362 &CharDataInvalid); 363 if (Invalid) 364 *Invalid = CharDataInvalid; 365 if (CharDataInvalid) 366 return {}; 367 368 // If this token contains nothing interesting, return it directly. 369 if (!Tok.needsCleaning()) 370 return std::string(TokStart, TokStart + Tok.getLength()); 371 372 std::string Result; 373 Result.resize(Tok.getLength()); 374 Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin())); 375 return Result; 376 } 377 378 /// getSpelling - This method is used to get the spelling of a token into a 379 /// preallocated buffer, instead of as an std::string. The caller is required 380 /// to allocate enough space for the token, which is guaranteed to be at least 381 /// Tok.getLength() bytes long. The actual length of the token is returned. 382 /// 383 /// Note that this method may do two possible things: it may either fill in 384 /// the buffer specified with characters, or it may *change the input pointer* 385 /// to point to a constant buffer with the data already in it (avoiding a 386 /// copy). The caller is not allowed to modify the returned buffer pointer 387 /// if an internal buffer is returned. 388 unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, 389 const SourceManager &SourceMgr, 390 const LangOptions &LangOpts, bool *Invalid) { 391 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 392 393 const char *TokStart = nullptr; 394 // NOTE: this has to be checked *before* testing for an IdentifierInfo. 395 if (Tok.is(tok::raw_identifier)) 396 TokStart = Tok.getRawIdentifier().data(); 397 else if (!Tok.hasUCN()) { 398 if (const IdentifierInfo *II = Tok.getIdentifierInfo()) { 399 // Just return the string from the identifier table, which is very quick. 400 Buffer = II->getNameStart(); 401 return II->getLength(); 402 } 403 } 404 405 // NOTE: this can be checked even after testing for an IdentifierInfo. 406 if (Tok.isLiteral()) 407 TokStart = Tok.getLiteralData(); 408 409 if (!TokStart) { 410 // Compute the start of the token in the input lexer buffer. 411 bool CharDataInvalid = false; 412 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid); 413 if (Invalid) 414 *Invalid = CharDataInvalid; 415 if (CharDataInvalid) { 416 Buffer = ""; 417 return 0; 418 } 419 } 420 421 // If this token contains nothing interesting, return it directly. 422 if (!Tok.needsCleaning()) { 423 Buffer = TokStart; 424 return Tok.getLength(); 425 } 426 427 // Otherwise, hard case, relex the characters into the string. 428 return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer)); 429 } 430 431 /// MeasureTokenLength - Relex the token at the specified location and return 432 /// its length in bytes in the input file. If the token needs cleaning (e.g. 433 /// includes a trigraph or an escaped newline) then this count includes bytes 434 /// that are part of that. 435 unsigned Lexer::MeasureTokenLength(SourceLocation Loc, 436 const SourceManager &SM, 437 const LangOptions &LangOpts) { 438 Token TheTok; 439 if (getRawToken(Loc, TheTok, SM, LangOpts)) 440 return 0; 441 return TheTok.getLength(); 442 } 443 444 /// Relex the token at the specified location. 445 /// \returns true if there was a failure, false on success. 446 bool Lexer::getRawToken(SourceLocation Loc, Token &Result, 447 const SourceManager &SM, 448 const LangOptions &LangOpts, 449 bool IgnoreWhiteSpace) { 450 // TODO: this could be special cased for common tokens like identifiers, ')', 451 // etc to make this faster, if it mattered. Just look at StrData[0] to handle 452 // all obviously single-char tokens. This could use 453 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or 454 // something. 455 456 // If this comes from a macro expansion, we really do want the macro name, not 457 // the token this macro expanded to. 458 Loc = SM.getExpansionLoc(Loc); 459 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 460 bool Invalid = false; 461 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 462 if (Invalid) 463 return true; 464 465 const char *StrData = Buffer.data()+LocInfo.second; 466 467 if (!IgnoreWhiteSpace && isWhitespace(StrData[0])) 468 return true; 469 470 // Create a lexer starting at the beginning of this token. 471 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, 472 Buffer.begin(), StrData, Buffer.end()); 473 TheLexer.SetCommentRetentionState(true); 474 TheLexer.LexFromRawLexer(Result); 475 return false; 476 } 477 478 /// Returns the pointer that points to the beginning of line that contains 479 /// the given offset, or null if the offset if invalid. 480 static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) { 481 const char *BufStart = Buffer.data(); 482 if (Offset >= Buffer.size()) 483 return nullptr; 484 485 const char *LexStart = BufStart + Offset; 486 for (; LexStart != BufStart; --LexStart) { 487 if (isVerticalWhitespace(LexStart[0]) && 488 !Lexer::isNewLineEscaped(BufStart, LexStart)) { 489 // LexStart should point at first character of logical line. 490 ++LexStart; 491 break; 492 } 493 } 494 return LexStart; 495 } 496 497 static SourceLocation getBeginningOfFileToken(SourceLocation Loc, 498 const SourceManager &SM, 499 const LangOptions &LangOpts) { 500 assert(Loc.isFileID()); 501 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 502 if (LocInfo.first.isInvalid()) 503 return Loc; 504 505 bool Invalid = false; 506 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 507 if (Invalid) 508 return Loc; 509 510 // Back up from the current location until we hit the beginning of a line 511 // (or the buffer). We'll relex from that point. 512 const char *StrData = Buffer.data() + LocInfo.second; 513 const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second); 514 if (!LexStart || LexStart == StrData) 515 return Loc; 516 517 // Create a lexer starting at the beginning of this token. 518 SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second); 519 Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart, 520 Buffer.end()); 521 TheLexer.SetCommentRetentionState(true); 522 523 // Lex tokens until we find the token that contains the source location. 524 Token TheTok; 525 do { 526 TheLexer.LexFromRawLexer(TheTok); 527 528 if (TheLexer.getBufferLocation() > StrData) { 529 // Lexing this token has taken the lexer past the source location we're 530 // looking for. If the current token encompasses our source location, 531 // return the beginning of that token. 532 if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData) 533 return TheTok.getLocation(); 534 535 // We ended up skipping over the source location entirely, which means 536 // that it points into whitespace. We're done here. 537 break; 538 } 539 } while (TheTok.getKind() != tok::eof); 540 541 // We've passed our source location; just return the original source location. 542 return Loc; 543 } 544 545 SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc, 546 const SourceManager &SM, 547 const LangOptions &LangOpts) { 548 if (Loc.isFileID()) 549 return getBeginningOfFileToken(Loc, SM, LangOpts); 550 551 if (!SM.isMacroArgExpansion(Loc)) 552 return Loc; 553 554 SourceLocation FileLoc = SM.getSpellingLoc(Loc); 555 SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts); 556 std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc); 557 std::pair<FileID, unsigned> BeginFileLocInfo = 558 SM.getDecomposedLoc(BeginFileLoc); 559 assert(FileLocInfo.first == BeginFileLocInfo.first && 560 FileLocInfo.second >= BeginFileLocInfo.second); 561 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second); 562 } 563 564 namespace { 565 566 enum PreambleDirectiveKind { 567 PDK_Skipped, 568 PDK_Unknown 569 }; 570 571 } // namespace 572 573 PreambleBounds Lexer::ComputePreamble(StringRef Buffer, 574 const LangOptions &LangOpts, 575 unsigned MaxLines) { 576 // Create a lexer starting at the beginning of the file. Note that we use a 577 // "fake" file source location at offset 1 so that the lexer will track our 578 // position within the file. 579 const unsigned StartOffset = 1; 580 SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset); 581 Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(), 582 Buffer.end()); 583 TheLexer.SetCommentRetentionState(true); 584 585 bool InPreprocessorDirective = false; 586 Token TheTok; 587 SourceLocation ActiveCommentLoc; 588 589 unsigned MaxLineOffset = 0; 590 if (MaxLines) { 591 const char *CurPtr = Buffer.begin(); 592 unsigned CurLine = 0; 593 while (CurPtr != Buffer.end()) { 594 char ch = *CurPtr++; 595 if (ch == '\n') { 596 ++CurLine; 597 if (CurLine == MaxLines) 598 break; 599 } 600 } 601 if (CurPtr != Buffer.end()) 602 MaxLineOffset = CurPtr - Buffer.begin(); 603 } 604 605 do { 606 TheLexer.LexFromRawLexer(TheTok); 607 608 if (InPreprocessorDirective) { 609 // If we've hit the end of the file, we're done. 610 if (TheTok.getKind() == tok::eof) { 611 break; 612 } 613 614 // If we haven't hit the end of the preprocessor directive, skip this 615 // token. 616 if (!TheTok.isAtStartOfLine()) 617 continue; 618 619 // We've passed the end of the preprocessor directive, and will look 620 // at this token again below. 621 InPreprocessorDirective = false; 622 } 623 624 // Keep track of the # of lines in the preamble. 625 if (TheTok.isAtStartOfLine()) { 626 unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset; 627 628 // If we were asked to limit the number of lines in the preamble, 629 // and we're about to exceed that limit, we're done. 630 if (MaxLineOffset && TokOffset >= MaxLineOffset) 631 break; 632 } 633 634 // Comments are okay; skip over them. 635 if (TheTok.getKind() == tok::comment) { 636 if (ActiveCommentLoc.isInvalid()) 637 ActiveCommentLoc = TheTok.getLocation(); 638 continue; 639 } 640 641 if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) { 642 // This is the start of a preprocessor directive. 643 Token HashTok = TheTok; 644 InPreprocessorDirective = true; 645 ActiveCommentLoc = SourceLocation(); 646 647 // Figure out which directive this is. Since we're lexing raw tokens, 648 // we don't have an identifier table available. Instead, just look at 649 // the raw identifier to recognize and categorize preprocessor directives. 650 TheLexer.LexFromRawLexer(TheTok); 651 if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) { 652 StringRef Keyword = TheTok.getRawIdentifier(); 653 PreambleDirectiveKind PDK 654 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword) 655 .Case("include", PDK_Skipped) 656 .Case("__include_macros", PDK_Skipped) 657 .Case("define", PDK_Skipped) 658 .Case("undef", PDK_Skipped) 659 .Case("line", PDK_Skipped) 660 .Case("error", PDK_Skipped) 661 .Case("pragma", PDK_Skipped) 662 .Case("import", PDK_Skipped) 663 .Case("include_next", PDK_Skipped) 664 .Case("warning", PDK_Skipped) 665 .Case("ident", PDK_Skipped) 666 .Case("sccs", PDK_Skipped) 667 .Case("assert", PDK_Skipped) 668 .Case("unassert", PDK_Skipped) 669 .Case("if", PDK_Skipped) 670 .Case("ifdef", PDK_Skipped) 671 .Case("ifndef", PDK_Skipped) 672 .Case("elif", PDK_Skipped) 673 .Case("else", PDK_Skipped) 674 .Case("endif", PDK_Skipped) 675 .Default(PDK_Unknown); 676 677 switch (PDK) { 678 case PDK_Skipped: 679 continue; 680 681 case PDK_Unknown: 682 // We don't know what this directive is; stop at the '#'. 683 break; 684 } 685 } 686 687 // We only end up here if we didn't recognize the preprocessor 688 // directive or it was one that can't occur in the preamble at this 689 // point. Roll back the current token to the location of the '#'. 690 InPreprocessorDirective = false; 691 TheTok = HashTok; 692 } 693 694 // We hit a token that we don't recognize as being in the 695 // "preprocessing only" part of the file, so we're no longer in 696 // the preamble. 697 break; 698 } while (true); 699 700 SourceLocation End; 701 if (ActiveCommentLoc.isValid()) 702 End = ActiveCommentLoc; // don't truncate a decl comment. 703 else 704 End = TheTok.getLocation(); 705 706 return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(), 707 TheTok.isAtStartOfLine()); 708 } 709 710 unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo, 711 const SourceManager &SM, 712 const LangOptions &LangOpts) { 713 // Figure out how many physical characters away the specified expansion 714 // character is. This needs to take into consideration newlines and 715 // trigraphs. 716 bool Invalid = false; 717 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid); 718 719 // If they request the first char of the token, we're trivially done. 720 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr))) 721 return 0; 722 723 unsigned PhysOffset = 0; 724 725 // The usual case is that tokens don't contain anything interesting. Skip 726 // over the uninteresting characters. If a token only consists of simple 727 // chars, this method is extremely fast. 728 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) { 729 if (CharNo == 0) 730 return PhysOffset; 731 ++TokPtr; 732 --CharNo; 733 ++PhysOffset; 734 } 735 736 // If we have a character that may be a trigraph or escaped newline, use a 737 // lexer to parse it correctly. 738 for (; CharNo; --CharNo) { 739 unsigned Size; 740 Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts); 741 TokPtr += Size; 742 PhysOffset += Size; 743 } 744 745 // Final detail: if we end up on an escaped newline, we want to return the 746 // location of the actual byte of the token. For example foo\<newline>bar 747 // advanced by 3 should return the location of b, not of \\. One compounding 748 // detail of this is that the escape may be made by a trigraph. 749 if (!Lexer::isObviouslySimpleCharacter(*TokPtr)) 750 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr; 751 752 return PhysOffset; 753 } 754 755 /// Computes the source location just past the end of the 756 /// token at this source location. 757 /// 758 /// This routine can be used to produce a source location that 759 /// points just past the end of the token referenced by \p Loc, and 760 /// is generally used when a diagnostic needs to point just after a 761 /// token where it expected something different that it received. If 762 /// the returned source location would not be meaningful (e.g., if 763 /// it points into a macro), this routine returns an invalid 764 /// source location. 765 /// 766 /// \param Offset an offset from the end of the token, where the source 767 /// location should refer to. The default offset (0) produces a source 768 /// location pointing just past the end of the token; an offset of 1 produces 769 /// a source location pointing to the last character in the token, etc. 770 SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset, 771 const SourceManager &SM, 772 const LangOptions &LangOpts) { 773 if (Loc.isInvalid()) 774 return {}; 775 776 if (Loc.isMacroID()) { 777 if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 778 return {}; // Points inside the macro expansion. 779 } 780 781 unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 782 if (Len > Offset) 783 Len = Len - Offset; 784 else 785 return Loc; 786 787 return Loc.getLocWithOffset(Len); 788 } 789 790 /// Returns true if the given MacroID location points at the first 791 /// token of the macro expansion. 792 bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc, 793 const SourceManager &SM, 794 const LangOptions &LangOpts, 795 SourceLocation *MacroBegin) { 796 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 797 798 SourceLocation expansionLoc; 799 if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc)) 800 return false; 801 802 if (expansionLoc.isFileID()) { 803 // No other macro expansions, this is the first. 804 if (MacroBegin) 805 *MacroBegin = expansionLoc; 806 return true; 807 } 808 809 return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin); 810 } 811 812 /// Returns true if the given MacroID location points at the last 813 /// token of the macro expansion. 814 bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc, 815 const SourceManager &SM, 816 const LangOptions &LangOpts, 817 SourceLocation *MacroEnd) { 818 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 819 820 SourceLocation spellLoc = SM.getSpellingLoc(loc); 821 unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts); 822 if (tokLen == 0) 823 return false; 824 825 SourceLocation afterLoc = loc.getLocWithOffset(tokLen); 826 SourceLocation expansionLoc; 827 if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc)) 828 return false; 829 830 if (expansionLoc.isFileID()) { 831 // No other macro expansions. 832 if (MacroEnd) 833 *MacroEnd = expansionLoc; 834 return true; 835 } 836 837 return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd); 838 } 839 840 static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, 841 const SourceManager &SM, 842 const LangOptions &LangOpts) { 843 SourceLocation Begin = Range.getBegin(); 844 SourceLocation End = Range.getEnd(); 845 assert(Begin.isFileID() && End.isFileID()); 846 if (Range.isTokenRange()) { 847 End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts); 848 if (End.isInvalid()) 849 return {}; 850 } 851 852 // Break down the source locations. 853 FileID FID; 854 unsigned BeginOffs; 855 std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin); 856 if (FID.isInvalid()) 857 return {}; 858 859 unsigned EndOffs; 860 if (!SM.isInFileID(End, FID, &EndOffs) || 861 BeginOffs > EndOffs) 862 return {}; 863 864 return CharSourceRange::getCharRange(Begin, End); 865 } 866 867 CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range, 868 const SourceManager &SM, 869 const LangOptions &LangOpts) { 870 SourceLocation Begin = Range.getBegin(); 871 SourceLocation End = Range.getEnd(); 872 if (Begin.isInvalid() || End.isInvalid()) 873 return {}; 874 875 if (Begin.isFileID() && End.isFileID()) 876 return makeRangeFromFileLocs(Range, SM, LangOpts); 877 878 if (Begin.isMacroID() && End.isFileID()) { 879 if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin)) 880 return {}; 881 Range.setBegin(Begin); 882 return makeRangeFromFileLocs(Range, SM, LangOpts); 883 } 884 885 if (Begin.isFileID() && End.isMacroID()) { 886 if ((Range.isTokenRange() && !isAtEndOfMacroExpansion(End, SM, LangOpts, 887 &End)) || 888 (Range.isCharRange() && !isAtStartOfMacroExpansion(End, SM, LangOpts, 889 &End))) 890 return {}; 891 Range.setEnd(End); 892 return makeRangeFromFileLocs(Range, SM, LangOpts); 893 } 894 895 assert(Begin.isMacroID() && End.isMacroID()); 896 SourceLocation MacroBegin, MacroEnd; 897 if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) && 898 ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts, 899 &MacroEnd)) || 900 (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts, 901 &MacroEnd)))) { 902 Range.setBegin(MacroBegin); 903 Range.setEnd(MacroEnd); 904 return makeRangeFromFileLocs(Range, SM, LangOpts); 905 } 906 907 bool Invalid = false; 908 const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin), 909 &Invalid); 910 if (Invalid) 911 return {}; 912 913 if (BeginEntry.getExpansion().isMacroArgExpansion()) { 914 const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End), 915 &Invalid); 916 if (Invalid) 917 return {}; 918 919 if (EndEntry.getExpansion().isMacroArgExpansion() && 920 BeginEntry.getExpansion().getExpansionLocStart() == 921 EndEntry.getExpansion().getExpansionLocStart()) { 922 Range.setBegin(SM.getImmediateSpellingLoc(Begin)); 923 Range.setEnd(SM.getImmediateSpellingLoc(End)); 924 return makeFileCharRange(Range, SM, LangOpts); 925 } 926 } 927 928 return {}; 929 } 930 931 StringRef Lexer::getSourceText(CharSourceRange Range, 932 const SourceManager &SM, 933 const LangOptions &LangOpts, 934 bool *Invalid) { 935 Range = makeFileCharRange(Range, SM, LangOpts); 936 if (Range.isInvalid()) { 937 if (Invalid) *Invalid = true; 938 return {}; 939 } 940 941 // Break down the source location. 942 std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin()); 943 if (beginInfo.first.isInvalid()) { 944 if (Invalid) *Invalid = true; 945 return {}; 946 } 947 948 unsigned EndOffs; 949 if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) || 950 beginInfo.second > EndOffs) { 951 if (Invalid) *Invalid = true; 952 return {}; 953 } 954 955 // Try to the load the file buffer. 956 bool invalidTemp = false; 957 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp); 958 if (invalidTemp) { 959 if (Invalid) *Invalid = true; 960 return {}; 961 } 962 963 if (Invalid) *Invalid = false; 964 return file.substr(beginInfo.second, EndOffs - beginInfo.second); 965 } 966 967 StringRef Lexer::getImmediateMacroName(SourceLocation Loc, 968 const SourceManager &SM, 969 const LangOptions &LangOpts) { 970 assert(Loc.isMacroID() && "Only reasonable to call this on macros"); 971 972 // Find the location of the immediate macro expansion. 973 while (true) { 974 FileID FID = SM.getFileID(Loc); 975 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID); 976 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion(); 977 Loc = Expansion.getExpansionLocStart(); 978 if (!Expansion.isMacroArgExpansion()) 979 break; 980 981 // For macro arguments we need to check that the argument did not come 982 // from an inner macro, e.g: "MAC1( MAC2(foo) )" 983 984 // Loc points to the argument id of the macro definition, move to the 985 // macro expansion. 986 Loc = SM.getImmediateExpansionRange(Loc).getBegin(); 987 SourceLocation SpellLoc = Expansion.getSpellingLoc(); 988 if (SpellLoc.isFileID()) 989 break; // No inner macro. 990 991 // If spelling location resides in the same FileID as macro expansion 992 // location, it means there is no inner macro. 993 FileID MacroFID = SM.getFileID(Loc); 994 if (SM.isInFileID(SpellLoc, MacroFID)) 995 break; 996 997 // Argument came from inner macro. 998 Loc = SpellLoc; 999 } 1000 1001 // Find the spelling location of the start of the non-argument expansion 1002 // range. This is where the macro name was spelled in order to begin 1003 // expanding this macro. 1004 Loc = SM.getSpellingLoc(Loc); 1005 1006 // Dig out the buffer where the macro name was spelled and the extents of the 1007 // name so that we can render it into the expansion note. 1008 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 1009 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 1010 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 1011 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 1012 } 1013 1014 StringRef Lexer::getImmediateMacroNameForDiagnostics( 1015 SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) { 1016 assert(Loc.isMacroID() && "Only reasonable to call this on macros"); 1017 // Walk past macro argument expansions. 1018 while (SM.isMacroArgExpansion(Loc)) 1019 Loc = SM.getImmediateExpansionRange(Loc).getBegin(); 1020 1021 // If the macro's spelling has no FileID, then it's actually a token paste 1022 // or stringization (or similar) and not a macro at all. 1023 if (!SM.getFileEntryForID(SM.getFileID(SM.getSpellingLoc(Loc)))) 1024 return {}; 1025 1026 // Find the spelling location of the start of the non-argument expansion 1027 // range. This is where the macro name was spelled in order to begin 1028 // expanding this macro. 1029 Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin()); 1030 1031 // Dig out the buffer where the macro name was spelled and the extents of the 1032 // name so that we can render it into the expansion note. 1033 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 1034 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 1035 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 1036 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 1037 } 1038 1039 bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) { 1040 return isIdentifierBody(c, LangOpts.DollarIdents); 1041 } 1042 1043 bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) { 1044 assert(isVerticalWhitespace(Str[0])); 1045 if (Str - 1 < BufferStart) 1046 return false; 1047 1048 if ((Str[0] == '\n' && Str[-1] == '\r') || 1049 (Str[0] == '\r' && Str[-1] == '\n')) { 1050 if (Str - 2 < BufferStart) 1051 return false; 1052 --Str; 1053 } 1054 --Str; 1055 1056 // Rewind to first non-space character: 1057 while (Str > BufferStart && isHorizontalWhitespace(*Str)) 1058 --Str; 1059 1060 return *Str == '\\'; 1061 } 1062 1063 StringRef Lexer::getIndentationForLine(SourceLocation Loc, 1064 const SourceManager &SM) { 1065 if (Loc.isInvalid() || Loc.isMacroID()) 1066 return {}; 1067 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 1068 if (LocInfo.first.isInvalid()) 1069 return {}; 1070 bool Invalid = false; 1071 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 1072 if (Invalid) 1073 return {}; 1074 const char *Line = findBeginningOfLine(Buffer, LocInfo.second); 1075 if (!Line) 1076 return {}; 1077 StringRef Rest = Buffer.substr(Line - Buffer.data()); 1078 size_t NumWhitespaceChars = Rest.find_first_not_of(" \t"); 1079 return NumWhitespaceChars == StringRef::npos 1080 ? "" 1081 : Rest.take_front(NumWhitespaceChars); 1082 } 1083 1084 //===----------------------------------------------------------------------===// 1085 // Diagnostics forwarding code. 1086 //===----------------------------------------------------------------------===// 1087 1088 /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the 1089 /// lexer buffer was all expanded at a single point, perform the mapping. 1090 /// This is currently only used for _Pragma implementation, so it is the slow 1091 /// path of the hot getSourceLocation method. Do not allow it to be inlined. 1092 static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc( 1093 Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen); 1094 static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 1095 SourceLocation FileLoc, 1096 unsigned CharNo, unsigned TokLen) { 1097 assert(FileLoc.isMacroID() && "Must be a macro expansion"); 1098 1099 // Otherwise, we're lexing "mapped tokens". This is used for things like 1100 // _Pragma handling. Combine the expansion location of FileLoc with the 1101 // spelling location. 1102 SourceManager &SM = PP.getSourceManager(); 1103 1104 // Create a new SLoc which is expanded from Expansion(FileLoc) but whose 1105 // characters come from spelling(FileLoc)+Offset. 1106 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc); 1107 SpellingLoc = SpellingLoc.getLocWithOffset(CharNo); 1108 1109 // Figure out the expansion loc range, which is the range covered by the 1110 // original _Pragma(...) sequence. 1111 CharSourceRange II = SM.getImmediateExpansionRange(FileLoc); 1112 1113 return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen); 1114 } 1115 1116 /// getSourceLocation - Return a source location identifier for the specified 1117 /// offset in the current file. 1118 SourceLocation Lexer::getSourceLocation(const char *Loc, 1119 unsigned TokLen) const { 1120 assert(Loc >= BufferStart && Loc <= BufferEnd && 1121 "Location out of range for this buffer!"); 1122 1123 // In the normal case, we're just lexing from a simple file buffer, return 1124 // the file id from FileLoc with the offset specified. 1125 unsigned CharNo = Loc-BufferStart; 1126 if (FileLoc.isFileID()) 1127 return FileLoc.getLocWithOffset(CharNo); 1128 1129 // Otherwise, this is the _Pragma lexer case, which pretends that all of the 1130 // tokens are lexed from where the _Pragma was defined. 1131 assert(PP && "This doesn't work on raw lexers"); 1132 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen); 1133 } 1134 1135 /// Diag - Forwarding function for diagnostics. This translate a source 1136 /// position in the current buffer into a SourceLocation object for rendering. 1137 DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const { 1138 return PP->Diag(getSourceLocation(Loc), DiagID); 1139 } 1140 1141 //===----------------------------------------------------------------------===// 1142 // Trigraph and Escaped Newline Handling Code. 1143 //===----------------------------------------------------------------------===// 1144 1145 /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, 1146 /// return the decoded trigraph letter it corresponds to, or '\0' if nothing. 1147 static char GetTrigraphCharForLetter(char Letter) { 1148 switch (Letter) { 1149 default: return 0; 1150 case '=': return '#'; 1151 case ')': return ']'; 1152 case '(': return '['; 1153 case '!': return '|'; 1154 case '\'': return '^'; 1155 case '>': return '}'; 1156 case '/': return '\\'; 1157 case '<': return '{'; 1158 case '-': return '~'; 1159 } 1160 } 1161 1162 /// DecodeTrigraphChar - If the specified character is a legal trigraph when 1163 /// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, 1164 /// return the result character. Finally, emit a warning about trigraph use 1165 /// whether trigraphs are enabled or not. 1166 static char DecodeTrigraphChar(const char *CP, Lexer *L) { 1167 char Res = GetTrigraphCharForLetter(*CP); 1168 if (!Res || !L) return Res; 1169 1170 if (!L->getLangOpts().Trigraphs) { 1171 if (!L->isLexingRawMode()) 1172 L->Diag(CP-2, diag::trigraph_ignored); 1173 return 0; 1174 } 1175 1176 if (!L->isLexingRawMode()) 1177 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1); 1178 return Res; 1179 } 1180 1181 /// getEscapedNewLineSize - Return the size of the specified escaped newline, 1182 /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a 1183 /// trigraph equivalent on entry to this function. 1184 unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { 1185 unsigned Size = 0; 1186 while (isWhitespace(Ptr[Size])) { 1187 ++Size; 1188 1189 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r') 1190 continue; 1191 1192 // If this is a \r\n or \n\r, skip the other half. 1193 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') && 1194 Ptr[Size-1] != Ptr[Size]) 1195 ++Size; 1196 1197 return Size; 1198 } 1199 1200 // Not an escaped newline, must be a \t or something else. 1201 return 0; 1202 } 1203 1204 /// SkipEscapedNewLines - If P points to an escaped newline (or a series of 1205 /// them), skip over them and return the first non-escaped-newline found, 1206 /// otherwise return P. 1207 const char *Lexer::SkipEscapedNewLines(const char *P) { 1208 while (true) { 1209 const char *AfterEscape; 1210 if (*P == '\\') { 1211 AfterEscape = P+1; 1212 } else if (*P == '?') { 1213 // If not a trigraph for escape, bail out. 1214 if (P[1] != '?' || P[2] != '/') 1215 return P; 1216 // FIXME: Take LangOpts into account; the language might not 1217 // support trigraphs. 1218 AfterEscape = P+3; 1219 } else { 1220 return P; 1221 } 1222 1223 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape); 1224 if (NewLineSize == 0) return P; 1225 P = AfterEscape+NewLineSize; 1226 } 1227 } 1228 1229 Optional<Token> Lexer::findNextToken(SourceLocation Loc, 1230 const SourceManager &SM, 1231 const LangOptions &LangOpts) { 1232 if (Loc.isMacroID()) { 1233 if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 1234 return None; 1235 } 1236 Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts); 1237 1238 // Break down the source location. 1239 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 1240 1241 // Try to load the file buffer. 1242 bool InvalidTemp = false; 1243 StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp); 1244 if (InvalidTemp) 1245 return None; 1246 1247 const char *TokenBegin = File.data() + LocInfo.second; 1248 1249 // Lex from the start of the given location. 1250 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(), 1251 TokenBegin, File.end()); 1252 // Find the token. 1253 Token Tok; 1254 lexer.LexFromRawLexer(Tok); 1255 return Tok; 1256 } 1257 1258 /// Checks that the given token is the first token that occurs after the 1259 /// given location (this excludes comments and whitespace). Returns the location 1260 /// immediately after the specified token. If the token is not found or the 1261 /// location is inside a macro, the returned source location will be invalid. 1262 SourceLocation Lexer::findLocationAfterToken( 1263 SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM, 1264 const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) { 1265 Optional<Token> Tok = findNextToken(Loc, SM, LangOpts); 1266 if (!Tok || Tok->isNot(TKind)) 1267 return {}; 1268 SourceLocation TokenLoc = Tok->getLocation(); 1269 1270 // Calculate how much whitespace needs to be skipped if any. 1271 unsigned NumWhitespaceChars = 0; 1272 if (SkipTrailingWhitespaceAndNewLine) { 1273 const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength(); 1274 unsigned char C = *TokenEnd; 1275 while (isHorizontalWhitespace(C)) { 1276 C = *(++TokenEnd); 1277 NumWhitespaceChars++; 1278 } 1279 1280 // Skip \r, \n, \r\n, or \n\r 1281 if (C == '\n' || C == '\r') { 1282 char PrevC = C; 1283 C = *(++TokenEnd); 1284 NumWhitespaceChars++; 1285 if ((C == '\n' || C == '\r') && C != PrevC) 1286 NumWhitespaceChars++; 1287 } 1288 } 1289 1290 return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars); 1291 } 1292 1293 /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, 1294 /// get its size, and return it. This is tricky in several cases: 1295 /// 1. If currently at the start of a trigraph, we warn about the trigraph, 1296 /// then either return the trigraph (skipping 3 chars) or the '?', 1297 /// depending on whether trigraphs are enabled or not. 1298 /// 2. If this is an escaped newline (potentially with whitespace between 1299 /// the backslash and newline), implicitly skip the newline and return 1300 /// the char after it. 1301 /// 1302 /// This handles the slow/uncommon case of the getCharAndSize method. Here we 1303 /// know that we can accumulate into Size, and that we have already incremented 1304 /// Ptr by Size bytes. 1305 /// 1306 /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should 1307 /// be updated to match. 1308 char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size, 1309 Token *Tok) { 1310 // If we have a slash, look for an escaped newline. 1311 if (Ptr[0] == '\\') { 1312 ++Size; 1313 ++Ptr; 1314 Slash: 1315 // Common case, backslash-char where the char is not whitespace. 1316 if (!isWhitespace(Ptr[0])) return '\\'; 1317 1318 // See if we have optional whitespace characters between the slash and 1319 // newline. 1320 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 1321 // Remember that this token needs to be cleaned. 1322 if (Tok) Tok->setFlag(Token::NeedsCleaning); 1323 1324 // Warn if there was whitespace between the backslash and newline. 1325 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode()) 1326 Diag(Ptr, diag::backslash_newline_space); 1327 1328 // Found backslash<whitespace><newline>. Parse the char after it. 1329 Size += EscapedNewLineSize; 1330 Ptr += EscapedNewLineSize; 1331 1332 // Use slow version to accumulate a correct size field. 1333 return getCharAndSizeSlow(Ptr, Size, Tok); 1334 } 1335 1336 // Otherwise, this is not an escaped newline, just return the slash. 1337 return '\\'; 1338 } 1339 1340 // If this is a trigraph, process it. 1341 if (Ptr[0] == '?' && Ptr[1] == '?') { 1342 // If this is actually a legal trigraph (not something like "??x"), emit 1343 // a trigraph warning. If so, and if trigraphs are enabled, return it. 1344 if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : nullptr)) { 1345 // Remember that this token needs to be cleaned. 1346 if (Tok) Tok->setFlag(Token::NeedsCleaning); 1347 1348 Ptr += 3; 1349 Size += 3; 1350 if (C == '\\') goto Slash; 1351 return C; 1352 } 1353 } 1354 1355 // If this is neither, return a single character. 1356 ++Size; 1357 return *Ptr; 1358 } 1359 1360 /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the 1361 /// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, 1362 /// and that we have already incremented Ptr by Size bytes. 1363 /// 1364 /// NOTE: When this method is updated, getCharAndSizeSlow (above) should 1365 /// be updated to match. 1366 char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size, 1367 const LangOptions &LangOpts) { 1368 // If we have a slash, look for an escaped newline. 1369 if (Ptr[0] == '\\') { 1370 ++Size; 1371 ++Ptr; 1372 Slash: 1373 // Common case, backslash-char where the char is not whitespace. 1374 if (!isWhitespace(Ptr[0])) return '\\'; 1375 1376 // See if we have optional whitespace characters followed by a newline. 1377 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 1378 // Found backslash<whitespace><newline>. Parse the char after it. 1379 Size += EscapedNewLineSize; 1380 Ptr += EscapedNewLineSize; 1381 1382 // Use slow version to accumulate a correct size field. 1383 return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts); 1384 } 1385 1386 // Otherwise, this is not an escaped newline, just return the slash. 1387 return '\\'; 1388 } 1389 1390 // If this is a trigraph, process it. 1391 if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { 1392 // If this is actually a legal trigraph (not something like "??x"), return 1393 // it. 1394 if (char C = GetTrigraphCharForLetter(Ptr[2])) { 1395 Ptr += 3; 1396 Size += 3; 1397 if (C == '\\') goto Slash; 1398 return C; 1399 } 1400 } 1401 1402 // If this is neither, return a single character. 1403 ++Size; 1404 return *Ptr; 1405 } 1406 1407 //===----------------------------------------------------------------------===// 1408 // Helper methods for lexing. 1409 //===----------------------------------------------------------------------===// 1410 1411 /// Routine that indiscriminately sets the offset into the source file. 1412 void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) { 1413 BufferPtr = BufferStart + Offset; 1414 if (BufferPtr > BufferEnd) 1415 BufferPtr = BufferEnd; 1416 // FIXME: What exactly does the StartOfLine bit mean? There are two 1417 // possible meanings for the "start" of the line: the first token on the 1418 // unexpanded line, or the first token on the expanded line. 1419 IsAtStartOfLine = StartOfLine; 1420 IsAtPhysicalStartOfLine = StartOfLine; 1421 } 1422 1423 static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) { 1424 if (LangOpts.AsmPreprocessor) { 1425 return false; 1426 } else if (LangOpts.CPlusPlus11 || LangOpts.C11) { 1427 static const llvm::sys::UnicodeCharSet C11AllowedIDChars( 1428 C11AllowedIDCharRanges); 1429 return C11AllowedIDChars.contains(C); 1430 } else if (LangOpts.CPlusPlus) { 1431 static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars( 1432 CXX03AllowedIDCharRanges); 1433 return CXX03AllowedIDChars.contains(C); 1434 } else { 1435 static const llvm::sys::UnicodeCharSet C99AllowedIDChars( 1436 C99AllowedIDCharRanges); 1437 return C99AllowedIDChars.contains(C); 1438 } 1439 } 1440 1441 static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) { 1442 assert(isAllowedIDChar(C, LangOpts)); 1443 if (LangOpts.AsmPreprocessor) { 1444 return false; 1445 } else if (LangOpts.CPlusPlus11 || LangOpts.C11) { 1446 static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars( 1447 C11DisallowedInitialIDCharRanges); 1448 return !C11DisallowedInitialIDChars.contains(C); 1449 } else if (LangOpts.CPlusPlus) { 1450 return true; 1451 } else { 1452 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( 1453 C99DisallowedInitialIDCharRanges); 1454 return !C99DisallowedInitialIDChars.contains(C); 1455 } 1456 } 1457 1458 static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin, 1459 const char *End) { 1460 return CharSourceRange::getCharRange(L.getSourceLocation(Begin), 1461 L.getSourceLocation(End)); 1462 } 1463 1464 static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, 1465 CharSourceRange Range, bool IsFirst) { 1466 // Check C99 compatibility. 1467 if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) { 1468 enum { 1469 CannotAppearInIdentifier = 0, 1470 CannotStartIdentifier 1471 }; 1472 1473 static const llvm::sys::UnicodeCharSet C99AllowedIDChars( 1474 C99AllowedIDCharRanges); 1475 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( 1476 C99DisallowedInitialIDCharRanges); 1477 if (!C99AllowedIDChars.contains(C)) { 1478 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) 1479 << Range 1480 << CannotAppearInIdentifier; 1481 } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) { 1482 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) 1483 << Range 1484 << CannotStartIdentifier; 1485 } 1486 } 1487 1488 // Check C++98 compatibility. 1489 if (!Diags.isIgnored(diag::warn_cxx98_compat_unicode_id, Range.getBegin())) { 1490 static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars( 1491 CXX03AllowedIDCharRanges); 1492 if (!CXX03AllowedIDChars.contains(C)) { 1493 Diags.Report(Range.getBegin(), diag::warn_cxx98_compat_unicode_id) 1494 << Range; 1495 } 1496 } 1497 } 1498 1499 /// After encountering UTF-8 character C and interpreting it as an identifier 1500 /// character, check whether it's a homoglyph for a common non-identifier 1501 /// source character that is unlikely to be an intentional identifier 1502 /// character and warn if so. 1503 static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, 1504 CharSourceRange Range) { 1505 // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes). 1506 struct HomoglyphPair { 1507 uint32_t Character; 1508 char LooksLike; 1509 bool operator<(HomoglyphPair R) const { return Character < R.Character; } 1510 }; 1511 static constexpr HomoglyphPair SortedHomoglyphs[] = { 1512 {U'\u00ad', 0}, // SOFT HYPHEN 1513 {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK 1514 {U'\u037e', ';'}, // GREEK QUESTION MARK 1515 {U'\u200b', 0}, // ZERO WIDTH SPACE 1516 {U'\u200c', 0}, // ZERO WIDTH NON-JOINER 1517 {U'\u200d', 0}, // ZERO WIDTH JOINER 1518 {U'\u2060', 0}, // WORD JOINER 1519 {U'\u2061', 0}, // FUNCTION APPLICATION 1520 {U'\u2062', 0}, // INVISIBLE TIMES 1521 {U'\u2063', 0}, // INVISIBLE SEPARATOR 1522 {U'\u2064', 0}, // INVISIBLE PLUS 1523 {U'\u2212', '-'}, // MINUS SIGN 1524 {U'\u2215', '/'}, // DIVISION SLASH 1525 {U'\u2216', '\\'}, // SET MINUS 1526 {U'\u2217', '*'}, // ASTERISK OPERATOR 1527 {U'\u2223', '|'}, // DIVIDES 1528 {U'\u2227', '^'}, // LOGICAL AND 1529 {U'\u2236', ':'}, // RATIO 1530 {U'\u223c', '~'}, // TILDE OPERATOR 1531 {U'\ua789', ':'}, // MODIFIER LETTER COLON 1532 {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE 1533 {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK 1534 {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN 1535 {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN 1536 {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN 1537 {U'\uff06', '&'}, // FULLWIDTH AMPERSAND 1538 {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS 1539 {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS 1540 {U'\uff0a', '*'}, // FULLWIDTH ASTERISK 1541 {U'\uff0b', '+'}, // FULLWIDTH ASTERISK 1542 {U'\uff0c', ','}, // FULLWIDTH COMMA 1543 {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS 1544 {U'\uff0e', '.'}, // FULLWIDTH FULL STOP 1545 {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS 1546 {U'\uff1a', ':'}, // FULLWIDTH COLON 1547 {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON 1548 {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN 1549 {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN 1550 {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN 1551 {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK 1552 {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT 1553 {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET 1554 {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS 1555 {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET 1556 {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT 1557 {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET 1558 {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE 1559 {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET 1560 {U'\uff5e', '~'}, // FULLWIDTH TILDE 1561 {0, 0} 1562 }; 1563 auto Homoglyph = 1564 std::lower_bound(std::begin(SortedHomoglyphs), 1565 std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'}); 1566 if (Homoglyph->Character == C) { 1567 llvm::SmallString<5> CharBuf; 1568 { 1569 llvm::raw_svector_ostream CharOS(CharBuf); 1570 llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4); 1571 } 1572 if (Homoglyph->LooksLike) { 1573 const char LooksLikeStr[] = {Homoglyph->LooksLike, 0}; 1574 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph) 1575 << Range << CharBuf << LooksLikeStr; 1576 } else { 1577 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width) 1578 << Range << CharBuf; 1579 } 1580 } 1581 } 1582 1583 bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size, 1584 Token &Result) { 1585 const char *UCNPtr = CurPtr + Size; 1586 uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr); 1587 if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts)) 1588 return false; 1589 1590 if (!isLexingRawMode()) 1591 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, 1592 makeCharRange(*this, CurPtr, UCNPtr), 1593 /*IsFirst=*/false); 1594 1595 Result.setFlag(Token::HasUCN); 1596 if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') || 1597 (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U')) 1598 CurPtr = UCNPtr; 1599 else 1600 while (CurPtr != UCNPtr) 1601 (void)getAndAdvanceChar(CurPtr, Result); 1602 return true; 1603 } 1604 1605 bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) { 1606 const char *UnicodePtr = CurPtr; 1607 llvm::UTF32 CodePoint; 1608 llvm::ConversionResult Result = 1609 llvm::convertUTF8Sequence((const llvm::UTF8 **)&UnicodePtr, 1610 (const llvm::UTF8 *)BufferEnd, 1611 &CodePoint, 1612 llvm::strictConversion); 1613 if (Result != llvm::conversionOK || 1614 !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts)) 1615 return false; 1616 1617 if (!isLexingRawMode()) { 1618 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, 1619 makeCharRange(*this, CurPtr, UnicodePtr), 1620 /*IsFirst=*/false); 1621 maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint, 1622 makeCharRange(*this, CurPtr, UnicodePtr)); 1623 } 1624 1625 CurPtr = UnicodePtr; 1626 return true; 1627 } 1628 1629 bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) { 1630 // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] 1631 unsigned Size; 1632 unsigned char C = *CurPtr++; 1633 while (isIdentifierBody(C)) 1634 C = *CurPtr++; 1635 1636 --CurPtr; // Back up over the skipped character. 1637 1638 // Fast path, no $,\,? in identifier found. '\' might be an escaped newline 1639 // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN. 1640 // 1641 // TODO: Could merge these checks into an InfoTable flag to make the 1642 // comparison cheaper 1643 if (isASCII(C) && C != '\\' && C != '?' && 1644 (C != '$' || !LangOpts.DollarIdents)) { 1645 FinishIdentifier: 1646 const char *IdStart = BufferPtr; 1647 FormTokenWithChars(Result, CurPtr, tok::raw_identifier); 1648 Result.setRawIdentifierData(IdStart); 1649 1650 // If we are in raw mode, return this identifier raw. There is no need to 1651 // look up identifier information or attempt to macro expand it. 1652 if (LexingRawMode) 1653 return true; 1654 1655 // Fill in Result.IdentifierInfo and update the token kind, 1656 // looking up the identifier in the identifier table. 1657 IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); 1658 // Note that we have to call PP->LookUpIdentifierInfo() even for code 1659 // completion, it writes IdentifierInfo into Result, and callers rely on it. 1660 1661 // If the completion point is at the end of an identifier, we want to treat 1662 // the identifier as incomplete even if it resolves to a macro or a keyword. 1663 // This allows e.g. 'class^' to complete to 'classifier'. 1664 if (isCodeCompletionPoint(CurPtr)) { 1665 // Return the code-completion token. 1666 Result.setKind(tok::code_completion); 1667 // Skip the code-completion char and all immediate identifier characters. 1668 // This ensures we get consistent behavior when completing at any point in 1669 // an identifier (i.e. at the start, in the middle, at the end). Note that 1670 // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code 1671 // simpler. 1672 assert(*CurPtr == 0 && "Completion character must be 0"); 1673 ++CurPtr; 1674 // Note that code completion token is not added as a separate character 1675 // when the completion point is at the end of the buffer. Therefore, we need 1676 // to check if the buffer has ended. 1677 if (CurPtr < BufferEnd) { 1678 while (isIdentifierBody(*CurPtr)) 1679 ++CurPtr; 1680 } 1681 BufferPtr = CurPtr; 1682 return true; 1683 } 1684 1685 // Finally, now that we know we have an identifier, pass this off to the 1686 // preprocessor, which may macro expand it or something. 1687 if (II->isHandleIdentifierCase()) 1688 return PP->HandleIdentifier(Result); 1689 1690 return true; 1691 } 1692 1693 // Otherwise, $,\,? in identifier found. Enter slower path. 1694 1695 C = getCharAndSize(CurPtr, Size); 1696 while (true) { 1697 if (C == '$') { 1698 // If we hit a $ and they are not supported in identifiers, we are done. 1699 if (!LangOpts.DollarIdents) goto FinishIdentifier; 1700 1701 // Otherwise, emit a diagnostic and continue. 1702 if (!isLexingRawMode()) 1703 Diag(CurPtr, diag::ext_dollar_in_identifier); 1704 CurPtr = ConsumeChar(CurPtr, Size, Result); 1705 C = getCharAndSize(CurPtr, Size); 1706 continue; 1707 } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) { 1708 C = getCharAndSize(CurPtr, Size); 1709 continue; 1710 } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) { 1711 C = getCharAndSize(CurPtr, Size); 1712 continue; 1713 } else if (!isIdentifierBody(C)) { 1714 goto FinishIdentifier; 1715 } 1716 1717 // Otherwise, this character is good, consume it. 1718 CurPtr = ConsumeChar(CurPtr, Size, Result); 1719 1720 C = getCharAndSize(CurPtr, Size); 1721 while (isIdentifierBody(C)) { 1722 CurPtr = ConsumeChar(CurPtr, Size, Result); 1723 C = getCharAndSize(CurPtr, Size); 1724 } 1725 } 1726 } 1727 1728 /// isHexaLiteral - Return true if Start points to a hex constant. 1729 /// in microsoft mode (where this is supposed to be several different tokens). 1730 bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) { 1731 unsigned Size; 1732 char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts); 1733 if (C1 != '0') 1734 return false; 1735 char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts); 1736 return (C2 == 'x' || C2 == 'X'); 1737 } 1738 1739 /// LexNumericConstant - Lex the remainder of a integer or floating point 1740 /// constant. From[-1] is the first character lexed. Return the end of the 1741 /// constant. 1742 bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { 1743 unsigned Size; 1744 char C = getCharAndSize(CurPtr, Size); 1745 char PrevCh = 0; 1746 while (isPreprocessingNumberBody(C)) { 1747 CurPtr = ConsumeChar(CurPtr, Size, Result); 1748 PrevCh = C; 1749 C = getCharAndSize(CurPtr, Size); 1750 } 1751 1752 // If we fell out, check for a sign, due to 1e+12. If we have one, continue. 1753 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) { 1754 // If we are in Microsoft mode, don't continue if the constant is hex. 1755 // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1 1756 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts)) 1757 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 1758 } 1759 1760 // If we have a hex FP constant, continue. 1761 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) { 1762 // Outside C99 and C++17, we accept hexadecimal floating point numbers as a 1763 // not-quite-conforming extension. Only do so if this looks like it's 1764 // actually meant to be a hexfloat, and not if it has a ud-suffix. 1765 bool IsHexFloat = true; 1766 if (!LangOpts.C99) { 1767 if (!isHexaLiteral(BufferPtr, LangOpts)) 1768 IsHexFloat = false; 1769 else if (!getLangOpts().CPlusPlus17 && 1770 std::find(BufferPtr, CurPtr, '_') != CurPtr) 1771 IsHexFloat = false; 1772 } 1773 if (IsHexFloat) 1774 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 1775 } 1776 1777 // If we have a digit separator, continue. 1778 if (C == '\'' && getLangOpts().CPlusPlus14) { 1779 unsigned NextSize; 1780 char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, getLangOpts()); 1781 if (isIdentifierBody(Next)) { 1782 if (!isLexingRawMode()) 1783 Diag(CurPtr, diag::warn_cxx11_compat_digit_separator); 1784 CurPtr = ConsumeChar(CurPtr, Size, Result); 1785 CurPtr = ConsumeChar(CurPtr, NextSize, Result); 1786 return LexNumericConstant(Result, CurPtr); 1787 } 1788 } 1789 1790 // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue. 1791 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 1792 return LexNumericConstant(Result, CurPtr); 1793 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) 1794 return LexNumericConstant(Result, CurPtr); 1795 1796 // Update the location of token as well as BufferPtr. 1797 const char *TokStart = BufferPtr; 1798 FormTokenWithChars(Result, CurPtr, tok::numeric_constant); 1799 Result.setLiteralData(TokStart); 1800 return true; 1801 } 1802 1803 /// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes 1804 /// in C++11, or warn on a ud-suffix in C++98. 1805 const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr, 1806 bool IsStringLiteral) { 1807 assert(getLangOpts().CPlusPlus); 1808 1809 // Maximally munch an identifier. 1810 unsigned Size; 1811 char C = getCharAndSize(CurPtr, Size); 1812 bool Consumed = false; 1813 1814 if (!isIdentifierHead(C)) { 1815 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 1816 Consumed = true; 1817 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) 1818 Consumed = true; 1819 else 1820 return CurPtr; 1821 } 1822 1823 if (!getLangOpts().CPlusPlus11) { 1824 if (!isLexingRawMode()) 1825 Diag(CurPtr, 1826 C == '_' ? diag::warn_cxx11_compat_user_defined_literal 1827 : diag::warn_cxx11_compat_reserved_user_defined_literal) 1828 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 1829 return CurPtr; 1830 } 1831 1832 // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix 1833 // that does not start with an underscore is ill-formed. As a conforming 1834 // extension, we treat all such suffixes as if they had whitespace before 1835 // them. We assume a suffix beginning with a UCN or UTF-8 character is more 1836 // likely to be a ud-suffix than a macro, however, and accept that. 1837 if (!Consumed) { 1838 bool IsUDSuffix = false; 1839 if (C == '_') 1840 IsUDSuffix = true; 1841 else if (IsStringLiteral && getLangOpts().CPlusPlus14) { 1842 // In C++1y, we need to look ahead a few characters to see if this is a 1843 // valid suffix for a string literal or a numeric literal (this could be 1844 // the 'operator""if' defining a numeric literal operator). 1845 const unsigned MaxStandardSuffixLength = 3; 1846 char Buffer[MaxStandardSuffixLength] = { C }; 1847 unsigned Consumed = Size; 1848 unsigned Chars = 1; 1849 while (true) { 1850 unsigned NextSize; 1851 char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize, 1852 getLangOpts()); 1853 if (!isIdentifierBody(Next)) { 1854 // End of suffix. Check whether this is on the whitelist. 1855 const StringRef CompleteSuffix(Buffer, Chars); 1856 IsUDSuffix = StringLiteralParser::isValidUDSuffix(getLangOpts(), 1857 CompleteSuffix); 1858 break; 1859 } 1860 1861 if (Chars == MaxStandardSuffixLength) 1862 // Too long: can't be a standard suffix. 1863 break; 1864 1865 Buffer[Chars++] = Next; 1866 Consumed += NextSize; 1867 } 1868 } 1869 1870 if (!IsUDSuffix) { 1871 if (!isLexingRawMode()) 1872 Diag(CurPtr, getLangOpts().MSVCCompat 1873 ? diag::ext_ms_reserved_user_defined_literal 1874 : diag::ext_reserved_user_defined_literal) 1875 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 1876 return CurPtr; 1877 } 1878 1879 CurPtr = ConsumeChar(CurPtr, Size, Result); 1880 } 1881 1882 Result.setFlag(Token::HasUDSuffix); 1883 while (true) { 1884 C = getCharAndSize(CurPtr, Size); 1885 if (isIdentifierBody(C)) { CurPtr = ConsumeChar(CurPtr, Size, Result); } 1886 else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {} 1887 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {} 1888 else break; 1889 } 1890 1891 return CurPtr; 1892 } 1893 1894 /// LexStringLiteral - Lex the remainder of a string literal, after having lexed 1895 /// either " or L" or u8" or u" or U". 1896 bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr, 1897 tok::TokenKind Kind) { 1898 const char *AfterQuote = CurPtr; 1899 // Does this string contain the \0 character? 1900 const char *NulCharacter = nullptr; 1901 1902 if (!isLexingRawMode() && 1903 (Kind == tok::utf8_string_literal || 1904 Kind == tok::utf16_string_literal || 1905 Kind == tok::utf32_string_literal)) 1906 Diag(BufferPtr, getLangOpts().CPlusPlus 1907 ? diag::warn_cxx98_compat_unicode_literal 1908 : diag::warn_c99_compat_unicode_literal); 1909 1910 char C = getAndAdvanceChar(CurPtr, Result); 1911 while (C != '"') { 1912 // Skip escaped characters. Escaped newlines will already be processed by 1913 // getAndAdvanceChar. 1914 if (C == '\\') 1915 C = getAndAdvanceChar(CurPtr, Result); 1916 1917 if (C == '\n' || C == '\r' || // Newline. 1918 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 1919 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 1920 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1; 1921 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 1922 return true; 1923 } 1924 1925 if (C == 0) { 1926 if (isCodeCompletionPoint(CurPtr-1)) { 1927 if (ParsingFilename) 1928 codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false); 1929 else 1930 PP->CodeCompleteNaturalLanguage(); 1931 FormTokenWithChars(Result, CurPtr - 1, tok::unknown); 1932 cutOffLexing(); 1933 return true; 1934 } 1935 1936 NulCharacter = CurPtr-1; 1937 } 1938 C = getAndAdvanceChar(CurPtr, Result); 1939 } 1940 1941 // If we are in C++11, lex the optional ud-suffix. 1942 if (getLangOpts().CPlusPlus) 1943 CurPtr = LexUDSuffix(Result, CurPtr, true); 1944 1945 // If a nul character existed in the string, warn about it. 1946 if (NulCharacter && !isLexingRawMode()) 1947 Diag(NulCharacter, diag::null_in_char_or_string) << 1; 1948 1949 // Update the location of the token as well as the BufferPtr instance var. 1950 const char *TokStart = BufferPtr; 1951 FormTokenWithChars(Result, CurPtr, Kind); 1952 Result.setLiteralData(TokStart); 1953 return true; 1954 } 1955 1956 /// LexRawStringLiteral - Lex the remainder of a raw string literal, after 1957 /// having lexed R", LR", u8R", uR", or UR". 1958 bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr, 1959 tok::TokenKind Kind) { 1960 // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3: 1961 // Between the initial and final double quote characters of the raw string, 1962 // any transformations performed in phases 1 and 2 (trigraphs, 1963 // universal-character-names, and line splicing) are reverted. 1964 1965 if (!isLexingRawMode()) 1966 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal); 1967 1968 unsigned PrefixLen = 0; 1969 1970 while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) 1971 ++PrefixLen; 1972 1973 // If the last character was not a '(', then we didn't lex a valid delimiter. 1974 if (CurPtr[PrefixLen] != '(') { 1975 if (!isLexingRawMode()) { 1976 const char *PrefixEnd = &CurPtr[PrefixLen]; 1977 if (PrefixLen == 16) { 1978 Diag(PrefixEnd, diag::err_raw_delim_too_long); 1979 } else { 1980 Diag(PrefixEnd, diag::err_invalid_char_raw_delim) 1981 << StringRef(PrefixEnd, 1); 1982 } 1983 } 1984 1985 // Search for the next '"' in hopes of salvaging the lexer. Unfortunately, 1986 // it's possible the '"' was intended to be part of the raw string, but 1987 // there's not much we can do about that. 1988 while (true) { 1989 char C = *CurPtr++; 1990 1991 if (C == '"') 1992 break; 1993 if (C == 0 && CurPtr-1 == BufferEnd) { 1994 --CurPtr; 1995 break; 1996 } 1997 } 1998 1999 FormTokenWithChars(Result, CurPtr, tok::unknown); 2000 return true; 2001 } 2002 2003 // Save prefix and move CurPtr past it 2004 const char *Prefix = CurPtr; 2005 CurPtr += PrefixLen + 1; // skip over prefix and '(' 2006 2007 while (true) { 2008 char C = *CurPtr++; 2009 2010 if (C == ')') { 2011 // Check for prefix match and closing quote. 2012 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') { 2013 CurPtr += PrefixLen + 1; // skip over prefix and '"' 2014 break; 2015 } 2016 } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file. 2017 if (!isLexingRawMode()) 2018 Diag(BufferPtr, diag::err_unterminated_raw_string) 2019 << StringRef(Prefix, PrefixLen); 2020 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2021 return true; 2022 } 2023 } 2024 2025 // If we are in C++11, lex the optional ud-suffix. 2026 if (getLangOpts().CPlusPlus) 2027 CurPtr = LexUDSuffix(Result, CurPtr, true); 2028 2029 // Update the location of token as well as BufferPtr. 2030 const char *TokStart = BufferPtr; 2031 FormTokenWithChars(Result, CurPtr, Kind); 2032 Result.setLiteralData(TokStart); 2033 return true; 2034 } 2035 2036 /// LexAngledStringLiteral - Lex the remainder of an angled string literal, 2037 /// after having lexed the '<' character. This is used for #include filenames. 2038 bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { 2039 // Does this string contain the \0 character? 2040 const char *NulCharacter = nullptr; 2041 const char *AfterLessPos = CurPtr; 2042 char C = getAndAdvanceChar(CurPtr, Result); 2043 while (C != '>') { 2044 // Skip escaped characters. Escaped newlines will already be processed by 2045 // getAndAdvanceChar. 2046 if (C == '\\') 2047 C = getAndAdvanceChar(CurPtr, Result); 2048 2049 if (C == '\n' || C == '\r' || // Newline. 2050 (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file. 2051 // If the filename is unterminated, then it must just be a lone < 2052 // character. Return this as such. 2053 FormTokenWithChars(Result, AfterLessPos, tok::less); 2054 return true; 2055 } 2056 2057 if (C == 0) { 2058 if (isCodeCompletionPoint(CurPtr - 1)) { 2059 codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true); 2060 cutOffLexing(); 2061 FormTokenWithChars(Result, CurPtr - 1, tok::unknown); 2062 return true; 2063 } 2064 NulCharacter = CurPtr-1; 2065 } 2066 C = getAndAdvanceChar(CurPtr, Result); 2067 } 2068 2069 // If a nul character existed in the string, warn about it. 2070 if (NulCharacter && !isLexingRawMode()) 2071 Diag(NulCharacter, diag::null_in_char_or_string) << 1; 2072 2073 // Update the location of token as well as BufferPtr. 2074 const char *TokStart = BufferPtr; 2075 FormTokenWithChars(Result, CurPtr, tok::header_name); 2076 Result.setLiteralData(TokStart); 2077 return true; 2078 } 2079 2080 void Lexer::codeCompleteIncludedFile(const char *PathStart, 2081 const char *CompletionPoint, 2082 bool IsAngled) { 2083 // Completion only applies to the filename, after the last slash. 2084 StringRef PartialPath(PathStart, CompletionPoint - PathStart); 2085 auto Slash = PartialPath.find_last_of(LangOpts.MSVCCompat ? "/\\" : "/"); 2086 StringRef Dir = 2087 (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash); 2088 const char *StartOfFilename = 2089 (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1; 2090 // Code completion filter range is the filename only, up to completion point. 2091 PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get( 2092 StringRef(StartOfFilename, CompletionPoint - StartOfFilename))); 2093 // We should replace the characters up to the closing quote, if any. 2094 while (CompletionPoint < BufferEnd) { 2095 char Next = *(CompletionPoint + 1); 2096 if (Next == 0 || Next == '\r' || Next == '\n') 2097 break; 2098 ++CompletionPoint; 2099 if (Next == (IsAngled ? '>' : '"')) 2100 break; 2101 } 2102 PP->setCodeCompletionTokenRange( 2103 FileLoc.getLocWithOffset(StartOfFilename - BufferStart), 2104 FileLoc.getLocWithOffset(CompletionPoint - BufferStart)); 2105 PP->CodeCompleteIncludedFile(Dir, IsAngled); 2106 } 2107 2108 /// LexCharConstant - Lex the remainder of a character constant, after having 2109 /// lexed either ' or L' or u8' or u' or U'. 2110 bool Lexer::LexCharConstant(Token &Result, const char *CurPtr, 2111 tok::TokenKind Kind) { 2112 // Does this character contain the \0 character? 2113 const char *NulCharacter = nullptr; 2114 2115 if (!isLexingRawMode()) { 2116 if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant) 2117 Diag(BufferPtr, getLangOpts().CPlusPlus 2118 ? diag::warn_cxx98_compat_unicode_literal 2119 : diag::warn_c99_compat_unicode_literal); 2120 else if (Kind == tok::utf8_char_constant) 2121 Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal); 2122 } 2123 2124 char C = getAndAdvanceChar(CurPtr, Result); 2125 if (C == '\'') { 2126 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 2127 Diag(BufferPtr, diag::ext_empty_character); 2128 FormTokenWithChars(Result, CurPtr, tok::unknown); 2129 return true; 2130 } 2131 2132 while (C != '\'') { 2133 // Skip escaped characters. 2134 if (C == '\\') 2135 C = getAndAdvanceChar(CurPtr, Result); 2136 2137 if (C == '\n' || C == '\r' || // Newline. 2138 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 2139 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 2140 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0; 2141 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2142 return true; 2143 } 2144 2145 if (C == 0) { 2146 if (isCodeCompletionPoint(CurPtr-1)) { 2147 PP->CodeCompleteNaturalLanguage(); 2148 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2149 cutOffLexing(); 2150 return true; 2151 } 2152 2153 NulCharacter = CurPtr-1; 2154 } 2155 C = getAndAdvanceChar(CurPtr, Result); 2156 } 2157 2158 // If we are in C++11, lex the optional ud-suffix. 2159 if (getLangOpts().CPlusPlus) 2160 CurPtr = LexUDSuffix(Result, CurPtr, false); 2161 2162 // If a nul character existed in the character, warn about it. 2163 if (NulCharacter && !isLexingRawMode()) 2164 Diag(NulCharacter, diag::null_in_char_or_string) << 0; 2165 2166 // Update the location of token as well as BufferPtr. 2167 const char *TokStart = BufferPtr; 2168 FormTokenWithChars(Result, CurPtr, Kind); 2169 Result.setLiteralData(TokStart); 2170 return true; 2171 } 2172 2173 /// SkipWhitespace - Efficiently skip over a series of whitespace characters. 2174 /// Update BufferPtr to point to the next non-whitespace character and return. 2175 /// 2176 /// This method forms a token and returns true if KeepWhitespaceMode is enabled. 2177 bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr, 2178 bool &TokAtPhysicalStartOfLine) { 2179 // Whitespace - Skip it, then return the token after the whitespace. 2180 bool SawNewline = isVerticalWhitespace(CurPtr[-1]); 2181 2182 unsigned char Char = *CurPtr; 2183 2184 // Skip consecutive spaces efficiently. 2185 while (true) { 2186 // Skip horizontal whitespace very aggressively. 2187 while (isHorizontalWhitespace(Char)) 2188 Char = *++CurPtr; 2189 2190 // Otherwise if we have something other than whitespace, we're done. 2191 if (!isVerticalWhitespace(Char)) 2192 break; 2193 2194 if (ParsingPreprocessorDirective) { 2195 // End of preprocessor directive line, let LexTokenInternal handle this. 2196 BufferPtr = CurPtr; 2197 return false; 2198 } 2199 2200 // OK, but handle newline. 2201 SawNewline = true; 2202 Char = *++CurPtr; 2203 } 2204 2205 // If the client wants us to return whitespace, return it now. 2206 if (isKeepWhitespaceMode()) { 2207 FormTokenWithChars(Result, CurPtr, tok::unknown); 2208 if (SawNewline) { 2209 IsAtStartOfLine = true; 2210 IsAtPhysicalStartOfLine = true; 2211 } 2212 // FIXME: The next token will not have LeadingSpace set. 2213 return true; 2214 } 2215 2216 // If this isn't immediately after a newline, there is leading space. 2217 char PrevChar = CurPtr[-1]; 2218 bool HasLeadingSpace = !isVerticalWhitespace(PrevChar); 2219 2220 Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace); 2221 if (SawNewline) { 2222 Result.setFlag(Token::StartOfLine); 2223 TokAtPhysicalStartOfLine = true; 2224 } 2225 2226 BufferPtr = CurPtr; 2227 return false; 2228 } 2229 2230 /// We have just read the // characters from input. Skip until we find the 2231 /// newline character that terminates the comment. Then update BufferPtr and 2232 /// return. 2233 /// 2234 /// If we're in KeepCommentMode or any CommentHandler has inserted 2235 /// some tokens, this will store the first token and return true. 2236 bool Lexer::SkipLineComment(Token &Result, const char *CurPtr, 2237 bool &TokAtPhysicalStartOfLine) { 2238 // If Line comments aren't explicitly enabled for this language, emit an 2239 // extension warning. 2240 if (!LangOpts.LineComment && !isLexingRawMode()) { 2241 Diag(BufferPtr, diag::ext_line_comment); 2242 2243 // Mark them enabled so we only emit one warning for this translation 2244 // unit. 2245 LangOpts.LineComment = true; 2246 } 2247 2248 // Scan over the body of the comment. The common case, when scanning, is that 2249 // the comment contains normal ascii characters with nothing interesting in 2250 // them. As such, optimize for this case with the inner loop. 2251 // 2252 // This loop terminates with CurPtr pointing at the newline (or end of buffer) 2253 // character that ends the line comment. 2254 char C; 2255 while (true) { 2256 C = *CurPtr; 2257 // Skip over characters in the fast loop. 2258 while (C != 0 && // Potentially EOF. 2259 C != '\n' && C != '\r') // Newline or DOS-style newline. 2260 C = *++CurPtr; 2261 2262 const char *NextLine = CurPtr; 2263 if (C != 0) { 2264 // We found a newline, see if it's escaped. 2265 const char *EscapePtr = CurPtr-1; 2266 bool HasSpace = false; 2267 while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace. 2268 --EscapePtr; 2269 HasSpace = true; 2270 } 2271 2272 if (*EscapePtr == '\\') 2273 // Escaped newline. 2274 CurPtr = EscapePtr; 2275 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' && 2276 EscapePtr[-2] == '?' && LangOpts.Trigraphs) 2277 // Trigraph-escaped newline. 2278 CurPtr = EscapePtr-2; 2279 else 2280 break; // This is a newline, we're done. 2281 2282 // If there was space between the backslash and newline, warn about it. 2283 if (HasSpace && !isLexingRawMode()) 2284 Diag(EscapePtr, diag::backslash_newline_space); 2285 } 2286 2287 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to 2288 // properly decode the character. Read it in raw mode to avoid emitting 2289 // diagnostics about things like trigraphs. If we see an escaped newline, 2290 // we'll handle it below. 2291 const char *OldPtr = CurPtr; 2292 bool OldRawMode = isLexingRawMode(); 2293 LexingRawMode = true; 2294 C = getAndAdvanceChar(CurPtr, Result); 2295 LexingRawMode = OldRawMode; 2296 2297 // If we only read only one character, then no special handling is needed. 2298 // We're done and can skip forward to the newline. 2299 if (C != 0 && CurPtr == OldPtr+1) { 2300 CurPtr = NextLine; 2301 break; 2302 } 2303 2304 // If we read multiple characters, and one of those characters was a \r or 2305 // \n, then we had an escaped newline within the comment. Emit diagnostic 2306 // unless the next line is also a // comment. 2307 if (CurPtr != OldPtr + 1 && C != '/' && 2308 (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) { 2309 for (; OldPtr != CurPtr; ++OldPtr) 2310 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { 2311 // Okay, we found a // comment that ends in a newline, if the next 2312 // line is also a // comment, but has spaces, don't emit a diagnostic. 2313 if (isWhitespace(C)) { 2314 const char *ForwardPtr = CurPtr; 2315 while (isWhitespace(*ForwardPtr)) // Skip whitespace. 2316 ++ForwardPtr; 2317 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') 2318 break; 2319 } 2320 2321 if (!isLexingRawMode()) 2322 Diag(OldPtr-1, diag::ext_multi_line_line_comment); 2323 break; 2324 } 2325 } 2326 2327 if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) { 2328 --CurPtr; 2329 break; 2330 } 2331 2332 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 2333 PP->CodeCompleteNaturalLanguage(); 2334 cutOffLexing(); 2335 return false; 2336 } 2337 } 2338 2339 // Found but did not consume the newline. Notify comment handlers about the 2340 // comment unless we're in a #if 0 block. 2341 if (PP && !isLexingRawMode() && 2342 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 2343 getSourceLocation(CurPtr)))) { 2344 BufferPtr = CurPtr; 2345 return true; // A token has to be returned. 2346 } 2347 2348 // If we are returning comments as tokens, return this comment as a token. 2349 if (inKeepCommentMode()) 2350 return SaveLineComment(Result, CurPtr); 2351 2352 // If we are inside a preprocessor directive and we see the end of line, 2353 // return immediately, so that the lexer can return this as an EOD token. 2354 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) { 2355 BufferPtr = CurPtr; 2356 return false; 2357 } 2358 2359 // Otherwise, eat the \n character. We don't care if this is a \n\r or 2360 // \r\n sequence. This is an efficiency hack (because we know the \n can't 2361 // contribute to another token), it isn't needed for correctness. Note that 2362 // this is ok even in KeepWhitespaceMode, because we would have returned the 2363 /// comment above in that mode. 2364 ++CurPtr; 2365 2366 // The next returned token is at the start of the line. 2367 Result.setFlag(Token::StartOfLine); 2368 TokAtPhysicalStartOfLine = true; 2369 // No leading whitespace seen so far. 2370 Result.clearFlag(Token::LeadingSpace); 2371 BufferPtr = CurPtr; 2372 return false; 2373 } 2374 2375 /// If in save-comment mode, package up this Line comment in an appropriate 2376 /// way and return it. 2377 bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) { 2378 // If we're not in a preprocessor directive, just return the // comment 2379 // directly. 2380 FormTokenWithChars(Result, CurPtr, tok::comment); 2381 2382 if (!ParsingPreprocessorDirective || LexingRawMode) 2383 return true; 2384 2385 // If this Line-style comment is in a macro definition, transmogrify it into 2386 // a C-style block comment. 2387 bool Invalid = false; 2388 std::string Spelling = PP->getSpelling(Result, &Invalid); 2389 if (Invalid) 2390 return true; 2391 2392 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?"); 2393 Spelling[1] = '*'; // Change prefix to "/*". 2394 Spelling += "*/"; // add suffix. 2395 2396 Result.setKind(tok::comment); 2397 PP->CreateString(Spelling, Result, 2398 Result.getLocation(), Result.getLocation()); 2399 return true; 2400 } 2401 2402 /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline 2403 /// character (either \\n or \\r) is part of an escaped newline sequence. Issue 2404 /// a diagnostic if so. We know that the newline is inside of a block comment. 2405 static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, 2406 Lexer *L) { 2407 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); 2408 2409 // Back up off the newline. 2410 --CurPtr; 2411 2412 // If this is a two-character newline sequence, skip the other character. 2413 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { 2414 // \n\n or \r\r -> not escaped newline. 2415 if (CurPtr[0] == CurPtr[1]) 2416 return false; 2417 // \n\r or \r\n -> skip the newline. 2418 --CurPtr; 2419 } 2420 2421 // If we have horizontal whitespace, skip over it. We allow whitespace 2422 // between the slash and newline. 2423 bool HasSpace = false; 2424 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) { 2425 --CurPtr; 2426 HasSpace = true; 2427 } 2428 2429 // If we have a slash, we know this is an escaped newline. 2430 if (*CurPtr == '\\') { 2431 if (CurPtr[-1] != '*') return false; 2432 } else { 2433 // It isn't a slash, is it the ?? / trigraph? 2434 if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' || 2435 CurPtr[-3] != '*') 2436 return false; 2437 2438 // This is the trigraph ending the comment. Emit a stern warning! 2439 CurPtr -= 2; 2440 2441 // If no trigraphs are enabled, warn that we ignored this trigraph and 2442 // ignore this * character. 2443 if (!L->getLangOpts().Trigraphs) { 2444 if (!L->isLexingRawMode()) 2445 L->Diag(CurPtr, diag::trigraph_ignored_block_comment); 2446 return false; 2447 } 2448 if (!L->isLexingRawMode()) 2449 L->Diag(CurPtr, diag::trigraph_ends_block_comment); 2450 } 2451 2452 // Warn about having an escaped newline between the */ characters. 2453 if (!L->isLexingRawMode()) 2454 L->Diag(CurPtr, diag::escaped_newline_block_comment_end); 2455 2456 // If there was space between the backslash and newline, warn about it. 2457 if (HasSpace && !L->isLexingRawMode()) 2458 L->Diag(CurPtr, diag::backslash_newline_space); 2459 2460 return true; 2461 } 2462 2463 #ifdef __SSE2__ 2464 #include <emmintrin.h> 2465 #elif __ALTIVEC__ 2466 #include <altivec.h> 2467 #undef bool 2468 #endif 2469 2470 /// We have just read from input the / and * characters that started a comment. 2471 /// Read until we find the * and / characters that terminate the comment. 2472 /// Note that we don't bother decoding trigraphs or escaped newlines in block 2473 /// comments, because they cannot cause the comment to end. The only thing 2474 /// that can happen is the comment could end with an escaped newline between 2475 /// the terminating * and /. 2476 /// 2477 /// If we're in KeepCommentMode or any CommentHandler has inserted 2478 /// some tokens, this will store the first token and return true. 2479 bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr, 2480 bool &TokAtPhysicalStartOfLine) { 2481 // Scan one character past where we should, looking for a '/' character. Once 2482 // we find it, check to see if it was preceded by a *. This common 2483 // optimization helps people who like to put a lot of * characters in their 2484 // comments. 2485 2486 // The first character we get with newlines and trigraphs skipped to handle 2487 // the degenerate /*/ case below correctly if the * has an escaped newline 2488 // after it. 2489 unsigned CharSize; 2490 unsigned char C = getCharAndSize(CurPtr, CharSize); 2491 CurPtr += CharSize; 2492 if (C == 0 && CurPtr == BufferEnd+1) { 2493 if (!isLexingRawMode()) 2494 Diag(BufferPtr, diag::err_unterminated_block_comment); 2495 --CurPtr; 2496 2497 // KeepWhitespaceMode should return this broken comment as a token. Since 2498 // it isn't a well formed comment, just return it as an 'unknown' token. 2499 if (isKeepWhitespaceMode()) { 2500 FormTokenWithChars(Result, CurPtr, tok::unknown); 2501 return true; 2502 } 2503 2504 BufferPtr = CurPtr; 2505 return false; 2506 } 2507 2508 // Check to see if the first character after the '/*' is another /. If so, 2509 // then this slash does not end the block comment, it is part of it. 2510 if (C == '/') 2511 C = *CurPtr++; 2512 2513 while (true) { 2514 // Skip over all non-interesting characters until we find end of buffer or a 2515 // (probably ending) '/' character. 2516 if (CurPtr + 24 < BufferEnd && 2517 // If there is a code-completion point avoid the fast scan because it 2518 // doesn't check for '\0'. 2519 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) { 2520 // While not aligned to a 16-byte boundary. 2521 while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0) 2522 C = *CurPtr++; 2523 2524 if (C == '/') goto FoundSlash; 2525 2526 #ifdef __SSE2__ 2527 __m128i Slashes = _mm_set1_epi8('/'); 2528 while (CurPtr+16 <= BufferEnd) { 2529 int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr, 2530 Slashes)); 2531 if (cmp != 0) { 2532 // Adjust the pointer to point directly after the first slash. It's 2533 // not necessary to set C here, it will be overwritten at the end of 2534 // the outer loop. 2535 CurPtr += llvm::countTrailingZeros<unsigned>(cmp) + 1; 2536 goto FoundSlash; 2537 } 2538 CurPtr += 16; 2539 } 2540 #elif __ALTIVEC__ 2541 __vector unsigned char Slashes = { 2542 '/', '/', '/', '/', '/', '/', '/', '/', 2543 '/', '/', '/', '/', '/', '/', '/', '/' 2544 }; 2545 while (CurPtr+16 <= BufferEnd && 2546 !vec_any_eq(*(const vector unsigned char*)CurPtr, Slashes)) 2547 CurPtr += 16; 2548 #else 2549 // Scan for '/' quickly. Many block comments are very large. 2550 while (CurPtr[0] != '/' && 2551 CurPtr[1] != '/' && 2552 CurPtr[2] != '/' && 2553 CurPtr[3] != '/' && 2554 CurPtr+4 < BufferEnd) { 2555 CurPtr += 4; 2556 } 2557 #endif 2558 2559 // It has to be one of the bytes scanned, increment to it and read one. 2560 C = *CurPtr++; 2561 } 2562 2563 // Loop to scan the remainder. 2564 while (C != '/' && C != '\0') 2565 C = *CurPtr++; 2566 2567 if (C == '/') { 2568 FoundSlash: 2569 if (CurPtr[-2] == '*') // We found the final */. We're done! 2570 break; 2571 2572 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { 2573 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) { 2574 // We found the final */, though it had an escaped newline between the 2575 // * and /. We're done! 2576 break; 2577 } 2578 } 2579 if (CurPtr[0] == '*' && CurPtr[1] != '/') { 2580 // If this is a /* inside of the comment, emit a warning. Don't do this 2581 // if this is a /*/, which will end the comment. This misses cases with 2582 // embedded escaped newlines, but oh well. 2583 if (!isLexingRawMode()) 2584 Diag(CurPtr-1, diag::warn_nested_block_comment); 2585 } 2586 } else if (C == 0 && CurPtr == BufferEnd+1) { 2587 if (!isLexingRawMode()) 2588 Diag(BufferPtr, diag::err_unterminated_block_comment); 2589 // Note: the user probably forgot a */. We could continue immediately 2590 // after the /*, but this would involve lexing a lot of what really is the 2591 // comment, which surely would confuse the parser. 2592 --CurPtr; 2593 2594 // KeepWhitespaceMode should return this broken comment as a token. Since 2595 // it isn't a well formed comment, just return it as an 'unknown' token. 2596 if (isKeepWhitespaceMode()) { 2597 FormTokenWithChars(Result, CurPtr, tok::unknown); 2598 return true; 2599 } 2600 2601 BufferPtr = CurPtr; 2602 return false; 2603 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 2604 PP->CodeCompleteNaturalLanguage(); 2605 cutOffLexing(); 2606 return false; 2607 } 2608 2609 C = *CurPtr++; 2610 } 2611 2612 // Notify comment handlers about the comment unless we're in a #if 0 block. 2613 if (PP && !isLexingRawMode() && 2614 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 2615 getSourceLocation(CurPtr)))) { 2616 BufferPtr = CurPtr; 2617 return true; // A token has to be returned. 2618 } 2619 2620 // If we are returning comments as tokens, return this comment as a token. 2621 if (inKeepCommentMode()) { 2622 FormTokenWithChars(Result, CurPtr, tok::comment); 2623 return true; 2624 } 2625 2626 // It is common for the tokens immediately after a /**/ comment to be 2627 // whitespace. Instead of going through the big switch, handle it 2628 // efficiently now. This is safe even in KeepWhitespaceMode because we would 2629 // have already returned above with the comment as a token. 2630 if (isHorizontalWhitespace(*CurPtr)) { 2631 SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine); 2632 return false; 2633 } 2634 2635 // Otherwise, just return so that the next character will be lexed as a token. 2636 BufferPtr = CurPtr; 2637 Result.setFlag(Token::LeadingSpace); 2638 return false; 2639 } 2640 2641 //===----------------------------------------------------------------------===// 2642 // Primary Lexing Entry Points 2643 //===----------------------------------------------------------------------===// 2644 2645 /// ReadToEndOfLine - Read the rest of the current preprocessor line as an 2646 /// uninterpreted string. This switches the lexer out of directive mode. 2647 void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) { 2648 assert(ParsingPreprocessorDirective && ParsingFilename == false && 2649 "Must be in a preprocessing directive!"); 2650 Token Tmp; 2651 2652 // CurPtr - Cache BufferPtr in an automatic variable. 2653 const char *CurPtr = BufferPtr; 2654 while (true) { 2655 char Char = getAndAdvanceChar(CurPtr, Tmp); 2656 switch (Char) { 2657 default: 2658 if (Result) 2659 Result->push_back(Char); 2660 break; 2661 case 0: // Null. 2662 // Found end of file? 2663 if (CurPtr-1 != BufferEnd) { 2664 if (isCodeCompletionPoint(CurPtr-1)) { 2665 PP->CodeCompleteNaturalLanguage(); 2666 cutOffLexing(); 2667 return; 2668 } 2669 2670 // Nope, normal character, continue. 2671 if (Result) 2672 Result->push_back(Char); 2673 break; 2674 } 2675 // FALL THROUGH. 2676 LLVM_FALLTHROUGH; 2677 case '\r': 2678 case '\n': 2679 // Okay, we found the end of the line. First, back up past the \0, \r, \n. 2680 assert(CurPtr[-1] == Char && "Trigraphs for newline?"); 2681 BufferPtr = CurPtr-1; 2682 2683 // Next, lex the character, which should handle the EOD transition. 2684 Lex(Tmp); 2685 if (Tmp.is(tok::code_completion)) { 2686 if (PP) 2687 PP->CodeCompleteNaturalLanguage(); 2688 Lex(Tmp); 2689 } 2690 assert(Tmp.is(tok::eod) && "Unexpected token!"); 2691 2692 // Finally, we're done; 2693 return; 2694 } 2695 } 2696 } 2697 2698 /// LexEndOfFile - CurPtr points to the end of this file. Handle this 2699 /// condition, reporting diagnostics and handling other edge cases as required. 2700 /// This returns true if Result contains a token, false if PP.Lex should be 2701 /// called again. 2702 bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { 2703 // If we hit the end of the file while parsing a preprocessor directive, 2704 // end the preprocessor directive first. The next token returned will 2705 // then be the end of file. 2706 if (ParsingPreprocessorDirective) { 2707 // Done parsing the "line". 2708 ParsingPreprocessorDirective = false; 2709 // Update the location of token as well as BufferPtr. 2710 FormTokenWithChars(Result, CurPtr, tok::eod); 2711 2712 // Restore comment saving mode, in case it was disabled for directive. 2713 if (PP) 2714 resetExtendedTokenMode(); 2715 return true; // Have a token. 2716 } 2717 2718 // If we are in raw mode, return this event as an EOF token. Let the caller 2719 // that put us in raw mode handle the event. 2720 if (isLexingRawMode()) { 2721 Result.startToken(); 2722 BufferPtr = BufferEnd; 2723 FormTokenWithChars(Result, BufferEnd, tok::eof); 2724 return true; 2725 } 2726 2727 if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) { 2728 PP->setRecordedPreambleConditionalStack(ConditionalStack); 2729 ConditionalStack.clear(); 2730 } 2731 2732 // Issue diagnostics for unterminated #if and missing newline. 2733 2734 // If we are in a #if directive, emit an error. 2735 while (!ConditionalStack.empty()) { 2736 if (PP->getCodeCompletionFileLoc() != FileLoc) 2737 PP->Diag(ConditionalStack.back().IfLoc, 2738 diag::err_pp_unterminated_conditional); 2739 ConditionalStack.pop_back(); 2740 } 2741 2742 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue 2743 // a pedwarn. 2744 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) { 2745 DiagnosticsEngine &Diags = PP->getDiagnostics(); 2746 SourceLocation EndLoc = getSourceLocation(BufferEnd); 2747 unsigned DiagID; 2748 2749 if (LangOpts.CPlusPlus11) { 2750 // C++11 [lex.phases] 2.2 p2 2751 // Prefer the C++98 pedantic compatibility warning over the generic, 2752 // non-extension, user-requested "missing newline at EOF" warning. 2753 if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) { 2754 DiagID = diag::warn_cxx98_compat_no_newline_eof; 2755 } else { 2756 DiagID = diag::warn_no_newline_eof; 2757 } 2758 } else { 2759 DiagID = diag::ext_no_newline_eof; 2760 } 2761 2762 Diag(BufferEnd, DiagID) 2763 << FixItHint::CreateInsertion(EndLoc, "\n"); 2764 } 2765 2766 BufferPtr = CurPtr; 2767 2768 // Finally, let the preprocessor handle this. 2769 return PP->HandleEndOfFile(Result, isPragmaLexer()); 2770 } 2771 2772 /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from 2773 /// the specified lexer will return a tok::l_paren token, 0 if it is something 2774 /// else and 2 if there are no more tokens in the buffer controlled by the 2775 /// lexer. 2776 unsigned Lexer::isNextPPTokenLParen() { 2777 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); 2778 2779 // Switch to 'skipping' mode. This will ensure that we can lex a token 2780 // without emitting diagnostics, disables macro expansion, and will cause EOF 2781 // to return an EOF token instead of popping the include stack. 2782 LexingRawMode = true; 2783 2784 // Save state that can be changed while lexing so that we can restore it. 2785 const char *TmpBufferPtr = BufferPtr; 2786 bool inPPDirectiveMode = ParsingPreprocessorDirective; 2787 bool atStartOfLine = IsAtStartOfLine; 2788 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; 2789 bool leadingSpace = HasLeadingSpace; 2790 2791 Token Tok; 2792 Lex(Tok); 2793 2794 // Restore state that may have changed. 2795 BufferPtr = TmpBufferPtr; 2796 ParsingPreprocessorDirective = inPPDirectiveMode; 2797 HasLeadingSpace = leadingSpace; 2798 IsAtStartOfLine = atStartOfLine; 2799 IsAtPhysicalStartOfLine = atPhysicalStartOfLine; 2800 2801 // Restore the lexer back to non-skipping mode. 2802 LexingRawMode = false; 2803 2804 if (Tok.is(tok::eof)) 2805 return 2; 2806 return Tok.is(tok::l_paren); 2807 } 2808 2809 /// Find the end of a version control conflict marker. 2810 static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd, 2811 ConflictMarkerKind CMK) { 2812 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>"; 2813 size_t TermLen = CMK == CMK_Perforce ? 5 : 7; 2814 auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen); 2815 size_t Pos = RestOfBuffer.find(Terminator); 2816 while (Pos != StringRef::npos) { 2817 // Must occur at start of line. 2818 if (Pos == 0 || 2819 (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) { 2820 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen); 2821 Pos = RestOfBuffer.find(Terminator); 2822 continue; 2823 } 2824 return RestOfBuffer.data()+Pos; 2825 } 2826 return nullptr; 2827 } 2828 2829 /// IsStartOfConflictMarker - If the specified pointer is the start of a version 2830 /// control conflict marker like '<<<<<<<', recognize it as such, emit an error 2831 /// and recover nicely. This returns true if it is a conflict marker and false 2832 /// if not. 2833 bool Lexer::IsStartOfConflictMarker(const char *CurPtr) { 2834 // Only a conflict marker if it starts at the beginning of a line. 2835 if (CurPtr != BufferStart && 2836 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 2837 return false; 2838 2839 // Check to see if we have <<<<<<< or >>>>. 2840 if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") && 2841 !StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> ")) 2842 return false; 2843 2844 // If we have a situation where we don't care about conflict markers, ignore 2845 // it. 2846 if (CurrentConflictMarkerState || isLexingRawMode()) 2847 return false; 2848 2849 ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce; 2850 2851 // Check to see if there is an ending marker somewhere in the buffer at the 2852 // start of a line to terminate this conflict marker. 2853 if (FindConflictEnd(CurPtr, BufferEnd, Kind)) { 2854 // We found a match. We are really in a conflict marker. 2855 // Diagnose this, and ignore to the end of line. 2856 Diag(CurPtr, diag::err_conflict_marker); 2857 CurrentConflictMarkerState = Kind; 2858 2859 // Skip ahead to the end of line. We know this exists because the 2860 // end-of-conflict marker starts with \r or \n. 2861 while (*CurPtr != '\r' && *CurPtr != '\n') { 2862 assert(CurPtr != BufferEnd && "Didn't find end of line"); 2863 ++CurPtr; 2864 } 2865 BufferPtr = CurPtr; 2866 return true; 2867 } 2868 2869 // No end of conflict marker found. 2870 return false; 2871 } 2872 2873 /// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if 2874 /// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it 2875 /// is the end of a conflict marker. Handle it by ignoring up until the end of 2876 /// the line. This returns true if it is a conflict marker and false if not. 2877 bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) { 2878 // Only a conflict marker if it starts at the beginning of a line. 2879 if (CurPtr != BufferStart && 2880 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 2881 return false; 2882 2883 // If we have a situation where we don't care about conflict markers, ignore 2884 // it. 2885 if (!CurrentConflictMarkerState || isLexingRawMode()) 2886 return false; 2887 2888 // Check to see if we have the marker (4 characters in a row). 2889 for (unsigned i = 1; i != 4; ++i) 2890 if (CurPtr[i] != CurPtr[0]) 2891 return false; 2892 2893 // If we do have it, search for the end of the conflict marker. This could 2894 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might 2895 // be the end of conflict marker. 2896 if (const char *End = FindConflictEnd(CurPtr, BufferEnd, 2897 CurrentConflictMarkerState)) { 2898 CurPtr = End; 2899 2900 // Skip ahead to the end of line. 2901 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n') 2902 ++CurPtr; 2903 2904 BufferPtr = CurPtr; 2905 2906 // No longer in the conflict marker. 2907 CurrentConflictMarkerState = CMK_None; 2908 return true; 2909 } 2910 2911 return false; 2912 } 2913 2914 static const char *findPlaceholderEnd(const char *CurPtr, 2915 const char *BufferEnd) { 2916 if (CurPtr == BufferEnd) 2917 return nullptr; 2918 BufferEnd -= 1; // Scan until the second last character. 2919 for (; CurPtr != BufferEnd; ++CurPtr) { 2920 if (CurPtr[0] == '#' && CurPtr[1] == '>') 2921 return CurPtr + 2; 2922 } 2923 return nullptr; 2924 } 2925 2926 bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) { 2927 assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!"); 2928 if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode) 2929 return false; 2930 const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd); 2931 if (!End) 2932 return false; 2933 const char *Start = CurPtr - 1; 2934 if (!LangOpts.AllowEditorPlaceholders) 2935 Diag(Start, diag::err_placeholder_in_source); 2936 Result.startToken(); 2937 FormTokenWithChars(Result, End, tok::raw_identifier); 2938 Result.setRawIdentifierData(Start); 2939 PP->LookUpIdentifierInfo(Result); 2940 Result.setFlag(Token::IsEditorPlaceholder); 2941 BufferPtr = End; 2942 return true; 2943 } 2944 2945 bool Lexer::isCodeCompletionPoint(const char *CurPtr) const { 2946 if (PP && PP->isCodeCompletionEnabled()) { 2947 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart); 2948 return Loc == PP->getCodeCompletionLoc(); 2949 } 2950 2951 return false; 2952 } 2953 2954 uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc, 2955 Token *Result) { 2956 unsigned CharSize; 2957 char Kind = getCharAndSize(StartPtr, CharSize); 2958 2959 unsigned NumHexDigits; 2960 if (Kind == 'u') 2961 NumHexDigits = 4; 2962 else if (Kind == 'U') 2963 NumHexDigits = 8; 2964 else 2965 return 0; 2966 2967 if (!LangOpts.CPlusPlus && !LangOpts.C99) { 2968 if (Result && !isLexingRawMode()) 2969 Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89); 2970 return 0; 2971 } 2972 2973 const char *CurPtr = StartPtr + CharSize; 2974 const char *KindLoc = &CurPtr[-1]; 2975 2976 uint32_t CodePoint = 0; 2977 for (unsigned i = 0; i < NumHexDigits; ++i) { 2978 char C = getCharAndSize(CurPtr, CharSize); 2979 2980 unsigned Value = llvm::hexDigitValue(C); 2981 if (Value == -1U) { 2982 if (Result && !isLexingRawMode()) { 2983 if (i == 0) { 2984 Diag(BufferPtr, diag::warn_ucn_escape_no_digits) 2985 << StringRef(KindLoc, 1); 2986 } else { 2987 Diag(BufferPtr, diag::warn_ucn_escape_incomplete); 2988 2989 // If the user wrote \U1234, suggest a fixit to \u. 2990 if (i == 4 && NumHexDigits == 8) { 2991 CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1); 2992 Diag(KindLoc, diag::note_ucn_four_not_eight) 2993 << FixItHint::CreateReplacement(URange, "u"); 2994 } 2995 } 2996 } 2997 2998 return 0; 2999 } 3000 3001 CodePoint <<= 4; 3002 CodePoint += Value; 3003 3004 CurPtr += CharSize; 3005 } 3006 3007 if (Result) { 3008 Result->setFlag(Token::HasUCN); 3009 if (CurPtr - StartPtr == (ptrdiff_t)NumHexDigits + 2) 3010 StartPtr = CurPtr; 3011 else 3012 while (StartPtr != CurPtr) 3013 (void)getAndAdvanceChar(StartPtr, *Result); 3014 } else { 3015 StartPtr = CurPtr; 3016 } 3017 3018 // Don't apply C family restrictions to UCNs in assembly mode 3019 if (LangOpts.AsmPreprocessor) 3020 return CodePoint; 3021 3022 // C99 6.4.3p2: A universal character name shall not specify a character whose 3023 // short identifier is less than 00A0 other than 0024 ($), 0040 (@), or 3024 // 0060 (`), nor one in the range D800 through DFFF inclusive.) 3025 // C++11 [lex.charset]p2: If the hexadecimal value for a 3026 // universal-character-name corresponds to a surrogate code point (in the 3027 // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally, 3028 // if the hexadecimal value for a universal-character-name outside the 3029 // c-char-sequence, s-char-sequence, or r-char-sequence of a character or 3030 // string literal corresponds to a control character (in either of the 3031 // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the 3032 // basic source character set, the program is ill-formed. 3033 if (CodePoint < 0xA0) { 3034 if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60) 3035 return CodePoint; 3036 3037 // We don't use isLexingRawMode() here because we need to warn about bad 3038 // UCNs even when skipping preprocessing tokens in a #if block. 3039 if (Result && PP) { 3040 if (CodePoint < 0x20 || CodePoint >= 0x7F) 3041 Diag(BufferPtr, diag::err_ucn_control_character); 3042 else { 3043 char C = static_cast<char>(CodePoint); 3044 Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1); 3045 } 3046 } 3047 3048 return 0; 3049 } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) { 3050 // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't. 3051 // We don't use isLexingRawMode() here because we need to diagnose bad 3052 // UCNs even when skipping preprocessing tokens in a #if block. 3053 if (Result && PP) { 3054 if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11) 3055 Diag(BufferPtr, diag::warn_ucn_escape_surrogate); 3056 else 3057 Diag(BufferPtr, diag::err_ucn_escape_invalid); 3058 } 3059 return 0; 3060 } 3061 3062 return CodePoint; 3063 } 3064 3065 bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C, 3066 const char *CurPtr) { 3067 static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars( 3068 UnicodeWhitespaceCharRanges); 3069 if (!isLexingRawMode() && !PP->isPreprocessedOutput() && 3070 UnicodeWhitespaceChars.contains(C)) { 3071 Diag(BufferPtr, diag::ext_unicode_whitespace) 3072 << makeCharRange(*this, BufferPtr, CurPtr); 3073 3074 Result.setFlag(Token::LeadingSpace); 3075 return true; 3076 } 3077 return false; 3078 } 3079 3080 bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) { 3081 if (isAllowedIDChar(C, LangOpts) && isAllowedInitiallyIDChar(C, LangOpts)) { 3082 if (!isLexingRawMode() && !ParsingPreprocessorDirective && 3083 !PP->isPreprocessedOutput()) { 3084 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C, 3085 makeCharRange(*this, BufferPtr, CurPtr), 3086 /*IsFirst=*/true); 3087 maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C, 3088 makeCharRange(*this, BufferPtr, CurPtr)); 3089 } 3090 3091 MIOpt.ReadToken(); 3092 return LexIdentifier(Result, CurPtr); 3093 } 3094 3095 if (!isLexingRawMode() && !ParsingPreprocessorDirective && 3096 !PP->isPreprocessedOutput() && 3097 !isASCII(*BufferPtr) && !isAllowedIDChar(C, LangOpts)) { 3098 // Non-ASCII characters tend to creep into source code unintentionally. 3099 // Instead of letting the parser complain about the unknown token, 3100 // just drop the character. 3101 // Note that we can /only/ do this when the non-ASCII character is actually 3102 // spelled as Unicode, not written as a UCN. The standard requires that 3103 // we not throw away any possible preprocessor tokens, but there's a 3104 // loophole in the mapping of Unicode characters to basic character set 3105 // characters that allows us to map these particular characters to, say, 3106 // whitespace. 3107 Diag(BufferPtr, diag::err_non_ascii) 3108 << FixItHint::CreateRemoval(makeCharRange(*this, BufferPtr, CurPtr)); 3109 3110 BufferPtr = CurPtr; 3111 return false; 3112 } 3113 3114 // Otherwise, we have an explicit UCN or a character that's unlikely to show 3115 // up by accident. 3116 MIOpt.ReadToken(); 3117 FormTokenWithChars(Result, CurPtr, tok::unknown); 3118 return true; 3119 } 3120 3121 void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) { 3122 IsAtStartOfLine = Result.isAtStartOfLine(); 3123 HasLeadingSpace = Result.hasLeadingSpace(); 3124 HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro(); 3125 // Note that this doesn't affect IsAtPhysicalStartOfLine. 3126 } 3127 3128 bool Lexer::Lex(Token &Result) { 3129 // Start a new token. 3130 Result.startToken(); 3131 3132 // Set up misc whitespace flags for LexTokenInternal. 3133 if (IsAtStartOfLine) { 3134 Result.setFlag(Token::StartOfLine); 3135 IsAtStartOfLine = false; 3136 } 3137 3138 if (HasLeadingSpace) { 3139 Result.setFlag(Token::LeadingSpace); 3140 HasLeadingSpace = false; 3141 } 3142 3143 if (HasLeadingEmptyMacro) { 3144 Result.setFlag(Token::LeadingEmptyMacro); 3145 HasLeadingEmptyMacro = false; 3146 } 3147 3148 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; 3149 IsAtPhysicalStartOfLine = false; 3150 bool isRawLex = isLexingRawMode(); 3151 (void) isRawLex; 3152 bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine); 3153 // (After the LexTokenInternal call, the lexer might be destroyed.) 3154 assert((returnedToken || !isRawLex) && "Raw lex must succeed"); 3155 return returnedToken; 3156 } 3157 3158 /// LexTokenInternal - This implements a simple C family lexer. It is an 3159 /// extremely performance critical piece of code. This assumes that the buffer 3160 /// has a null character at the end of the file. This returns a preprocessing 3161 /// token, not a normal token, as such, it is an internal interface. It assumes 3162 /// that the Flags of result have been cleared before calling this. 3163 bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) { 3164 LexNextToken: 3165 // New token, can't need cleaning yet. 3166 Result.clearFlag(Token::NeedsCleaning); 3167 Result.setIdentifierInfo(nullptr); 3168 3169 // CurPtr - Cache BufferPtr in an automatic variable. 3170 const char *CurPtr = BufferPtr; 3171 3172 // Small amounts of horizontal whitespace is very common between tokens. 3173 if ((*CurPtr == ' ') || (*CurPtr == '\t')) { 3174 ++CurPtr; 3175 while ((*CurPtr == ' ') || (*CurPtr == '\t')) 3176 ++CurPtr; 3177 3178 // If we are keeping whitespace and other tokens, just return what we just 3179 // skipped. The next lexer invocation will return the token after the 3180 // whitespace. 3181 if (isKeepWhitespaceMode()) { 3182 FormTokenWithChars(Result, CurPtr, tok::unknown); 3183 // FIXME: The next token will not have LeadingSpace set. 3184 return true; 3185 } 3186 3187 BufferPtr = CurPtr; 3188 Result.setFlag(Token::LeadingSpace); 3189 } 3190 3191 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. 3192 3193 // Read a character, advancing over it. 3194 char Char = getAndAdvanceChar(CurPtr, Result); 3195 tok::TokenKind Kind; 3196 3197 switch (Char) { 3198 case 0: // Null. 3199 // Found end of file? 3200 if (CurPtr-1 == BufferEnd) 3201 return LexEndOfFile(Result, CurPtr-1); 3202 3203 // Check if we are performing code completion. 3204 if (isCodeCompletionPoint(CurPtr-1)) { 3205 // Return the code-completion token. 3206 Result.startToken(); 3207 FormTokenWithChars(Result, CurPtr, tok::code_completion); 3208 return true; 3209 } 3210 3211 if (!isLexingRawMode()) 3212 Diag(CurPtr-1, diag::null_in_file); 3213 Result.setFlag(Token::LeadingSpace); 3214 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3215 return true; // KeepWhitespaceMode 3216 3217 // We know the lexer hasn't changed, so just try again with this lexer. 3218 // (We manually eliminate the tail call to avoid recursion.) 3219 goto LexNextToken; 3220 3221 case 26: // DOS & CP/M EOF: "^Z". 3222 // If we're in Microsoft extensions mode, treat this as end of file. 3223 if (LangOpts.MicrosoftExt) { 3224 if (!isLexingRawMode()) 3225 Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft); 3226 return LexEndOfFile(Result, CurPtr-1); 3227 } 3228 3229 // If Microsoft extensions are disabled, this is just random garbage. 3230 Kind = tok::unknown; 3231 break; 3232 3233 case '\r': 3234 if (CurPtr[0] == '\n') 3235 Char = getAndAdvanceChar(CurPtr, Result); 3236 LLVM_FALLTHROUGH; 3237 case '\n': 3238 // If we are inside a preprocessor directive and we see the end of line, 3239 // we know we are done with the directive, so return an EOD token. 3240 if (ParsingPreprocessorDirective) { 3241 // Done parsing the "line". 3242 ParsingPreprocessorDirective = false; 3243 3244 // Restore comment saving mode, in case it was disabled for directive. 3245 if (PP) 3246 resetExtendedTokenMode(); 3247 3248 // Since we consumed a newline, we are back at the start of a line. 3249 IsAtStartOfLine = true; 3250 IsAtPhysicalStartOfLine = true; 3251 3252 Kind = tok::eod; 3253 break; 3254 } 3255 3256 // No leading whitespace seen so far. 3257 Result.clearFlag(Token::LeadingSpace); 3258 3259 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3260 return true; // KeepWhitespaceMode 3261 3262 // We only saw whitespace, so just try again with this lexer. 3263 // (We manually eliminate the tail call to avoid recursion.) 3264 goto LexNextToken; 3265 case ' ': 3266 case '\t': 3267 case '\f': 3268 case '\v': 3269 SkipHorizontalWhitespace: 3270 Result.setFlag(Token::LeadingSpace); 3271 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3272 return true; // KeepWhitespaceMode 3273 3274 SkipIgnoredUnits: 3275 CurPtr = BufferPtr; 3276 3277 // If the next token is obviously a // or /* */ comment, skip it efficiently 3278 // too (without going through the big switch stmt). 3279 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && 3280 LangOpts.LineComment && 3281 (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) { 3282 if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) 3283 return true; // There is a token to return. 3284 goto SkipIgnoredUnits; 3285 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) { 3286 if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) 3287 return true; // There is a token to return. 3288 goto SkipIgnoredUnits; 3289 } else if (isHorizontalWhitespace(*CurPtr)) { 3290 goto SkipHorizontalWhitespace; 3291 } 3292 // We only saw whitespace, so just try again with this lexer. 3293 // (We manually eliminate the tail call to avoid recursion.) 3294 goto LexNextToken; 3295 3296 // C99 6.4.4.1: Integer Constants. 3297 // C99 6.4.4.2: Floating Constants. 3298 case '0': case '1': case '2': case '3': case '4': 3299 case '5': case '6': case '7': case '8': case '9': 3300 // Notify MIOpt that we read a non-whitespace/non-comment token. 3301 MIOpt.ReadToken(); 3302 return LexNumericConstant(Result, CurPtr); 3303 3304 case 'u': // Identifier (uber) or C11/C++11 UTF-8 or UTF-16 string literal 3305 // Notify MIOpt that we read a non-whitespace/non-comment token. 3306 MIOpt.ReadToken(); 3307 3308 if (LangOpts.CPlusPlus11 || LangOpts.C11) { 3309 Char = getCharAndSize(CurPtr, SizeTmp); 3310 3311 // UTF-16 string literal 3312 if (Char == '"') 3313 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3314 tok::utf16_string_literal); 3315 3316 // UTF-16 character constant 3317 if (Char == '\'') 3318 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3319 tok::utf16_char_constant); 3320 3321 // UTF-16 raw string literal 3322 if (Char == 'R' && LangOpts.CPlusPlus11 && 3323 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 3324 return LexRawStringLiteral(Result, 3325 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3326 SizeTmp2, Result), 3327 tok::utf16_string_literal); 3328 3329 if (Char == '8') { 3330 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2); 3331 3332 // UTF-8 string literal 3333 if (Char2 == '"') 3334 return LexStringLiteral(Result, 3335 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3336 SizeTmp2, Result), 3337 tok::utf8_string_literal); 3338 if (Char2 == '\'' && LangOpts.CPlusPlus17) 3339 return LexCharConstant( 3340 Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3341 SizeTmp2, Result), 3342 tok::utf8_char_constant); 3343 3344 if (Char2 == 'R' && LangOpts.CPlusPlus11) { 3345 unsigned SizeTmp3; 3346 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 3347 // UTF-8 raw string literal 3348 if (Char3 == '"') { 3349 return LexRawStringLiteral(Result, 3350 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3351 SizeTmp2, Result), 3352 SizeTmp3, Result), 3353 tok::utf8_string_literal); 3354 } 3355 } 3356 } 3357 } 3358 3359 // treat u like the start of an identifier. 3360 return LexIdentifier(Result, CurPtr); 3361 3362 case 'U': // Identifier (Uber) or C11/C++11 UTF-32 string literal 3363 // Notify MIOpt that we read a non-whitespace/non-comment token. 3364 MIOpt.ReadToken(); 3365 3366 if (LangOpts.CPlusPlus11 || LangOpts.C11) { 3367 Char = getCharAndSize(CurPtr, SizeTmp); 3368 3369 // UTF-32 string literal 3370 if (Char == '"') 3371 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3372 tok::utf32_string_literal); 3373 3374 // UTF-32 character constant 3375 if (Char == '\'') 3376 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3377 tok::utf32_char_constant); 3378 3379 // UTF-32 raw string literal 3380 if (Char == 'R' && LangOpts.CPlusPlus11 && 3381 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 3382 return LexRawStringLiteral(Result, 3383 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3384 SizeTmp2, Result), 3385 tok::utf32_string_literal); 3386 } 3387 3388 // treat U like the start of an identifier. 3389 return LexIdentifier(Result, CurPtr); 3390 3391 case 'R': // Identifier or C++0x raw string literal 3392 // Notify MIOpt that we read a non-whitespace/non-comment token. 3393 MIOpt.ReadToken(); 3394 3395 if (LangOpts.CPlusPlus11) { 3396 Char = getCharAndSize(CurPtr, SizeTmp); 3397 3398 if (Char == '"') 3399 return LexRawStringLiteral(Result, 3400 ConsumeChar(CurPtr, SizeTmp, Result), 3401 tok::string_literal); 3402 } 3403 3404 // treat R like the start of an identifier. 3405 return LexIdentifier(Result, CurPtr); 3406 3407 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). 3408 // Notify MIOpt that we read a non-whitespace/non-comment token. 3409 MIOpt.ReadToken(); 3410 Char = getCharAndSize(CurPtr, SizeTmp); 3411 3412 // Wide string literal. 3413 if (Char == '"') 3414 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3415 tok::wide_string_literal); 3416 3417 // Wide raw string literal. 3418 if (LangOpts.CPlusPlus11 && Char == 'R' && 3419 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 3420 return LexRawStringLiteral(Result, 3421 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3422 SizeTmp2, Result), 3423 tok::wide_string_literal); 3424 3425 // Wide character constant. 3426 if (Char == '\'') 3427 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3428 tok::wide_char_constant); 3429 // FALL THROUGH, treating L like the start of an identifier. 3430 LLVM_FALLTHROUGH; 3431 3432 // C99 6.4.2: Identifiers. 3433 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 3434 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': 3435 case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/ 3436 case 'V': case 'W': case 'X': case 'Y': case 'Z': 3437 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 3438 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 3439 case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/ 3440 case 'v': case 'w': case 'x': case 'y': case 'z': 3441 case '_': 3442 // Notify MIOpt that we read a non-whitespace/non-comment token. 3443 MIOpt.ReadToken(); 3444 return LexIdentifier(Result, CurPtr); 3445 3446 case '$': // $ in identifiers. 3447 if (LangOpts.DollarIdents) { 3448 if (!isLexingRawMode()) 3449 Diag(CurPtr-1, diag::ext_dollar_in_identifier); 3450 // Notify MIOpt that we read a non-whitespace/non-comment token. 3451 MIOpt.ReadToken(); 3452 return LexIdentifier(Result, CurPtr); 3453 } 3454 3455 Kind = tok::unknown; 3456 break; 3457 3458 // C99 6.4.4: Character Constants. 3459 case '\'': 3460 // Notify MIOpt that we read a non-whitespace/non-comment token. 3461 MIOpt.ReadToken(); 3462 return LexCharConstant(Result, CurPtr, tok::char_constant); 3463 3464 // C99 6.4.5: String Literals. 3465 case '"': 3466 // Notify MIOpt that we read a non-whitespace/non-comment token. 3467 MIOpt.ReadToken(); 3468 return LexStringLiteral(Result, CurPtr, 3469 ParsingFilename ? tok::header_name 3470 : tok::string_literal); 3471 3472 // C99 6.4.6: Punctuators. 3473 case '?': 3474 Kind = tok::question; 3475 break; 3476 case '[': 3477 Kind = tok::l_square; 3478 break; 3479 case ']': 3480 Kind = tok::r_square; 3481 break; 3482 case '(': 3483 Kind = tok::l_paren; 3484 break; 3485 case ')': 3486 Kind = tok::r_paren; 3487 break; 3488 case '{': 3489 Kind = tok::l_brace; 3490 break; 3491 case '}': 3492 Kind = tok::r_brace; 3493 break; 3494 case '.': 3495 Char = getCharAndSize(CurPtr, SizeTmp); 3496 if (Char >= '0' && Char <= '9') { 3497 // Notify MIOpt that we read a non-whitespace/non-comment token. 3498 MIOpt.ReadToken(); 3499 3500 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 3501 } else if (LangOpts.CPlusPlus && Char == '*') { 3502 Kind = tok::periodstar; 3503 CurPtr += SizeTmp; 3504 } else if (Char == '.' && 3505 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') { 3506 Kind = tok::ellipsis; 3507 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3508 SizeTmp2, Result); 3509 } else { 3510 Kind = tok::period; 3511 } 3512 break; 3513 case '&': 3514 Char = getCharAndSize(CurPtr, SizeTmp); 3515 if (Char == '&') { 3516 Kind = tok::ampamp; 3517 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3518 } else if (Char == '=') { 3519 Kind = tok::ampequal; 3520 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3521 } else { 3522 Kind = tok::amp; 3523 } 3524 break; 3525 case '*': 3526 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 3527 Kind = tok::starequal; 3528 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3529 } else { 3530 Kind = tok::star; 3531 } 3532 break; 3533 case '+': 3534 Char = getCharAndSize(CurPtr, SizeTmp); 3535 if (Char == '+') { 3536 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3537 Kind = tok::plusplus; 3538 } else if (Char == '=') { 3539 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3540 Kind = tok::plusequal; 3541 } else { 3542 Kind = tok::plus; 3543 } 3544 break; 3545 case '-': 3546 Char = getCharAndSize(CurPtr, SizeTmp); 3547 if (Char == '-') { // -- 3548 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3549 Kind = tok::minusminus; 3550 } else if (Char == '>' && LangOpts.CPlusPlus && 3551 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->* 3552 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3553 SizeTmp2, Result); 3554 Kind = tok::arrowstar; 3555 } else if (Char == '>') { // -> 3556 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3557 Kind = tok::arrow; 3558 } else if (Char == '=') { // -= 3559 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3560 Kind = tok::minusequal; 3561 } else { 3562 Kind = tok::minus; 3563 } 3564 break; 3565 case '~': 3566 Kind = tok::tilde; 3567 break; 3568 case '!': 3569 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 3570 Kind = tok::exclaimequal; 3571 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3572 } else { 3573 Kind = tok::exclaim; 3574 } 3575 break; 3576 case '/': 3577 // 6.4.9: Comments 3578 Char = getCharAndSize(CurPtr, SizeTmp); 3579 if (Char == '/') { // Line comment. 3580 // Even if Line comments are disabled (e.g. in C89 mode), we generally 3581 // want to lex this as a comment. There is one problem with this though, 3582 // that in one particular corner case, this can change the behavior of the 3583 // resultant program. For example, In "foo //**/ bar", C89 would lex 3584 // this as "foo / bar" and languages with Line comments would lex it as 3585 // "foo". Check to see if the character after the second slash is a '*'. 3586 // If so, we will lex that as a "/" instead of the start of a comment. 3587 // However, we never do this if we are just preprocessing. 3588 bool TreatAsComment = LangOpts.LineComment && 3589 (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP); 3590 if (!TreatAsComment) 3591 if (!(PP && PP->isPreprocessedOutput())) 3592 TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*'; 3593 3594 if (TreatAsComment) { 3595 if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3596 TokAtPhysicalStartOfLine)) 3597 return true; // There is a token to return. 3598 3599 // It is common for the tokens immediately after a // comment to be 3600 // whitespace (indentation for the next line). Instead of going through 3601 // the big switch, handle it efficiently now. 3602 goto SkipIgnoredUnits; 3603 } 3604 } 3605 3606 if (Char == '*') { // /**/ comment. 3607 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3608 TokAtPhysicalStartOfLine)) 3609 return true; // There is a token to return. 3610 3611 // We only saw whitespace, so just try again with this lexer. 3612 // (We manually eliminate the tail call to avoid recursion.) 3613 goto LexNextToken; 3614 } 3615 3616 if (Char == '=') { 3617 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3618 Kind = tok::slashequal; 3619 } else { 3620 Kind = tok::slash; 3621 } 3622 break; 3623 case '%': 3624 Char = getCharAndSize(CurPtr, SizeTmp); 3625 if (Char == '=') { 3626 Kind = tok::percentequal; 3627 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3628 } else if (LangOpts.Digraphs && Char == '>') { 3629 Kind = tok::r_brace; // '%>' -> '}' 3630 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3631 } else if (LangOpts.Digraphs && Char == ':') { 3632 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3633 Char = getCharAndSize(CurPtr, SizeTmp); 3634 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') { 3635 Kind = tok::hashhash; // '%:%:' -> '##' 3636 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3637 SizeTmp2, Result); 3638 } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize 3639 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3640 if (!isLexingRawMode()) 3641 Diag(BufferPtr, diag::ext_charize_microsoft); 3642 Kind = tok::hashat; 3643 } else { // '%:' -> '#' 3644 // We parsed a # character. If this occurs at the start of the line, 3645 // it's actually the start of a preprocessing directive. Callback to 3646 // the preprocessor to handle it. 3647 // TODO: -fpreprocessed mode?? 3648 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) 3649 goto HandleDirective; 3650 3651 Kind = tok::hash; 3652 } 3653 } else { 3654 Kind = tok::percent; 3655 } 3656 break; 3657 case '<': 3658 Char = getCharAndSize(CurPtr, SizeTmp); 3659 if (ParsingFilename) { 3660 return LexAngledStringLiteral(Result, CurPtr); 3661 } else if (Char == '<') { 3662 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 3663 if (After == '=') { 3664 Kind = tok::lesslessequal; 3665 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3666 SizeTmp2, Result); 3667 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) { 3668 // If this is actually a '<<<<<<<' version control conflict marker, 3669 // recognize it as such and recover nicely. 3670 goto LexNextToken; 3671 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) { 3672 // If this is '<<<<' and we're in a Perforce-style conflict marker, 3673 // ignore it. 3674 goto LexNextToken; 3675 } else if (LangOpts.CUDA && After == '<') { 3676 Kind = tok::lesslessless; 3677 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3678 SizeTmp2, Result); 3679 } else { 3680 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3681 Kind = tok::lessless; 3682 } 3683 } else if (Char == '=') { 3684 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 3685 if (After == '>') { 3686 if (getLangOpts().CPlusPlus2a) { 3687 if (!isLexingRawMode()) 3688 Diag(BufferPtr, diag::warn_cxx17_compat_spaceship); 3689 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3690 SizeTmp2, Result); 3691 Kind = tok::spaceship; 3692 break; 3693 } 3694 // Suggest adding a space between the '<=' and the '>' to avoid a 3695 // change in semantics if this turns up in C++ <=17 mode. 3696 if (getLangOpts().CPlusPlus && !isLexingRawMode()) { 3697 Diag(BufferPtr, diag::warn_cxx2a_compat_spaceship) 3698 << FixItHint::CreateInsertion( 3699 getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " "); 3700 } 3701 } 3702 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3703 Kind = tok::lessequal; 3704 } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '[' 3705 if (LangOpts.CPlusPlus11 && 3706 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') { 3707 // C++0x [lex.pptoken]p3: 3708 // Otherwise, if the next three characters are <:: and the subsequent 3709 // character is neither : nor >, the < is treated as a preprocessor 3710 // token by itself and not as the first character of the alternative 3711 // token <:. 3712 unsigned SizeTmp3; 3713 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 3714 if (After != ':' && After != '>') { 3715 Kind = tok::less; 3716 if (!isLexingRawMode()) 3717 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon); 3718 break; 3719 } 3720 } 3721 3722 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3723 Kind = tok::l_square; 3724 } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{' 3725 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3726 Kind = tok::l_brace; 3727 } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 && 3728 lexEditorPlaceholder(Result, CurPtr)) { 3729 return true; 3730 } else { 3731 Kind = tok::less; 3732 } 3733 break; 3734 case '>': 3735 Char = getCharAndSize(CurPtr, SizeTmp); 3736 if (Char == '=') { 3737 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3738 Kind = tok::greaterequal; 3739 } else if (Char == '>') { 3740 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 3741 if (After == '=') { 3742 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3743 SizeTmp2, Result); 3744 Kind = tok::greatergreaterequal; 3745 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) { 3746 // If this is actually a '>>>>' conflict marker, recognize it as such 3747 // and recover nicely. 3748 goto LexNextToken; 3749 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) { 3750 // If this is '>>>>>>>' and we're in a conflict marker, ignore it. 3751 goto LexNextToken; 3752 } else if (LangOpts.CUDA && After == '>') { 3753 Kind = tok::greatergreatergreater; 3754 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3755 SizeTmp2, Result); 3756 } else { 3757 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3758 Kind = tok::greatergreater; 3759 } 3760 } else { 3761 Kind = tok::greater; 3762 } 3763 break; 3764 case '^': 3765 Char = getCharAndSize(CurPtr, SizeTmp); 3766 if (Char == '=') { 3767 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3768 Kind = tok::caretequal; 3769 } else if (LangOpts.OpenCL && Char == '^') { 3770 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3771 Kind = tok::caretcaret; 3772 } else { 3773 Kind = tok::caret; 3774 } 3775 break; 3776 case '|': 3777 Char = getCharAndSize(CurPtr, SizeTmp); 3778 if (Char == '=') { 3779 Kind = tok::pipeequal; 3780 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3781 } else if (Char == '|') { 3782 // If this is '|||||||' and we're in a conflict marker, ignore it. 3783 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1)) 3784 goto LexNextToken; 3785 Kind = tok::pipepipe; 3786 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3787 } else { 3788 Kind = tok::pipe; 3789 } 3790 break; 3791 case ':': 3792 Char = getCharAndSize(CurPtr, SizeTmp); 3793 if (LangOpts.Digraphs && Char == '>') { 3794 Kind = tok::r_square; // ':>' -> ']' 3795 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3796 } else if ((LangOpts.CPlusPlus || 3797 LangOpts.DoubleSquareBracketAttributes) && 3798 Char == ':') { 3799 Kind = tok::coloncolon; 3800 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3801 } else { 3802 Kind = tok::colon; 3803 } 3804 break; 3805 case ';': 3806 Kind = tok::semi; 3807 break; 3808 case '=': 3809 Char = getCharAndSize(CurPtr, SizeTmp); 3810 if (Char == '=') { 3811 // If this is '====' and we're in a conflict marker, ignore it. 3812 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1)) 3813 goto LexNextToken; 3814 3815 Kind = tok::equalequal; 3816 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3817 } else { 3818 Kind = tok::equal; 3819 } 3820 break; 3821 case ',': 3822 Kind = tok::comma; 3823 break; 3824 case '#': 3825 Char = getCharAndSize(CurPtr, SizeTmp); 3826 if (Char == '#') { 3827 Kind = tok::hashhash; 3828 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3829 } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize 3830 Kind = tok::hashat; 3831 if (!isLexingRawMode()) 3832 Diag(BufferPtr, diag::ext_charize_microsoft); 3833 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3834 } else { 3835 // We parsed a # character. If this occurs at the start of the line, 3836 // it's actually the start of a preprocessing directive. Callback to 3837 // the preprocessor to handle it. 3838 // TODO: -fpreprocessed mode?? 3839 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) 3840 goto HandleDirective; 3841 3842 Kind = tok::hash; 3843 } 3844 break; 3845 3846 case '@': 3847 // Objective C support. 3848 if (CurPtr[-1] == '@' && LangOpts.ObjC) 3849 Kind = tok::at; 3850 else 3851 Kind = tok::unknown; 3852 break; 3853 3854 // UCNs (C99 6.4.3, C++11 [lex.charset]p2) 3855 case '\\': 3856 if (!LangOpts.AsmPreprocessor) { 3857 if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) { 3858 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { 3859 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3860 return true; // KeepWhitespaceMode 3861 3862 // We only saw whitespace, so just try again with this lexer. 3863 // (We manually eliminate the tail call to avoid recursion.) 3864 goto LexNextToken; 3865 } 3866 3867 return LexUnicode(Result, CodePoint, CurPtr); 3868 } 3869 } 3870 3871 Kind = tok::unknown; 3872 break; 3873 3874 default: { 3875 if (isASCII(Char)) { 3876 Kind = tok::unknown; 3877 break; 3878 } 3879 3880 llvm::UTF32 CodePoint; 3881 3882 // We can't just reset CurPtr to BufferPtr because BufferPtr may point to 3883 // an escaped newline. 3884 --CurPtr; 3885 llvm::ConversionResult Status = 3886 llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr, 3887 (const llvm::UTF8 *)BufferEnd, 3888 &CodePoint, 3889 llvm::strictConversion); 3890 if (Status == llvm::conversionOK) { 3891 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { 3892 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3893 return true; // KeepWhitespaceMode 3894 3895 // We only saw whitespace, so just try again with this lexer. 3896 // (We manually eliminate the tail call to avoid recursion.) 3897 goto LexNextToken; 3898 } 3899 return LexUnicode(Result, CodePoint, CurPtr); 3900 } 3901 3902 if (isLexingRawMode() || ParsingPreprocessorDirective || 3903 PP->isPreprocessedOutput()) { 3904 ++CurPtr; 3905 Kind = tok::unknown; 3906 break; 3907 } 3908 3909 // Non-ASCII characters tend to creep into source code unintentionally. 3910 // Instead of letting the parser complain about the unknown token, 3911 // just diagnose the invalid UTF-8, then drop the character. 3912 Diag(CurPtr, diag::err_invalid_utf8); 3913 3914 BufferPtr = CurPtr+1; 3915 // We're pretending the character didn't exist, so just try again with 3916 // this lexer. 3917 // (We manually eliminate the tail call to avoid recursion.) 3918 goto LexNextToken; 3919 } 3920 } 3921 3922 // Notify MIOpt that we read a non-whitespace/non-comment token. 3923 MIOpt.ReadToken(); 3924 3925 // Update the location of token as well as BufferPtr. 3926 FormTokenWithChars(Result, CurPtr, Kind); 3927 return true; 3928 3929 HandleDirective: 3930 // We parsed a # character and it's the start of a preprocessing directive. 3931 3932 FormTokenWithChars(Result, CurPtr, tok::hash); 3933 PP->HandleDirective(Result); 3934 3935 if (PP->hadModuleLoaderFatalFailure()) { 3936 // With a fatal failure in the module loader, we abort parsing. 3937 assert(Result.is(tok::eof) && "Preprocessor did not set tok:eof"); 3938 return true; 3939 } 3940 3941 // We parsed the directive; lex a token with the new state. 3942 return false; 3943 } 3944