1 //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Implement the Lexer for TableGen. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "TGLexer.h" 14 #include "llvm/ADT/ArrayRef.h" 15 #include "llvm/ADT/StringSwitch.h" 16 #include "llvm/ADT/Twine.h" 17 #include "llvm/Config/config.h" // for strtoull()/strtoll() define 18 #include "llvm/Support/Compiler.h" 19 #include "llvm/Support/MemoryBuffer.h" 20 #include "llvm/Support/SourceMgr.h" 21 #include "llvm/TableGen/Error.h" 22 #include <algorithm> 23 #include <cctype> 24 #include <cerrno> 25 #include <cstdint> 26 #include <cstdio> 27 #include <cstdlib> 28 #include <cstring> 29 30 using namespace llvm; 31 32 namespace { 33 // A list of supported preprocessing directives with their 34 // internal token kinds and names. 35 struct { 36 tgtok::TokKind Kind; 37 const char *Word; 38 } PreprocessorDirs[] = { 39 { tgtok::Ifdef, "ifdef" }, 40 { tgtok::Ifndef, "ifndef" }, 41 { tgtok::Else, "else" }, 42 { tgtok::Endif, "endif" }, 43 { tgtok::Define, "define" } 44 }; 45 } // end anonymous namespace 46 47 TGLexer::TGLexer(SourceMgr &SM, ArrayRef<std::string> Macros) : SrcMgr(SM) { 48 CurBuffer = SrcMgr.getMainFileID(); 49 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); 50 CurPtr = CurBuf.begin(); 51 TokStart = nullptr; 52 53 // Pretend that we enter the "top-level" include file. 54 PrepIncludeStack.push_back( 55 std::make_unique<std::vector<PreprocessorControlDesc>>()); 56 57 // Put all macros defined in the command line into the DefinedMacros set. 58 std::for_each(Macros.begin(), Macros.end(), 59 [this](const std::string &MacroName) { 60 DefinedMacros.insert(MacroName); 61 }); 62 } 63 64 SMLoc TGLexer::getLoc() const { 65 return SMLoc::getFromPointer(TokStart); 66 } 67 68 /// ReturnError - Set the error to the specified string at the specified 69 /// location. This is defined to always return tgtok::Error. 70 tgtok::TokKind TGLexer::ReturnError(SMLoc Loc, const Twine &Msg) { 71 PrintError(Loc, Msg); 72 return tgtok::Error; 73 } 74 75 tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) { 76 return ReturnError(SMLoc::getFromPointer(Loc), Msg); 77 } 78 79 bool TGLexer::processEOF() { 80 SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer); 81 if (ParentIncludeLoc != SMLoc()) { 82 // If prepExitInclude() detects a problem with the preprocessing 83 // control stack, it will return false. Pretend that we reached 84 // the final EOF and stop lexing more tokens by returning false 85 // to LexToken(). 86 if (!prepExitInclude(false)) 87 return false; 88 89 CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc); 90 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); 91 CurPtr = ParentIncludeLoc.getPointer(); 92 // Make sure TokStart points into the parent file's buffer. 93 // LexToken() assigns to it before calling getNextChar(), 94 // so it is pointing into the included file now. 95 TokStart = CurPtr; 96 return true; 97 } 98 99 // Pretend that we exit the "top-level" include file. 100 // Note that in case of an error (e.g. control stack imbalance) 101 // the routine will issue a fatal error. 102 prepExitInclude(true); 103 return false; 104 } 105 106 int TGLexer::getNextChar() { 107 char CurChar = *CurPtr++; 108 switch (CurChar) { 109 default: 110 return (unsigned char)CurChar; 111 case 0: { 112 // A nul character in the stream is either the end of the current buffer or 113 // a random nul in the file. Disambiguate that here. 114 if (CurPtr-1 != CurBuf.end()) 115 return 0; // Just whitespace. 116 117 // Otherwise, return end of file. 118 --CurPtr; // Another call to lex will return EOF again. 119 return EOF; 120 } 121 case '\n': 122 case '\r': 123 // Handle the newline character by ignoring it and incrementing the line 124 // count. However, be careful about 'dos style' files with \n\r in them. 125 // Only treat a \n\r or \r\n as a single line. 126 if ((*CurPtr == '\n' || (*CurPtr == '\r')) && 127 *CurPtr != CurChar) 128 ++CurPtr; // Eat the two char newline sequence. 129 return '\n'; 130 } 131 } 132 133 int TGLexer::peekNextChar(int Index) const { 134 return *(CurPtr + Index); 135 } 136 137 tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) { 138 TokStart = CurPtr; 139 // This always consumes at least one character. 140 int CurChar = getNextChar(); 141 142 switch (CurChar) { 143 default: 144 // Handle letters: [a-zA-Z_] 145 if (isalpha(CurChar) || CurChar == '_') 146 return LexIdentifier(); 147 148 // Unknown character, emit an error. 149 return ReturnError(TokStart, "Unexpected character"); 150 case EOF: 151 // Lex next token, if we just left an include file. 152 // Note that leaving an include file means that the next 153 // symbol is located at the end of 'include "..."' 154 // construct, so LexToken() is called with default 155 // false parameter. 156 if (processEOF()) 157 return LexToken(); 158 159 // Return EOF denoting the end of lexing. 160 return tgtok::Eof; 161 162 case ':': return tgtok::colon; 163 case ';': return tgtok::semi; 164 case ',': return tgtok::comma; 165 case '<': return tgtok::less; 166 case '>': return tgtok::greater; 167 case ']': return tgtok::r_square; 168 case '{': return tgtok::l_brace; 169 case '}': return tgtok::r_brace; 170 case '(': return tgtok::l_paren; 171 case ')': return tgtok::r_paren; 172 case '=': return tgtok::equal; 173 case '?': return tgtok::question; 174 case '#': 175 if (FileOrLineStart) { 176 tgtok::TokKind Kind = prepIsDirective(); 177 if (Kind != tgtok::Error) 178 return lexPreprocessor(Kind); 179 } 180 181 return tgtok::paste; 182 183 // The period is a separate case so we can recognize the "..." 184 // range punctuator. 185 case '.': 186 if (peekNextChar(0) == '.') { 187 ++CurPtr; // Eat second dot. 188 if (peekNextChar(0) == '.') { 189 ++CurPtr; // Eat third dot. 190 return tgtok::dotdotdot; 191 } 192 return ReturnError(TokStart, "Invalid '..' punctuation"); 193 } 194 return tgtok::dot; 195 196 case '\r': 197 PrintFatalError("getNextChar() must never return '\r'"); 198 return tgtok::Error; 199 200 case 0: 201 case ' ': 202 case '\t': 203 // Ignore whitespace. 204 return LexToken(FileOrLineStart); 205 case '\n': 206 // Ignore whitespace, and identify the new line. 207 return LexToken(true); 208 case '/': 209 // If this is the start of a // comment, skip until the end of the line or 210 // the end of the buffer. 211 if (*CurPtr == '/') 212 SkipBCPLComment(); 213 else if (*CurPtr == '*') { 214 if (SkipCComment()) 215 return tgtok::Error; 216 } else // Otherwise, this is an error. 217 return ReturnError(TokStart, "Unexpected character"); 218 return LexToken(FileOrLineStart); 219 case '-': case '+': 220 case '0': case '1': case '2': case '3': case '4': case '5': case '6': 221 case '7': case '8': case '9': { 222 int NextChar = 0; 223 if (isdigit(CurChar)) { 224 // Allow identifiers to start with a number if it is followed by 225 // an identifier. This can happen with paste operations like 226 // foo#8i. 227 int i = 0; 228 do { 229 NextChar = peekNextChar(i++); 230 } while (isdigit(NextChar)); 231 232 if (NextChar == 'x' || NextChar == 'b') { 233 // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most 234 // likely a number. 235 int NextNextChar = peekNextChar(i); 236 switch (NextNextChar) { 237 default: 238 break; 239 case '0': case '1': 240 if (NextChar == 'b') 241 return LexNumber(); 242 LLVM_FALLTHROUGH; 243 case '2': case '3': case '4': case '5': 244 case '6': case '7': case '8': case '9': 245 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 246 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 247 if (NextChar == 'x') 248 return LexNumber(); 249 break; 250 } 251 } 252 } 253 254 if (isalpha(NextChar) || NextChar == '_') 255 return LexIdentifier(); 256 257 return LexNumber(); 258 } 259 case '"': return LexString(); 260 case '$': return LexVarName(); 261 case '[': return LexBracket(); 262 case '!': return LexExclaim(); 263 } 264 } 265 266 /// LexString - Lex "[^"]*" 267 tgtok::TokKind TGLexer::LexString() { 268 const char *StrStart = CurPtr; 269 270 CurStrVal = ""; 271 272 while (*CurPtr != '"') { 273 // If we hit the end of the buffer, report an error. 274 if (*CurPtr == 0 && CurPtr == CurBuf.end()) 275 return ReturnError(StrStart, "End of file in string literal"); 276 277 if (*CurPtr == '\n' || *CurPtr == '\r') 278 return ReturnError(StrStart, "End of line in string literal"); 279 280 if (*CurPtr != '\\') { 281 CurStrVal += *CurPtr++; 282 continue; 283 } 284 285 ++CurPtr; 286 287 switch (*CurPtr) { 288 case '\\': case '\'': case '"': 289 // These turn into their literal character. 290 CurStrVal += *CurPtr++; 291 break; 292 case 't': 293 CurStrVal += '\t'; 294 ++CurPtr; 295 break; 296 case 'n': 297 CurStrVal += '\n'; 298 ++CurPtr; 299 break; 300 301 case '\n': 302 case '\r': 303 return ReturnError(CurPtr, "escaped newlines not supported in tblgen"); 304 305 // If we hit the end of the buffer, report an error. 306 case '\0': 307 if (CurPtr == CurBuf.end()) 308 return ReturnError(StrStart, "End of file in string literal"); 309 LLVM_FALLTHROUGH; 310 default: 311 return ReturnError(CurPtr, "invalid escape in string literal"); 312 } 313 } 314 315 ++CurPtr; 316 return tgtok::StrVal; 317 } 318 319 tgtok::TokKind TGLexer::LexVarName() { 320 if (!isalpha(CurPtr[0]) && CurPtr[0] != '_') 321 return ReturnError(TokStart, "Invalid variable name"); 322 323 // Otherwise, we're ok, consume the rest of the characters. 324 const char *VarNameStart = CurPtr++; 325 326 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') 327 ++CurPtr; 328 329 CurStrVal.assign(VarNameStart, CurPtr); 330 return tgtok::VarName; 331 } 332 333 tgtok::TokKind TGLexer::LexIdentifier() { 334 // The first letter is [a-zA-Z_]. 335 const char *IdentStart = TokStart; 336 337 // Match the rest of the identifier regex: [0-9a-zA-Z_]* 338 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') 339 ++CurPtr; 340 341 // Check to see if this identifier is a keyword. 342 StringRef Str(IdentStart, CurPtr-IdentStart); 343 344 if (Str == "include") { 345 if (LexInclude()) return tgtok::Error; 346 return Lex(); 347 } 348 349 tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str) 350 .Case("int", tgtok::Int) 351 .Case("bit", tgtok::Bit) 352 .Case("bits", tgtok::Bits) 353 .Case("string", tgtok::String) 354 .Case("list", tgtok::List) 355 .Case("code", tgtok::Code) 356 .Case("dag", tgtok::Dag) 357 .Case("class", tgtok::Class) 358 .Case("def", tgtok::Def) 359 .Case("foreach", tgtok::Foreach) 360 .Case("defm", tgtok::Defm) 361 .Case("defset", tgtok::Defset) 362 .Case("multiclass", tgtok::MultiClass) 363 .Case("field", tgtok::Field) 364 .Case("let", tgtok::Let) 365 .Case("in", tgtok::In) 366 .Case("defvar", tgtok::Defvar) 367 .Case("if", tgtok::If) 368 .Case("then", tgtok::Then) 369 .Case("else", tgtok::ElseKW) 370 .Default(tgtok::Id); 371 372 if (Kind == tgtok::Id) 373 CurStrVal.assign(Str.begin(), Str.end()); 374 return Kind; 375 } 376 377 /// LexInclude - We just read the "include" token. Get the string token that 378 /// comes next and enter the include. 379 bool TGLexer::LexInclude() { 380 // The token after the include must be a string. 381 tgtok::TokKind Tok = LexToken(); 382 if (Tok == tgtok::Error) return true; 383 if (Tok != tgtok::StrVal) { 384 PrintError(getLoc(), "Expected filename after include"); 385 return true; 386 } 387 388 // Get the string. 389 std::string Filename = CurStrVal; 390 std::string IncludedFile; 391 392 CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr), 393 IncludedFile); 394 if (!CurBuffer) { 395 PrintError(getLoc(), "Could not find include file '" + Filename + "'"); 396 return true; 397 } 398 399 Dependencies.insert(IncludedFile); 400 // Save the line number and lex buffer of the includer. 401 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); 402 CurPtr = CurBuf.begin(); 403 404 PrepIncludeStack.push_back( 405 std::make_unique<std::vector<PreprocessorControlDesc>>()); 406 return false; 407 } 408 409 void TGLexer::SkipBCPLComment() { 410 ++CurPtr; // skip the second slash. 411 while (true) { 412 switch (*CurPtr) { 413 case '\n': 414 case '\r': 415 return; // Newline is end of comment. 416 case 0: 417 // If this is the end of the buffer, end the comment. 418 if (CurPtr == CurBuf.end()) 419 return; 420 break; 421 } 422 // Otherwise, skip the character. 423 ++CurPtr; 424 } 425 } 426 427 /// SkipCComment - This skips C-style /**/ comments. The only difference from C 428 /// is that we allow nesting. 429 bool TGLexer::SkipCComment() { 430 ++CurPtr; // skip the star. 431 unsigned CommentDepth = 1; 432 433 while (true) { 434 int CurChar = getNextChar(); 435 switch (CurChar) { 436 case EOF: 437 PrintError(TokStart, "Unterminated comment!"); 438 return true; 439 case '*': 440 // End of the comment? 441 if (CurPtr[0] != '/') break; 442 443 ++CurPtr; // End the */. 444 if (--CommentDepth == 0) 445 return false; 446 break; 447 case '/': 448 // Start of a nested comment? 449 if (CurPtr[0] != '*') break; 450 ++CurPtr; 451 ++CommentDepth; 452 break; 453 } 454 } 455 } 456 457 /// LexNumber - Lex: 458 /// [-+]?[0-9]+ 459 /// 0x[0-9a-fA-F]+ 460 /// 0b[01]+ 461 tgtok::TokKind TGLexer::LexNumber() { 462 if (CurPtr[-1] == '0') { 463 if (CurPtr[0] == 'x') { 464 ++CurPtr; 465 const char *NumStart = CurPtr; 466 while (isxdigit(CurPtr[0])) 467 ++CurPtr; 468 469 // Requires at least one hex digit. 470 if (CurPtr == NumStart) 471 return ReturnError(TokStart, "Invalid hexadecimal number"); 472 473 errno = 0; 474 CurIntVal = strtoll(NumStart, nullptr, 16); 475 if (errno == EINVAL) 476 return ReturnError(TokStart, "Invalid hexadecimal number"); 477 if (errno == ERANGE) { 478 errno = 0; 479 CurIntVal = (int64_t)strtoull(NumStart, nullptr, 16); 480 if (errno == EINVAL) 481 return ReturnError(TokStart, "Invalid hexadecimal number"); 482 if (errno == ERANGE) 483 return ReturnError(TokStart, "Hexadecimal number out of range"); 484 } 485 return tgtok::IntVal; 486 } else if (CurPtr[0] == 'b') { 487 ++CurPtr; 488 const char *NumStart = CurPtr; 489 while (CurPtr[0] == '0' || CurPtr[0] == '1') 490 ++CurPtr; 491 492 // Requires at least one binary digit. 493 if (CurPtr == NumStart) 494 return ReturnError(CurPtr-2, "Invalid binary number"); 495 CurIntVal = strtoll(NumStart, nullptr, 2); 496 return tgtok::BinaryIntVal; 497 } 498 } 499 500 // Check for a sign without a digit. 501 if (!isdigit(CurPtr[0])) { 502 if (CurPtr[-1] == '-') 503 return tgtok::minus; 504 else if (CurPtr[-1] == '+') 505 return tgtok::plus; 506 } 507 508 while (isdigit(CurPtr[0])) 509 ++CurPtr; 510 CurIntVal = strtoll(TokStart, nullptr, 10); 511 return tgtok::IntVal; 512 } 513 514 /// LexBracket - We just read '['. If this is a code block, return it, 515 /// otherwise return the bracket. Match: '[' and '[{ ( [^}]+ | }[^]] )* }]' 516 tgtok::TokKind TGLexer::LexBracket() { 517 if (CurPtr[0] != '{') 518 return tgtok::l_square; 519 ++CurPtr; 520 const char *CodeStart = CurPtr; 521 while (true) { 522 int Char = getNextChar(); 523 if (Char == EOF) break; 524 525 if (Char != '}') continue; 526 527 Char = getNextChar(); 528 if (Char == EOF) break; 529 if (Char == ']') { 530 CurStrVal.assign(CodeStart, CurPtr-2); 531 return tgtok::CodeFragment; 532 } 533 } 534 535 return ReturnError(CodeStart-2, "Unterminated Code Block"); 536 } 537 538 /// LexExclaim - Lex '!' and '![a-zA-Z]+'. 539 tgtok::TokKind TGLexer::LexExclaim() { 540 if (!isalpha(*CurPtr)) 541 return ReturnError(CurPtr - 1, "Invalid \"!operator\""); 542 543 const char *Start = CurPtr++; 544 while (isalpha(*CurPtr)) 545 ++CurPtr; 546 547 // Check to see which operator this is. 548 tgtok::TokKind Kind = 549 StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start)) 550 .Case("eq", tgtok::XEq) 551 .Case("ne", tgtok::XNe) 552 .Case("le", tgtok::XLe) 553 .Case("lt", tgtok::XLt) 554 .Case("ge", tgtok::XGe) 555 .Case("gt", tgtok::XGt) 556 .Case("if", tgtok::XIf) 557 .Case("cond", tgtok::XCond) 558 .Case("isa", tgtok::XIsA) 559 .Case("head", tgtok::XHead) 560 .Case("tail", tgtok::XTail) 561 .Case("size", tgtok::XSize) 562 .Case("con", tgtok::XConcat) 563 .Case("dag", tgtok::XDag) 564 .Case("add", tgtok::XADD) 565 .Case("mul", tgtok::XMUL) 566 .Case("not", tgtok::XNOT) 567 .Case("and", tgtok::XAND) 568 .Case("or", tgtok::XOR) 569 .Case("xor", tgtok::XXOR) 570 .Case("shl", tgtok::XSHL) 571 .Case("sra", tgtok::XSRA) 572 .Case("srl", tgtok::XSRL) 573 .Case("cast", tgtok::XCast) 574 .Case("empty", tgtok::XEmpty) 575 .Case("subst", tgtok::XSubst) 576 .Case("foldl", tgtok::XFoldl) 577 .Case("foreach", tgtok::XForEach) 578 .Case("listconcat", tgtok::XListConcat) 579 .Case("listsplat", tgtok::XListSplat) 580 .Case("strconcat", tgtok::XStrConcat) 581 .Case("setop", tgtok::XSetOp) 582 .Case("getop", tgtok::XGetOp) 583 .Default(tgtok::Error); 584 585 return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator"); 586 } 587 588 bool TGLexer::prepExitInclude(bool IncludeStackMustBeEmpty) { 589 // Report an error, if preprocessor control stack for the current 590 // file is not empty. 591 if (!PrepIncludeStack.back()->empty()) { 592 prepReportPreprocessorStackError(); 593 594 return false; 595 } 596 597 // Pop the preprocessing controls from the include stack. 598 if (PrepIncludeStack.empty()) { 599 PrintFatalError("Preprocessor include stack is empty"); 600 } 601 602 PrepIncludeStack.pop_back(); 603 604 if (IncludeStackMustBeEmpty) { 605 if (!PrepIncludeStack.empty()) 606 PrintFatalError("Preprocessor include stack is not empty"); 607 } else { 608 if (PrepIncludeStack.empty()) 609 PrintFatalError("Preprocessor include stack is empty"); 610 } 611 612 return true; 613 } 614 615 tgtok::TokKind TGLexer::prepIsDirective() const { 616 for (unsigned ID = 0; ID < llvm::array_lengthof(PreprocessorDirs); ++ID) { 617 int NextChar = *CurPtr; 618 bool Match = true; 619 unsigned I = 0; 620 for (; I < strlen(PreprocessorDirs[ID].Word); ++I) { 621 if (NextChar != PreprocessorDirs[ID].Word[I]) { 622 Match = false; 623 break; 624 } 625 626 NextChar = peekNextChar(I + 1); 627 } 628 629 // Check for whitespace after the directive. If there is no whitespace, 630 // then we do not recognize it as a preprocessing directive. 631 if (Match) { 632 tgtok::TokKind Kind = PreprocessorDirs[ID].Kind; 633 634 // New line and EOF may follow only #else/#endif. It will be reported 635 // as an error for #ifdef/#define after the call to prepLexMacroName(). 636 if (NextChar == ' ' || NextChar == '\t' || NextChar == EOF || 637 NextChar == '\n' || 638 // It looks like TableGen does not support '\r' as the actual 639 // carriage return, e.g. getNextChar() treats a single '\r' 640 // as '\n'. So we do the same here. 641 NextChar == '\r') 642 return Kind; 643 644 // Allow comments after some directives, e.g.: 645 // #else// OR #else/**/ 646 // #endif// OR #endif/**/ 647 // 648 // Note that we do allow comments after #ifdef/#define here, e.g. 649 // #ifdef/**/ AND #ifdef// 650 // #define/**/ AND #define// 651 // 652 // These cases will be reported as incorrect after calling 653 // prepLexMacroName(). We could have supported C-style comments 654 // after #ifdef/#define, but this would complicate the code 655 // for little benefit. 656 if (NextChar == '/') { 657 NextChar = peekNextChar(I + 1); 658 659 if (NextChar == '*' || NextChar == '/') 660 return Kind; 661 662 // Pretend that we do not recognize the directive. 663 } 664 } 665 } 666 667 return tgtok::Error; 668 } 669 670 bool TGLexer::prepEatPreprocessorDirective(tgtok::TokKind Kind) { 671 TokStart = CurPtr; 672 673 for (unsigned ID = 0; ID < llvm::array_lengthof(PreprocessorDirs); ++ID) 674 if (PreprocessorDirs[ID].Kind == Kind) { 675 // Advance CurPtr to the end of the preprocessing word. 676 CurPtr += strlen(PreprocessorDirs[ID].Word); 677 return true; 678 } 679 680 PrintFatalError("Unsupported preprocessing token in " 681 "prepEatPreprocessorDirective()"); 682 return false; 683 } 684 685 tgtok::TokKind TGLexer::lexPreprocessor( 686 tgtok::TokKind Kind, bool ReturnNextLiveToken) { 687 688 // We must be looking at a preprocessing directive. Eat it! 689 if (!prepEatPreprocessorDirective(Kind)) 690 PrintFatalError("lexPreprocessor() called for unknown " 691 "preprocessor directive"); 692 693 if (Kind == tgtok::Ifdef || Kind == tgtok::Ifndef) { 694 StringRef MacroName = prepLexMacroName(); 695 StringRef IfTokName = Kind == tgtok::Ifdef ? "#ifdef" : "#ifndef"; 696 if (MacroName.empty()) 697 return ReturnError(TokStart, "Expected macro name after " + IfTokName); 698 699 bool MacroIsDefined = DefinedMacros.count(MacroName) != 0; 700 701 // Canonicalize ifndef to ifdef equivalent 702 if (Kind == tgtok::Ifndef) { 703 MacroIsDefined = !MacroIsDefined; 704 Kind = tgtok::Ifdef; 705 } 706 707 // Regardless of whether we are processing tokens or not, 708 // we put the #ifdef control on stack. 709 PrepIncludeStack.back()->push_back( 710 {Kind, MacroIsDefined, SMLoc::getFromPointer(TokStart)}); 711 712 if (!prepSkipDirectiveEnd()) 713 return ReturnError(CurPtr, "Only comments are supported after " + 714 IfTokName + " NAME"); 715 716 // If we were not processing tokens before this #ifdef, 717 // then just return back to the lines skipping code. 718 if (!ReturnNextLiveToken) 719 return Kind; 720 721 // If we were processing tokens before this #ifdef, 722 // and the macro is defined, then just return the next token. 723 if (MacroIsDefined) 724 return LexToken(); 725 726 // We were processing tokens before this #ifdef, and the macro 727 // is not defined, so we have to start skipping the lines. 728 // If the skipping is successful, it will return the token following 729 // either #else or #endif corresponding to this #ifdef. 730 if (prepSkipRegion(ReturnNextLiveToken)) 731 return LexToken(); 732 733 return tgtok::Error; 734 } else if (Kind == tgtok::Else) { 735 // Check if this #else is correct before calling prepSkipDirectiveEnd(), 736 // which will move CurPtr away from the beginning of #else. 737 if (PrepIncludeStack.back()->empty()) 738 return ReturnError(TokStart, "#else without #ifdef or #ifndef"); 739 740 PreprocessorControlDesc IfdefEntry = PrepIncludeStack.back()->back(); 741 742 if (IfdefEntry.Kind != tgtok::Ifdef) { 743 PrintError(TokStart, "double #else"); 744 return ReturnError(IfdefEntry.SrcPos, "Previous #else is here"); 745 } 746 747 // Replace the corresponding #ifdef's control with its negation 748 // on the control stack. 749 PrepIncludeStack.back()->pop_back(); 750 PrepIncludeStack.back()->push_back( 751 {Kind, !IfdefEntry.IsDefined, SMLoc::getFromPointer(TokStart)}); 752 753 if (!prepSkipDirectiveEnd()) 754 return ReturnError(CurPtr, "Only comments are supported after #else"); 755 756 // If we were processing tokens before this #else, 757 // we have to start skipping lines until the matching #endif. 758 if (ReturnNextLiveToken) { 759 if (prepSkipRegion(ReturnNextLiveToken)) 760 return LexToken(); 761 762 return tgtok::Error; 763 } 764 765 // Return to the lines skipping code. 766 return Kind; 767 } else if (Kind == tgtok::Endif) { 768 // Check if this #endif is correct before calling prepSkipDirectiveEnd(), 769 // which will move CurPtr away from the beginning of #endif. 770 if (PrepIncludeStack.back()->empty()) 771 return ReturnError(TokStart, "#endif without #ifdef"); 772 773 auto &IfdefOrElseEntry = PrepIncludeStack.back()->back(); 774 775 if (IfdefOrElseEntry.Kind != tgtok::Ifdef && 776 IfdefOrElseEntry.Kind != tgtok::Else) { 777 PrintFatalError("Invalid preprocessor control on the stack"); 778 return tgtok::Error; 779 } 780 781 if (!prepSkipDirectiveEnd()) 782 return ReturnError(CurPtr, "Only comments are supported after #endif"); 783 784 PrepIncludeStack.back()->pop_back(); 785 786 // If we were processing tokens before this #endif, then 787 // we should continue it. 788 if (ReturnNextLiveToken) { 789 return LexToken(); 790 } 791 792 // Return to the lines skipping code. 793 return Kind; 794 } else if (Kind == tgtok::Define) { 795 StringRef MacroName = prepLexMacroName(); 796 if (MacroName.empty()) 797 return ReturnError(TokStart, "Expected macro name after #define"); 798 799 if (!DefinedMacros.insert(MacroName).second) 800 PrintWarning(getLoc(), 801 "Duplicate definition of macro: " + Twine(MacroName)); 802 803 if (!prepSkipDirectiveEnd()) 804 return ReturnError(CurPtr, 805 "Only comments are supported after #define NAME"); 806 807 if (!ReturnNextLiveToken) { 808 PrintFatalError("#define must be ignored during the lines skipping"); 809 return tgtok::Error; 810 } 811 812 return LexToken(); 813 } 814 815 PrintFatalError("Preprocessing directive is not supported"); 816 return tgtok::Error; 817 } 818 819 bool TGLexer::prepSkipRegion(bool MustNeverBeFalse) { 820 if (!MustNeverBeFalse) 821 PrintFatalError("Invalid recursion."); 822 823 do { 824 // Skip all symbols to the line end. 825 prepSkipToLineEnd(); 826 827 // Find the first non-whitespace symbol in the next line(s). 828 if (!prepSkipLineBegin()) 829 return false; 830 831 // If the first non-blank/comment symbol on the line is '#', 832 // it may be a start of preprocessing directive. 833 // 834 // If it is not '#' just go to the next line. 835 if (*CurPtr == '#') 836 ++CurPtr; 837 else 838 continue; 839 840 tgtok::TokKind Kind = prepIsDirective(); 841 842 // If we did not find a preprocessing directive or it is #define, 843 // then just skip to the next line. We do not have to do anything 844 // for #define in the line-skipping mode. 845 if (Kind == tgtok::Error || Kind == tgtok::Define) 846 continue; 847 848 tgtok::TokKind ProcessedKind = lexPreprocessor(Kind, false); 849 850 // If lexPreprocessor() encountered an error during lexing this 851 // preprocessor idiom, then return false to the calling lexPreprocessor(). 852 // This will force tgtok::Error to be returned to the tokens processing. 853 if (ProcessedKind == tgtok::Error) 854 return false; 855 856 if (Kind != ProcessedKind) 857 PrintFatalError("prepIsDirective() and lexPreprocessor() " 858 "returned different token kinds"); 859 860 // If this preprocessing directive enables tokens processing, 861 // then return to the lexPreprocessor() and get to the next token. 862 // We can move from line-skipping mode to processing tokens only 863 // due to #else or #endif. 864 if (prepIsProcessingEnabled()) { 865 if (Kind != tgtok::Else && Kind != tgtok::Endif) { 866 PrintFatalError("Tokens processing was enabled by an unexpected " 867 "preprocessing directive"); 868 return false; 869 } 870 871 return true; 872 } 873 } while (CurPtr != CurBuf.end()); 874 875 // We have reached the end of the file, but never left the lines-skipping 876 // mode. This means there is no matching #endif. 877 prepReportPreprocessorStackError(); 878 return false; 879 } 880 881 StringRef TGLexer::prepLexMacroName() { 882 // Skip whitespaces between the preprocessing directive and the macro name. 883 while (*CurPtr == ' ' || *CurPtr == '\t') 884 ++CurPtr; 885 886 TokStart = CurPtr; 887 // Macro names start with [a-zA-Z_]. 888 if (*CurPtr != '_' && !isalpha(*CurPtr)) 889 return ""; 890 891 // Match the rest of the identifier regex: [0-9a-zA-Z_]* 892 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') 893 ++CurPtr; 894 895 return StringRef(TokStart, CurPtr - TokStart); 896 } 897 898 bool TGLexer::prepSkipLineBegin() { 899 while (CurPtr != CurBuf.end()) { 900 switch (*CurPtr) { 901 case ' ': 902 case '\t': 903 case '\n': 904 case '\r': 905 break; 906 907 case '/': { 908 int NextChar = peekNextChar(1); 909 if (NextChar == '*') { 910 // Skip C-style comment. 911 // Note that we do not care about skipping the C++-style comments. 912 // If the line contains "//", it may not contain any processable 913 // preprocessing directive. Just return CurPtr pointing to 914 // the first '/' in this case. We also do not care about 915 // incorrect symbols after the first '/' - we are in lines-skipping 916 // mode, so incorrect code is allowed to some extent. 917 918 // Set TokStart to the beginning of the comment to enable proper 919 // diagnostic printing in case of error in SkipCComment(). 920 TokStart = CurPtr; 921 922 // CurPtr must point to '*' before call to SkipCComment(). 923 ++CurPtr; 924 if (SkipCComment()) 925 return false; 926 } else { 927 // CurPtr points to the non-whitespace '/'. 928 return true; 929 } 930 931 // We must not increment CurPtr after the comment was lexed. 932 continue; 933 } 934 935 default: 936 return true; 937 } 938 939 ++CurPtr; 940 } 941 942 // We have reached the end of the file. Return to the lines skipping 943 // code, and allow it to handle the EOF as needed. 944 return true; 945 } 946 947 bool TGLexer::prepSkipDirectiveEnd() { 948 while (CurPtr != CurBuf.end()) { 949 switch (*CurPtr) { 950 case ' ': 951 case '\t': 952 break; 953 954 case '\n': 955 case '\r': 956 return true; 957 958 case '/': { 959 int NextChar = peekNextChar(1); 960 if (NextChar == '/') { 961 // Skip C++-style comment. 962 // We may just return true now, but let's skip to the line/buffer end 963 // to simplify the method specification. 964 ++CurPtr; 965 SkipBCPLComment(); 966 } else if (NextChar == '*') { 967 // When we are skipping C-style comment at the end of a preprocessing 968 // directive, we can skip several lines. If any meaningful TD token 969 // follows the end of the C-style comment on the same line, it will 970 // be considered as an invalid usage of TD token. 971 // For example, we want to forbid usages like this one: 972 // #define MACRO class Class {} 973 // But with C-style comments we also disallow the following: 974 // #define MACRO /* This macro is used 975 // to ... */ class Class {} 976 // One can argue that this should be allowed, but it does not seem 977 // to be worth of the complication. Moreover, this matches 978 // the C preprocessor behavior. 979 980 // Set TokStart to the beginning of the comment to enable proper 981 // diagnostic printer in case of error in SkipCComment(). 982 TokStart = CurPtr; 983 ++CurPtr; 984 if (SkipCComment()) 985 return false; 986 } else { 987 TokStart = CurPtr; 988 PrintError(CurPtr, "Unexpected character"); 989 return false; 990 } 991 992 // We must not increment CurPtr after the comment was lexed. 993 continue; 994 } 995 996 default: 997 // Do not allow any non-whitespaces after the directive. 998 TokStart = CurPtr; 999 return false; 1000 } 1001 1002 ++CurPtr; 1003 } 1004 1005 return true; 1006 } 1007 1008 void TGLexer::prepSkipToLineEnd() { 1009 while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) 1010 ++CurPtr; 1011 } 1012 1013 bool TGLexer::prepIsProcessingEnabled() { 1014 for (auto I = PrepIncludeStack.back()->rbegin(), 1015 E = PrepIncludeStack.back()->rend(); 1016 I != E; ++I) { 1017 if (!I->IsDefined) 1018 return false; 1019 } 1020 1021 return true; 1022 } 1023 1024 void TGLexer::prepReportPreprocessorStackError() { 1025 if (PrepIncludeStack.back()->empty()) 1026 PrintFatalError("prepReportPreprocessorStackError() called with " 1027 "empty control stack"); 1028 1029 auto &PrepControl = PrepIncludeStack.back()->back(); 1030 PrintError(CurBuf.end(), "Reached EOF without matching #endif"); 1031 PrintError(PrepControl.SrcPos, "The latest preprocessor control is here"); 1032 1033 TokStart = CurPtr; 1034 } 1035