1 //===-- lib/Parser/prescan.cpp --------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "prescan.h" 10 #include "preprocessor.h" 11 #include "token-sequence.h" 12 #include "flang/Common/idioms.h" 13 #include "flang/Parser/characters.h" 14 #include "flang/Parser/message.h" 15 #include "flang/Parser/source.h" 16 #include "llvm/Support/raw_ostream.h" 17 #include <cstddef> 18 #include <cstring> 19 #include <utility> 20 #include <vector> 21 22 namespace Fortran::parser { 23 24 using common::LanguageFeature; 25 26 static constexpr int maxPrescannerNesting{100}; 27 28 Prescanner::Prescanner(Messages &messages, CookedSource &cooked, 29 Preprocessor &preprocessor, common::LanguageFeatureControl lfc) 30 : messages_{messages}, cooked_{cooked}, preprocessor_{preprocessor}, 31 allSources_{preprocessor_.allSources()}, features_{lfc}, 32 encoding_{allSources_.encoding()} {} 33 34 Prescanner::Prescanner(const Prescanner &that) 35 : messages_{that.messages_}, cooked_{that.cooked_}, 36 preprocessor_{that.preprocessor_}, allSources_{that.allSources_}, 37 features_{that.features_}, inFixedForm_{that.inFixedForm_}, 38 fixedFormColumnLimit_{that.fixedFormColumnLimit_}, 39 encoding_{that.encoding_}, prescannerNesting_{that.prescannerNesting_ + 40 1}, 41 skipLeadingAmpersand_{that.skipLeadingAmpersand_}, 42 compilerDirectiveBloomFilter_{that.compilerDirectiveBloomFilter_}, 43 compilerDirectiveSentinels_{that.compilerDirectiveSentinels_} {} 44 45 static inline constexpr bool IsFixedFormCommentChar(char ch) { 46 return ch == '!' || ch == '*' || ch == 'C' || ch == 'c'; 47 } 48 49 static void NormalizeCompilerDirectiveCommentMarker(TokenSequence &dir) { 50 char *p{dir.GetMutableCharData()}; 51 char *limit{p + dir.SizeInChars()}; 52 for (; p < limit; ++p) { 53 if (*p != ' ') { 54 CHECK(IsFixedFormCommentChar(*p)); 55 *p = '!'; 56 return; 57 } 58 } 59 DIE("compiler directive all blank"); 60 } 61 62 void Prescanner::Prescan(ProvenanceRange range) { 63 startProvenance_ = range.start(); 64 start_ = allSources_.GetSource(range); 65 CHECK(start_); 66 limit_ = start_ + range.size(); 67 nextLine_ = start_; 68 const bool beganInFixedForm{inFixedForm_}; 69 if (prescannerNesting_ > maxPrescannerNesting) { 70 Say(GetProvenance(start_), 71 "too many nested INCLUDE/#include files, possibly circular"_err_en_US); 72 return; 73 } 74 while (!IsAtEnd()) { 75 Statement(); 76 } 77 if (inFixedForm_ != beganInFixedForm) { 78 std::string dir{"!dir$ "}; 79 if (beganInFixedForm) { 80 dir += "fixed"; 81 } else { 82 dir += "free"; 83 } 84 dir += '\n'; 85 TokenSequence tokens{dir, allSources_.AddCompilerInsertion(dir).start()}; 86 tokens.Emit(cooked_); 87 } 88 } 89 90 void Prescanner::Statement() { 91 TokenSequence tokens; 92 LineClassification line{ClassifyLine(nextLine_)}; 93 switch (line.kind) { 94 case LineClassification::Kind::Comment: 95 nextLine_ += line.payloadOffset; // advance to '!' or newline 96 NextLine(); 97 return; 98 case LineClassification::Kind::IncludeLine: 99 FortranInclude(nextLine_ + line.payloadOffset); 100 NextLine(); 101 return; 102 case LineClassification::Kind::ConditionalCompilationDirective: 103 case LineClassification::Kind::IncludeDirective: 104 case LineClassification::Kind::DefinitionDirective: 105 case LineClassification::Kind::PreprocessorDirective: 106 preprocessor_.Directive(TokenizePreprocessorDirective(), *this); 107 return; 108 case LineClassification::Kind::CompilerDirective: 109 directiveSentinel_ = line.sentinel; 110 CHECK(InCompilerDirective()); 111 BeginStatementAndAdvance(); 112 if (inFixedForm_) { 113 CHECK(IsFixedFormCommentChar(*at_)); 114 } else { 115 while (*at_ == ' ' || *at_ == '\t') { 116 ++at_, ++column_; 117 } 118 CHECK(*at_ == '!'); 119 } 120 if (directiveSentinel_[0] == '$' && directiveSentinel_[1] == '\0') { 121 // OpenMP conditional compilation line. Remove the sentinel and then 122 // treat the line as if it were normal source. 123 at_ += 2, column_ += 2; 124 if (inFixedForm_) { 125 LabelField(tokens); 126 } else { 127 SkipSpaces(); 128 } 129 } else { 130 // Compiler directive. Emit normalized sentinel. 131 EmitChar(tokens, '!'); 132 ++at_, ++column_; 133 for (const char *sp{directiveSentinel_}; *sp != '\0'; 134 ++sp, ++at_, ++column_) { 135 EmitChar(tokens, *sp); 136 } 137 if (*at_ == ' ') { 138 EmitChar(tokens, ' '); 139 ++at_, ++column_; 140 } 141 tokens.CloseToken(); 142 } 143 break; 144 case LineClassification::Kind::Source: 145 BeginStatementAndAdvance(); 146 if (inFixedForm_) { 147 if (features_.IsEnabled(LanguageFeature::OldDebugLines) && 148 (*at_ == 'D' || *at_ == 'd')) { 149 NextChar(); 150 } 151 LabelField(tokens); 152 } else if (skipLeadingAmpersand_) { 153 skipLeadingAmpersand_ = false; 154 const char *p{SkipWhiteSpace(at_)}; 155 if (p < limit_ && *p == '&') { 156 column_ += ++p - at_; 157 at_ = p; 158 } 159 } else { 160 SkipSpaces(); 161 } 162 break; 163 } 164 165 while (NextToken(tokens)) { 166 } 167 168 Provenance newlineProvenance{GetCurrentProvenance()}; 169 if (std::optional<TokenSequence> preprocessed{ 170 preprocessor_.MacroReplacement(tokens, *this)}) { 171 // Reprocess the preprocessed line. Append a newline temporarily. 172 preprocessed->PutNextTokenChar('\n', newlineProvenance); 173 preprocessed->CloseToken(); 174 const char *ppd{preprocessed->ToCharBlock().begin()}; 175 LineClassification ppl{ClassifyLine(ppd)}; 176 preprocessed->pop_back(); // remove the newline 177 switch (ppl.kind) { 178 case LineClassification::Kind::Comment: 179 break; 180 case LineClassification::Kind::IncludeLine: 181 FortranInclude(ppd + ppl.payloadOffset); 182 break; 183 case LineClassification::Kind::ConditionalCompilationDirective: 184 case LineClassification::Kind::IncludeDirective: 185 case LineClassification::Kind::DefinitionDirective: 186 case LineClassification::Kind::PreprocessorDirective: 187 Say(preprocessed->GetProvenanceRange(), 188 "Preprocessed line resembles a preprocessor directive"_warn_en_US); 189 preprocessed->ToLowerCase() 190 .CheckBadFortranCharacters(messages_) 191 .CheckBadParentheses(messages_) 192 .Emit(cooked_); 193 break; 194 case LineClassification::Kind::CompilerDirective: 195 if (preprocessed->HasRedundantBlanks()) { 196 preprocessed->RemoveRedundantBlanks(); 197 } 198 NormalizeCompilerDirectiveCommentMarker(*preprocessed); 199 preprocessed->ToLowerCase(); 200 SourceFormChange(preprocessed->ToString()); 201 preprocessed->ClipComment(true /* skip first ! */) 202 .CheckBadFortranCharacters(messages_) 203 .CheckBadParentheses(messages_) 204 .Emit(cooked_); 205 break; 206 case LineClassification::Kind::Source: 207 if (inFixedForm_) { 208 if (preprocessed->HasBlanks(/*after column*/ 6)) { 209 preprocessed->RemoveBlanks(/*after column*/ 6); 210 } 211 } else { 212 if (preprocessed->HasRedundantBlanks()) { 213 preprocessed->RemoveRedundantBlanks(); 214 } 215 } 216 preprocessed->ToLowerCase() 217 .ClipComment() 218 .CheckBadFortranCharacters(messages_) 219 .CheckBadParentheses(messages_) 220 .Emit(cooked_); 221 break; 222 } 223 } else { 224 tokens.ToLowerCase(); 225 if (line.kind == LineClassification::Kind::CompilerDirective) { 226 SourceFormChange(tokens.ToString()); 227 } 228 if (inFixedForm_ && line.kind == LineClassification::Kind::Source) { 229 EnforceStupidEndStatementRules(tokens); 230 } 231 tokens.CheckBadFortranCharacters(messages_) 232 .CheckBadParentheses(messages_) 233 .Emit(cooked_); 234 } 235 if (omitNewline_) { 236 omitNewline_ = false; 237 } else { 238 cooked_.Put('\n', newlineProvenance); 239 } 240 directiveSentinel_ = nullptr; 241 } 242 243 TokenSequence Prescanner::TokenizePreprocessorDirective() { 244 CHECK(!IsAtEnd() && !inPreprocessorDirective_); 245 inPreprocessorDirective_ = true; 246 BeginStatementAndAdvance(); 247 TokenSequence tokens; 248 while (NextToken(tokens)) { 249 } 250 inPreprocessorDirective_ = false; 251 return tokens; 252 } 253 254 void Prescanner::NextLine() { 255 void *vstart{static_cast<void *>(const_cast<char *>(nextLine_))}; 256 void *v{std::memchr(vstart, '\n', limit_ - nextLine_)}; 257 if (!v) { 258 nextLine_ = limit_; 259 } else { 260 const char *nl{const_cast<const char *>(static_cast<char *>(v))}; 261 nextLine_ = nl + 1; 262 } 263 } 264 265 void Prescanner::LabelField(TokenSequence &token) { 266 const char *bad{nullptr}; 267 int outCol{1}; 268 const char *start{at_}; 269 for (; *at_ != '\n' && column_ <= 6; ++at_) { 270 if (*at_ == '\t') { 271 ++at_; 272 column_ = 7; 273 break; 274 } 275 if (*at_ != ' ' && 276 !(*at_ == '0' && column_ == 6)) { // '0' in column 6 becomes space 277 EmitChar(token, *at_); 278 ++outCol; 279 if (!bad && !IsDecimalDigit(*at_)) { 280 bad = at_; 281 } 282 } 283 ++column_; 284 } 285 if (bad && !preprocessor_.IsNameDefined(token.CurrentOpenToken())) { 286 Say(GetProvenance(bad), 287 "Character in fixed-form label field must be a digit"_warn_en_US); 288 token.clear(); 289 at_ = start; 290 return; 291 } 292 if (outCol == 1) { // empty label field 293 // Emit a space so that, if the line is rescanned after preprocessing, 294 // a leading 'C' or 'D' won't be left-justified and then accidentally 295 // misinterpreted as a comment card. 296 EmitChar(token, ' '); 297 ++outCol; 298 } 299 token.CloseToken(); 300 SkipToNextSignificantCharacter(); 301 if (IsDecimalDigit(*at_)) { 302 Say(GetProvenance(at_), 303 "Label digit is not in fixed-form label field"_port_en_US); 304 } 305 } 306 307 // 6.3.3.5: A program unit END statement, or any other statement whose 308 // initial line resembles an END statement, shall not be continued in 309 // fixed form source. 310 void Prescanner::EnforceStupidEndStatementRules(const TokenSequence &tokens) { 311 CharBlock cBlock{tokens.ToCharBlock()}; 312 const char *str{cBlock.begin()}; 313 std::size_t n{cBlock.size()}; 314 if (n < 3) { 315 return; 316 } 317 std::size_t j{0}; 318 for (; j < n && (str[j] == ' ' || (str[j] >= '0' && str[j] <= '9')); ++j) { 319 } 320 if (j + 3 > n || std::memcmp(str + j, "end", 3) != 0) { 321 return; 322 } 323 // It starts with END, possibly after a label. 324 auto start{allSources_.GetSourcePosition(tokens.GetCharProvenance(j))}; 325 auto end{allSources_.GetSourcePosition(tokens.GetCharProvenance(n - 1))}; 326 if (!start || !end) { 327 return; 328 } 329 if (&start->file == &end->file && start->line == end->line) { 330 return; // no continuation 331 } 332 j += 3; 333 static const char *const prefixes[]{"program", "subroutine", "function", 334 "blockdata", "module", "submodule", nullptr}; 335 bool isPrefix{j == n || !IsLegalInIdentifier(str[j])}; // prefix is END 336 std::size_t endOfPrefix{j - 1}; 337 for (const char *const *p{prefixes}; *p; ++p) { 338 std::size_t pLen{std::strlen(*p)}; 339 if (j + pLen <= n && std::memcmp(str + j, *p, pLen) == 0) { 340 isPrefix = true; // END thing as prefix 341 j += pLen; 342 endOfPrefix = j - 1; 343 for (; j < n && IsLegalInIdentifier(str[j]); ++j) { 344 } 345 break; 346 } 347 } 348 if (isPrefix) { 349 auto range{tokens.GetTokenProvenanceRange(1)}; 350 if (j == n) { // END or END thing [name] 351 Say(range, 352 "Program unit END statement may not be continued in fixed form source"_err_en_US); 353 } else { 354 auto endOfPrefixPos{ 355 allSources_.GetSourcePosition(tokens.GetCharProvenance(endOfPrefix))}; 356 auto next{allSources_.GetSourcePosition(tokens.GetCharProvenance(j))}; 357 if (endOfPrefixPos && next && &endOfPrefixPos->file == &start->file && 358 endOfPrefixPos->line == start->line && 359 (&next->file != &start->file || next->line != start->line)) { 360 Say(range, 361 "Initial line of continued statement must not appear to be a program unit END in fixed form source"_err_en_US); 362 } 363 } 364 } 365 } 366 367 void Prescanner::SkipToEndOfLine() { 368 while (*at_ != '\n') { 369 ++at_, ++column_; 370 } 371 } 372 373 bool Prescanner::MustSkipToEndOfLine() const { 374 if (inFixedForm_ && column_ > fixedFormColumnLimit_ && !tabInCurrentLine_) { 375 return true; // skip over ignored columns in right margin (73:80) 376 } else if (*at_ == '!' && !inCharLiteral_) { 377 return true; // inline comment goes to end of source line 378 } else { 379 return false; 380 } 381 } 382 383 void Prescanner::NextChar() { 384 CHECK(*at_ != '\n'); 385 ++at_, ++column_; 386 while (at_[0] == '\xef' && at_[1] == '\xbb' && at_[2] == '\xbf') { 387 // UTF-8 byte order mark - treat this file as UTF-8 388 at_ += 3; 389 encoding_ = Encoding::UTF_8; 390 } 391 SkipToNextSignificantCharacter(); 392 } 393 394 // Skip everything that should be ignored until the next significant 395 // character is reached; handles C-style comments in preprocessing 396 // directives, Fortran ! comments, stuff after the right margin in 397 // fixed form, and all forms of line continuation. 398 void Prescanner::SkipToNextSignificantCharacter() { 399 if (inPreprocessorDirective_) { 400 SkipCComments(); 401 } else { 402 bool mightNeedSpace{false}; 403 if (MustSkipToEndOfLine()) { 404 SkipToEndOfLine(); 405 } else { 406 mightNeedSpace = *at_ == '\n'; 407 } 408 for (; Continuation(mightNeedSpace); mightNeedSpace = false) { 409 if (MustSkipToEndOfLine()) { 410 SkipToEndOfLine(); 411 } 412 } 413 if (*at_ == '\t') { 414 tabInCurrentLine_ = true; 415 } 416 } 417 } 418 419 void Prescanner::SkipCComments() { 420 while (true) { 421 if (IsCComment(at_)) { 422 if (const char *after{SkipCComment(at_)}) { 423 column_ += after - at_; 424 // May have skipped over one or more newlines; relocate the start of 425 // the next line. 426 nextLine_ = at_ = after; 427 NextLine(); 428 } else { 429 // Don't emit any messages about unclosed C-style comments, because 430 // the sequence /* can appear legally in a FORMAT statement. There's 431 // no ambiguity, since the sequence */ cannot appear legally. 432 break; 433 } 434 } else if (inPreprocessorDirective_ && at_[0] == '\\' && at_ + 2 < limit_ && 435 at_[1] == '\n' && !IsAtEnd()) { 436 BeginSourceLineAndAdvance(); 437 } else { 438 break; 439 } 440 } 441 } 442 443 void Prescanner::SkipSpaces() { 444 while (*at_ == ' ' || *at_ == '\t') { 445 NextChar(); 446 } 447 insertASpace_ = false; 448 } 449 450 const char *Prescanner::SkipWhiteSpace(const char *p) { 451 while (*p == ' ' || *p == '\t') { 452 ++p; 453 } 454 return p; 455 } 456 457 const char *Prescanner::SkipWhiteSpaceAndCComments(const char *p) const { 458 while (true) { 459 if (*p == ' ' || *p == '\t') { 460 ++p; 461 } else if (IsCComment(p)) { 462 if (const char *after{SkipCComment(p)}) { 463 p = after; 464 } else { 465 break; 466 } 467 } else { 468 break; 469 } 470 } 471 return p; 472 } 473 474 const char *Prescanner::SkipCComment(const char *p) const { 475 char star{' '}, slash{' '}; 476 p += 2; 477 while (star != '*' || slash != '/') { 478 if (p >= limit_) { 479 return nullptr; // signifies an unterminated comment 480 } 481 star = slash; 482 slash = *p++; 483 } 484 return p; 485 } 486 487 bool Prescanner::NextToken(TokenSequence &tokens) { 488 CHECK(at_ >= start_ && at_ < limit_); 489 if (InFixedFormSource()) { 490 SkipSpaces(); 491 } else { 492 if (*at_ == '/' && IsCComment(at_)) { 493 // Recognize and skip over classic C style /*comments*/ when 494 // outside a character literal. 495 if (features_.ShouldWarn(LanguageFeature::ClassicCComments)) { 496 Say(GetProvenance(at_), 497 "nonstandard usage: C-style comment"_port_en_US); 498 } 499 SkipCComments(); 500 } 501 if (*at_ == ' ' || *at_ == '\t') { 502 // Compress free-form white space into a single space character. 503 const auto theSpace{at_}; 504 char previous{at_ <= start_ ? ' ' : at_[-1]}; 505 NextChar(); 506 SkipSpaces(); 507 if (*at_ == '\n') { 508 // Discard white space at the end of a line. 509 } else if (!inPreprocessorDirective_ && 510 (previous == '(' || *at_ == '(' || *at_ == ')')) { 511 // Discard white space before/after '(' and before ')', unless in a 512 // preprocessor directive. This helps yield space-free contiguous 513 // names for generic interfaces like OPERATOR( + ) and 514 // READ ( UNFORMATTED ), without misinterpreting #define f (notAnArg). 515 // This has the effect of silently ignoring the illegal spaces in 516 // the array constructor ( /1,2/ ) but that seems benign; it's 517 // hard to avoid that while still removing spaces from OPERATOR( / ) 518 // and OPERATOR( // ). 519 } else { 520 // Preserve the squashed white space as a single space character. 521 tokens.PutNextTokenChar(' ', GetProvenance(theSpace)); 522 tokens.CloseToken(); 523 return true; 524 } 525 } 526 } 527 if (insertASpace_) { 528 tokens.PutNextTokenChar(' ', spaceProvenance_); 529 insertASpace_ = false; 530 } 531 if (*at_ == '\n') { 532 return false; 533 } 534 const char *start{at_}; 535 if (*at_ == '\'' || *at_ == '"') { 536 QuotedCharacterLiteral(tokens, start); 537 preventHollerith_ = false; 538 } else if (IsDecimalDigit(*at_)) { 539 int n{0}, digits{0}; 540 static constexpr int maxHollerith{256 /*lines*/ * (132 - 6 /*columns*/)}; 541 do { 542 if (n < maxHollerith) { 543 n = 10 * n + DecimalDigitValue(*at_); 544 } 545 EmitCharAndAdvance(tokens, *at_); 546 ++digits; 547 if (InFixedFormSource()) { 548 SkipSpaces(); 549 } 550 } while (IsDecimalDigit(*at_)); 551 if ((*at_ == 'h' || *at_ == 'H') && n > 0 && n < maxHollerith && 552 !preventHollerith_) { 553 Hollerith(tokens, n, start); 554 } else if (*at_ == '.') { 555 while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) { 556 } 557 ExponentAndKind(tokens); 558 } else if (ExponentAndKind(tokens)) { 559 } else if (digits == 1 && n == 0 && (*at_ == 'x' || *at_ == 'X') && 560 inPreprocessorDirective_) { 561 do { 562 EmitCharAndAdvance(tokens, *at_); 563 } while (IsHexadecimalDigit(*at_)); 564 } else if (IsLetter(*at_)) { 565 // Handles FORMAT(3I9HHOLLERITH) by skipping over the first I so that 566 // we don't misrecognize I9HOLLERITH as an identifier in the next case. 567 EmitCharAndAdvance(tokens, *at_); 568 } else if (at_[0] == '_' && (at_[1] == '\'' || at_[1] == '"')) { // 4_"..." 569 EmitCharAndAdvance(tokens, *at_); 570 QuotedCharacterLiteral(tokens, start); 571 } 572 preventHollerith_ = false; 573 } else if (*at_ == '.') { 574 char nch{EmitCharAndAdvance(tokens, '.')}; 575 if (!inPreprocessorDirective_ && IsDecimalDigit(nch)) { 576 while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) { 577 } 578 ExponentAndKind(tokens); 579 } else if (nch == '.' && EmitCharAndAdvance(tokens, '.') == '.') { 580 EmitCharAndAdvance(tokens, '.'); // variadic macro definition ellipsis 581 } 582 preventHollerith_ = false; 583 } else if (IsLegalInIdentifier(*at_)) { 584 do { 585 } while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_))); 586 if ((*at_ == '\'' || *at_ == '"') && 587 tokens.CharAt(tokens.SizeInChars() - 1) == '_') { // kind_"..." 588 QuotedCharacterLiteral(tokens, start); 589 } 590 preventHollerith_ = false; 591 } else if (*at_ == '*') { 592 if (EmitCharAndAdvance(tokens, '*') == '*') { 593 EmitCharAndAdvance(tokens, '*'); 594 } else { 595 // Subtle ambiguity: 596 // CHARACTER*2H declares H because *2 is a kind specifier 597 // DATAC/N*2H / is repeated Hollerith 598 preventHollerith_ = !slashInCurrentStatement_; 599 } 600 } else { 601 char ch{*at_}; 602 if (ch == '(' || ch == '[') { 603 ++delimiterNesting_; 604 } else if ((ch == ')' || ch == ']') && delimiterNesting_ > 0) { 605 --delimiterNesting_; 606 } 607 char nch{EmitCharAndAdvance(tokens, ch)}; 608 preventHollerith_ = false; 609 if ((nch == '=' && 610 (ch == '<' || ch == '>' || ch == '/' || ch == '=' || ch == '!')) || 611 (ch == nch && 612 (ch == '/' || ch == ':' || ch == '*' || ch == '#' || ch == '&' || 613 ch == '|' || ch == '<' || ch == '>')) || 614 (ch == '=' && nch == '>')) { 615 // token comprises two characters 616 EmitCharAndAdvance(tokens, nch); 617 } else if (ch == '/') { 618 slashInCurrentStatement_ = true; 619 } 620 } 621 tokens.CloseToken(); 622 return true; 623 } 624 625 bool Prescanner::ExponentAndKind(TokenSequence &tokens) { 626 char ed{ToLowerCaseLetter(*at_)}; 627 if (ed != 'e' && ed != 'd') { 628 return false; 629 } 630 EmitCharAndAdvance(tokens, ed); 631 if (*at_ == '+' || *at_ == '-') { 632 EmitCharAndAdvance(tokens, *at_); 633 } 634 while (IsDecimalDigit(*at_)) { 635 EmitCharAndAdvance(tokens, *at_); 636 } 637 if (*at_ == '_') { 638 while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_))) { 639 } 640 } 641 return true; 642 } 643 644 void Prescanner::QuotedCharacterLiteral( 645 TokenSequence &tokens, const char *start) { 646 char quote{*at_}; 647 const char *end{at_ + 1}; 648 inCharLiteral_ = true; 649 const auto emit{[&](char ch) { EmitChar(tokens, ch); }}; 650 const auto insert{[&](char ch) { EmitInsertedChar(tokens, ch); }}; 651 bool isEscaped{false}; 652 bool escapesEnabled{features_.IsEnabled(LanguageFeature::BackslashEscapes)}; 653 while (true) { 654 if (*at_ == '\\') { 655 if (escapesEnabled) { 656 isEscaped = !isEscaped; 657 } else { 658 // The parser always processes escape sequences, so don't confuse it 659 // when escapes are disabled. 660 insert('\\'); 661 } 662 } else { 663 isEscaped = false; 664 } 665 EmitQuotedChar(static_cast<unsigned char>(*at_), emit, insert, false, 666 Encoding::LATIN_1); 667 while (PadOutCharacterLiteral(tokens)) { 668 } 669 if (*at_ == '\n') { 670 if (!inPreprocessorDirective_) { 671 Say(GetProvenanceRange(start, end), 672 "Incomplete character literal"_err_en_US); 673 } 674 break; 675 } 676 end = at_ + 1; 677 NextChar(); 678 if (*at_ == quote && !isEscaped) { 679 // A doubled unescaped quote mark becomes a single instance of that 680 // quote character in the literal (later). There can be spaces between 681 // the quotes in fixed form source. 682 EmitChar(tokens, quote); 683 inCharLiteral_ = false; // for cases like print *, '...'!comment 684 NextChar(); 685 if (InFixedFormSource()) { 686 SkipSpaces(); 687 } 688 if (*at_ != quote) { 689 break; 690 } 691 inCharLiteral_ = true; 692 } 693 } 694 inCharLiteral_ = false; 695 } 696 697 void Prescanner::Hollerith( 698 TokenSequence &tokens, int count, const char *start) { 699 inCharLiteral_ = true; 700 CHECK(*at_ == 'h' || *at_ == 'H'); 701 EmitChar(tokens, 'H'); 702 while (count-- > 0) { 703 if (PadOutCharacterLiteral(tokens)) { 704 } else if (*at_ == '\n') { 705 Say(GetProvenanceRange(start, at_), 706 "Possible truncated Hollerith literal"_warn_en_US); 707 break; 708 } else { 709 NextChar(); 710 // Each multi-byte character encoding counts as a single character. 711 // No escape sequences are recognized. 712 // Hollerith is always emitted to the cooked character 713 // stream in UTF-8. 714 DecodedCharacter decoded{DecodeCharacter( 715 encoding_, at_, static_cast<std::size_t>(limit_ - at_), false)}; 716 if (decoded.bytes > 0) { 717 EncodedCharacter utf8{ 718 EncodeCharacter<Encoding::UTF_8>(decoded.codepoint)}; 719 for (int j{0}; j < utf8.bytes; ++j) { 720 EmitChar(tokens, utf8.buffer[j]); 721 } 722 at_ += decoded.bytes - 1; 723 } else { 724 Say(GetProvenanceRange(start, at_), 725 "Bad character in Hollerith literal"_err_en_US); 726 break; 727 } 728 } 729 } 730 if (*at_ != '\n') { 731 NextChar(); 732 } 733 inCharLiteral_ = false; 734 } 735 736 // In fixed form, source card images must be processed as if they were at 737 // least 72 columns wide, at least in character literal contexts. 738 bool Prescanner::PadOutCharacterLiteral(TokenSequence &tokens) { 739 while (inFixedForm_ && !tabInCurrentLine_ && at_[1] == '\n') { 740 if (column_ < fixedFormColumnLimit_) { 741 tokens.PutNextTokenChar(' ', spaceProvenance_); 742 ++column_; 743 return true; 744 } 745 if (!FixedFormContinuation(false /*no need to insert space*/) || 746 tabInCurrentLine_) { 747 return false; 748 } 749 CHECK(column_ == 7); 750 --at_; // point to column 6 of continuation line 751 column_ = 6; 752 } 753 return false; 754 } 755 756 bool Prescanner::IsFixedFormCommentLine(const char *start) const { 757 const char *p{start}; 758 if (IsFixedFormCommentChar(*p) || *p == '%' || // VAX %list, %eject, &c. 759 ((*p == 'D' || *p == 'd') && 760 !features_.IsEnabled(LanguageFeature::OldDebugLines))) { 761 return true; 762 } 763 bool anyTabs{false}; 764 while (true) { 765 if (*p == ' ') { 766 ++p; 767 } else if (*p == '\t') { 768 anyTabs = true; 769 ++p; 770 } else if (*p == '0' && !anyTabs && p == start + 5) { 771 ++p; // 0 in column 6 must treated as a space 772 } else { 773 break; 774 } 775 } 776 if (!anyTabs && p >= start + fixedFormColumnLimit_) { 777 return true; 778 } 779 if (*p == '!' && !inCharLiteral_ && (anyTabs || p != start + 5)) { 780 return true; 781 } 782 return *p == '\n'; 783 } 784 785 const char *Prescanner::IsFreeFormComment(const char *p) const { 786 p = SkipWhiteSpaceAndCComments(p); 787 if (*p == '!' || *p == '\n') { 788 return p; 789 } else { 790 return nullptr; 791 } 792 } 793 794 std::optional<std::size_t> Prescanner::IsIncludeLine(const char *start) const { 795 const char *p{SkipWhiteSpace(start)}; 796 for (char ch : "include"s) { 797 if (ToLowerCaseLetter(*p++) != ch) { 798 return std::nullopt; 799 } 800 } 801 p = SkipWhiteSpace(p); 802 if (*p == '"' || *p == '\'') { 803 return {p - start}; 804 } 805 return std::nullopt; 806 } 807 808 void Prescanner::FortranInclude(const char *firstQuote) { 809 const char *p{firstQuote}; 810 while (*p != '"' && *p != '\'') { 811 ++p; 812 } 813 char quote{*p}; 814 std::string path; 815 for (++p; *p != '\n'; ++p) { 816 if (*p == quote) { 817 if (p[1] != quote) { 818 break; 819 } 820 ++p; 821 } 822 path += *p; 823 } 824 if (*p != quote) { 825 Say(GetProvenanceRange(firstQuote, p), 826 "malformed path name string"_err_en_US); 827 return; 828 } 829 p = SkipWhiteSpace(p + 1); 830 if (*p != '\n' && *p != '!') { 831 const char *garbage{p}; 832 for (; *p != '\n' && *p != '!'; ++p) { 833 } 834 Say(GetProvenanceRange(garbage, p), 835 "excess characters after path name"_warn_en_US); 836 } 837 std::string buf; 838 llvm::raw_string_ostream error{buf}; 839 Provenance provenance{GetProvenance(nextLine_)}; 840 std::optional<std::string> prependPath; 841 if (const SourceFile * currentFile{allSources_.GetSourceFile(provenance)}) { 842 prependPath = DirectoryName(currentFile->path()); 843 } 844 const SourceFile *included{ 845 allSources_.Open(path, error, std::move(prependPath))}; 846 if (!included) { 847 Say(provenance, "INCLUDE: %s"_err_en_US, error.str()); 848 } else if (included->bytes() > 0) { 849 ProvenanceRange includeLineRange{ 850 provenance, static_cast<std::size_t>(p - nextLine_)}; 851 ProvenanceRange fileRange{ 852 allSources_.AddIncludedFile(*included, includeLineRange)}; 853 Prescanner{*this}.set_encoding(included->encoding()).Prescan(fileRange); 854 } 855 } 856 857 const char *Prescanner::IsPreprocessorDirectiveLine(const char *start) const { 858 const char *p{start}; 859 for (; *p == ' '; ++p) { 860 } 861 if (*p == '#') { 862 if (inFixedForm_ && p == start + 5) { 863 return nullptr; 864 } 865 } else { 866 p = SkipWhiteSpace(p); 867 if (*p != '#') { 868 return nullptr; 869 } 870 } 871 return SkipWhiteSpace(p + 1); 872 } 873 874 bool Prescanner::IsNextLinePreprocessorDirective() const { 875 return IsPreprocessorDirectiveLine(nextLine_) != nullptr; 876 } 877 878 bool Prescanner::SkipCommentLine(bool afterAmpersand) { 879 if (IsAtEnd()) { 880 if (afterAmpersand && prescannerNesting_ > 0) { 881 // A continuation marker at the end of the last line in an 882 // include file inhibits the newline for that line. 883 SkipToEndOfLine(); 884 omitNewline_ = true; 885 } 886 return false; 887 } 888 auto lineClass{ClassifyLine(nextLine_)}; 889 if (lineClass.kind == LineClassification::Kind::Comment) { 890 NextLine(); 891 return true; 892 } else if (inPreprocessorDirective_) { 893 return false; 894 } else if (lineClass.kind == 895 LineClassification::Kind::ConditionalCompilationDirective || 896 lineClass.kind == LineClassification::Kind::PreprocessorDirective) { 897 // Allow conditional compilation directives (e.g., #ifdef) to affect 898 // continuation lines. 899 // Allow other preprocessor directives, too, except #include 900 // (when it does not follow '&'), #define, and #undef (because 901 // they cannot be allowed to affect preceding text on a 902 // continued line). 903 preprocessor_.Directive(TokenizePreprocessorDirective(), *this); 904 return true; 905 } else if (afterAmpersand && 906 (lineClass.kind == LineClassification::Kind::IncludeDirective || 907 lineClass.kind == LineClassification::Kind::IncludeLine)) { 908 SkipToEndOfLine(); 909 omitNewline_ = true; 910 skipLeadingAmpersand_ = true; 911 return false; 912 } else { 913 return false; 914 } 915 } 916 917 const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) { 918 if (IsAtEnd()) { 919 return nullptr; 920 } 921 tabInCurrentLine_ = false; 922 char col1{*nextLine_}; 923 if (InCompilerDirective()) { 924 // Must be a continued compiler directive. 925 if (!IsFixedFormCommentChar(col1)) { 926 return nullptr; 927 } 928 int j{1}; 929 for (; j < 5; ++j) { 930 char ch{directiveSentinel_[j - 1]}; 931 if (ch == '\0') { 932 break; 933 } 934 if (ch != ToLowerCaseLetter(nextLine_[j])) { 935 return nullptr; 936 } 937 } 938 for (; j < 5; ++j) { 939 if (nextLine_[j] != ' ') { 940 return nullptr; 941 } 942 } 943 char col6{nextLine_[5]}; 944 if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') { 945 if (nextLine_[6] != ' ' && mightNeedSpace) { 946 insertASpace_ = true; 947 } 948 return nextLine_ + 6; 949 } 950 return nullptr; 951 } else { 952 // Normal case: not in a compiler directive. 953 if (col1 == '&' && 954 features_.IsEnabled( 955 LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) { 956 // Extension: '&' as continuation marker 957 if (features_.ShouldWarn( 958 LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) { 959 Say(GetProvenance(nextLine_), "nonstandard usage"_port_en_US); 960 } 961 return nextLine_ + 1; 962 } 963 if (col1 == '\t' && nextLine_[1] >= '1' && nextLine_[1] <= '9') { 964 tabInCurrentLine_ = true; 965 return nextLine_ + 2; // VAX extension 966 } 967 if (col1 == ' ' && nextLine_[1] == ' ' && nextLine_[2] == ' ' && 968 nextLine_[3] == ' ' && nextLine_[4] == ' ') { 969 char col6{nextLine_[5]}; 970 if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') { 971 return nextLine_ + 6; 972 } 973 } 974 if (IsImplicitContinuation()) { 975 return nextLine_; 976 } 977 } 978 return nullptr; // not a continuation line 979 } 980 981 const char *Prescanner::FreeFormContinuationLine(bool ampersand) { 982 const char *p{nextLine_}; 983 if (p >= limit_) { 984 return nullptr; 985 } 986 p = SkipWhiteSpace(p); 987 if (InCompilerDirective()) { 988 if (*p++ != '!') { 989 return nullptr; 990 } 991 for (const char *s{directiveSentinel_}; *s != '\0'; ++p, ++s) { 992 if (*s != ToLowerCaseLetter(*p)) { 993 return nullptr; 994 } 995 } 996 p = SkipWhiteSpace(p); 997 if (*p == '&') { 998 if (!ampersand) { 999 insertASpace_ = true; 1000 } 1001 return p + 1; 1002 } else if (ampersand) { 1003 return p; 1004 } else { 1005 return nullptr; 1006 } 1007 } else { 1008 if (*p == '&') { 1009 return p + 1; 1010 } else if (*p == '!' || *p == '\n' || *p == '#') { 1011 return nullptr; 1012 } else if (ampersand || IsImplicitContinuation()) { 1013 if (p > nextLine_) { 1014 --p; 1015 } else { 1016 insertASpace_ = true; 1017 } 1018 return p; 1019 } else { 1020 return nullptr; 1021 } 1022 } 1023 } 1024 1025 bool Prescanner::FixedFormContinuation(bool mightNeedSpace) { 1026 // N.B. We accept '&' as a continuation indicator in fixed form, too, 1027 // but not in a character literal. 1028 if (*at_ == '&' && inCharLiteral_) { 1029 return false; 1030 } 1031 do { 1032 if (const char *cont{FixedFormContinuationLine(mightNeedSpace)}) { 1033 BeginSourceLine(cont); 1034 column_ = 7; 1035 NextLine(); 1036 return true; 1037 } 1038 } while (SkipCommentLine(false /* not after ampersand */)); 1039 return false; 1040 } 1041 1042 bool Prescanner::FreeFormContinuation() { 1043 const char *p{at_}; 1044 bool ampersand{*p == '&'}; 1045 if (ampersand) { 1046 p = SkipWhiteSpace(p + 1); 1047 } 1048 if (*p != '\n') { 1049 if (inCharLiteral_) { 1050 return false; 1051 } else if (*p != '!' && 1052 features_.ShouldWarn(LanguageFeature::CruftAfterAmpersand)) { 1053 Say(GetProvenance(p), "missing ! before comment after &"_warn_en_US); 1054 } 1055 } 1056 do { 1057 if (const char *cont{FreeFormContinuationLine(ampersand)}) { 1058 BeginSourceLine(cont); 1059 NextLine(); 1060 return true; 1061 } 1062 } while (SkipCommentLine(ampersand)); 1063 return false; 1064 } 1065 1066 // Implicit line continuation allows a preprocessor macro call with 1067 // arguments to span multiple lines. 1068 bool Prescanner::IsImplicitContinuation() const { 1069 return !inPreprocessorDirective_ && !inCharLiteral_ && 1070 delimiterNesting_ > 0 && !IsAtEnd() && 1071 ClassifyLine(nextLine_).kind == LineClassification::Kind::Source; 1072 } 1073 1074 bool Prescanner::Continuation(bool mightNeedFixedFormSpace) { 1075 if (*at_ == '\n' || *at_ == '&') { 1076 if (inFixedForm_) { 1077 return FixedFormContinuation(mightNeedFixedFormSpace); 1078 } else { 1079 return FreeFormContinuation(); 1080 } 1081 } else { 1082 return false; 1083 } 1084 } 1085 1086 std::optional<Prescanner::LineClassification> 1087 Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const { 1088 const char *p{start}; 1089 char col1{*p++}; 1090 if (!IsFixedFormCommentChar(col1)) { 1091 return std::nullopt; 1092 } 1093 char sentinel[5], *sp{sentinel}; 1094 int column{2}; 1095 for (; column < 6; ++column, ++p) { 1096 if (*p != ' ') { 1097 if (*p == '\n' || *p == '\t') { 1098 break; 1099 } 1100 if (sp == sentinel + 1 && sentinel[0] == '$' && IsDecimalDigit(*p)) { 1101 // OpenMP conditional compilation line: leave the label alone 1102 break; 1103 } 1104 *sp++ = ToLowerCaseLetter(*p); 1105 } 1106 } 1107 if (column == 6) { 1108 if (*p == ' ' || *p == '\t' || *p == '0') { 1109 ++p; 1110 } else { 1111 // This is a Continuation line, not an initial directive line. 1112 return std::nullopt; 1113 } 1114 } 1115 if (sp == sentinel) { 1116 return std::nullopt; 1117 } 1118 *sp = '\0'; 1119 if (const char *ss{IsCompilerDirectiveSentinel(sentinel)}) { 1120 std::size_t payloadOffset = p - start; 1121 return {LineClassification{ 1122 LineClassification::Kind::CompilerDirective, payloadOffset, ss}}; 1123 } 1124 return std::nullopt; 1125 } 1126 1127 std::optional<Prescanner::LineClassification> 1128 Prescanner::IsFreeFormCompilerDirectiveLine(const char *start) const { 1129 char sentinel[8]; 1130 const char *p{SkipWhiteSpace(start)}; 1131 if (*p++ != '!') { 1132 return std::nullopt; 1133 } 1134 for (std::size_t j{0}; j + 1 < sizeof sentinel; ++p, ++j) { 1135 if (*p == '\n') { 1136 break; 1137 } 1138 if (*p == ' ' || *p == '\t' || *p == '&') { 1139 if (j == 0) { 1140 break; 1141 } 1142 sentinel[j] = '\0'; 1143 p = SkipWhiteSpace(p + 1); 1144 if (*p == '!') { 1145 break; 1146 } 1147 if (const char *sp{IsCompilerDirectiveSentinel(sentinel)}) { 1148 std::size_t offset = p - start; 1149 return {LineClassification{ 1150 LineClassification::Kind::CompilerDirective, offset, sp}}; 1151 } 1152 break; 1153 } 1154 sentinel[j] = ToLowerCaseLetter(*p); 1155 } 1156 return std::nullopt; 1157 } 1158 1159 Prescanner &Prescanner::AddCompilerDirectiveSentinel(const std::string &dir) { 1160 std::uint64_t packed{0}; 1161 for (char ch : dir) { 1162 packed = (packed << 8) | (ToLowerCaseLetter(ch) & 0xff); 1163 } 1164 compilerDirectiveBloomFilter_.set(packed % prime1); 1165 compilerDirectiveBloomFilter_.set(packed % prime2); 1166 compilerDirectiveSentinels_.insert(dir); 1167 return *this; 1168 } 1169 1170 const char *Prescanner::IsCompilerDirectiveSentinel( 1171 const char *sentinel) const { 1172 std::uint64_t packed{0}; 1173 std::size_t n{0}; 1174 for (; sentinel[n] != '\0'; ++n) { 1175 packed = (packed << 8) | (sentinel[n] & 0xff); 1176 } 1177 if (n == 0 || !compilerDirectiveBloomFilter_.test(packed % prime1) || 1178 !compilerDirectiveBloomFilter_.test(packed % prime2)) { 1179 return nullptr; 1180 } 1181 const auto iter{compilerDirectiveSentinels_.find(std::string(sentinel, n))}; 1182 return iter == compilerDirectiveSentinels_.end() ? nullptr : iter->c_str(); 1183 } 1184 1185 constexpr bool IsDirective(const char *match, const char *dir) { 1186 for (; *match; ++match) { 1187 if (*match != ToLowerCaseLetter(*dir++)) { 1188 return false; 1189 } 1190 } 1191 return true; 1192 } 1193 1194 Prescanner::LineClassification Prescanner::ClassifyLine( 1195 const char *start) const { 1196 if (inFixedForm_) { 1197 if (std::optional<LineClassification> lc{ 1198 IsFixedFormCompilerDirectiveLine(start)}) { 1199 return std::move(*lc); 1200 } 1201 if (IsFixedFormCommentLine(start)) { 1202 return {LineClassification::Kind::Comment}; 1203 } 1204 } else { 1205 if (std::optional<LineClassification> lc{ 1206 IsFreeFormCompilerDirectiveLine(start)}) { 1207 return std::move(*lc); 1208 } 1209 if (const char *bang{IsFreeFormComment(start)}) { 1210 return {LineClassification::Kind::Comment, 1211 static_cast<std::size_t>(bang - start)}; 1212 } 1213 } 1214 if (std::optional<std::size_t> quoteOffset{IsIncludeLine(start)}) { 1215 return {LineClassification::Kind::IncludeLine, *quoteOffset}; 1216 } 1217 if (const char *dir{IsPreprocessorDirectiveLine(start)}) { 1218 if (IsDirective("if", dir) || IsDirective("elif", dir) || 1219 IsDirective("else", dir) || IsDirective("endif", dir)) { 1220 return {LineClassification::Kind::ConditionalCompilationDirective}; 1221 } else if (IsDirective("include", dir)) { 1222 return {LineClassification::Kind::IncludeDirective}; 1223 } else if (IsDirective("define", dir) || IsDirective("undef", dir)) { 1224 return {LineClassification::Kind::DefinitionDirective}; 1225 } else { 1226 return {LineClassification::Kind::PreprocessorDirective}; 1227 } 1228 } 1229 return {LineClassification::Kind::Source}; 1230 } 1231 1232 void Prescanner::SourceFormChange(std::string &&dir) { 1233 if (dir == "!dir$ free") { 1234 inFixedForm_ = false; 1235 } else if (dir == "!dir$ fixed") { 1236 inFixedForm_ = true; 1237 } 1238 } 1239 } // namespace Fortran::parser 1240