1 //===-- lib/Parser/prescan.cpp --------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "prescan.h" 10 #include "preprocessor.h" 11 #include "token-sequence.h" 12 #include "flang/Common/idioms.h" 13 #include "flang/Parser/characters.h" 14 #include "flang/Parser/message.h" 15 #include "flang/Parser/source.h" 16 #include "llvm/Support/raw_ostream.h" 17 #include <cstddef> 18 #include <cstring> 19 #include <utility> 20 #include <vector> 21 22 namespace Fortran::parser { 23 24 using common::LanguageFeature; 25 26 static constexpr int maxPrescannerNesting{100}; 27 28 Prescanner::Prescanner(Messages &messages, CookedSource &cooked, 29 Preprocessor &preprocessor, common::LanguageFeatureControl lfc) 30 : messages_{messages}, cooked_{cooked}, preprocessor_{preprocessor}, 31 allSources_{preprocessor_.allSources()}, features_{lfc}, 32 encoding_{allSources_.encoding()} {} 33 34 Prescanner::Prescanner(const Prescanner &that) 35 : messages_{that.messages_}, cooked_{that.cooked_}, 36 preprocessor_{that.preprocessor_}, allSources_{that.allSources_}, 37 features_{that.features_}, inFixedForm_{that.inFixedForm_}, 38 fixedFormColumnLimit_{that.fixedFormColumnLimit_}, 39 encoding_{that.encoding_}, prescannerNesting_{that.prescannerNesting_ + 40 1}, 41 skipLeadingAmpersand_{that.skipLeadingAmpersand_}, 42 compilerDirectiveBloomFilter_{that.compilerDirectiveBloomFilter_}, 43 compilerDirectiveSentinels_{that.compilerDirectiveSentinels_} {} 44 45 static inline constexpr bool IsFixedFormCommentChar(char ch) { 46 return ch == '!' || ch == '*' || ch == 'C' || ch == 'c'; 47 } 48 49 static void NormalizeCompilerDirectiveCommentMarker(TokenSequence &dir) { 50 char *p{dir.GetMutableCharData()}; 51 char *limit{p + dir.SizeInChars()}; 52 for (; p < limit; ++p) { 53 if (*p != ' ') { 54 CHECK(IsFixedFormCommentChar(*p)); 55 *p = '!'; 56 return; 57 } 58 } 59 DIE("compiler directive all blank"); 60 } 61 62 void Prescanner::Prescan(ProvenanceRange range) { 63 startProvenance_ = range.start(); 64 start_ = allSources_.GetSource(range); 65 CHECK(start_); 66 limit_ = start_ + range.size(); 67 nextLine_ = start_; 68 const bool beganInFixedForm{inFixedForm_}; 69 if (prescannerNesting_ > maxPrescannerNesting) { 70 Say(GetProvenance(start_), 71 "too many nested INCLUDE/#include files, possibly circular"_err_en_US); 72 return; 73 } 74 while (!IsAtEnd()) { 75 Statement(); 76 } 77 if (inFixedForm_ != beganInFixedForm) { 78 std::string dir{"!dir$ "}; 79 if (beganInFixedForm) { 80 dir += "fixed"; 81 } else { 82 dir += "free"; 83 } 84 dir += '\n'; 85 TokenSequence tokens{dir, allSources_.AddCompilerInsertion(dir).start()}; 86 tokens.Emit(cooked_); 87 } 88 } 89 90 void Prescanner::Statement() { 91 TokenSequence tokens; 92 LineClassification line{ClassifyLine(nextLine_)}; 93 switch (line.kind) { 94 case LineClassification::Kind::Comment: 95 nextLine_ += line.payloadOffset; // advance to '!' or newline 96 NextLine(); 97 return; 98 case LineClassification::Kind::IncludeLine: 99 FortranInclude(nextLine_ + line.payloadOffset); 100 NextLine(); 101 return; 102 case LineClassification::Kind::ConditionalCompilationDirective: 103 case LineClassification::Kind::IncludeDirective: 104 case LineClassification::Kind::DefinitionDirective: 105 case LineClassification::Kind::PreprocessorDirective: 106 preprocessor_.Directive(TokenizePreprocessorDirective(), *this); 107 return; 108 case LineClassification::Kind::CompilerDirective: 109 directiveSentinel_ = line.sentinel; 110 CHECK(InCompilerDirective()); 111 BeginStatementAndAdvance(); 112 if (inFixedForm_) { 113 CHECK(IsFixedFormCommentChar(*at_)); 114 } else { 115 while (*at_ == ' ' || *at_ == '\t') { 116 ++at_, ++column_; 117 } 118 CHECK(*at_ == '!'); 119 } 120 if (directiveSentinel_[0] == '$' && directiveSentinel_[1] == '\0') { 121 // OpenMP conditional compilation line. Remove the sentinel and then 122 // treat the line as if it were normal source. 123 at_ += 2, column_ += 2; 124 if (inFixedForm_) { 125 LabelField(tokens); 126 } else { 127 SkipSpaces(); 128 } 129 } else { 130 // Compiler directive. Emit normalized sentinel. 131 EmitChar(tokens, '!'); 132 ++at_, ++column_; 133 for (const char *sp{directiveSentinel_}; *sp != '\0'; 134 ++sp, ++at_, ++column_) { 135 EmitChar(tokens, *sp); 136 } 137 if (*at_ == ' ') { 138 EmitChar(tokens, ' '); 139 ++at_, ++column_; 140 } 141 tokens.CloseToken(); 142 } 143 break; 144 case LineClassification::Kind::Source: 145 BeginStatementAndAdvance(); 146 if (inFixedForm_) { 147 LabelField(tokens); 148 } else if (skipLeadingAmpersand_) { 149 skipLeadingAmpersand_ = false; 150 const char *p{SkipWhiteSpace(at_)}; 151 if (p < limit_ && *p == '&') { 152 column_ += ++p - at_; 153 at_ = p; 154 } 155 } else { 156 SkipSpaces(); 157 } 158 break; 159 } 160 161 while (NextToken(tokens)) { 162 } 163 164 Provenance newlineProvenance{GetCurrentProvenance()}; 165 if (std::optional<TokenSequence> preprocessed{ 166 preprocessor_.MacroReplacement(tokens, *this)}) { 167 // Reprocess the preprocessed line. Append a newline temporarily. 168 preprocessed->PutNextTokenChar('\n', newlineProvenance); 169 preprocessed->CloseToken(); 170 const char *ppd{preprocessed->ToCharBlock().begin()}; 171 LineClassification ppl{ClassifyLine(ppd)}; 172 preprocessed->pop_back(); // remove the newline 173 switch (ppl.kind) { 174 case LineClassification::Kind::Comment: 175 break; 176 case LineClassification::Kind::IncludeLine: 177 FortranInclude(ppd + ppl.payloadOffset); 178 break; 179 case LineClassification::Kind::ConditionalCompilationDirective: 180 case LineClassification::Kind::IncludeDirective: 181 case LineClassification::Kind::DefinitionDirective: 182 case LineClassification::Kind::PreprocessorDirective: 183 Say(preprocessed->GetProvenanceRange(), 184 "Preprocessed line resembles a preprocessor directive"_en_US); 185 preprocessed->ToLowerCase() 186 .CheckBadFortranCharacters(messages_) 187 .CheckBadParentheses(messages_) 188 .Emit(cooked_); 189 break; 190 case LineClassification::Kind::CompilerDirective: 191 if (preprocessed->HasRedundantBlanks()) { 192 preprocessed->RemoveRedundantBlanks(); 193 } 194 NormalizeCompilerDirectiveCommentMarker(*preprocessed); 195 preprocessed->ToLowerCase(); 196 SourceFormChange(preprocessed->ToString()); 197 preprocessed->ClipComment(true /* skip first ! */) 198 .CheckBadFortranCharacters(messages_) 199 .CheckBadParentheses(messages_) 200 .Emit(cooked_); 201 break; 202 case LineClassification::Kind::Source: 203 if (inFixedForm_) { 204 if (preprocessed->HasBlanks(/*after column*/ 6)) { 205 preprocessed->RemoveBlanks(/*after column*/ 6); 206 } 207 } else { 208 if (preprocessed->HasRedundantBlanks()) { 209 preprocessed->RemoveRedundantBlanks(); 210 } 211 } 212 preprocessed->ToLowerCase() 213 .ClipComment() 214 .CheckBadFortranCharacters(messages_) 215 .CheckBadParentheses(messages_) 216 .Emit(cooked_); 217 break; 218 } 219 } else { 220 tokens.ToLowerCase(); 221 if (line.kind == LineClassification::Kind::CompilerDirective) { 222 SourceFormChange(tokens.ToString()); 223 } 224 if (inFixedForm_ && line.kind == LineClassification::Kind::Source) { 225 EnforceStupidEndStatementRules(tokens); 226 } 227 tokens.CheckBadFortranCharacters(messages_) 228 .CheckBadParentheses(messages_) 229 .Emit(cooked_); 230 } 231 if (omitNewline_) { 232 omitNewline_ = false; 233 } else { 234 cooked_.Put('\n', newlineProvenance); 235 } 236 directiveSentinel_ = nullptr; 237 } 238 239 TokenSequence Prescanner::TokenizePreprocessorDirective() { 240 CHECK(!IsAtEnd() && !inPreprocessorDirective_); 241 inPreprocessorDirective_ = true; 242 BeginStatementAndAdvance(); 243 TokenSequence tokens; 244 while (NextToken(tokens)) { 245 } 246 inPreprocessorDirective_ = false; 247 return tokens; 248 } 249 250 void Prescanner::NextLine() { 251 void *vstart{static_cast<void *>(const_cast<char *>(nextLine_))}; 252 void *v{std::memchr(vstart, '\n', limit_ - nextLine_)}; 253 if (!v) { 254 nextLine_ = limit_; 255 } else { 256 const char *nl{const_cast<const char *>(static_cast<char *>(v))}; 257 nextLine_ = nl + 1; 258 } 259 } 260 261 void Prescanner::LabelField(TokenSequence &token) { 262 const char *bad{nullptr}; 263 int outCol{1}; 264 const char *start{at_}; 265 for (; *at_ != '\n' && column_ <= 6; ++at_) { 266 if (*at_ == '\t') { 267 ++at_; 268 column_ = 7; 269 break; 270 } 271 if (*at_ != ' ' && 272 !(*at_ == '0' && column_ == 6)) { // '0' in column 6 becomes space 273 EmitChar(token, *at_); 274 ++outCol; 275 if (!bad && !IsDecimalDigit(*at_)) { 276 bad = at_; 277 } 278 } 279 ++column_; 280 } 281 if (bad && !preprocessor_.IsNameDefined(token.CurrentOpenToken())) { 282 Say(GetProvenance(bad), 283 "Character in fixed-form label field must be a digit"_en_US); 284 token.clear(); 285 at_ = start; 286 return; 287 } 288 if (outCol == 1) { // empty label field 289 // Emit a space so that, if the line is rescanned after preprocessing, 290 // a leading 'C' or 'D' won't be left-justified and then accidentally 291 // misinterpreted as a comment card. 292 EmitChar(token, ' '); 293 ++outCol; 294 } 295 token.CloseToken(); 296 SkipToNextSignificantCharacter(); 297 if (IsDecimalDigit(*at_)) { 298 Say(GetProvenance(at_), 299 "Label digit is not in fixed-form label field"_en_US); 300 } 301 } 302 303 // 6.3.3.5: A program unit END statement, or any other statement whose 304 // initial line resembles an END statement, shall not be continued in 305 // fixed form source. 306 void Prescanner::EnforceStupidEndStatementRules(const TokenSequence &tokens) { 307 CharBlock cBlock{tokens.ToCharBlock()}; 308 const char *str{cBlock.begin()}; 309 std::size_t n{cBlock.size()}; 310 if (n < 3) { 311 return; 312 } 313 std::size_t j{0}; 314 for (; j < n && (str[j] == ' ' || (str[j] >= '0' && str[j] <= '9')); ++j) { 315 } 316 if (j + 3 > n || std::memcmp(str + j, "end", 3) != 0) { 317 return; 318 } 319 // It starts with END, possibly after a label. 320 auto start{allSources_.GetSourcePosition(tokens.GetCharProvenance(j))}; 321 auto end{allSources_.GetSourcePosition(tokens.GetCharProvenance(n - 1))}; 322 if (!start || !end) { 323 return; 324 } 325 if (&start->file == &end->file && start->line == end->line) { 326 return; // no continuation 327 } 328 j += 3; 329 static const char *const prefixes[]{"program", "subroutine", "function", 330 "blockdata", "module", "submodule", nullptr}; 331 bool isPrefix{j == n || !IsLegalInIdentifier(str[j])}; // prefix is END 332 std::size_t endOfPrefix{j - 1}; 333 for (const char *const *p{prefixes}; *p; ++p) { 334 std::size_t pLen{std::strlen(*p)}; 335 if (j + pLen <= n && std::memcmp(str + j, *p, pLen) == 0) { 336 isPrefix = true; // END thing as prefix 337 j += pLen; 338 endOfPrefix = j - 1; 339 for (; j < n && IsLegalInIdentifier(str[j]); ++j) { 340 } 341 break; 342 } 343 } 344 if (isPrefix) { 345 auto range{tokens.GetTokenProvenanceRange(1)}; 346 if (j == n) { // END or END thing [name] 347 Say(range, 348 "Program unit END statement may not be continued in fixed form source"_err_en_US); 349 } else { 350 auto endOfPrefixPos{ 351 allSources_.GetSourcePosition(tokens.GetCharProvenance(endOfPrefix))}; 352 auto next{allSources_.GetSourcePosition(tokens.GetCharProvenance(j))}; 353 if (endOfPrefixPos && next && &endOfPrefixPos->file == &start->file && 354 endOfPrefixPos->line == start->line && 355 (&next->file != &start->file || next->line != start->line)) { 356 Say(range, 357 "Initial line of continued statement must not appear to be a program unit END in fixed form source"_err_en_US); 358 } 359 } 360 } 361 } 362 363 void Prescanner::SkipToEndOfLine() { 364 while (*at_ != '\n') { 365 ++at_, ++column_; 366 } 367 } 368 369 bool Prescanner::MustSkipToEndOfLine() const { 370 if (inFixedForm_ && column_ > fixedFormColumnLimit_ && !tabInCurrentLine_) { 371 return true; // skip over ignored columns in right margin (73:80) 372 } else if (*at_ == '!' && !inCharLiteral_) { 373 return true; // inline comment goes to end of source line 374 } else { 375 return false; 376 } 377 } 378 379 void Prescanner::NextChar() { 380 CHECK(*at_ != '\n'); 381 ++at_, ++column_; 382 while (at_[0] == '\xef' && at_[1] == '\xbb' && at_[2] == '\xbf') { 383 // UTF-8 byte order mark - treat this file as UTF-8 384 at_ += 3; 385 encoding_ = Encoding::UTF_8; 386 } 387 SkipToNextSignificantCharacter(); 388 } 389 390 // Skip everything that should be ignored until the next significant 391 // character is reached; handles C-style comments in preprocessing 392 // directives, Fortran ! comments, stuff after the right margin in 393 // fixed form, and all forms of line continuation. 394 void Prescanner::SkipToNextSignificantCharacter() { 395 if (inPreprocessorDirective_) { 396 SkipCComments(); 397 } else { 398 bool mightNeedSpace{false}; 399 if (MustSkipToEndOfLine()) { 400 SkipToEndOfLine(); 401 } else { 402 mightNeedSpace = *at_ == '\n'; 403 } 404 for (; Continuation(mightNeedSpace); mightNeedSpace = false) { 405 if (MustSkipToEndOfLine()) { 406 SkipToEndOfLine(); 407 } 408 } 409 if (*at_ == '\t') { 410 tabInCurrentLine_ = true; 411 } 412 } 413 } 414 415 void Prescanner::SkipCComments() { 416 while (true) { 417 if (IsCComment(at_)) { 418 if (const char *after{SkipCComment(at_)}) { 419 column_ += after - at_; 420 // May have skipped over one or more newlines; relocate the start of 421 // the next line. 422 nextLine_ = at_ = after; 423 NextLine(); 424 } else { 425 // Don't emit any messages about unclosed C-style comments, because 426 // the sequence /* can appear legally in a FORMAT statement. There's 427 // no ambiguity, since the sequence */ cannot appear legally. 428 break; 429 } 430 } else if (inPreprocessorDirective_ && at_[0] == '\\' && at_ + 2 < limit_ && 431 at_[1] == '\n' && !IsAtEnd()) { 432 BeginSourceLineAndAdvance(); 433 } else { 434 break; 435 } 436 } 437 } 438 439 void Prescanner::SkipSpaces() { 440 while (*at_ == ' ' || *at_ == '\t') { 441 NextChar(); 442 } 443 insertASpace_ = false; 444 } 445 446 const char *Prescanner::SkipWhiteSpace(const char *p) { 447 while (*p == ' ' || *p == '\t') { 448 ++p; 449 } 450 return p; 451 } 452 453 const char *Prescanner::SkipWhiteSpaceAndCComments(const char *p) const { 454 while (true) { 455 if (*p == ' ' || *p == '\t') { 456 ++p; 457 } else if (IsCComment(p)) { 458 if (const char *after{SkipCComment(p)}) { 459 p = after; 460 } else { 461 break; 462 } 463 } else { 464 break; 465 } 466 } 467 return p; 468 } 469 470 const char *Prescanner::SkipCComment(const char *p) const { 471 char star{' '}, slash{' '}; 472 p += 2; 473 while (star != '*' || slash != '/') { 474 if (p >= limit_) { 475 return nullptr; // signifies an unterminated comment 476 } 477 star = slash; 478 slash = *p++; 479 } 480 return p; 481 } 482 483 bool Prescanner::NextToken(TokenSequence &tokens) { 484 CHECK(at_ >= start_ && at_ < limit_); 485 if (InFixedFormSource()) { 486 SkipSpaces(); 487 } else { 488 if (*at_ == '/' && IsCComment(at_)) { 489 // Recognize and skip over classic C style /*comments*/ when 490 // outside a character literal. 491 if (features_.ShouldWarn(LanguageFeature::ClassicCComments)) { 492 Say(GetProvenance(at_), "nonstandard usage: C-style comment"_en_US); 493 } 494 SkipCComments(); 495 } 496 if (*at_ == ' ' || *at_ == '\t') { 497 // Compress free-form white space into a single space character. 498 const auto theSpace{at_}; 499 char previous{at_ <= start_ ? ' ' : at_[-1]}; 500 NextChar(); 501 SkipSpaces(); 502 if (*at_ == '\n') { 503 // Discard white space at the end of a line. 504 } else if (!inPreprocessorDirective_ && 505 (previous == '(' || *at_ == '(' || *at_ == ')')) { 506 // Discard white space before/after '(' and before ')', unless in a 507 // preprocessor directive. This helps yield space-free contiguous 508 // names for generic interfaces like OPERATOR( + ) and 509 // READ ( UNFORMATTED ), without misinterpreting #define f (notAnArg). 510 // This has the effect of silently ignoring the illegal spaces in 511 // the array constructor ( /1,2/ ) but that seems benign; it's 512 // hard to avoid that while still removing spaces from OPERATOR( / ) 513 // and OPERATOR( // ). 514 } else { 515 // Preserve the squashed white space as a single space character. 516 tokens.PutNextTokenChar(' ', GetProvenance(theSpace)); 517 tokens.CloseToken(); 518 return true; 519 } 520 } 521 } 522 if (insertASpace_) { 523 tokens.PutNextTokenChar(' ', spaceProvenance_); 524 insertASpace_ = false; 525 } 526 if (*at_ == '\n') { 527 return false; 528 } 529 const char *start{at_}; 530 if (*at_ == '\'' || *at_ == '"') { 531 QuotedCharacterLiteral(tokens, start); 532 preventHollerith_ = false; 533 } else if (IsDecimalDigit(*at_)) { 534 int n{0}, digits{0}; 535 static constexpr int maxHollerith{256 /*lines*/ * (132 - 6 /*columns*/)}; 536 do { 537 if (n < maxHollerith) { 538 n = 10 * n + DecimalDigitValue(*at_); 539 } 540 EmitCharAndAdvance(tokens, *at_); 541 ++digits; 542 if (InFixedFormSource()) { 543 SkipSpaces(); 544 } 545 } while (IsDecimalDigit(*at_)); 546 if ((*at_ == 'h' || *at_ == 'H') && n > 0 && n < maxHollerith && 547 !preventHollerith_) { 548 Hollerith(tokens, n, start); 549 } else if (*at_ == '.') { 550 while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) { 551 } 552 ExponentAndKind(tokens); 553 } else if (ExponentAndKind(tokens)) { 554 } else if (digits == 1 && n == 0 && (*at_ == 'x' || *at_ == 'X') && 555 inPreprocessorDirective_) { 556 do { 557 EmitCharAndAdvance(tokens, *at_); 558 } while (IsHexadecimalDigit(*at_)); 559 } else if (IsLetter(*at_)) { 560 // Handles FORMAT(3I9HHOLLERITH) by skipping over the first I so that 561 // we don't misrecognize I9HOLLERITH as an identifier in the next case. 562 EmitCharAndAdvance(tokens, *at_); 563 } else if (at_[0] == '_' && (at_[1] == '\'' || at_[1] == '"')) { // 4_"..." 564 EmitCharAndAdvance(tokens, *at_); 565 QuotedCharacterLiteral(tokens, start); 566 } 567 preventHollerith_ = false; 568 } else if (*at_ == '.') { 569 char nch{EmitCharAndAdvance(tokens, '.')}; 570 if (!inPreprocessorDirective_ && IsDecimalDigit(nch)) { 571 while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) { 572 } 573 ExponentAndKind(tokens); 574 } else if (nch == '.' && EmitCharAndAdvance(tokens, '.') == '.') { 575 EmitCharAndAdvance(tokens, '.'); // variadic macro definition ellipsis 576 } 577 preventHollerith_ = false; 578 } else if (IsLegalInIdentifier(*at_)) { 579 do { 580 } while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_))); 581 if ((*at_ == '\'' || *at_ == '"') && 582 tokens.CharAt(tokens.SizeInChars() - 1) == '_') { // kind_"..." 583 QuotedCharacterLiteral(tokens, start); 584 } 585 preventHollerith_ = false; 586 } else if (*at_ == '*') { 587 if (EmitCharAndAdvance(tokens, '*') == '*') { 588 EmitCharAndAdvance(tokens, '*'); 589 } else { 590 // Subtle ambiguity: 591 // CHARACTER*2H declares H because *2 is a kind specifier 592 // DATAC/N*2H / is repeated Hollerith 593 preventHollerith_ = !slashInCurrentStatement_; 594 } 595 } else { 596 char ch{*at_}; 597 if (ch == '(' || ch == '[') { 598 ++delimiterNesting_; 599 } else if ((ch == ')' || ch == ']') && delimiterNesting_ > 0) { 600 --delimiterNesting_; 601 } 602 char nch{EmitCharAndAdvance(tokens, ch)}; 603 preventHollerith_ = false; 604 if ((nch == '=' && 605 (ch == '<' || ch == '>' || ch == '/' || ch == '=' || ch == '!')) || 606 (ch == nch && 607 (ch == '/' || ch == ':' || ch == '*' || ch == '#' || ch == '&' || 608 ch == '|' || ch == '<' || ch == '>')) || 609 (ch == '=' && nch == '>')) { 610 // token comprises two characters 611 EmitCharAndAdvance(tokens, nch); 612 } else if (ch == '/') { 613 slashInCurrentStatement_ = true; 614 } 615 } 616 tokens.CloseToken(); 617 return true; 618 } 619 620 bool Prescanner::ExponentAndKind(TokenSequence &tokens) { 621 char ed{ToLowerCaseLetter(*at_)}; 622 if (ed != 'e' && ed != 'd') { 623 return false; 624 } 625 EmitCharAndAdvance(tokens, ed); 626 if (*at_ == '+' || *at_ == '-') { 627 EmitCharAndAdvance(tokens, *at_); 628 } 629 while (IsDecimalDigit(*at_)) { 630 EmitCharAndAdvance(tokens, *at_); 631 } 632 if (*at_ == '_') { 633 while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_))) { 634 } 635 } 636 return true; 637 } 638 639 void Prescanner::QuotedCharacterLiteral( 640 TokenSequence &tokens, const char *start) { 641 char quote{*at_}; 642 const char *end{at_ + 1}; 643 inCharLiteral_ = true; 644 const auto emit{[&](char ch) { EmitChar(tokens, ch); }}; 645 const auto insert{[&](char ch) { EmitInsertedChar(tokens, ch); }}; 646 bool isEscaped{false}; 647 bool escapesEnabled{features_.IsEnabled(LanguageFeature::BackslashEscapes)}; 648 while (true) { 649 if (*at_ == '\\') { 650 if (escapesEnabled) { 651 isEscaped = !isEscaped; 652 } else { 653 // The parser always processes escape sequences, so don't confuse it 654 // when escapes are disabled. 655 insert('\\'); 656 } 657 } else { 658 isEscaped = false; 659 } 660 EmitQuotedChar(static_cast<unsigned char>(*at_), emit, insert, false, 661 Encoding::LATIN_1); 662 while (PadOutCharacterLiteral(tokens)) { 663 } 664 if (*at_ == '\n') { 665 if (!inPreprocessorDirective_) { 666 Say(GetProvenanceRange(start, end), 667 "Incomplete character literal"_err_en_US); 668 } 669 break; 670 } 671 end = at_ + 1; 672 NextChar(); 673 if (*at_ == quote && !isEscaped) { 674 // A doubled unescaped quote mark becomes a single instance of that 675 // quote character in the literal (later). There can be spaces between 676 // the quotes in fixed form source. 677 EmitChar(tokens, quote); 678 inCharLiteral_ = false; // for cases like print *, '...'!comment 679 NextChar(); 680 if (InFixedFormSource()) { 681 SkipSpaces(); 682 } 683 if (*at_ != quote) { 684 break; 685 } 686 inCharLiteral_ = true; 687 } 688 } 689 inCharLiteral_ = false; 690 } 691 692 void Prescanner::Hollerith( 693 TokenSequence &tokens, int count, const char *start) { 694 inCharLiteral_ = true; 695 CHECK(*at_ == 'h' || *at_ == 'H'); 696 EmitChar(tokens, 'H'); 697 while (count-- > 0) { 698 if (PadOutCharacterLiteral(tokens)) { 699 } else if (*at_ == '\n') { 700 Say(GetProvenanceRange(start, at_), 701 "Possible truncated Hollerith literal"_en_US); 702 break; 703 } else { 704 NextChar(); 705 // Each multi-byte character encoding counts as a single character. 706 // No escape sequences are recognized. 707 // Hollerith is always emitted to the cooked character 708 // stream in UTF-8. 709 DecodedCharacter decoded{DecodeCharacter( 710 encoding_, at_, static_cast<std::size_t>(limit_ - at_), false)}; 711 if (decoded.bytes > 0) { 712 EncodedCharacter utf8{ 713 EncodeCharacter<Encoding::UTF_8>(decoded.codepoint)}; 714 for (int j{0}; j < utf8.bytes; ++j) { 715 EmitChar(tokens, utf8.buffer[j]); 716 } 717 at_ += decoded.bytes - 1; 718 } else { 719 Say(GetProvenanceRange(start, at_), 720 "Bad character in Hollerith literal"_err_en_US); 721 break; 722 } 723 } 724 } 725 if (*at_ != '\n') { 726 NextChar(); 727 } 728 inCharLiteral_ = false; 729 } 730 731 // In fixed form, source card images must be processed as if they were at 732 // least 72 columns wide, at least in character literal contexts. 733 bool Prescanner::PadOutCharacterLiteral(TokenSequence &tokens) { 734 while (inFixedForm_ && !tabInCurrentLine_ && at_[1] == '\n') { 735 if (column_ < fixedFormColumnLimit_) { 736 tokens.PutNextTokenChar(' ', spaceProvenance_); 737 ++column_; 738 return true; 739 } 740 if (!FixedFormContinuation(false /*no need to insert space*/) || 741 tabInCurrentLine_) { 742 return false; 743 } 744 CHECK(column_ == 7); 745 --at_; // point to column 6 of continuation line 746 column_ = 6; 747 } 748 return false; 749 } 750 751 bool Prescanner::IsFixedFormCommentLine(const char *start) const { 752 const char *p{start}; 753 if (IsFixedFormCommentChar(*p) || *p == '%' || // VAX %list, %eject, &c. 754 ((*p == 'D' || *p == 'd') && 755 !features_.IsEnabled(LanguageFeature::OldDebugLines))) { 756 return true; 757 } 758 bool anyTabs{false}; 759 while (true) { 760 if (*p == ' ') { 761 ++p; 762 } else if (*p == '\t') { 763 anyTabs = true; 764 ++p; 765 } else if (*p == '0' && !anyTabs && p == start + 5) { 766 ++p; // 0 in column 6 must treated as a space 767 } else { 768 break; 769 } 770 } 771 if (!anyTabs && p >= start + fixedFormColumnLimit_) { 772 return true; 773 } 774 if (*p == '!' && !inCharLiteral_ && (anyTabs || p != start + 5)) { 775 return true; 776 } 777 return *p == '\n'; 778 } 779 780 const char *Prescanner::IsFreeFormComment(const char *p) const { 781 p = SkipWhiteSpaceAndCComments(p); 782 if (*p == '!' || *p == '\n') { 783 return p; 784 } else { 785 return nullptr; 786 } 787 } 788 789 std::optional<std::size_t> Prescanner::IsIncludeLine(const char *start) const { 790 const char *p{SkipWhiteSpace(start)}; 791 for (char ch : "include"s) { 792 if (ToLowerCaseLetter(*p++) != ch) { 793 return std::nullopt; 794 } 795 } 796 p = SkipWhiteSpace(p); 797 if (*p == '"' || *p == '\'') { 798 return {p - start}; 799 } 800 return std::nullopt; 801 } 802 803 void Prescanner::FortranInclude(const char *firstQuote) { 804 const char *p{firstQuote}; 805 while (*p != '"' && *p != '\'') { 806 ++p; 807 } 808 char quote{*p}; 809 std::string path; 810 for (++p; *p != '\n'; ++p) { 811 if (*p == quote) { 812 if (p[1] != quote) { 813 break; 814 } 815 ++p; 816 } 817 path += *p; 818 } 819 if (*p != quote) { 820 Say(GetProvenanceRange(firstQuote, p), 821 "malformed path name string"_err_en_US); 822 return; 823 } 824 p = SkipWhiteSpace(p + 1); 825 if (*p != '\n' && *p != '!') { 826 const char *garbage{p}; 827 for (; *p != '\n' && *p != '!'; ++p) { 828 } 829 Say(GetProvenanceRange(garbage, p), 830 "excess characters after path name"_en_US); 831 } 832 std::string buf; 833 llvm::raw_string_ostream error{buf}; 834 Provenance provenance{GetProvenance(nextLine_)}; 835 std::optional<std::string> prependPath; 836 if (const SourceFile * currentFile{allSources_.GetSourceFile(provenance)}) { 837 prependPath = DirectoryName(currentFile->path()); 838 } 839 const SourceFile *included{ 840 allSources_.Open(path, error, std::move(prependPath))}; 841 if (!included) { 842 Say(provenance, "INCLUDE: %s"_err_en_US, error.str()); 843 } else if (included->bytes() > 0) { 844 ProvenanceRange includeLineRange{ 845 provenance, static_cast<std::size_t>(p - nextLine_)}; 846 ProvenanceRange fileRange{ 847 allSources_.AddIncludedFile(*included, includeLineRange)}; 848 Prescanner{*this}.set_encoding(included->encoding()).Prescan(fileRange); 849 } 850 } 851 852 const char *Prescanner::IsPreprocessorDirectiveLine(const char *start) const { 853 const char *p{start}; 854 for (; *p == ' '; ++p) { 855 } 856 if (*p == '#') { 857 if (inFixedForm_ && p == start + 5) { 858 return nullptr; 859 } 860 } else { 861 p = SkipWhiteSpace(p); 862 if (*p != '#') { 863 return nullptr; 864 } 865 } 866 return SkipWhiteSpace(p + 1); 867 } 868 869 bool Prescanner::IsNextLinePreprocessorDirective() const { 870 return IsPreprocessorDirectiveLine(nextLine_) != nullptr; 871 } 872 873 bool Prescanner::SkipCommentLine(bool afterAmpersand) { 874 if (IsAtEnd()) { 875 if (afterAmpersand && prescannerNesting_ > 0) { 876 // A continuation marker at the end of the last line in an 877 // include file inhibits the newline for that line. 878 SkipToEndOfLine(); 879 omitNewline_ = true; 880 } 881 return false; 882 } 883 auto lineClass{ClassifyLine(nextLine_)}; 884 if (lineClass.kind == LineClassification::Kind::Comment) { 885 NextLine(); 886 return true; 887 } else if (inPreprocessorDirective_) { 888 return false; 889 } else if (lineClass.kind == 890 LineClassification::Kind::ConditionalCompilationDirective || 891 lineClass.kind == LineClassification::Kind::PreprocessorDirective) { 892 // Allow conditional compilation directives (e.g., #ifdef) to affect 893 // continuation lines. 894 // Allow other preprocessor directives, too, except #include 895 // (when it does not follow '&'), #define, and #undef (because 896 // they cannot be allowed to affect preceding text on a 897 // continued line). 898 preprocessor_.Directive(TokenizePreprocessorDirective(), *this); 899 return true; 900 } else if (afterAmpersand && 901 (lineClass.kind == LineClassification::Kind::IncludeDirective || 902 lineClass.kind == LineClassification::Kind::IncludeLine)) { 903 SkipToEndOfLine(); 904 omitNewline_ = true; 905 skipLeadingAmpersand_ = true; 906 return false; 907 } else { 908 return false; 909 } 910 } 911 912 const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) { 913 if (IsAtEnd()) { 914 return nullptr; 915 } 916 tabInCurrentLine_ = false; 917 char col1{*nextLine_}; 918 if (InCompilerDirective()) { 919 // Must be a continued compiler directive. 920 if (!IsFixedFormCommentChar(col1)) { 921 return nullptr; 922 } 923 int j{1}; 924 for (; j < 5; ++j) { 925 char ch{directiveSentinel_[j - 1]}; 926 if (ch == '\0') { 927 break; 928 } 929 if (ch != ToLowerCaseLetter(nextLine_[j])) { 930 return nullptr; 931 } 932 } 933 for (; j < 5; ++j) { 934 if (nextLine_[j] != ' ') { 935 return nullptr; 936 } 937 } 938 char col6{nextLine_[5]}; 939 if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') { 940 if (nextLine_[6] != ' ' && mightNeedSpace) { 941 insertASpace_ = true; 942 } 943 return nextLine_ + 6; 944 } 945 return nullptr; 946 } else { 947 // Normal case: not in a compiler directive. 948 if (col1 == '&' && 949 features_.IsEnabled( 950 LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) { 951 // Extension: '&' as continuation marker 952 if (features_.ShouldWarn( 953 LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) { 954 Say(GetProvenance(nextLine_), "nonstandard usage"_en_US); 955 } 956 return nextLine_ + 1; 957 } 958 if (col1 == '\t' && nextLine_[1] >= '1' && nextLine_[1] <= '9') { 959 tabInCurrentLine_ = true; 960 return nextLine_ + 2; // VAX extension 961 } 962 if (col1 == ' ' && nextLine_[1] == ' ' && nextLine_[2] == ' ' && 963 nextLine_[3] == ' ' && nextLine_[4] == ' ') { 964 char col6{nextLine_[5]}; 965 if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') { 966 return nextLine_ + 6; 967 } 968 } 969 if (IsImplicitContinuation()) { 970 return nextLine_; 971 } 972 } 973 return nullptr; // not a continuation line 974 } 975 976 const char *Prescanner::FreeFormContinuationLine(bool ampersand) { 977 const char *p{nextLine_}; 978 if (p >= limit_) { 979 return nullptr; 980 } 981 p = SkipWhiteSpace(p); 982 if (InCompilerDirective()) { 983 if (*p++ != '!') { 984 return nullptr; 985 } 986 for (const char *s{directiveSentinel_}; *s != '\0'; ++p, ++s) { 987 if (*s != ToLowerCaseLetter(*p)) { 988 return nullptr; 989 } 990 } 991 p = SkipWhiteSpace(p); 992 if (*p == '&') { 993 if (!ampersand) { 994 insertASpace_ = true; 995 } 996 return p + 1; 997 } else if (ampersand) { 998 return p; 999 } else { 1000 return nullptr; 1001 } 1002 } else { 1003 if (*p == '&') { 1004 return p + 1; 1005 } else if (*p == '!' || *p == '\n' || *p == '#') { 1006 return nullptr; 1007 } else if (ampersand || IsImplicitContinuation()) { 1008 if (p > nextLine_) { 1009 --p; 1010 } else { 1011 insertASpace_ = true; 1012 } 1013 return p; 1014 } else { 1015 return nullptr; 1016 } 1017 } 1018 } 1019 1020 bool Prescanner::FixedFormContinuation(bool mightNeedSpace) { 1021 // N.B. We accept '&' as a continuation indicator in fixed form, too, 1022 // but not in a character literal. 1023 if (*at_ == '&' && inCharLiteral_) { 1024 return false; 1025 } 1026 do { 1027 if (const char *cont{FixedFormContinuationLine(mightNeedSpace)}) { 1028 BeginSourceLine(cont); 1029 column_ = 7; 1030 NextLine(); 1031 return true; 1032 } 1033 } while (SkipCommentLine(false /* not after ampersand */)); 1034 return false; 1035 } 1036 1037 bool Prescanner::FreeFormContinuation() { 1038 const char *p{at_}; 1039 bool ampersand{*p == '&'}; 1040 if (ampersand) { 1041 p = SkipWhiteSpace(p + 1); 1042 } 1043 if (*p != '\n') { 1044 if (inCharLiteral_) { 1045 return false; 1046 } else if (*p != '!' && 1047 features_.ShouldWarn(LanguageFeature::CruftAfterAmpersand)) { 1048 Say(GetProvenance(p), "missing ! before comment after &"_en_US); 1049 } 1050 } 1051 do { 1052 if (const char *cont{FreeFormContinuationLine(ampersand)}) { 1053 BeginSourceLine(cont); 1054 NextLine(); 1055 return true; 1056 } 1057 } while (SkipCommentLine(ampersand)); 1058 return false; 1059 } 1060 1061 // Implicit line continuation allows a preprocessor macro call with 1062 // arguments to span multiple lines. 1063 bool Prescanner::IsImplicitContinuation() const { 1064 return !inPreprocessorDirective_ && !inCharLiteral_ && 1065 delimiterNesting_ > 0 && !IsAtEnd() && 1066 ClassifyLine(nextLine_).kind == LineClassification::Kind::Source; 1067 } 1068 1069 bool Prescanner::Continuation(bool mightNeedFixedFormSpace) { 1070 if (*at_ == '\n' || *at_ == '&') { 1071 if (inFixedForm_) { 1072 return FixedFormContinuation(mightNeedFixedFormSpace); 1073 } else { 1074 return FreeFormContinuation(); 1075 } 1076 } else { 1077 return false; 1078 } 1079 } 1080 1081 std::optional<Prescanner::LineClassification> 1082 Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const { 1083 const char *p{start}; 1084 char col1{*p++}; 1085 if (!IsFixedFormCommentChar(col1)) { 1086 return std::nullopt; 1087 } 1088 char sentinel[5], *sp{sentinel}; 1089 int column{2}; 1090 for (; column < 6; ++column, ++p) { 1091 if (*p != ' ') { 1092 if (*p == '\n' || *p == '\t') { 1093 break; 1094 } 1095 if (sp == sentinel + 1 && sentinel[0] == '$' && IsDecimalDigit(*p)) { 1096 // OpenMP conditional compilation line: leave the label alone 1097 break; 1098 } 1099 *sp++ = ToLowerCaseLetter(*p); 1100 } 1101 } 1102 if (column == 6) { 1103 if (*p == ' ' || *p == '\t' || *p == '0') { 1104 ++p; 1105 } else { 1106 // This is a Continuation line, not an initial directive line. 1107 return std::nullopt; 1108 } 1109 } 1110 if (sp == sentinel) { 1111 return std::nullopt; 1112 } 1113 *sp = '\0'; 1114 if (const char *ss{IsCompilerDirectiveSentinel(sentinel)}) { 1115 std::size_t payloadOffset = p - start; 1116 return {LineClassification{ 1117 LineClassification::Kind::CompilerDirective, payloadOffset, ss}}; 1118 } 1119 return std::nullopt; 1120 } 1121 1122 std::optional<Prescanner::LineClassification> 1123 Prescanner::IsFreeFormCompilerDirectiveLine(const char *start) const { 1124 char sentinel[8]; 1125 const char *p{SkipWhiteSpace(start)}; 1126 if (*p++ != '!') { 1127 return std::nullopt; 1128 } 1129 for (std::size_t j{0}; j + 1 < sizeof sentinel; ++p, ++j) { 1130 if (*p == '\n') { 1131 break; 1132 } 1133 if (*p == ' ' || *p == '\t' || *p == '&') { 1134 if (j == 0) { 1135 break; 1136 } 1137 sentinel[j] = '\0'; 1138 p = SkipWhiteSpace(p + 1); 1139 if (*p == '!') { 1140 break; 1141 } 1142 if (const char *sp{IsCompilerDirectiveSentinel(sentinel)}) { 1143 std::size_t offset = p - start; 1144 return {LineClassification{ 1145 LineClassification::Kind::CompilerDirective, offset, sp}}; 1146 } 1147 break; 1148 } 1149 sentinel[j] = ToLowerCaseLetter(*p); 1150 } 1151 return std::nullopt; 1152 } 1153 1154 Prescanner &Prescanner::AddCompilerDirectiveSentinel(const std::string &dir) { 1155 std::uint64_t packed{0}; 1156 for (char ch : dir) { 1157 packed = (packed << 8) | (ToLowerCaseLetter(ch) & 0xff); 1158 } 1159 compilerDirectiveBloomFilter_.set(packed % prime1); 1160 compilerDirectiveBloomFilter_.set(packed % prime2); 1161 compilerDirectiveSentinels_.insert(dir); 1162 return *this; 1163 } 1164 1165 const char *Prescanner::IsCompilerDirectiveSentinel( 1166 const char *sentinel) const { 1167 std::uint64_t packed{0}; 1168 std::size_t n{0}; 1169 for (; sentinel[n] != '\0'; ++n) { 1170 packed = (packed << 8) | (sentinel[n] & 0xff); 1171 } 1172 if (n == 0 || !compilerDirectiveBloomFilter_.test(packed % prime1) || 1173 !compilerDirectiveBloomFilter_.test(packed % prime2)) { 1174 return nullptr; 1175 } 1176 const auto iter{compilerDirectiveSentinels_.find(std::string(sentinel, n))}; 1177 return iter == compilerDirectiveSentinels_.end() ? nullptr : iter->c_str(); 1178 } 1179 1180 constexpr bool IsDirective(const char *match, const char *dir) { 1181 for (; *match; ++match) { 1182 if (*match != ToLowerCaseLetter(*dir++)) { 1183 return false; 1184 } 1185 } 1186 return true; 1187 } 1188 1189 Prescanner::LineClassification Prescanner::ClassifyLine( 1190 const char *start) const { 1191 if (inFixedForm_) { 1192 if (std::optional<LineClassification> lc{ 1193 IsFixedFormCompilerDirectiveLine(start)}) { 1194 return std::move(*lc); 1195 } 1196 if (IsFixedFormCommentLine(start)) { 1197 return {LineClassification::Kind::Comment}; 1198 } 1199 } else { 1200 if (std::optional<LineClassification> lc{ 1201 IsFreeFormCompilerDirectiveLine(start)}) { 1202 return std::move(*lc); 1203 } 1204 if (const char *bang{IsFreeFormComment(start)}) { 1205 return {LineClassification::Kind::Comment, 1206 static_cast<std::size_t>(bang - start)}; 1207 } 1208 } 1209 if (std::optional<std::size_t> quoteOffset{IsIncludeLine(start)}) { 1210 return {LineClassification::Kind::IncludeLine, *quoteOffset}; 1211 } 1212 if (const char *dir{IsPreprocessorDirectiveLine(start)}) { 1213 if (IsDirective("if", dir) || IsDirective("elif", dir) || 1214 IsDirective("else", dir) || IsDirective("endif", dir)) { 1215 return {LineClassification::Kind::ConditionalCompilationDirective}; 1216 } else if (IsDirective("include", dir)) { 1217 return {LineClassification::Kind::IncludeDirective}; 1218 } else if (IsDirective("define", dir) || IsDirective("undef", dir)) { 1219 return {LineClassification::Kind::DefinitionDirective}; 1220 } else { 1221 return {LineClassification::Kind::PreprocessorDirective}; 1222 } 1223 } 1224 return {LineClassification::Kind::Source}; 1225 } 1226 1227 void Prescanner::SourceFormChange(std::string &&dir) { 1228 if (dir == "!dir$ free") { 1229 inFixedForm_ = false; 1230 } else if (dir == "!dir$ fixed") { 1231 inFixedForm_ = true; 1232 } 1233 } 1234 } // namespace Fortran::parser 1235