1 //===-- lib/Parser/prescan.cpp --------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "prescan.h" 10 #include "preprocessor.h" 11 #include "token-sequence.h" 12 #include "flang/Common/idioms.h" 13 #include "flang/Parser/characters.h" 14 #include "flang/Parser/message.h" 15 #include "flang/Parser/source.h" 16 #include "llvm/Support/raw_ostream.h" 17 #include <cstddef> 18 #include <cstring> 19 #include <utility> 20 #include <vector> 21 22 namespace Fortran::parser { 23 24 using common::LanguageFeature; 25 26 static constexpr int maxPrescannerNesting{100}; 27 28 Prescanner::Prescanner(Messages &messages, CookedSource &cooked, 29 Preprocessor &preprocessor, common::LanguageFeatureControl lfc) 30 : messages_{messages}, cooked_{cooked}, preprocessor_{preprocessor}, 31 allSources_{preprocessor_.allSources()}, features_{lfc}, 32 encoding_{allSources_.encoding()} {} 33 34 Prescanner::Prescanner(const Prescanner &that) 35 : messages_{that.messages_}, cooked_{that.cooked_}, 36 preprocessor_{that.preprocessor_}, allSources_{that.allSources_}, 37 features_{that.features_}, inFixedForm_{that.inFixedForm_}, 38 fixedFormColumnLimit_{that.fixedFormColumnLimit_}, 39 encoding_{that.encoding_}, prescannerNesting_{that.prescannerNesting_ + 40 1}, 41 skipLeadingAmpersand_{that.skipLeadingAmpersand_}, 42 compilerDirectiveBloomFilter_{that.compilerDirectiveBloomFilter_}, 43 compilerDirectiveSentinels_{that.compilerDirectiveSentinels_} {} 44 45 static inline constexpr bool IsFixedFormCommentChar(char ch) { 46 return ch == '!' || ch == '*' || ch == 'C' || ch == 'c'; 47 } 48 49 static void NormalizeCompilerDirectiveCommentMarker(TokenSequence &dir) { 50 char *p{dir.GetMutableCharData()}; 51 char *limit{p + dir.SizeInChars()}; 52 for (; p < limit; ++p) { 53 if (*p != ' ') { 54 CHECK(IsFixedFormCommentChar(*p)); 55 *p = '!'; 56 return; 57 } 58 } 59 DIE("compiler directive all blank"); 60 } 61 62 void Prescanner::Prescan(ProvenanceRange range) { 63 startProvenance_ = range.start(); 64 start_ = allSources_.GetSource(range); 65 CHECK(start_); 66 limit_ = start_ + range.size(); 67 nextLine_ = start_; 68 const bool beganInFixedForm{inFixedForm_}; 69 if (prescannerNesting_ > maxPrescannerNesting) { 70 Say(GetProvenance(start_), 71 "too many nested INCLUDE/#include files, possibly circular"_err_en_US); 72 return; 73 } 74 while (!IsAtEnd()) { 75 Statement(); 76 } 77 if (inFixedForm_ != beganInFixedForm) { 78 std::string dir{"!dir$ "}; 79 if (beganInFixedForm) { 80 dir += "fixed"; 81 } else { 82 dir += "free"; 83 } 84 dir += '\n'; 85 TokenSequence tokens{dir, allSources_.AddCompilerInsertion(dir).start()}; 86 tokens.Emit(cooked_); 87 } 88 } 89 90 void Prescanner::Statement() { 91 TokenSequence tokens; 92 LineClassification line{ClassifyLine(nextLine_)}; 93 switch (line.kind) { 94 case LineClassification::Kind::Comment: 95 nextLine_ += line.payloadOffset; // advance to '!' or newline 96 NextLine(); 97 return; 98 case LineClassification::Kind::IncludeLine: 99 FortranInclude(nextLine_ + line.payloadOffset); 100 NextLine(); 101 return; 102 case LineClassification::Kind::ConditionalCompilationDirective: 103 case LineClassification::Kind::IncludeDirective: 104 case LineClassification::Kind::DefinitionDirective: 105 case LineClassification::Kind::PreprocessorDirective: 106 preprocessor_.Directive(TokenizePreprocessorDirective(), *this); 107 return; 108 case LineClassification::Kind::CompilerDirective: 109 directiveSentinel_ = line.sentinel; 110 CHECK(InCompilerDirective()); 111 BeginStatementAndAdvance(); 112 if (inFixedForm_) { 113 CHECK(IsFixedFormCommentChar(*at_)); 114 } else { 115 while (*at_ == ' ' || *at_ == '\t') { 116 ++at_, ++column_; 117 } 118 CHECK(*at_ == '!'); 119 } 120 if (directiveSentinel_[0] == '$' && directiveSentinel_[1] == '\0') { 121 // OpenMP conditional compilation line. Remove the sentinel and then 122 // treat the line as if it were normal source. 123 at_ += 2, column_ += 2; 124 if (inFixedForm_) { 125 LabelField(tokens); 126 } else { 127 SkipSpaces(); 128 } 129 } else { 130 // Compiler directive. Emit normalized sentinel. 131 EmitChar(tokens, '!'); 132 ++at_, ++column_; 133 for (const char *sp{directiveSentinel_}; *sp != '\0'; 134 ++sp, ++at_, ++column_) { 135 EmitChar(tokens, *sp); 136 } 137 if (*at_ == ' ') { 138 EmitChar(tokens, ' '); 139 ++at_, ++column_; 140 } 141 tokens.CloseToken(); 142 } 143 break; 144 case LineClassification::Kind::Source: 145 BeginStatementAndAdvance(); 146 if (inFixedForm_) { 147 LabelField(tokens); 148 } else if (skipLeadingAmpersand_) { 149 skipLeadingAmpersand_ = false; 150 const char *p{SkipWhiteSpace(at_)}; 151 if (p < limit_ && *p == '&') { 152 column_ += ++p - at_; 153 at_ = p; 154 } 155 } else { 156 SkipSpaces(); 157 } 158 break; 159 } 160 161 while (NextToken(tokens)) { 162 } 163 164 Provenance newlineProvenance{GetCurrentProvenance()}; 165 if (std::optional<TokenSequence> preprocessed{ 166 preprocessor_.MacroReplacement(tokens, *this)}) { 167 // Reprocess the preprocessed line. Append a newline temporarily. 168 preprocessed->PutNextTokenChar('\n', newlineProvenance); 169 preprocessed->CloseToken(); 170 const char *ppd{preprocessed->ToCharBlock().begin()}; 171 LineClassification ppl{ClassifyLine(ppd)}; 172 preprocessed->pop_back(); // remove the newline 173 switch (ppl.kind) { 174 case LineClassification::Kind::Comment: 175 break; 176 case LineClassification::Kind::IncludeLine: 177 FortranInclude(ppd + ppl.payloadOffset); 178 break; 179 case LineClassification::Kind::ConditionalCompilationDirective: 180 case LineClassification::Kind::IncludeDirective: 181 case LineClassification::Kind::DefinitionDirective: 182 case LineClassification::Kind::PreprocessorDirective: 183 Say(preprocessed->GetProvenanceRange(), 184 "Preprocessed line resembles a preprocessor directive"_warn_en_US); 185 preprocessed->ToLowerCase() 186 .CheckBadFortranCharacters(messages_) 187 .CheckBadParentheses(messages_) 188 .Emit(cooked_); 189 break; 190 case LineClassification::Kind::CompilerDirective: 191 if (preprocessed->HasRedundantBlanks()) { 192 preprocessed->RemoveRedundantBlanks(); 193 } 194 NormalizeCompilerDirectiveCommentMarker(*preprocessed); 195 preprocessed->ToLowerCase(); 196 SourceFormChange(preprocessed->ToString()); 197 preprocessed->ClipComment(true /* skip first ! */) 198 .CheckBadFortranCharacters(messages_) 199 .CheckBadParentheses(messages_) 200 .Emit(cooked_); 201 break; 202 case LineClassification::Kind::Source: 203 if (inFixedForm_) { 204 if (preprocessed->HasBlanks(/*after column*/ 6)) { 205 preprocessed->RemoveBlanks(/*after column*/ 6); 206 } 207 } else { 208 if (preprocessed->HasRedundantBlanks()) { 209 preprocessed->RemoveRedundantBlanks(); 210 } 211 } 212 preprocessed->ToLowerCase() 213 .ClipComment() 214 .CheckBadFortranCharacters(messages_) 215 .CheckBadParentheses(messages_) 216 .Emit(cooked_); 217 break; 218 } 219 } else { 220 tokens.ToLowerCase(); 221 if (line.kind == LineClassification::Kind::CompilerDirective) { 222 SourceFormChange(tokens.ToString()); 223 } 224 if (inFixedForm_ && line.kind == LineClassification::Kind::Source) { 225 EnforceStupidEndStatementRules(tokens); 226 } 227 tokens.CheckBadFortranCharacters(messages_) 228 .CheckBadParentheses(messages_) 229 .Emit(cooked_); 230 } 231 if (omitNewline_) { 232 omitNewline_ = false; 233 } else { 234 cooked_.Put('\n', newlineProvenance); 235 } 236 directiveSentinel_ = nullptr; 237 } 238 239 TokenSequence Prescanner::TokenizePreprocessorDirective() { 240 CHECK(!IsAtEnd() && !inPreprocessorDirective_); 241 inPreprocessorDirective_ = true; 242 BeginStatementAndAdvance(); 243 TokenSequence tokens; 244 while (NextToken(tokens)) { 245 } 246 inPreprocessorDirective_ = false; 247 return tokens; 248 } 249 250 void Prescanner::NextLine() { 251 void *vstart{static_cast<void *>(const_cast<char *>(nextLine_))}; 252 void *v{std::memchr(vstart, '\n', limit_ - nextLine_)}; 253 if (!v) { 254 nextLine_ = limit_; 255 } else { 256 const char *nl{const_cast<const char *>(static_cast<char *>(v))}; 257 nextLine_ = nl + 1; 258 } 259 } 260 261 void Prescanner::LabelField(TokenSequence &token) { 262 const char *bad{nullptr}; 263 int outCol{1}; 264 const char *start{at_}; 265 for (; *at_ != '\n' && column_ <= 6; ++at_) { 266 if (*at_ == '\t') { 267 ++at_; 268 column_ = 7; 269 break; 270 } 271 if (*at_ != ' ' && 272 !(*at_ == '0' && column_ == 6)) { // '0' in column 6 becomes space 273 EmitChar(token, *at_); 274 ++outCol; 275 if (!bad && !IsDecimalDigit(*at_)) { 276 bad = at_; 277 } 278 } 279 ++column_; 280 } 281 if (bad && !preprocessor_.IsNameDefined(token.CurrentOpenToken())) { 282 Say(GetProvenance(bad), 283 "Character in fixed-form label field must be a digit"_warn_en_US); 284 token.clear(); 285 at_ = start; 286 return; 287 } 288 if (outCol == 1) { // empty label field 289 // Emit a space so that, if the line is rescanned after preprocessing, 290 // a leading 'C' or 'D' won't be left-justified and then accidentally 291 // misinterpreted as a comment card. 292 EmitChar(token, ' '); 293 ++outCol; 294 } 295 token.CloseToken(); 296 SkipToNextSignificantCharacter(); 297 if (IsDecimalDigit(*at_)) { 298 Say(GetProvenance(at_), 299 "Label digit is not in fixed-form label field"_port_en_US); 300 } 301 } 302 303 // 6.3.3.5: A program unit END statement, or any other statement whose 304 // initial line resembles an END statement, shall not be continued in 305 // fixed form source. 306 void Prescanner::EnforceStupidEndStatementRules(const TokenSequence &tokens) { 307 CharBlock cBlock{tokens.ToCharBlock()}; 308 const char *str{cBlock.begin()}; 309 std::size_t n{cBlock.size()}; 310 if (n < 3) { 311 return; 312 } 313 std::size_t j{0}; 314 for (; j < n && (str[j] == ' ' || (str[j] >= '0' && str[j] <= '9')); ++j) { 315 } 316 if (j + 3 > n || std::memcmp(str + j, "end", 3) != 0) { 317 return; 318 } 319 // It starts with END, possibly after a label. 320 auto start{allSources_.GetSourcePosition(tokens.GetCharProvenance(j))}; 321 auto end{allSources_.GetSourcePosition(tokens.GetCharProvenance(n - 1))}; 322 if (!start || !end) { 323 return; 324 } 325 if (&start->file == &end->file && start->line == end->line) { 326 return; // no continuation 327 } 328 j += 3; 329 static const char *const prefixes[]{"program", "subroutine", "function", 330 "blockdata", "module", "submodule", nullptr}; 331 bool isPrefix{j == n || !IsLegalInIdentifier(str[j])}; // prefix is END 332 std::size_t endOfPrefix{j - 1}; 333 for (const char *const *p{prefixes}; *p; ++p) { 334 std::size_t pLen{std::strlen(*p)}; 335 if (j + pLen <= n && std::memcmp(str + j, *p, pLen) == 0) { 336 isPrefix = true; // END thing as prefix 337 j += pLen; 338 endOfPrefix = j - 1; 339 for (; j < n && IsLegalInIdentifier(str[j]); ++j) { 340 } 341 break; 342 } 343 } 344 if (isPrefix) { 345 auto range{tokens.GetTokenProvenanceRange(1)}; 346 if (j == n) { // END or END thing [name] 347 Say(range, 348 "Program unit END statement may not be continued in fixed form source"_err_en_US); 349 } else { 350 auto endOfPrefixPos{ 351 allSources_.GetSourcePosition(tokens.GetCharProvenance(endOfPrefix))}; 352 auto next{allSources_.GetSourcePosition(tokens.GetCharProvenance(j))}; 353 if (endOfPrefixPos && next && &endOfPrefixPos->file == &start->file && 354 endOfPrefixPos->line == start->line && 355 (&next->file != &start->file || next->line != start->line)) { 356 Say(range, 357 "Initial line of continued statement must not appear to be a program unit END in fixed form source"_err_en_US); 358 } 359 } 360 } 361 } 362 363 void Prescanner::SkipToEndOfLine() { 364 while (*at_ != '\n') { 365 ++at_, ++column_; 366 } 367 } 368 369 bool Prescanner::MustSkipToEndOfLine() const { 370 if (inFixedForm_ && column_ > fixedFormColumnLimit_ && !tabInCurrentLine_) { 371 return true; // skip over ignored columns in right margin (73:80) 372 } else if (*at_ == '!' && !inCharLiteral_) { 373 return true; // inline comment goes to end of source line 374 } else { 375 return false; 376 } 377 } 378 379 void Prescanner::NextChar() { 380 CHECK(*at_ != '\n'); 381 ++at_, ++column_; 382 while (at_[0] == '\xef' && at_[1] == '\xbb' && at_[2] == '\xbf') { 383 // UTF-8 byte order mark - treat this file as UTF-8 384 at_ += 3; 385 encoding_ = Encoding::UTF_8; 386 } 387 SkipToNextSignificantCharacter(); 388 } 389 390 // Skip everything that should be ignored until the next significant 391 // character is reached; handles C-style comments in preprocessing 392 // directives, Fortran ! comments, stuff after the right margin in 393 // fixed form, and all forms of line continuation. 394 void Prescanner::SkipToNextSignificantCharacter() { 395 if (inPreprocessorDirective_) { 396 SkipCComments(); 397 } else { 398 bool mightNeedSpace{false}; 399 if (MustSkipToEndOfLine()) { 400 SkipToEndOfLine(); 401 } else { 402 mightNeedSpace = *at_ == '\n'; 403 } 404 for (; Continuation(mightNeedSpace); mightNeedSpace = false) { 405 if (MustSkipToEndOfLine()) { 406 SkipToEndOfLine(); 407 } 408 } 409 if (*at_ == '\t') { 410 tabInCurrentLine_ = true; 411 } 412 } 413 } 414 415 void Prescanner::SkipCComments() { 416 while (true) { 417 if (IsCComment(at_)) { 418 if (const char *after{SkipCComment(at_)}) { 419 column_ += after - at_; 420 // May have skipped over one or more newlines; relocate the start of 421 // the next line. 422 nextLine_ = at_ = after; 423 NextLine(); 424 } else { 425 // Don't emit any messages about unclosed C-style comments, because 426 // the sequence /* can appear legally in a FORMAT statement. There's 427 // no ambiguity, since the sequence */ cannot appear legally. 428 break; 429 } 430 } else if (inPreprocessorDirective_ && at_[0] == '\\' && at_ + 2 < limit_ && 431 at_[1] == '\n' && !IsAtEnd()) { 432 BeginSourceLineAndAdvance(); 433 } else { 434 break; 435 } 436 } 437 } 438 439 void Prescanner::SkipSpaces() { 440 while (*at_ == ' ' || *at_ == '\t') { 441 NextChar(); 442 } 443 insertASpace_ = false; 444 } 445 446 const char *Prescanner::SkipWhiteSpace(const char *p) { 447 while (*p == ' ' || *p == '\t') { 448 ++p; 449 } 450 return p; 451 } 452 453 const char *Prescanner::SkipWhiteSpaceAndCComments(const char *p) const { 454 while (true) { 455 if (*p == ' ' || *p == '\t') { 456 ++p; 457 } else if (IsCComment(p)) { 458 if (const char *after{SkipCComment(p)}) { 459 p = after; 460 } else { 461 break; 462 } 463 } else { 464 break; 465 } 466 } 467 return p; 468 } 469 470 const char *Prescanner::SkipCComment(const char *p) const { 471 char star{' '}, slash{' '}; 472 p += 2; 473 while (star != '*' || slash != '/') { 474 if (p >= limit_) { 475 return nullptr; // signifies an unterminated comment 476 } 477 star = slash; 478 slash = *p++; 479 } 480 return p; 481 } 482 483 bool Prescanner::NextToken(TokenSequence &tokens) { 484 CHECK(at_ >= start_ && at_ < limit_); 485 if (InFixedFormSource()) { 486 SkipSpaces(); 487 } else { 488 if (*at_ == '/' && IsCComment(at_)) { 489 // Recognize and skip over classic C style /*comments*/ when 490 // outside a character literal. 491 if (features_.ShouldWarn(LanguageFeature::ClassicCComments)) { 492 Say(GetProvenance(at_), 493 "nonstandard usage: C-style comment"_port_en_US); 494 } 495 SkipCComments(); 496 } 497 if (*at_ == ' ' || *at_ == '\t') { 498 // Compress free-form white space into a single space character. 499 const auto theSpace{at_}; 500 char previous{at_ <= start_ ? ' ' : at_[-1]}; 501 NextChar(); 502 SkipSpaces(); 503 if (*at_ == '\n') { 504 // Discard white space at the end of a line. 505 } else if (!inPreprocessorDirective_ && 506 (previous == '(' || *at_ == '(' || *at_ == ')')) { 507 // Discard white space before/after '(' and before ')', unless in a 508 // preprocessor directive. This helps yield space-free contiguous 509 // names for generic interfaces like OPERATOR( + ) and 510 // READ ( UNFORMATTED ), without misinterpreting #define f (notAnArg). 511 // This has the effect of silently ignoring the illegal spaces in 512 // the array constructor ( /1,2/ ) but that seems benign; it's 513 // hard to avoid that while still removing spaces from OPERATOR( / ) 514 // and OPERATOR( // ). 515 } else { 516 // Preserve the squashed white space as a single space character. 517 tokens.PutNextTokenChar(' ', GetProvenance(theSpace)); 518 tokens.CloseToken(); 519 return true; 520 } 521 } 522 } 523 if (insertASpace_) { 524 tokens.PutNextTokenChar(' ', spaceProvenance_); 525 insertASpace_ = false; 526 } 527 if (*at_ == '\n') { 528 return false; 529 } 530 const char *start{at_}; 531 if (*at_ == '\'' || *at_ == '"') { 532 QuotedCharacterLiteral(tokens, start); 533 preventHollerith_ = false; 534 } else if (IsDecimalDigit(*at_)) { 535 int n{0}, digits{0}; 536 static constexpr int maxHollerith{256 /*lines*/ * (132 - 6 /*columns*/)}; 537 do { 538 if (n < maxHollerith) { 539 n = 10 * n + DecimalDigitValue(*at_); 540 } 541 EmitCharAndAdvance(tokens, *at_); 542 ++digits; 543 if (InFixedFormSource()) { 544 SkipSpaces(); 545 } 546 } while (IsDecimalDigit(*at_)); 547 if ((*at_ == 'h' || *at_ == 'H') && n > 0 && n < maxHollerith && 548 !preventHollerith_) { 549 Hollerith(tokens, n, start); 550 } else if (*at_ == '.') { 551 while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) { 552 } 553 ExponentAndKind(tokens); 554 } else if (ExponentAndKind(tokens)) { 555 } else if (digits == 1 && n == 0 && (*at_ == 'x' || *at_ == 'X') && 556 inPreprocessorDirective_) { 557 do { 558 EmitCharAndAdvance(tokens, *at_); 559 } while (IsHexadecimalDigit(*at_)); 560 } else if (IsLetter(*at_)) { 561 // Handles FORMAT(3I9HHOLLERITH) by skipping over the first I so that 562 // we don't misrecognize I9HOLLERITH as an identifier in the next case. 563 EmitCharAndAdvance(tokens, *at_); 564 } else if (at_[0] == '_' && (at_[1] == '\'' || at_[1] == '"')) { // 4_"..." 565 EmitCharAndAdvance(tokens, *at_); 566 QuotedCharacterLiteral(tokens, start); 567 } 568 preventHollerith_ = false; 569 } else if (*at_ == '.') { 570 char nch{EmitCharAndAdvance(tokens, '.')}; 571 if (!inPreprocessorDirective_ && IsDecimalDigit(nch)) { 572 while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) { 573 } 574 ExponentAndKind(tokens); 575 } else if (nch == '.' && EmitCharAndAdvance(tokens, '.') == '.') { 576 EmitCharAndAdvance(tokens, '.'); // variadic macro definition ellipsis 577 } 578 preventHollerith_ = false; 579 } else if (IsLegalInIdentifier(*at_)) { 580 do { 581 } while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_))); 582 if ((*at_ == '\'' || *at_ == '"') && 583 tokens.CharAt(tokens.SizeInChars() - 1) == '_') { // kind_"..." 584 QuotedCharacterLiteral(tokens, start); 585 } 586 preventHollerith_ = false; 587 } else if (*at_ == '*') { 588 if (EmitCharAndAdvance(tokens, '*') == '*') { 589 EmitCharAndAdvance(tokens, '*'); 590 } else { 591 // Subtle ambiguity: 592 // CHARACTER*2H declares H because *2 is a kind specifier 593 // DATAC/N*2H / is repeated Hollerith 594 preventHollerith_ = !slashInCurrentStatement_; 595 } 596 } else { 597 char ch{*at_}; 598 if (ch == '(' || ch == '[') { 599 ++delimiterNesting_; 600 } else if ((ch == ')' || ch == ']') && delimiterNesting_ > 0) { 601 --delimiterNesting_; 602 } 603 char nch{EmitCharAndAdvance(tokens, ch)}; 604 preventHollerith_ = false; 605 if ((nch == '=' && 606 (ch == '<' || ch == '>' || ch == '/' || ch == '=' || ch == '!')) || 607 (ch == nch && 608 (ch == '/' || ch == ':' || ch == '*' || ch == '#' || ch == '&' || 609 ch == '|' || ch == '<' || ch == '>')) || 610 (ch == '=' && nch == '>')) { 611 // token comprises two characters 612 EmitCharAndAdvance(tokens, nch); 613 } else if (ch == '/') { 614 slashInCurrentStatement_ = true; 615 } 616 } 617 tokens.CloseToken(); 618 return true; 619 } 620 621 bool Prescanner::ExponentAndKind(TokenSequence &tokens) { 622 char ed{ToLowerCaseLetter(*at_)}; 623 if (ed != 'e' && ed != 'd') { 624 return false; 625 } 626 EmitCharAndAdvance(tokens, ed); 627 if (*at_ == '+' || *at_ == '-') { 628 EmitCharAndAdvance(tokens, *at_); 629 } 630 while (IsDecimalDigit(*at_)) { 631 EmitCharAndAdvance(tokens, *at_); 632 } 633 if (*at_ == '_') { 634 while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_))) { 635 } 636 } 637 return true; 638 } 639 640 void Prescanner::QuotedCharacterLiteral( 641 TokenSequence &tokens, const char *start) { 642 char quote{*at_}; 643 const char *end{at_ + 1}; 644 inCharLiteral_ = true; 645 const auto emit{[&](char ch) { EmitChar(tokens, ch); }}; 646 const auto insert{[&](char ch) { EmitInsertedChar(tokens, ch); }}; 647 bool isEscaped{false}; 648 bool escapesEnabled{features_.IsEnabled(LanguageFeature::BackslashEscapes)}; 649 while (true) { 650 if (*at_ == '\\') { 651 if (escapesEnabled) { 652 isEscaped = !isEscaped; 653 } else { 654 // The parser always processes escape sequences, so don't confuse it 655 // when escapes are disabled. 656 insert('\\'); 657 } 658 } else { 659 isEscaped = false; 660 } 661 EmitQuotedChar(static_cast<unsigned char>(*at_), emit, insert, false, 662 Encoding::LATIN_1); 663 while (PadOutCharacterLiteral(tokens)) { 664 } 665 if (*at_ == '\n') { 666 if (!inPreprocessorDirective_) { 667 Say(GetProvenanceRange(start, end), 668 "Incomplete character literal"_err_en_US); 669 } 670 break; 671 } 672 end = at_ + 1; 673 NextChar(); 674 if (*at_ == quote && !isEscaped) { 675 // A doubled unescaped quote mark becomes a single instance of that 676 // quote character in the literal (later). There can be spaces between 677 // the quotes in fixed form source. 678 EmitChar(tokens, quote); 679 inCharLiteral_ = false; // for cases like print *, '...'!comment 680 NextChar(); 681 if (InFixedFormSource()) { 682 SkipSpaces(); 683 } 684 if (*at_ != quote) { 685 break; 686 } 687 inCharLiteral_ = true; 688 } 689 } 690 inCharLiteral_ = false; 691 } 692 693 void Prescanner::Hollerith( 694 TokenSequence &tokens, int count, const char *start) { 695 inCharLiteral_ = true; 696 CHECK(*at_ == 'h' || *at_ == 'H'); 697 EmitChar(tokens, 'H'); 698 while (count-- > 0) { 699 if (PadOutCharacterLiteral(tokens)) { 700 } else if (*at_ == '\n') { 701 Say(GetProvenanceRange(start, at_), 702 "Possible truncated Hollerith literal"_warn_en_US); 703 break; 704 } else { 705 NextChar(); 706 // Each multi-byte character encoding counts as a single character. 707 // No escape sequences are recognized. 708 // Hollerith is always emitted to the cooked character 709 // stream in UTF-8. 710 DecodedCharacter decoded{DecodeCharacter( 711 encoding_, at_, static_cast<std::size_t>(limit_ - at_), false)}; 712 if (decoded.bytes > 0) { 713 EncodedCharacter utf8{ 714 EncodeCharacter<Encoding::UTF_8>(decoded.codepoint)}; 715 for (int j{0}; j < utf8.bytes; ++j) { 716 EmitChar(tokens, utf8.buffer[j]); 717 } 718 at_ += decoded.bytes - 1; 719 } else { 720 Say(GetProvenanceRange(start, at_), 721 "Bad character in Hollerith literal"_err_en_US); 722 break; 723 } 724 } 725 } 726 if (*at_ != '\n') { 727 NextChar(); 728 } 729 inCharLiteral_ = false; 730 } 731 732 // In fixed form, source card images must be processed as if they were at 733 // least 72 columns wide, at least in character literal contexts. 734 bool Prescanner::PadOutCharacterLiteral(TokenSequence &tokens) { 735 while (inFixedForm_ && !tabInCurrentLine_ && at_[1] == '\n') { 736 if (column_ < fixedFormColumnLimit_) { 737 tokens.PutNextTokenChar(' ', spaceProvenance_); 738 ++column_; 739 return true; 740 } 741 if (!FixedFormContinuation(false /*no need to insert space*/) || 742 tabInCurrentLine_) { 743 return false; 744 } 745 CHECK(column_ == 7); 746 --at_; // point to column 6 of continuation line 747 column_ = 6; 748 } 749 return false; 750 } 751 752 bool Prescanner::IsFixedFormCommentLine(const char *start) const { 753 const char *p{start}; 754 if (IsFixedFormCommentChar(*p) || *p == '%' || // VAX %list, %eject, &c. 755 ((*p == 'D' || *p == 'd') && 756 !features_.IsEnabled(LanguageFeature::OldDebugLines))) { 757 return true; 758 } 759 bool anyTabs{false}; 760 while (true) { 761 if (*p == ' ') { 762 ++p; 763 } else if (*p == '\t') { 764 anyTabs = true; 765 ++p; 766 } else if (*p == '0' && !anyTabs && p == start + 5) { 767 ++p; // 0 in column 6 must treated as a space 768 } else { 769 break; 770 } 771 } 772 if (!anyTabs && p >= start + fixedFormColumnLimit_) { 773 return true; 774 } 775 if (*p == '!' && !inCharLiteral_ && (anyTabs || p != start + 5)) { 776 return true; 777 } 778 return *p == '\n'; 779 } 780 781 const char *Prescanner::IsFreeFormComment(const char *p) const { 782 p = SkipWhiteSpaceAndCComments(p); 783 if (*p == '!' || *p == '\n') { 784 return p; 785 } else { 786 return nullptr; 787 } 788 } 789 790 std::optional<std::size_t> Prescanner::IsIncludeLine(const char *start) const { 791 const char *p{SkipWhiteSpace(start)}; 792 for (char ch : "include"s) { 793 if (ToLowerCaseLetter(*p++) != ch) { 794 return std::nullopt; 795 } 796 } 797 p = SkipWhiteSpace(p); 798 if (*p == '"' || *p == '\'') { 799 return {p - start}; 800 } 801 return std::nullopt; 802 } 803 804 void Prescanner::FortranInclude(const char *firstQuote) { 805 const char *p{firstQuote}; 806 while (*p != '"' && *p != '\'') { 807 ++p; 808 } 809 char quote{*p}; 810 std::string path; 811 for (++p; *p != '\n'; ++p) { 812 if (*p == quote) { 813 if (p[1] != quote) { 814 break; 815 } 816 ++p; 817 } 818 path += *p; 819 } 820 if (*p != quote) { 821 Say(GetProvenanceRange(firstQuote, p), 822 "malformed path name string"_err_en_US); 823 return; 824 } 825 p = SkipWhiteSpace(p + 1); 826 if (*p != '\n' && *p != '!') { 827 const char *garbage{p}; 828 for (; *p != '\n' && *p != '!'; ++p) { 829 } 830 Say(GetProvenanceRange(garbage, p), 831 "excess characters after path name"_warn_en_US); 832 } 833 std::string buf; 834 llvm::raw_string_ostream error{buf}; 835 Provenance provenance{GetProvenance(nextLine_)}; 836 std::optional<std::string> prependPath; 837 if (const SourceFile * currentFile{allSources_.GetSourceFile(provenance)}) { 838 prependPath = DirectoryName(currentFile->path()); 839 } 840 const SourceFile *included{ 841 allSources_.Open(path, error, std::move(prependPath))}; 842 if (!included) { 843 Say(provenance, "INCLUDE: %s"_err_en_US, error.str()); 844 } else if (included->bytes() > 0) { 845 ProvenanceRange includeLineRange{ 846 provenance, static_cast<std::size_t>(p - nextLine_)}; 847 ProvenanceRange fileRange{ 848 allSources_.AddIncludedFile(*included, includeLineRange)}; 849 Prescanner{*this}.set_encoding(included->encoding()).Prescan(fileRange); 850 } 851 } 852 853 const char *Prescanner::IsPreprocessorDirectiveLine(const char *start) const { 854 const char *p{start}; 855 for (; *p == ' '; ++p) { 856 } 857 if (*p == '#') { 858 if (inFixedForm_ && p == start + 5) { 859 return nullptr; 860 } 861 } else { 862 p = SkipWhiteSpace(p); 863 if (*p != '#') { 864 return nullptr; 865 } 866 } 867 return SkipWhiteSpace(p + 1); 868 } 869 870 bool Prescanner::IsNextLinePreprocessorDirective() const { 871 return IsPreprocessorDirectiveLine(nextLine_) != nullptr; 872 } 873 874 bool Prescanner::SkipCommentLine(bool afterAmpersand) { 875 if (IsAtEnd()) { 876 if (afterAmpersand && prescannerNesting_ > 0) { 877 // A continuation marker at the end of the last line in an 878 // include file inhibits the newline for that line. 879 SkipToEndOfLine(); 880 omitNewline_ = true; 881 } 882 return false; 883 } 884 auto lineClass{ClassifyLine(nextLine_)}; 885 if (lineClass.kind == LineClassification::Kind::Comment) { 886 NextLine(); 887 return true; 888 } else if (inPreprocessorDirective_) { 889 return false; 890 } else if (lineClass.kind == 891 LineClassification::Kind::ConditionalCompilationDirective || 892 lineClass.kind == LineClassification::Kind::PreprocessorDirective) { 893 // Allow conditional compilation directives (e.g., #ifdef) to affect 894 // continuation lines. 895 // Allow other preprocessor directives, too, except #include 896 // (when it does not follow '&'), #define, and #undef (because 897 // they cannot be allowed to affect preceding text on a 898 // continued line). 899 preprocessor_.Directive(TokenizePreprocessorDirective(), *this); 900 return true; 901 } else if (afterAmpersand && 902 (lineClass.kind == LineClassification::Kind::IncludeDirective || 903 lineClass.kind == LineClassification::Kind::IncludeLine)) { 904 SkipToEndOfLine(); 905 omitNewline_ = true; 906 skipLeadingAmpersand_ = true; 907 return false; 908 } else { 909 return false; 910 } 911 } 912 913 const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) { 914 if (IsAtEnd()) { 915 return nullptr; 916 } 917 tabInCurrentLine_ = false; 918 char col1{*nextLine_}; 919 if (InCompilerDirective()) { 920 // Must be a continued compiler directive. 921 if (!IsFixedFormCommentChar(col1)) { 922 return nullptr; 923 } 924 int j{1}; 925 for (; j < 5; ++j) { 926 char ch{directiveSentinel_[j - 1]}; 927 if (ch == '\0') { 928 break; 929 } 930 if (ch != ToLowerCaseLetter(nextLine_[j])) { 931 return nullptr; 932 } 933 } 934 for (; j < 5; ++j) { 935 if (nextLine_[j] != ' ') { 936 return nullptr; 937 } 938 } 939 char col6{nextLine_[5]}; 940 if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') { 941 if (nextLine_[6] != ' ' && mightNeedSpace) { 942 insertASpace_ = true; 943 } 944 return nextLine_ + 6; 945 } 946 return nullptr; 947 } else { 948 // Normal case: not in a compiler directive. 949 if (col1 == '&' && 950 features_.IsEnabled( 951 LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) { 952 // Extension: '&' as continuation marker 953 if (features_.ShouldWarn( 954 LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) { 955 Say(GetProvenance(nextLine_), "nonstandard usage"_port_en_US); 956 } 957 return nextLine_ + 1; 958 } 959 if (col1 == '\t' && nextLine_[1] >= '1' && nextLine_[1] <= '9') { 960 tabInCurrentLine_ = true; 961 return nextLine_ + 2; // VAX extension 962 } 963 if (col1 == ' ' && nextLine_[1] == ' ' && nextLine_[2] == ' ' && 964 nextLine_[3] == ' ' && nextLine_[4] == ' ') { 965 char col6{nextLine_[5]}; 966 if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') { 967 return nextLine_ + 6; 968 } 969 } 970 if (IsImplicitContinuation()) { 971 return nextLine_; 972 } 973 } 974 return nullptr; // not a continuation line 975 } 976 977 const char *Prescanner::FreeFormContinuationLine(bool ampersand) { 978 const char *p{nextLine_}; 979 if (p >= limit_) { 980 return nullptr; 981 } 982 p = SkipWhiteSpace(p); 983 if (InCompilerDirective()) { 984 if (*p++ != '!') { 985 return nullptr; 986 } 987 for (const char *s{directiveSentinel_}; *s != '\0'; ++p, ++s) { 988 if (*s != ToLowerCaseLetter(*p)) { 989 return nullptr; 990 } 991 } 992 p = SkipWhiteSpace(p); 993 if (*p == '&') { 994 if (!ampersand) { 995 insertASpace_ = true; 996 } 997 return p + 1; 998 } else if (ampersand) { 999 return p; 1000 } else { 1001 return nullptr; 1002 } 1003 } else { 1004 if (*p == '&') { 1005 return p + 1; 1006 } else if (*p == '!' || *p == '\n' || *p == '#') { 1007 return nullptr; 1008 } else if (ampersand || IsImplicitContinuation()) { 1009 if (p > nextLine_) { 1010 --p; 1011 } else { 1012 insertASpace_ = true; 1013 } 1014 return p; 1015 } else { 1016 return nullptr; 1017 } 1018 } 1019 } 1020 1021 bool Prescanner::FixedFormContinuation(bool mightNeedSpace) { 1022 // N.B. We accept '&' as a continuation indicator in fixed form, too, 1023 // but not in a character literal. 1024 if (*at_ == '&' && inCharLiteral_) { 1025 return false; 1026 } 1027 do { 1028 if (const char *cont{FixedFormContinuationLine(mightNeedSpace)}) { 1029 BeginSourceLine(cont); 1030 column_ = 7; 1031 NextLine(); 1032 return true; 1033 } 1034 } while (SkipCommentLine(false /* not after ampersand */)); 1035 return false; 1036 } 1037 1038 bool Prescanner::FreeFormContinuation() { 1039 const char *p{at_}; 1040 bool ampersand{*p == '&'}; 1041 if (ampersand) { 1042 p = SkipWhiteSpace(p + 1); 1043 } 1044 if (*p != '\n') { 1045 if (inCharLiteral_) { 1046 return false; 1047 } else if (*p != '!' && 1048 features_.ShouldWarn(LanguageFeature::CruftAfterAmpersand)) { 1049 Say(GetProvenance(p), "missing ! before comment after &"_warn_en_US); 1050 } 1051 } 1052 do { 1053 if (const char *cont{FreeFormContinuationLine(ampersand)}) { 1054 BeginSourceLine(cont); 1055 NextLine(); 1056 return true; 1057 } 1058 } while (SkipCommentLine(ampersand)); 1059 return false; 1060 } 1061 1062 // Implicit line continuation allows a preprocessor macro call with 1063 // arguments to span multiple lines. 1064 bool Prescanner::IsImplicitContinuation() const { 1065 return !inPreprocessorDirective_ && !inCharLiteral_ && 1066 delimiterNesting_ > 0 && !IsAtEnd() && 1067 ClassifyLine(nextLine_).kind == LineClassification::Kind::Source; 1068 } 1069 1070 bool Prescanner::Continuation(bool mightNeedFixedFormSpace) { 1071 if (*at_ == '\n' || *at_ == '&') { 1072 if (inFixedForm_) { 1073 return FixedFormContinuation(mightNeedFixedFormSpace); 1074 } else { 1075 return FreeFormContinuation(); 1076 } 1077 } else { 1078 return false; 1079 } 1080 } 1081 1082 std::optional<Prescanner::LineClassification> 1083 Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const { 1084 const char *p{start}; 1085 char col1{*p++}; 1086 if (!IsFixedFormCommentChar(col1)) { 1087 return std::nullopt; 1088 } 1089 char sentinel[5], *sp{sentinel}; 1090 int column{2}; 1091 for (; column < 6; ++column, ++p) { 1092 if (*p != ' ') { 1093 if (*p == '\n' || *p == '\t') { 1094 break; 1095 } 1096 if (sp == sentinel + 1 && sentinel[0] == '$' && IsDecimalDigit(*p)) { 1097 // OpenMP conditional compilation line: leave the label alone 1098 break; 1099 } 1100 *sp++ = ToLowerCaseLetter(*p); 1101 } 1102 } 1103 if (column == 6) { 1104 if (*p == ' ' || *p == '\t' || *p == '0') { 1105 ++p; 1106 } else { 1107 // This is a Continuation line, not an initial directive line. 1108 return std::nullopt; 1109 } 1110 } 1111 if (sp == sentinel) { 1112 return std::nullopt; 1113 } 1114 *sp = '\0'; 1115 if (const char *ss{IsCompilerDirectiveSentinel(sentinel)}) { 1116 std::size_t payloadOffset = p - start; 1117 return {LineClassification{ 1118 LineClassification::Kind::CompilerDirective, payloadOffset, ss}}; 1119 } 1120 return std::nullopt; 1121 } 1122 1123 std::optional<Prescanner::LineClassification> 1124 Prescanner::IsFreeFormCompilerDirectiveLine(const char *start) const { 1125 char sentinel[8]; 1126 const char *p{SkipWhiteSpace(start)}; 1127 if (*p++ != '!') { 1128 return std::nullopt; 1129 } 1130 for (std::size_t j{0}; j + 1 < sizeof sentinel; ++p, ++j) { 1131 if (*p == '\n') { 1132 break; 1133 } 1134 if (*p == ' ' || *p == '\t' || *p == '&') { 1135 if (j == 0) { 1136 break; 1137 } 1138 sentinel[j] = '\0'; 1139 p = SkipWhiteSpace(p + 1); 1140 if (*p == '!') { 1141 break; 1142 } 1143 if (const char *sp{IsCompilerDirectiveSentinel(sentinel)}) { 1144 std::size_t offset = p - start; 1145 return {LineClassification{ 1146 LineClassification::Kind::CompilerDirective, offset, sp}}; 1147 } 1148 break; 1149 } 1150 sentinel[j] = ToLowerCaseLetter(*p); 1151 } 1152 return std::nullopt; 1153 } 1154 1155 Prescanner &Prescanner::AddCompilerDirectiveSentinel(const std::string &dir) { 1156 std::uint64_t packed{0}; 1157 for (char ch : dir) { 1158 packed = (packed << 8) | (ToLowerCaseLetter(ch) & 0xff); 1159 } 1160 compilerDirectiveBloomFilter_.set(packed % prime1); 1161 compilerDirectiveBloomFilter_.set(packed % prime2); 1162 compilerDirectiveSentinels_.insert(dir); 1163 return *this; 1164 } 1165 1166 const char *Prescanner::IsCompilerDirectiveSentinel( 1167 const char *sentinel) const { 1168 std::uint64_t packed{0}; 1169 std::size_t n{0}; 1170 for (; sentinel[n] != '\0'; ++n) { 1171 packed = (packed << 8) | (sentinel[n] & 0xff); 1172 } 1173 if (n == 0 || !compilerDirectiveBloomFilter_.test(packed % prime1) || 1174 !compilerDirectiveBloomFilter_.test(packed % prime2)) { 1175 return nullptr; 1176 } 1177 const auto iter{compilerDirectiveSentinels_.find(std::string(sentinel, n))}; 1178 return iter == compilerDirectiveSentinels_.end() ? nullptr : iter->c_str(); 1179 } 1180 1181 constexpr bool IsDirective(const char *match, const char *dir) { 1182 for (; *match; ++match) { 1183 if (*match != ToLowerCaseLetter(*dir++)) { 1184 return false; 1185 } 1186 } 1187 return true; 1188 } 1189 1190 Prescanner::LineClassification Prescanner::ClassifyLine( 1191 const char *start) const { 1192 if (inFixedForm_) { 1193 if (std::optional<LineClassification> lc{ 1194 IsFixedFormCompilerDirectiveLine(start)}) { 1195 return std::move(*lc); 1196 } 1197 if (IsFixedFormCommentLine(start)) { 1198 return {LineClassification::Kind::Comment}; 1199 } 1200 } else { 1201 if (std::optional<LineClassification> lc{ 1202 IsFreeFormCompilerDirectiveLine(start)}) { 1203 return std::move(*lc); 1204 } 1205 if (const char *bang{IsFreeFormComment(start)}) { 1206 return {LineClassification::Kind::Comment, 1207 static_cast<std::size_t>(bang - start)}; 1208 } 1209 } 1210 if (std::optional<std::size_t> quoteOffset{IsIncludeLine(start)}) { 1211 return {LineClassification::Kind::IncludeLine, *quoteOffset}; 1212 } 1213 if (const char *dir{IsPreprocessorDirectiveLine(start)}) { 1214 if (IsDirective("if", dir) || IsDirective("elif", dir) || 1215 IsDirective("else", dir) || IsDirective("endif", dir)) { 1216 return {LineClassification::Kind::ConditionalCompilationDirective}; 1217 } else if (IsDirective("include", dir)) { 1218 return {LineClassification::Kind::IncludeDirective}; 1219 } else if (IsDirective("define", dir) || IsDirective("undef", dir)) { 1220 return {LineClassification::Kind::DefinitionDirective}; 1221 } else { 1222 return {LineClassification::Kind::PreprocessorDirective}; 1223 } 1224 } 1225 return {LineClassification::Kind::Source}; 1226 } 1227 1228 void Prescanner::SourceFormChange(std::string &&dir) { 1229 if (dir == "!dir$ free") { 1230 inFixedForm_ = false; 1231 } else if (dir == "!dir$ fixed") { 1232 inFixedForm_ = true; 1233 } 1234 } 1235 } // namespace Fortran::parser 1236