1 //===-- lib/Parser/prescan.cpp --------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "prescan.h"
10 #include "preprocessor.h"
11 #include "token-sequence.h"
12 #include "flang/Common/idioms.h"
13 #include "flang/Parser/characters.h"
14 #include "flang/Parser/message.h"
15 #include "flang/Parser/source.h"
16 #include "llvm/Support/raw_ostream.h"
17 #include <cstddef>
18 #include <cstring>
19 #include <utility>
20 #include <vector>
21
22 namespace Fortran::parser {
23
24 using common::LanguageFeature;
25
26 static constexpr int maxPrescannerNesting{100};
27
Prescanner(Messages & messages,CookedSource & cooked,Preprocessor & preprocessor,common::LanguageFeatureControl lfc)28 Prescanner::Prescanner(Messages &messages, CookedSource &cooked,
29 Preprocessor &preprocessor, common::LanguageFeatureControl lfc)
30 : messages_{messages}, cooked_{cooked}, preprocessor_{preprocessor},
31 allSources_{preprocessor_.allSources()}, features_{lfc},
32 encoding_{allSources_.encoding()} {}
33
Prescanner(const Prescanner & that)34 Prescanner::Prescanner(const Prescanner &that)
35 : messages_{that.messages_}, cooked_{that.cooked_},
36 preprocessor_{that.preprocessor_}, allSources_{that.allSources_},
37 features_{that.features_}, inFixedForm_{that.inFixedForm_},
38 fixedFormColumnLimit_{that.fixedFormColumnLimit_},
39 encoding_{that.encoding_}, prescannerNesting_{that.prescannerNesting_ +
40 1},
41 skipLeadingAmpersand_{that.skipLeadingAmpersand_},
42 compilerDirectiveBloomFilter_{that.compilerDirectiveBloomFilter_},
43 compilerDirectiveSentinels_{that.compilerDirectiveSentinels_} {}
44
IsFixedFormCommentChar(char ch)45 static inline constexpr bool IsFixedFormCommentChar(char ch) {
46 return ch == '!' || ch == '*' || ch == 'C' || ch == 'c';
47 }
48
NormalizeCompilerDirectiveCommentMarker(TokenSequence & dir)49 static void NormalizeCompilerDirectiveCommentMarker(TokenSequence &dir) {
50 char *p{dir.GetMutableCharData()};
51 char *limit{p + dir.SizeInChars()};
52 for (; p < limit; ++p) {
53 if (*p != ' ') {
54 CHECK(IsFixedFormCommentChar(*p));
55 *p = '!';
56 return;
57 }
58 }
59 DIE("compiler directive all blank");
60 }
61
Prescan(ProvenanceRange range)62 void Prescanner::Prescan(ProvenanceRange range) {
63 startProvenance_ = range.start();
64 start_ = allSources_.GetSource(range);
65 CHECK(start_);
66 limit_ = start_ + range.size();
67 nextLine_ = start_;
68 const bool beganInFixedForm{inFixedForm_};
69 if (prescannerNesting_ > maxPrescannerNesting) {
70 Say(GetProvenance(start_),
71 "too many nested INCLUDE/#include files, possibly circular"_err_en_US);
72 return;
73 }
74 while (!IsAtEnd()) {
75 Statement();
76 }
77 if (inFixedForm_ != beganInFixedForm) {
78 std::string dir{"!dir$ "};
79 if (beganInFixedForm) {
80 dir += "fixed";
81 } else {
82 dir += "free";
83 }
84 dir += '\n';
85 TokenSequence tokens{dir, allSources_.AddCompilerInsertion(dir).start()};
86 tokens.Emit(cooked_);
87 }
88 }
89
Statement()90 void Prescanner::Statement() {
91 TokenSequence tokens;
92 LineClassification line{ClassifyLine(nextLine_)};
93 switch (line.kind) {
94 case LineClassification::Kind::Comment:
95 nextLine_ += line.payloadOffset; // advance to '!' or newline
96 NextLine();
97 return;
98 case LineClassification::Kind::IncludeLine:
99 FortranInclude(nextLine_ + line.payloadOffset);
100 NextLine();
101 return;
102 case LineClassification::Kind::ConditionalCompilationDirective:
103 case LineClassification::Kind::IncludeDirective:
104 case LineClassification::Kind::DefinitionDirective:
105 case LineClassification::Kind::PreprocessorDirective:
106 preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
107 return;
108 case LineClassification::Kind::CompilerDirective:
109 directiveSentinel_ = line.sentinel;
110 CHECK(InCompilerDirective());
111 BeginStatementAndAdvance();
112 if (inFixedForm_) {
113 CHECK(IsFixedFormCommentChar(*at_));
114 } else {
115 while (*at_ == ' ' || *at_ == '\t') {
116 ++at_, ++column_;
117 }
118 CHECK(*at_ == '!');
119 }
120 if (directiveSentinel_[0] == '$' && directiveSentinel_[1] == '\0') {
121 // OpenMP conditional compilation line. Remove the sentinel and then
122 // treat the line as if it were normal source.
123 at_ += 2, column_ += 2;
124 if (inFixedForm_) {
125 LabelField(tokens);
126 } else {
127 SkipSpaces();
128 }
129 } else {
130 // Compiler directive. Emit normalized sentinel.
131 EmitChar(tokens, '!');
132 ++at_, ++column_;
133 for (const char *sp{directiveSentinel_}; *sp != '\0';
134 ++sp, ++at_, ++column_) {
135 EmitChar(tokens, *sp);
136 }
137 if (*at_ == ' ') {
138 EmitChar(tokens, ' ');
139 ++at_, ++column_;
140 }
141 tokens.CloseToken();
142 }
143 break;
144 case LineClassification::Kind::Source:
145 BeginStatementAndAdvance();
146 if (inFixedForm_) {
147 if (features_.IsEnabled(LanguageFeature::OldDebugLines) &&
148 (*at_ == 'D' || *at_ == 'd')) {
149 NextChar();
150 }
151 LabelField(tokens);
152 } else if (skipLeadingAmpersand_) {
153 skipLeadingAmpersand_ = false;
154 const char *p{SkipWhiteSpace(at_)};
155 if (p < limit_ && *p == '&') {
156 column_ += ++p - at_;
157 at_ = p;
158 }
159 } else {
160 SkipSpaces();
161 }
162 break;
163 }
164
165 while (NextToken(tokens)) {
166 }
167
168 Provenance newlineProvenance{GetCurrentProvenance()};
169 if (std::optional<TokenSequence> preprocessed{
170 preprocessor_.MacroReplacement(tokens, *this)}) {
171 // Reprocess the preprocessed line. Append a newline temporarily.
172 preprocessed->PutNextTokenChar('\n', newlineProvenance);
173 preprocessed->CloseToken();
174 const char *ppd{preprocessed->ToCharBlock().begin()};
175 LineClassification ppl{ClassifyLine(ppd)};
176 preprocessed->pop_back(); // remove the newline
177 switch (ppl.kind) {
178 case LineClassification::Kind::Comment:
179 break;
180 case LineClassification::Kind::IncludeLine:
181 FortranInclude(ppd + ppl.payloadOffset);
182 break;
183 case LineClassification::Kind::ConditionalCompilationDirective:
184 case LineClassification::Kind::IncludeDirective:
185 case LineClassification::Kind::DefinitionDirective:
186 case LineClassification::Kind::PreprocessorDirective:
187 Say(preprocessed->GetProvenanceRange(),
188 "Preprocessed line resembles a preprocessor directive"_warn_en_US);
189 preprocessed->ToLowerCase()
190 .CheckBadFortranCharacters(messages_)
191 .CheckBadParentheses(messages_)
192 .Emit(cooked_);
193 break;
194 case LineClassification::Kind::CompilerDirective:
195 if (preprocessed->HasRedundantBlanks()) {
196 preprocessed->RemoveRedundantBlanks();
197 }
198 NormalizeCompilerDirectiveCommentMarker(*preprocessed);
199 preprocessed->ToLowerCase();
200 SourceFormChange(preprocessed->ToString());
201 preprocessed->ClipComment(true /* skip first ! */)
202 .CheckBadFortranCharacters(messages_)
203 .CheckBadParentheses(messages_)
204 .Emit(cooked_);
205 break;
206 case LineClassification::Kind::Source:
207 if (inFixedForm_) {
208 if (preprocessed->HasBlanks(/*after column*/ 6)) {
209 preprocessed->RemoveBlanks(/*after column*/ 6);
210 }
211 } else {
212 if (preprocessed->HasRedundantBlanks()) {
213 preprocessed->RemoveRedundantBlanks();
214 }
215 }
216 preprocessed->ToLowerCase()
217 .ClipComment()
218 .CheckBadFortranCharacters(messages_)
219 .CheckBadParentheses(messages_)
220 .Emit(cooked_);
221 break;
222 }
223 } else {
224 tokens.ToLowerCase();
225 if (line.kind == LineClassification::Kind::CompilerDirective) {
226 SourceFormChange(tokens.ToString());
227 }
228 if (inFixedForm_ && line.kind == LineClassification::Kind::Source) {
229 EnforceStupidEndStatementRules(tokens);
230 }
231 tokens.CheckBadFortranCharacters(messages_)
232 .CheckBadParentheses(messages_)
233 .Emit(cooked_);
234 }
235 if (omitNewline_) {
236 omitNewline_ = false;
237 } else {
238 cooked_.Put('\n', newlineProvenance);
239 }
240 directiveSentinel_ = nullptr;
241 }
242
TokenizePreprocessorDirective()243 TokenSequence Prescanner::TokenizePreprocessorDirective() {
244 CHECK(!IsAtEnd() && !inPreprocessorDirective_);
245 inPreprocessorDirective_ = true;
246 BeginStatementAndAdvance();
247 TokenSequence tokens;
248 while (NextToken(tokens)) {
249 }
250 inPreprocessorDirective_ = false;
251 return tokens;
252 }
253
NextLine()254 void Prescanner::NextLine() {
255 void *vstart{static_cast<void *>(const_cast<char *>(nextLine_))};
256 void *v{std::memchr(vstart, '\n', limit_ - nextLine_)};
257 if (!v) {
258 nextLine_ = limit_;
259 } else {
260 const char *nl{const_cast<const char *>(static_cast<char *>(v))};
261 nextLine_ = nl + 1;
262 }
263 }
264
LabelField(TokenSequence & token)265 void Prescanner::LabelField(TokenSequence &token) {
266 const char *bad{nullptr};
267 int outCol{1};
268 const char *start{at_};
269 for (; *at_ != '\n' && column_ <= 6; ++at_) {
270 if (*at_ == '\t') {
271 ++at_;
272 column_ = 7;
273 break;
274 }
275 if (*at_ != ' ' &&
276 !(*at_ == '0' && column_ == 6)) { // '0' in column 6 becomes space
277 EmitChar(token, *at_);
278 ++outCol;
279 if (!bad && !IsDecimalDigit(*at_)) {
280 bad = at_;
281 }
282 }
283 ++column_;
284 }
285 if (bad && !preprocessor_.IsNameDefined(token.CurrentOpenToken())) {
286 Say(GetProvenance(bad),
287 "Character in fixed-form label field must be a digit"_warn_en_US);
288 token.clear();
289 at_ = start;
290 return;
291 }
292 if (outCol == 1) { // empty label field
293 // Emit a space so that, if the line is rescanned after preprocessing,
294 // a leading 'C' or 'D' won't be left-justified and then accidentally
295 // misinterpreted as a comment card.
296 EmitChar(token, ' ');
297 ++outCol;
298 }
299 token.CloseToken();
300 SkipToNextSignificantCharacter();
301 if (IsDecimalDigit(*at_)) {
302 Say(GetProvenance(at_),
303 "Label digit is not in fixed-form label field"_port_en_US);
304 }
305 }
306
307 // 6.3.3.5: A program unit END statement, or any other statement whose
308 // initial line resembles an END statement, shall not be continued in
309 // fixed form source.
EnforceStupidEndStatementRules(const TokenSequence & tokens)310 void Prescanner::EnforceStupidEndStatementRules(const TokenSequence &tokens) {
311 CharBlock cBlock{tokens.ToCharBlock()};
312 const char *str{cBlock.begin()};
313 std::size_t n{cBlock.size()};
314 if (n < 3) {
315 return;
316 }
317 std::size_t j{0};
318 for (; j < n && (str[j] == ' ' || (str[j] >= '0' && str[j] <= '9')); ++j) {
319 }
320 if (j + 3 > n || std::memcmp(str + j, "end", 3) != 0) {
321 return;
322 }
323 // It starts with END, possibly after a label.
324 auto start{allSources_.GetSourcePosition(tokens.GetCharProvenance(j))};
325 auto end{allSources_.GetSourcePosition(tokens.GetCharProvenance(n - 1))};
326 if (!start || !end) {
327 return;
328 }
329 if (&start->file == &end->file && start->line == end->line) {
330 return; // no continuation
331 }
332 j += 3;
333 static const char *const prefixes[]{"program", "subroutine", "function",
334 "blockdata", "module", "submodule", nullptr};
335 bool isPrefix{j == n || !IsLegalInIdentifier(str[j])}; // prefix is END
336 std::size_t endOfPrefix{j - 1};
337 for (const char *const *p{prefixes}; *p; ++p) {
338 std::size_t pLen{std::strlen(*p)};
339 if (j + pLen <= n && std::memcmp(str + j, *p, pLen) == 0) {
340 isPrefix = true; // END thing as prefix
341 j += pLen;
342 endOfPrefix = j - 1;
343 for (; j < n && IsLegalInIdentifier(str[j]); ++j) {
344 }
345 break;
346 }
347 }
348 if (isPrefix) {
349 auto range{tokens.GetTokenProvenanceRange(1)};
350 if (j == n) { // END or END thing [name]
351 Say(range,
352 "Program unit END statement may not be continued in fixed form source"_err_en_US);
353 } else {
354 auto endOfPrefixPos{
355 allSources_.GetSourcePosition(tokens.GetCharProvenance(endOfPrefix))};
356 auto next{allSources_.GetSourcePosition(tokens.GetCharProvenance(j))};
357 if (endOfPrefixPos && next && &endOfPrefixPos->file == &start->file &&
358 endOfPrefixPos->line == start->line &&
359 (&next->file != &start->file || next->line != start->line)) {
360 Say(range,
361 "Initial line of continued statement must not appear to be a program unit END in fixed form source"_err_en_US);
362 }
363 }
364 }
365 }
366
SkipToEndOfLine()367 void Prescanner::SkipToEndOfLine() {
368 while (*at_ != '\n') {
369 ++at_, ++column_;
370 }
371 }
372
MustSkipToEndOfLine() const373 bool Prescanner::MustSkipToEndOfLine() const {
374 if (inFixedForm_ && column_ > fixedFormColumnLimit_ && !tabInCurrentLine_) {
375 return true; // skip over ignored columns in right margin (73:80)
376 } else if (*at_ == '!' && !inCharLiteral_) {
377 return true; // inline comment goes to end of source line
378 } else {
379 return false;
380 }
381 }
382
NextChar()383 void Prescanner::NextChar() {
384 CHECK(*at_ != '\n');
385 ++at_, ++column_;
386 while (at_[0] == '\xef' && at_[1] == '\xbb' && at_[2] == '\xbf') {
387 // UTF-8 byte order mark - treat this file as UTF-8
388 at_ += 3;
389 encoding_ = Encoding::UTF_8;
390 }
391 SkipToNextSignificantCharacter();
392 }
393
394 // Skip everything that should be ignored until the next significant
395 // character is reached; handles C-style comments in preprocessing
396 // directives, Fortran ! comments, stuff after the right margin in
397 // fixed form, and all forms of line continuation.
SkipToNextSignificantCharacter()398 void Prescanner::SkipToNextSignificantCharacter() {
399 if (inPreprocessorDirective_) {
400 SkipCComments();
401 } else {
402 bool mightNeedSpace{false};
403 if (MustSkipToEndOfLine()) {
404 SkipToEndOfLine();
405 } else {
406 mightNeedSpace = *at_ == '\n';
407 }
408 for (; Continuation(mightNeedSpace); mightNeedSpace = false) {
409 if (MustSkipToEndOfLine()) {
410 SkipToEndOfLine();
411 }
412 }
413 if (*at_ == '\t') {
414 tabInCurrentLine_ = true;
415 }
416 }
417 }
418
SkipCComments()419 void Prescanner::SkipCComments() {
420 while (true) {
421 if (IsCComment(at_)) {
422 if (const char *after{SkipCComment(at_)}) {
423 column_ += after - at_;
424 // May have skipped over one or more newlines; relocate the start of
425 // the next line.
426 nextLine_ = at_ = after;
427 NextLine();
428 } else {
429 // Don't emit any messages about unclosed C-style comments, because
430 // the sequence /* can appear legally in a FORMAT statement. There's
431 // no ambiguity, since the sequence */ cannot appear legally.
432 break;
433 }
434 } else if (inPreprocessorDirective_ && at_[0] == '\\' && at_ + 2 < limit_ &&
435 at_[1] == '\n' && !IsAtEnd()) {
436 BeginSourceLineAndAdvance();
437 } else {
438 break;
439 }
440 }
441 }
442
SkipSpaces()443 void Prescanner::SkipSpaces() {
444 while (*at_ == ' ' || *at_ == '\t') {
445 NextChar();
446 }
447 insertASpace_ = false;
448 }
449
SkipWhiteSpace(const char * p)450 const char *Prescanner::SkipWhiteSpace(const char *p) {
451 while (*p == ' ' || *p == '\t') {
452 ++p;
453 }
454 return p;
455 }
456
SkipWhiteSpaceAndCComments(const char * p) const457 const char *Prescanner::SkipWhiteSpaceAndCComments(const char *p) const {
458 while (true) {
459 if (*p == ' ' || *p == '\t') {
460 ++p;
461 } else if (IsCComment(p)) {
462 if (const char *after{SkipCComment(p)}) {
463 p = after;
464 } else {
465 break;
466 }
467 } else {
468 break;
469 }
470 }
471 return p;
472 }
473
SkipCComment(const char * p) const474 const char *Prescanner::SkipCComment(const char *p) const {
475 char star{' '}, slash{' '};
476 p += 2;
477 while (star != '*' || slash != '/') {
478 if (p >= limit_) {
479 return nullptr; // signifies an unterminated comment
480 }
481 star = slash;
482 slash = *p++;
483 }
484 return p;
485 }
486
NextToken(TokenSequence & tokens)487 bool Prescanner::NextToken(TokenSequence &tokens) {
488 CHECK(at_ >= start_ && at_ < limit_);
489 if (InFixedFormSource()) {
490 SkipSpaces();
491 } else {
492 if (*at_ == '/' && IsCComment(at_)) {
493 // Recognize and skip over classic C style /*comments*/ when
494 // outside a character literal.
495 if (features_.ShouldWarn(LanguageFeature::ClassicCComments)) {
496 Say(GetProvenance(at_),
497 "nonstandard usage: C-style comment"_port_en_US);
498 }
499 SkipCComments();
500 }
501 if (*at_ == ' ' || *at_ == '\t') {
502 // Compress free-form white space into a single space character.
503 const auto theSpace{at_};
504 char previous{at_ <= start_ ? ' ' : at_[-1]};
505 NextChar();
506 SkipSpaces();
507 if (*at_ == '\n') {
508 // Discard white space at the end of a line.
509 } else if (!inPreprocessorDirective_ &&
510 (previous == '(' || *at_ == '(' || *at_ == ')')) {
511 // Discard white space before/after '(' and before ')', unless in a
512 // preprocessor directive. This helps yield space-free contiguous
513 // names for generic interfaces like OPERATOR( + ) and
514 // READ ( UNFORMATTED ), without misinterpreting #define f (notAnArg).
515 // This has the effect of silently ignoring the illegal spaces in
516 // the array constructor ( /1,2/ ) but that seems benign; it's
517 // hard to avoid that while still removing spaces from OPERATOR( / )
518 // and OPERATOR( // ).
519 } else {
520 // Preserve the squashed white space as a single space character.
521 tokens.PutNextTokenChar(' ', GetProvenance(theSpace));
522 tokens.CloseToken();
523 return true;
524 }
525 }
526 }
527 if (insertASpace_) {
528 tokens.PutNextTokenChar(' ', spaceProvenance_);
529 insertASpace_ = false;
530 }
531 if (*at_ == '\n') {
532 return false;
533 }
534 const char *start{at_};
535 if (*at_ == '\'' || *at_ == '"') {
536 QuotedCharacterLiteral(tokens, start);
537 preventHollerith_ = false;
538 } else if (IsDecimalDigit(*at_)) {
539 int n{0}, digits{0};
540 static constexpr int maxHollerith{256 /*lines*/ * (132 - 6 /*columns*/)};
541 do {
542 if (n < maxHollerith) {
543 n = 10 * n + DecimalDigitValue(*at_);
544 }
545 EmitCharAndAdvance(tokens, *at_);
546 ++digits;
547 if (InFixedFormSource()) {
548 SkipSpaces();
549 }
550 } while (IsDecimalDigit(*at_));
551 if ((*at_ == 'h' || *at_ == 'H') && n > 0 && n < maxHollerith &&
552 !preventHollerith_) {
553 Hollerith(tokens, n, start);
554 } else if (*at_ == '.') {
555 while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
556 }
557 ExponentAndKind(tokens);
558 } else if (ExponentAndKind(tokens)) {
559 } else if (digits == 1 && n == 0 && (*at_ == 'x' || *at_ == 'X') &&
560 inPreprocessorDirective_) {
561 do {
562 EmitCharAndAdvance(tokens, *at_);
563 } while (IsHexadecimalDigit(*at_));
564 } else if (IsLetter(*at_)) {
565 // Handles FORMAT(3I9HHOLLERITH) by skipping over the first I so that
566 // we don't misrecognize I9HOLLERITH as an identifier in the next case.
567 EmitCharAndAdvance(tokens, *at_);
568 } else if (at_[0] == '_' && (at_[1] == '\'' || at_[1] == '"')) { // 4_"..."
569 EmitCharAndAdvance(tokens, *at_);
570 QuotedCharacterLiteral(tokens, start);
571 }
572 preventHollerith_ = false;
573 } else if (*at_ == '.') {
574 char nch{EmitCharAndAdvance(tokens, '.')};
575 if (!inPreprocessorDirective_ && IsDecimalDigit(nch)) {
576 while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
577 }
578 ExponentAndKind(tokens);
579 } else if (nch == '.' && EmitCharAndAdvance(tokens, '.') == '.') {
580 EmitCharAndAdvance(tokens, '.'); // variadic macro definition ellipsis
581 }
582 preventHollerith_ = false;
583 } else if (IsLegalInIdentifier(*at_)) {
584 do {
585 } while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_)));
586 if ((*at_ == '\'' || *at_ == '"') &&
587 tokens.CharAt(tokens.SizeInChars() - 1) == '_') { // kind_"..."
588 QuotedCharacterLiteral(tokens, start);
589 }
590 preventHollerith_ = false;
591 } else if (*at_ == '*') {
592 if (EmitCharAndAdvance(tokens, '*') == '*') {
593 EmitCharAndAdvance(tokens, '*');
594 } else {
595 // Subtle ambiguity:
596 // CHARACTER*2H declares H because *2 is a kind specifier
597 // DATAC/N*2H / is repeated Hollerith
598 preventHollerith_ = !slashInCurrentStatement_;
599 }
600 } else {
601 char ch{*at_};
602 if (ch == '(' || ch == '[') {
603 ++delimiterNesting_;
604 } else if ((ch == ')' || ch == ']') && delimiterNesting_ > 0) {
605 --delimiterNesting_;
606 }
607 char nch{EmitCharAndAdvance(tokens, ch)};
608 preventHollerith_ = false;
609 if ((nch == '=' &&
610 (ch == '<' || ch == '>' || ch == '/' || ch == '=' || ch == '!')) ||
611 (ch == nch &&
612 (ch == '/' || ch == ':' || ch == '*' || ch == '#' || ch == '&' ||
613 ch == '|' || ch == '<' || ch == '>')) ||
614 (ch == '=' && nch == '>')) {
615 // token comprises two characters
616 EmitCharAndAdvance(tokens, nch);
617 } else if (ch == '/') {
618 slashInCurrentStatement_ = true;
619 }
620 }
621 tokens.CloseToken();
622 return true;
623 }
624
ExponentAndKind(TokenSequence & tokens)625 bool Prescanner::ExponentAndKind(TokenSequence &tokens) {
626 char ed{ToLowerCaseLetter(*at_)};
627 if (ed != 'e' && ed != 'd') {
628 return false;
629 }
630 EmitCharAndAdvance(tokens, ed);
631 if (*at_ == '+' || *at_ == '-') {
632 EmitCharAndAdvance(tokens, *at_);
633 }
634 while (IsDecimalDigit(*at_)) {
635 EmitCharAndAdvance(tokens, *at_);
636 }
637 if (*at_ == '_') {
638 while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_))) {
639 }
640 }
641 return true;
642 }
643
QuotedCharacterLiteral(TokenSequence & tokens,const char * start)644 void Prescanner::QuotedCharacterLiteral(
645 TokenSequence &tokens, const char *start) {
646 char quote{*at_};
647 const char *end{at_ + 1};
648 inCharLiteral_ = true;
649 const auto emit{[&](char ch) { EmitChar(tokens, ch); }};
650 const auto insert{[&](char ch) { EmitInsertedChar(tokens, ch); }};
651 bool isEscaped{false};
652 bool escapesEnabled{features_.IsEnabled(LanguageFeature::BackslashEscapes)};
653 while (true) {
654 if (*at_ == '\\') {
655 if (escapesEnabled) {
656 isEscaped = !isEscaped;
657 } else {
658 // The parser always processes escape sequences, so don't confuse it
659 // when escapes are disabled.
660 insert('\\');
661 }
662 } else {
663 isEscaped = false;
664 }
665 EmitQuotedChar(static_cast<unsigned char>(*at_), emit, insert, false,
666 Encoding::LATIN_1);
667 while (PadOutCharacterLiteral(tokens)) {
668 }
669 if (*at_ == '\n') {
670 if (!inPreprocessorDirective_) {
671 Say(GetProvenanceRange(start, end),
672 "Incomplete character literal"_err_en_US);
673 }
674 break;
675 }
676 end = at_ + 1;
677 NextChar();
678 if (*at_ == quote && !isEscaped) {
679 // A doubled unescaped quote mark becomes a single instance of that
680 // quote character in the literal (later). There can be spaces between
681 // the quotes in fixed form source.
682 EmitChar(tokens, quote);
683 inCharLiteral_ = false; // for cases like print *, '...'!comment
684 NextChar();
685 if (InFixedFormSource()) {
686 SkipSpaces();
687 }
688 if (*at_ != quote) {
689 break;
690 }
691 inCharLiteral_ = true;
692 }
693 }
694 inCharLiteral_ = false;
695 }
696
Hollerith(TokenSequence & tokens,int count,const char * start)697 void Prescanner::Hollerith(
698 TokenSequence &tokens, int count, const char *start) {
699 inCharLiteral_ = true;
700 CHECK(*at_ == 'h' || *at_ == 'H');
701 EmitChar(tokens, 'H');
702 while (count-- > 0) {
703 if (PadOutCharacterLiteral(tokens)) {
704 } else if (*at_ == '\n') {
705 Say(GetProvenanceRange(start, at_),
706 "Possible truncated Hollerith literal"_warn_en_US);
707 break;
708 } else {
709 NextChar();
710 // Each multi-byte character encoding counts as a single character.
711 // No escape sequences are recognized.
712 // Hollerith is always emitted to the cooked character
713 // stream in UTF-8.
714 DecodedCharacter decoded{DecodeCharacter(
715 encoding_, at_, static_cast<std::size_t>(limit_ - at_), false)};
716 if (decoded.bytes > 0) {
717 EncodedCharacter utf8{
718 EncodeCharacter<Encoding::UTF_8>(decoded.codepoint)};
719 for (int j{0}; j < utf8.bytes; ++j) {
720 EmitChar(tokens, utf8.buffer[j]);
721 }
722 at_ += decoded.bytes - 1;
723 } else {
724 Say(GetProvenanceRange(start, at_),
725 "Bad character in Hollerith literal"_err_en_US);
726 break;
727 }
728 }
729 }
730 if (*at_ != '\n') {
731 NextChar();
732 }
733 inCharLiteral_ = false;
734 }
735
736 // In fixed form, source card images must be processed as if they were at
737 // least 72 columns wide, at least in character literal contexts.
PadOutCharacterLiteral(TokenSequence & tokens)738 bool Prescanner::PadOutCharacterLiteral(TokenSequence &tokens) {
739 while (inFixedForm_ && !tabInCurrentLine_ && at_[1] == '\n') {
740 if (column_ < fixedFormColumnLimit_) {
741 tokens.PutNextTokenChar(' ', spaceProvenance_);
742 ++column_;
743 return true;
744 }
745 if (!FixedFormContinuation(false /*no need to insert space*/) ||
746 tabInCurrentLine_) {
747 return false;
748 }
749 CHECK(column_ == 7);
750 --at_; // point to column 6 of continuation line
751 column_ = 6;
752 }
753 return false;
754 }
755
IsFixedFormCommentLine(const char * start) const756 bool Prescanner::IsFixedFormCommentLine(const char *start) const {
757 const char *p{start};
758 if (IsFixedFormCommentChar(*p) || *p == '%' || // VAX %list, %eject, &c.
759 ((*p == 'D' || *p == 'd') &&
760 !features_.IsEnabled(LanguageFeature::OldDebugLines))) {
761 return true;
762 }
763 bool anyTabs{false};
764 while (true) {
765 if (*p == ' ') {
766 ++p;
767 } else if (*p == '\t') {
768 anyTabs = true;
769 ++p;
770 } else if (*p == '0' && !anyTabs && p == start + 5) {
771 ++p; // 0 in column 6 must treated as a space
772 } else {
773 break;
774 }
775 }
776 if (!anyTabs && p >= start + fixedFormColumnLimit_) {
777 return true;
778 }
779 if (*p == '!' && !inCharLiteral_ && (anyTabs || p != start + 5)) {
780 return true;
781 }
782 return *p == '\n';
783 }
784
IsFreeFormComment(const char * p) const785 const char *Prescanner::IsFreeFormComment(const char *p) const {
786 p = SkipWhiteSpaceAndCComments(p);
787 if (*p == '!' || *p == '\n') {
788 return p;
789 } else {
790 return nullptr;
791 }
792 }
793
IsIncludeLine(const char * start) const794 std::optional<std::size_t> Prescanner::IsIncludeLine(const char *start) const {
795 const char *p{SkipWhiteSpace(start)};
796 for (char ch : "include"s) {
797 if (ToLowerCaseLetter(*p++) != ch) {
798 return std::nullopt;
799 }
800 }
801 p = SkipWhiteSpace(p);
802 if (*p == '"' || *p == '\'') {
803 return {p - start};
804 }
805 return std::nullopt;
806 }
807
FortranInclude(const char * firstQuote)808 void Prescanner::FortranInclude(const char *firstQuote) {
809 const char *p{firstQuote};
810 while (*p != '"' && *p != '\'') {
811 ++p;
812 }
813 char quote{*p};
814 std::string path;
815 for (++p; *p != '\n'; ++p) {
816 if (*p == quote) {
817 if (p[1] != quote) {
818 break;
819 }
820 ++p;
821 }
822 path += *p;
823 }
824 if (*p != quote) {
825 Say(GetProvenanceRange(firstQuote, p),
826 "malformed path name string"_err_en_US);
827 return;
828 }
829 p = SkipWhiteSpace(p + 1);
830 if (*p != '\n' && *p != '!') {
831 const char *garbage{p};
832 for (; *p != '\n' && *p != '!'; ++p) {
833 }
834 Say(GetProvenanceRange(garbage, p),
835 "excess characters after path name"_warn_en_US);
836 }
837 std::string buf;
838 llvm::raw_string_ostream error{buf};
839 Provenance provenance{GetProvenance(nextLine_)};
840 std::optional<std::string> prependPath;
841 if (const SourceFile * currentFile{allSources_.GetSourceFile(provenance)}) {
842 prependPath = DirectoryName(currentFile->path());
843 }
844 const SourceFile *included{
845 allSources_.Open(path, error, std::move(prependPath))};
846 if (!included) {
847 Say(provenance, "INCLUDE: %s"_err_en_US, error.str());
848 } else if (included->bytes() > 0) {
849 ProvenanceRange includeLineRange{
850 provenance, static_cast<std::size_t>(p - nextLine_)};
851 ProvenanceRange fileRange{
852 allSources_.AddIncludedFile(*included, includeLineRange)};
853 Prescanner{*this}.set_encoding(included->encoding()).Prescan(fileRange);
854 }
855 }
856
IsPreprocessorDirectiveLine(const char * start) const857 const char *Prescanner::IsPreprocessorDirectiveLine(const char *start) const {
858 const char *p{start};
859 for (; *p == ' '; ++p) {
860 }
861 if (*p == '#') {
862 if (inFixedForm_ && p == start + 5) {
863 return nullptr;
864 }
865 } else {
866 p = SkipWhiteSpace(p);
867 if (*p != '#') {
868 return nullptr;
869 }
870 }
871 return SkipWhiteSpace(p + 1);
872 }
873
IsNextLinePreprocessorDirective() const874 bool Prescanner::IsNextLinePreprocessorDirective() const {
875 return IsPreprocessorDirectiveLine(nextLine_) != nullptr;
876 }
877
SkipCommentLine(bool afterAmpersand)878 bool Prescanner::SkipCommentLine(bool afterAmpersand) {
879 if (IsAtEnd()) {
880 if (afterAmpersand && prescannerNesting_ > 0) {
881 // A continuation marker at the end of the last line in an
882 // include file inhibits the newline for that line.
883 SkipToEndOfLine();
884 omitNewline_ = true;
885 }
886 return false;
887 }
888 auto lineClass{ClassifyLine(nextLine_)};
889 if (lineClass.kind == LineClassification::Kind::Comment) {
890 NextLine();
891 return true;
892 } else if (inPreprocessorDirective_) {
893 return false;
894 } else if (lineClass.kind ==
895 LineClassification::Kind::ConditionalCompilationDirective ||
896 lineClass.kind == LineClassification::Kind::PreprocessorDirective) {
897 // Allow conditional compilation directives (e.g., #ifdef) to affect
898 // continuation lines.
899 // Allow other preprocessor directives, too, except #include
900 // (when it does not follow '&'), #define, and #undef (because
901 // they cannot be allowed to affect preceding text on a
902 // continued line).
903 preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
904 return true;
905 } else if (afterAmpersand &&
906 (lineClass.kind == LineClassification::Kind::IncludeDirective ||
907 lineClass.kind == LineClassification::Kind::IncludeLine)) {
908 SkipToEndOfLine();
909 omitNewline_ = true;
910 skipLeadingAmpersand_ = true;
911 return false;
912 } else {
913 return false;
914 }
915 }
916
FixedFormContinuationLine(bool mightNeedSpace)917 const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) {
918 if (IsAtEnd()) {
919 return nullptr;
920 }
921 tabInCurrentLine_ = false;
922 char col1{*nextLine_};
923 if (InCompilerDirective()) {
924 // Must be a continued compiler directive.
925 if (!IsFixedFormCommentChar(col1)) {
926 return nullptr;
927 }
928 int j{1};
929 for (; j < 5; ++j) {
930 char ch{directiveSentinel_[j - 1]};
931 if (ch == '\0') {
932 break;
933 }
934 if (ch != ToLowerCaseLetter(nextLine_[j])) {
935 return nullptr;
936 }
937 }
938 for (; j < 5; ++j) {
939 if (nextLine_[j] != ' ') {
940 return nullptr;
941 }
942 }
943 char col6{nextLine_[5]};
944 if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
945 if (nextLine_[6] != ' ' && mightNeedSpace) {
946 insertASpace_ = true;
947 }
948 return nextLine_ + 6;
949 }
950 return nullptr;
951 } else {
952 // Normal case: not in a compiler directive.
953 if (col1 == '&' &&
954 features_.IsEnabled(
955 LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
956 // Extension: '&' as continuation marker
957 if (features_.ShouldWarn(
958 LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
959 Say(GetProvenance(nextLine_), "nonstandard usage"_port_en_US);
960 }
961 return nextLine_ + 1;
962 }
963 if (col1 == '\t' && nextLine_[1] >= '1' && nextLine_[1] <= '9') {
964 tabInCurrentLine_ = true;
965 return nextLine_ + 2; // VAX extension
966 }
967 if (col1 == ' ' && nextLine_[1] == ' ' && nextLine_[2] == ' ' &&
968 nextLine_[3] == ' ' && nextLine_[4] == ' ') {
969 char col6{nextLine_[5]};
970 if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
971 return nextLine_ + 6;
972 }
973 }
974 if (IsImplicitContinuation()) {
975 return nextLine_;
976 }
977 }
978 return nullptr; // not a continuation line
979 }
980
FreeFormContinuationLine(bool ampersand)981 const char *Prescanner::FreeFormContinuationLine(bool ampersand) {
982 const char *p{nextLine_};
983 if (p >= limit_) {
984 return nullptr;
985 }
986 p = SkipWhiteSpace(p);
987 if (InCompilerDirective()) {
988 if (*p++ != '!') {
989 return nullptr;
990 }
991 for (const char *s{directiveSentinel_}; *s != '\0'; ++p, ++s) {
992 if (*s != ToLowerCaseLetter(*p)) {
993 return nullptr;
994 }
995 }
996 p = SkipWhiteSpace(p);
997 if (*p == '&') {
998 if (!ampersand) {
999 insertASpace_ = true;
1000 }
1001 return p + 1;
1002 } else if (ampersand) {
1003 return p;
1004 } else {
1005 return nullptr;
1006 }
1007 } else {
1008 if (*p == '&') {
1009 return p + 1;
1010 } else if (*p == '!' || *p == '\n' || *p == '#') {
1011 return nullptr;
1012 } else if (ampersand || IsImplicitContinuation()) {
1013 if (p > nextLine_) {
1014 --p;
1015 } else {
1016 insertASpace_ = true;
1017 }
1018 return p;
1019 } else {
1020 return nullptr;
1021 }
1022 }
1023 }
1024
FixedFormContinuation(bool mightNeedSpace)1025 bool Prescanner::FixedFormContinuation(bool mightNeedSpace) {
1026 // N.B. We accept '&' as a continuation indicator in fixed form, too,
1027 // but not in a character literal.
1028 if (*at_ == '&' && inCharLiteral_) {
1029 return false;
1030 }
1031 do {
1032 if (const char *cont{FixedFormContinuationLine(mightNeedSpace)}) {
1033 BeginSourceLine(cont);
1034 column_ = 7;
1035 NextLine();
1036 return true;
1037 }
1038 } while (SkipCommentLine(false /* not after ampersand */));
1039 return false;
1040 }
1041
FreeFormContinuation()1042 bool Prescanner::FreeFormContinuation() {
1043 const char *p{at_};
1044 bool ampersand{*p == '&'};
1045 if (ampersand) {
1046 p = SkipWhiteSpace(p + 1);
1047 }
1048 if (*p != '\n') {
1049 if (inCharLiteral_) {
1050 return false;
1051 } else if (*p != '!' &&
1052 features_.ShouldWarn(LanguageFeature::CruftAfterAmpersand)) {
1053 Say(GetProvenance(p), "missing ! before comment after &"_warn_en_US);
1054 }
1055 }
1056 do {
1057 if (const char *cont{FreeFormContinuationLine(ampersand)}) {
1058 BeginSourceLine(cont);
1059 NextLine();
1060 return true;
1061 }
1062 } while (SkipCommentLine(ampersand));
1063 return false;
1064 }
1065
1066 // Implicit line continuation allows a preprocessor macro call with
1067 // arguments to span multiple lines.
IsImplicitContinuation() const1068 bool Prescanner::IsImplicitContinuation() const {
1069 return !inPreprocessorDirective_ && !inCharLiteral_ &&
1070 delimiterNesting_ > 0 && !IsAtEnd() &&
1071 ClassifyLine(nextLine_).kind == LineClassification::Kind::Source;
1072 }
1073
Continuation(bool mightNeedFixedFormSpace)1074 bool Prescanner::Continuation(bool mightNeedFixedFormSpace) {
1075 if (*at_ == '\n' || *at_ == '&') {
1076 if (inFixedForm_) {
1077 return FixedFormContinuation(mightNeedFixedFormSpace);
1078 } else {
1079 return FreeFormContinuation();
1080 }
1081 } else {
1082 return false;
1083 }
1084 }
1085
1086 std::optional<Prescanner::LineClassification>
IsFixedFormCompilerDirectiveLine(const char * start) const1087 Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const {
1088 const char *p{start};
1089 char col1{*p++};
1090 if (!IsFixedFormCommentChar(col1)) {
1091 return std::nullopt;
1092 }
1093 char sentinel[5], *sp{sentinel};
1094 int column{2};
1095 for (; column < 6; ++column, ++p) {
1096 if (*p != ' ') {
1097 if (*p == '\n' || *p == '\t') {
1098 break;
1099 }
1100 if (sp == sentinel + 1 && sentinel[0] == '$' && IsDecimalDigit(*p)) {
1101 // OpenMP conditional compilation line: leave the label alone
1102 break;
1103 }
1104 *sp++ = ToLowerCaseLetter(*p);
1105 }
1106 }
1107 if (column == 6) {
1108 if (*p == ' ' || *p == '\t' || *p == '0') {
1109 ++p;
1110 } else {
1111 // This is a Continuation line, not an initial directive line.
1112 return std::nullopt;
1113 }
1114 }
1115 if (sp == sentinel) {
1116 return std::nullopt;
1117 }
1118 *sp = '\0';
1119 if (const char *ss{IsCompilerDirectiveSentinel(sentinel)}) {
1120 std::size_t payloadOffset = p - start;
1121 return {LineClassification{
1122 LineClassification::Kind::CompilerDirective, payloadOffset, ss}};
1123 }
1124 return std::nullopt;
1125 }
1126
1127 std::optional<Prescanner::LineClassification>
IsFreeFormCompilerDirectiveLine(const char * start) const1128 Prescanner::IsFreeFormCompilerDirectiveLine(const char *start) const {
1129 char sentinel[8];
1130 const char *p{SkipWhiteSpace(start)};
1131 if (*p++ != '!') {
1132 return std::nullopt;
1133 }
1134 for (std::size_t j{0}; j + 1 < sizeof sentinel; ++p, ++j) {
1135 if (*p == '\n') {
1136 break;
1137 }
1138 if (*p == ' ' || *p == '\t' || *p == '&') {
1139 if (j == 0) {
1140 break;
1141 }
1142 sentinel[j] = '\0';
1143 p = SkipWhiteSpace(p + 1);
1144 if (*p == '!') {
1145 break;
1146 }
1147 if (const char *sp{IsCompilerDirectiveSentinel(sentinel)}) {
1148 std::size_t offset = p - start;
1149 return {LineClassification{
1150 LineClassification::Kind::CompilerDirective, offset, sp}};
1151 }
1152 break;
1153 }
1154 sentinel[j] = ToLowerCaseLetter(*p);
1155 }
1156 return std::nullopt;
1157 }
1158
AddCompilerDirectiveSentinel(const std::string & dir)1159 Prescanner &Prescanner::AddCompilerDirectiveSentinel(const std::string &dir) {
1160 std::uint64_t packed{0};
1161 for (char ch : dir) {
1162 packed = (packed << 8) | (ToLowerCaseLetter(ch) & 0xff);
1163 }
1164 compilerDirectiveBloomFilter_.set(packed % prime1);
1165 compilerDirectiveBloomFilter_.set(packed % prime2);
1166 compilerDirectiveSentinels_.insert(dir);
1167 return *this;
1168 }
1169
IsCompilerDirectiveSentinel(const char * sentinel) const1170 const char *Prescanner::IsCompilerDirectiveSentinel(
1171 const char *sentinel) const {
1172 std::uint64_t packed{0};
1173 std::size_t n{0};
1174 for (; sentinel[n] != '\0'; ++n) {
1175 packed = (packed << 8) | (sentinel[n] & 0xff);
1176 }
1177 if (n == 0 || !compilerDirectiveBloomFilter_.test(packed % prime1) ||
1178 !compilerDirectiveBloomFilter_.test(packed % prime2)) {
1179 return nullptr;
1180 }
1181 const auto iter{compilerDirectiveSentinels_.find(std::string(sentinel, n))};
1182 return iter == compilerDirectiveSentinels_.end() ? nullptr : iter->c_str();
1183 }
1184
IsDirective(const char * match,const char * dir)1185 constexpr bool IsDirective(const char *match, const char *dir) {
1186 for (; *match; ++match) {
1187 if (*match != ToLowerCaseLetter(*dir++)) {
1188 return false;
1189 }
1190 }
1191 return true;
1192 }
1193
ClassifyLine(const char * start) const1194 Prescanner::LineClassification Prescanner::ClassifyLine(
1195 const char *start) const {
1196 if (inFixedForm_) {
1197 if (std::optional<LineClassification> lc{
1198 IsFixedFormCompilerDirectiveLine(start)}) {
1199 return std::move(*lc);
1200 }
1201 if (IsFixedFormCommentLine(start)) {
1202 return {LineClassification::Kind::Comment};
1203 }
1204 } else {
1205 if (std::optional<LineClassification> lc{
1206 IsFreeFormCompilerDirectiveLine(start)}) {
1207 return std::move(*lc);
1208 }
1209 if (const char *bang{IsFreeFormComment(start)}) {
1210 return {LineClassification::Kind::Comment,
1211 static_cast<std::size_t>(bang - start)};
1212 }
1213 }
1214 if (std::optional<std::size_t> quoteOffset{IsIncludeLine(start)}) {
1215 return {LineClassification::Kind::IncludeLine, *quoteOffset};
1216 }
1217 if (const char *dir{IsPreprocessorDirectiveLine(start)}) {
1218 if (IsDirective("if", dir) || IsDirective("elif", dir) ||
1219 IsDirective("else", dir) || IsDirective("endif", dir)) {
1220 return {LineClassification::Kind::ConditionalCompilationDirective};
1221 } else if (IsDirective("include", dir)) {
1222 return {LineClassification::Kind::IncludeDirective};
1223 } else if (IsDirective("define", dir) || IsDirective("undef", dir)) {
1224 return {LineClassification::Kind::DefinitionDirective};
1225 } else {
1226 return {LineClassification::Kind::PreprocessorDirective};
1227 }
1228 }
1229 return {LineClassification::Kind::Source};
1230 }
1231
SourceFormChange(std::string && dir)1232 void Prescanner::SourceFormChange(std::string &&dir) {
1233 if (dir == "!dir$ free") {
1234 inFixedForm_ = false;
1235 } else if (dir == "!dir$ fixed") {
1236 inFixedForm_ = true;
1237 }
1238 }
1239 } // namespace Fortran::parser
1240