1 //===-- lib/Parser/prescan.cpp --------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "prescan.h"
10 #include "preprocessor.h"
11 #include "token-sequence.h"
12 #include "flang/Common/idioms.h"
13 #include "flang/Parser/characters.h"
14 #include "flang/Parser/message.h"
15 #include "flang/Parser/source.h"
16 #include "llvm/Support/raw_ostream.h"
17 #include <cstddef>
18 #include <cstring>
19 #include <utility>
20 #include <vector>
21 
22 namespace Fortran::parser {
23 
24 using common::LanguageFeature;
25 
26 static constexpr int maxPrescannerNesting{100};
27 
Prescanner(Messages & messages,CookedSource & cooked,Preprocessor & preprocessor,common::LanguageFeatureControl lfc)28 Prescanner::Prescanner(Messages &messages, CookedSource &cooked,
29     Preprocessor &preprocessor, common::LanguageFeatureControl lfc)
30     : messages_{messages}, cooked_{cooked}, preprocessor_{preprocessor},
31       allSources_{preprocessor_.allSources()}, features_{lfc},
32       encoding_{allSources_.encoding()} {}
33 
Prescanner(const Prescanner & that)34 Prescanner::Prescanner(const Prescanner &that)
35     : messages_{that.messages_}, cooked_{that.cooked_},
36       preprocessor_{that.preprocessor_}, allSources_{that.allSources_},
37       features_{that.features_}, inFixedForm_{that.inFixedForm_},
38       fixedFormColumnLimit_{that.fixedFormColumnLimit_},
39       encoding_{that.encoding_}, prescannerNesting_{that.prescannerNesting_ +
40                                      1},
41       skipLeadingAmpersand_{that.skipLeadingAmpersand_},
42       compilerDirectiveBloomFilter_{that.compilerDirectiveBloomFilter_},
43       compilerDirectiveSentinels_{that.compilerDirectiveSentinels_} {}
44 
IsFixedFormCommentChar(char ch)45 static inline constexpr bool IsFixedFormCommentChar(char ch) {
46   return ch == '!' || ch == '*' || ch == 'C' || ch == 'c';
47 }
48 
NormalizeCompilerDirectiveCommentMarker(TokenSequence & dir)49 static void NormalizeCompilerDirectiveCommentMarker(TokenSequence &dir) {
50   char *p{dir.GetMutableCharData()};
51   char *limit{p + dir.SizeInChars()};
52   for (; p < limit; ++p) {
53     if (*p != ' ') {
54       CHECK(IsFixedFormCommentChar(*p));
55       *p = '!';
56       return;
57     }
58   }
59   DIE("compiler directive all blank");
60 }
61 
Prescan(ProvenanceRange range)62 void Prescanner::Prescan(ProvenanceRange range) {
63   startProvenance_ = range.start();
64   start_ = allSources_.GetSource(range);
65   CHECK(start_);
66   limit_ = start_ + range.size();
67   nextLine_ = start_;
68   const bool beganInFixedForm{inFixedForm_};
69   if (prescannerNesting_ > maxPrescannerNesting) {
70     Say(GetProvenance(start_),
71         "too many nested INCLUDE/#include files, possibly circular"_err_en_US);
72     return;
73   }
74   while (!IsAtEnd()) {
75     Statement();
76   }
77   if (inFixedForm_ != beganInFixedForm) {
78     std::string dir{"!dir$ "};
79     if (beganInFixedForm) {
80       dir += "fixed";
81     } else {
82       dir += "free";
83     }
84     dir += '\n';
85     TokenSequence tokens{dir, allSources_.AddCompilerInsertion(dir).start()};
86     tokens.Emit(cooked_);
87   }
88 }
89 
Statement()90 void Prescanner::Statement() {
91   TokenSequence tokens;
92   LineClassification line{ClassifyLine(nextLine_)};
93   switch (line.kind) {
94   case LineClassification::Kind::Comment:
95     nextLine_ += line.payloadOffset; // advance to '!' or newline
96     NextLine();
97     return;
98   case LineClassification::Kind::IncludeLine:
99     FortranInclude(nextLine_ + line.payloadOffset);
100     NextLine();
101     return;
102   case LineClassification::Kind::ConditionalCompilationDirective:
103   case LineClassification::Kind::IncludeDirective:
104   case LineClassification::Kind::DefinitionDirective:
105   case LineClassification::Kind::PreprocessorDirective:
106     preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
107     return;
108   case LineClassification::Kind::CompilerDirective:
109     directiveSentinel_ = line.sentinel;
110     CHECK(InCompilerDirective());
111     BeginStatementAndAdvance();
112     if (inFixedForm_) {
113       CHECK(IsFixedFormCommentChar(*at_));
114     } else {
115       while (*at_ == ' ' || *at_ == '\t') {
116         ++at_, ++column_;
117       }
118       CHECK(*at_ == '!');
119     }
120     if (directiveSentinel_[0] == '$' && directiveSentinel_[1] == '\0') {
121       // OpenMP conditional compilation line.  Remove the sentinel and then
122       // treat the line as if it were normal source.
123       at_ += 2, column_ += 2;
124       if (inFixedForm_) {
125         LabelField(tokens);
126       } else {
127         SkipSpaces();
128       }
129     } else {
130       // Compiler directive.  Emit normalized sentinel.
131       EmitChar(tokens, '!');
132       ++at_, ++column_;
133       for (const char *sp{directiveSentinel_}; *sp != '\0';
134            ++sp, ++at_, ++column_) {
135         EmitChar(tokens, *sp);
136       }
137       if (*at_ == ' ') {
138         EmitChar(tokens, ' ');
139         ++at_, ++column_;
140       }
141       tokens.CloseToken();
142     }
143     break;
144   case LineClassification::Kind::Source:
145     BeginStatementAndAdvance();
146     if (inFixedForm_) {
147       if (features_.IsEnabled(LanguageFeature::OldDebugLines) &&
148           (*at_ == 'D' || *at_ == 'd')) {
149         NextChar();
150       }
151       LabelField(tokens);
152     } else if (skipLeadingAmpersand_) {
153       skipLeadingAmpersand_ = false;
154       const char *p{SkipWhiteSpace(at_)};
155       if (p < limit_ && *p == '&') {
156         column_ += ++p - at_;
157         at_ = p;
158       }
159     } else {
160       SkipSpaces();
161     }
162     break;
163   }
164 
165   while (NextToken(tokens)) {
166   }
167 
168   Provenance newlineProvenance{GetCurrentProvenance()};
169   if (std::optional<TokenSequence> preprocessed{
170           preprocessor_.MacroReplacement(tokens, *this)}) {
171     // Reprocess the preprocessed line.  Append a newline temporarily.
172     preprocessed->PutNextTokenChar('\n', newlineProvenance);
173     preprocessed->CloseToken();
174     const char *ppd{preprocessed->ToCharBlock().begin()};
175     LineClassification ppl{ClassifyLine(ppd)};
176     preprocessed->pop_back(); // remove the newline
177     switch (ppl.kind) {
178     case LineClassification::Kind::Comment:
179       break;
180     case LineClassification::Kind::IncludeLine:
181       FortranInclude(ppd + ppl.payloadOffset);
182       break;
183     case LineClassification::Kind::ConditionalCompilationDirective:
184     case LineClassification::Kind::IncludeDirective:
185     case LineClassification::Kind::DefinitionDirective:
186     case LineClassification::Kind::PreprocessorDirective:
187       Say(preprocessed->GetProvenanceRange(),
188           "Preprocessed line resembles a preprocessor directive"_warn_en_US);
189       preprocessed->ToLowerCase()
190           .CheckBadFortranCharacters(messages_)
191           .CheckBadParentheses(messages_)
192           .Emit(cooked_);
193       break;
194     case LineClassification::Kind::CompilerDirective:
195       if (preprocessed->HasRedundantBlanks()) {
196         preprocessed->RemoveRedundantBlanks();
197       }
198       NormalizeCompilerDirectiveCommentMarker(*preprocessed);
199       preprocessed->ToLowerCase();
200       SourceFormChange(preprocessed->ToString());
201       preprocessed->ClipComment(true /* skip first ! */)
202           .CheckBadFortranCharacters(messages_)
203           .CheckBadParentheses(messages_)
204           .Emit(cooked_);
205       break;
206     case LineClassification::Kind::Source:
207       if (inFixedForm_) {
208         if (preprocessed->HasBlanks(/*after column*/ 6)) {
209           preprocessed->RemoveBlanks(/*after column*/ 6);
210         }
211       } else {
212         if (preprocessed->HasRedundantBlanks()) {
213           preprocessed->RemoveRedundantBlanks();
214         }
215       }
216       preprocessed->ToLowerCase()
217           .ClipComment()
218           .CheckBadFortranCharacters(messages_)
219           .CheckBadParentheses(messages_)
220           .Emit(cooked_);
221       break;
222     }
223   } else {
224     tokens.ToLowerCase();
225     if (line.kind == LineClassification::Kind::CompilerDirective) {
226       SourceFormChange(tokens.ToString());
227     }
228     if (inFixedForm_ && line.kind == LineClassification::Kind::Source) {
229       EnforceStupidEndStatementRules(tokens);
230     }
231     tokens.CheckBadFortranCharacters(messages_)
232         .CheckBadParentheses(messages_)
233         .Emit(cooked_);
234   }
235   if (omitNewline_) {
236     omitNewline_ = false;
237   } else {
238     cooked_.Put('\n', newlineProvenance);
239   }
240   directiveSentinel_ = nullptr;
241 }
242 
TokenizePreprocessorDirective()243 TokenSequence Prescanner::TokenizePreprocessorDirective() {
244   CHECK(!IsAtEnd() && !inPreprocessorDirective_);
245   inPreprocessorDirective_ = true;
246   BeginStatementAndAdvance();
247   TokenSequence tokens;
248   while (NextToken(tokens)) {
249   }
250   inPreprocessorDirective_ = false;
251   return tokens;
252 }
253 
NextLine()254 void Prescanner::NextLine() {
255   void *vstart{static_cast<void *>(const_cast<char *>(nextLine_))};
256   void *v{std::memchr(vstart, '\n', limit_ - nextLine_)};
257   if (!v) {
258     nextLine_ = limit_;
259   } else {
260     const char *nl{const_cast<const char *>(static_cast<char *>(v))};
261     nextLine_ = nl + 1;
262   }
263 }
264 
LabelField(TokenSequence & token)265 void Prescanner::LabelField(TokenSequence &token) {
266   const char *bad{nullptr};
267   int outCol{1};
268   const char *start{at_};
269   for (; *at_ != '\n' && column_ <= 6; ++at_) {
270     if (*at_ == '\t') {
271       ++at_;
272       column_ = 7;
273       break;
274     }
275     if (*at_ != ' ' &&
276         !(*at_ == '0' && column_ == 6)) { // '0' in column 6 becomes space
277       EmitChar(token, *at_);
278       ++outCol;
279       if (!bad && !IsDecimalDigit(*at_)) {
280         bad = at_;
281       }
282     }
283     ++column_;
284   }
285   if (bad && !preprocessor_.IsNameDefined(token.CurrentOpenToken())) {
286     Say(GetProvenance(bad),
287         "Character in fixed-form label field must be a digit"_warn_en_US);
288     token.clear();
289     at_ = start;
290     return;
291   }
292   if (outCol == 1) { // empty label field
293     // Emit a space so that, if the line is rescanned after preprocessing,
294     // a leading 'C' or 'D' won't be left-justified and then accidentally
295     // misinterpreted as a comment card.
296     EmitChar(token, ' ');
297     ++outCol;
298   }
299   token.CloseToken();
300   SkipToNextSignificantCharacter();
301   if (IsDecimalDigit(*at_)) {
302     Say(GetProvenance(at_),
303         "Label digit is not in fixed-form label field"_port_en_US);
304   }
305 }
306 
307 // 6.3.3.5: A program unit END statement, or any other statement whose
308 // initial line resembles an END statement, shall not be continued in
309 // fixed form source.
EnforceStupidEndStatementRules(const TokenSequence & tokens)310 void Prescanner::EnforceStupidEndStatementRules(const TokenSequence &tokens) {
311   CharBlock cBlock{tokens.ToCharBlock()};
312   const char *str{cBlock.begin()};
313   std::size_t n{cBlock.size()};
314   if (n < 3) {
315     return;
316   }
317   std::size_t j{0};
318   for (; j < n && (str[j] == ' ' || (str[j] >= '0' && str[j] <= '9')); ++j) {
319   }
320   if (j + 3 > n || std::memcmp(str + j, "end", 3) != 0) {
321     return;
322   }
323   // It starts with END, possibly after a label.
324   auto start{allSources_.GetSourcePosition(tokens.GetCharProvenance(j))};
325   auto end{allSources_.GetSourcePosition(tokens.GetCharProvenance(n - 1))};
326   if (!start || !end) {
327     return;
328   }
329   if (&start->file == &end->file && start->line == end->line) {
330     return; // no continuation
331   }
332   j += 3;
333   static const char *const prefixes[]{"program", "subroutine", "function",
334       "blockdata", "module", "submodule", nullptr};
335   bool isPrefix{j == n || !IsLegalInIdentifier(str[j])}; // prefix is END
336   std::size_t endOfPrefix{j - 1};
337   for (const char *const *p{prefixes}; *p; ++p) {
338     std::size_t pLen{std::strlen(*p)};
339     if (j + pLen <= n && std::memcmp(str + j, *p, pLen) == 0) {
340       isPrefix = true; // END thing as prefix
341       j += pLen;
342       endOfPrefix = j - 1;
343       for (; j < n && IsLegalInIdentifier(str[j]); ++j) {
344       }
345       break;
346     }
347   }
348   if (isPrefix) {
349     auto range{tokens.GetTokenProvenanceRange(1)};
350     if (j == n) { // END or END thing [name]
351       Say(range,
352           "Program unit END statement may not be continued in fixed form source"_err_en_US);
353     } else {
354       auto endOfPrefixPos{
355           allSources_.GetSourcePosition(tokens.GetCharProvenance(endOfPrefix))};
356       auto next{allSources_.GetSourcePosition(tokens.GetCharProvenance(j))};
357       if (endOfPrefixPos && next && &endOfPrefixPos->file == &start->file &&
358           endOfPrefixPos->line == start->line &&
359           (&next->file != &start->file || next->line != start->line)) {
360         Say(range,
361             "Initial line of continued statement must not appear to be a program unit END in fixed form source"_err_en_US);
362       }
363     }
364   }
365 }
366 
SkipToEndOfLine()367 void Prescanner::SkipToEndOfLine() {
368   while (*at_ != '\n') {
369     ++at_, ++column_;
370   }
371 }
372 
MustSkipToEndOfLine() const373 bool Prescanner::MustSkipToEndOfLine() const {
374   if (inFixedForm_ && column_ > fixedFormColumnLimit_ && !tabInCurrentLine_) {
375     return true; // skip over ignored columns in right margin (73:80)
376   } else if (*at_ == '!' && !inCharLiteral_) {
377     return true; // inline comment goes to end of source line
378   } else {
379     return false;
380   }
381 }
382 
NextChar()383 void Prescanner::NextChar() {
384   CHECK(*at_ != '\n');
385   ++at_, ++column_;
386   while (at_[0] == '\xef' && at_[1] == '\xbb' && at_[2] == '\xbf') {
387     // UTF-8 byte order mark - treat this file as UTF-8
388     at_ += 3;
389     encoding_ = Encoding::UTF_8;
390   }
391   SkipToNextSignificantCharacter();
392 }
393 
394 // Skip everything that should be ignored until the next significant
395 // character is reached; handles C-style comments in preprocessing
396 // directives, Fortran ! comments, stuff after the right margin in
397 // fixed form, and all forms of line continuation.
SkipToNextSignificantCharacter()398 void Prescanner::SkipToNextSignificantCharacter() {
399   if (inPreprocessorDirective_) {
400     SkipCComments();
401   } else {
402     bool mightNeedSpace{false};
403     if (MustSkipToEndOfLine()) {
404       SkipToEndOfLine();
405     } else {
406       mightNeedSpace = *at_ == '\n';
407     }
408     for (; Continuation(mightNeedSpace); mightNeedSpace = false) {
409       if (MustSkipToEndOfLine()) {
410         SkipToEndOfLine();
411       }
412     }
413     if (*at_ == '\t') {
414       tabInCurrentLine_ = true;
415     }
416   }
417 }
418 
SkipCComments()419 void Prescanner::SkipCComments() {
420   while (true) {
421     if (IsCComment(at_)) {
422       if (const char *after{SkipCComment(at_)}) {
423         column_ += after - at_;
424         // May have skipped over one or more newlines; relocate the start of
425         // the next line.
426         nextLine_ = at_ = after;
427         NextLine();
428       } else {
429         // Don't emit any messages about unclosed C-style comments, because
430         // the sequence /* can appear legally in a FORMAT statement.  There's
431         // no ambiguity, since the sequence */ cannot appear legally.
432         break;
433       }
434     } else if (inPreprocessorDirective_ && at_[0] == '\\' && at_ + 2 < limit_ &&
435         at_[1] == '\n' && !IsAtEnd()) {
436       BeginSourceLineAndAdvance();
437     } else {
438       break;
439     }
440   }
441 }
442 
SkipSpaces()443 void Prescanner::SkipSpaces() {
444   while (*at_ == ' ' || *at_ == '\t') {
445     NextChar();
446   }
447   insertASpace_ = false;
448 }
449 
SkipWhiteSpace(const char * p)450 const char *Prescanner::SkipWhiteSpace(const char *p) {
451   while (*p == ' ' || *p == '\t') {
452     ++p;
453   }
454   return p;
455 }
456 
SkipWhiteSpaceAndCComments(const char * p) const457 const char *Prescanner::SkipWhiteSpaceAndCComments(const char *p) const {
458   while (true) {
459     if (*p == ' ' || *p == '\t') {
460       ++p;
461     } else if (IsCComment(p)) {
462       if (const char *after{SkipCComment(p)}) {
463         p = after;
464       } else {
465         break;
466       }
467     } else {
468       break;
469     }
470   }
471   return p;
472 }
473 
SkipCComment(const char * p) const474 const char *Prescanner::SkipCComment(const char *p) const {
475   char star{' '}, slash{' '};
476   p += 2;
477   while (star != '*' || slash != '/') {
478     if (p >= limit_) {
479       return nullptr; // signifies an unterminated comment
480     }
481     star = slash;
482     slash = *p++;
483   }
484   return p;
485 }
486 
NextToken(TokenSequence & tokens)487 bool Prescanner::NextToken(TokenSequence &tokens) {
488   CHECK(at_ >= start_ && at_ < limit_);
489   if (InFixedFormSource()) {
490     SkipSpaces();
491   } else {
492     if (*at_ == '/' && IsCComment(at_)) {
493       // Recognize and skip over classic C style /*comments*/ when
494       // outside a character literal.
495       if (features_.ShouldWarn(LanguageFeature::ClassicCComments)) {
496         Say(GetProvenance(at_),
497             "nonstandard usage: C-style comment"_port_en_US);
498       }
499       SkipCComments();
500     }
501     if (*at_ == ' ' || *at_ == '\t') {
502       // Compress free-form white space into a single space character.
503       const auto theSpace{at_};
504       char previous{at_ <= start_ ? ' ' : at_[-1]};
505       NextChar();
506       SkipSpaces();
507       if (*at_ == '\n') {
508         // Discard white space at the end of a line.
509       } else if (!inPreprocessorDirective_ &&
510           (previous == '(' || *at_ == '(' || *at_ == ')')) {
511         // Discard white space before/after '(' and before ')', unless in a
512         // preprocessor directive.  This helps yield space-free contiguous
513         // names for generic interfaces like OPERATOR( + ) and
514         // READ ( UNFORMATTED ), without misinterpreting #define f (notAnArg).
515         // This has the effect of silently ignoring the illegal spaces in
516         // the array constructor ( /1,2/ ) but that seems benign; it's
517         // hard to avoid that while still removing spaces from OPERATOR( / )
518         // and OPERATOR( // ).
519       } else {
520         // Preserve the squashed white space as a single space character.
521         tokens.PutNextTokenChar(' ', GetProvenance(theSpace));
522         tokens.CloseToken();
523         return true;
524       }
525     }
526   }
527   if (insertASpace_) {
528     tokens.PutNextTokenChar(' ', spaceProvenance_);
529     insertASpace_ = false;
530   }
531   if (*at_ == '\n') {
532     return false;
533   }
534   const char *start{at_};
535   if (*at_ == '\'' || *at_ == '"') {
536     QuotedCharacterLiteral(tokens, start);
537     preventHollerith_ = false;
538   } else if (IsDecimalDigit(*at_)) {
539     int n{0}, digits{0};
540     static constexpr int maxHollerith{256 /*lines*/ * (132 - 6 /*columns*/)};
541     do {
542       if (n < maxHollerith) {
543         n = 10 * n + DecimalDigitValue(*at_);
544       }
545       EmitCharAndAdvance(tokens, *at_);
546       ++digits;
547       if (InFixedFormSource()) {
548         SkipSpaces();
549       }
550     } while (IsDecimalDigit(*at_));
551     if ((*at_ == 'h' || *at_ == 'H') && n > 0 && n < maxHollerith &&
552         !preventHollerith_) {
553       Hollerith(tokens, n, start);
554     } else if (*at_ == '.') {
555       while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
556       }
557       ExponentAndKind(tokens);
558     } else if (ExponentAndKind(tokens)) {
559     } else if (digits == 1 && n == 0 && (*at_ == 'x' || *at_ == 'X') &&
560         inPreprocessorDirective_) {
561       do {
562         EmitCharAndAdvance(tokens, *at_);
563       } while (IsHexadecimalDigit(*at_));
564     } else if (IsLetter(*at_)) {
565       // Handles FORMAT(3I9HHOLLERITH) by skipping over the first I so that
566       // we don't misrecognize I9HOLLERITH as an identifier in the next case.
567       EmitCharAndAdvance(tokens, *at_);
568     } else if (at_[0] == '_' && (at_[1] == '\'' || at_[1] == '"')) { // 4_"..."
569       EmitCharAndAdvance(tokens, *at_);
570       QuotedCharacterLiteral(tokens, start);
571     }
572     preventHollerith_ = false;
573   } else if (*at_ == '.') {
574     char nch{EmitCharAndAdvance(tokens, '.')};
575     if (!inPreprocessorDirective_ && IsDecimalDigit(nch)) {
576       while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
577       }
578       ExponentAndKind(tokens);
579     } else if (nch == '.' && EmitCharAndAdvance(tokens, '.') == '.') {
580       EmitCharAndAdvance(tokens, '.'); // variadic macro definition ellipsis
581     }
582     preventHollerith_ = false;
583   } else if (IsLegalInIdentifier(*at_)) {
584     do {
585     } while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_)));
586     if ((*at_ == '\'' || *at_ == '"') &&
587         tokens.CharAt(tokens.SizeInChars() - 1) == '_') { // kind_"..."
588       QuotedCharacterLiteral(tokens, start);
589     }
590     preventHollerith_ = false;
591   } else if (*at_ == '*') {
592     if (EmitCharAndAdvance(tokens, '*') == '*') {
593       EmitCharAndAdvance(tokens, '*');
594     } else {
595       // Subtle ambiguity:
596       //  CHARACTER*2H     declares H because *2 is a kind specifier
597       //  DATAC/N*2H  /    is repeated Hollerith
598       preventHollerith_ = !slashInCurrentStatement_;
599     }
600   } else {
601     char ch{*at_};
602     if (ch == '(' || ch == '[') {
603       ++delimiterNesting_;
604     } else if ((ch == ')' || ch == ']') && delimiterNesting_ > 0) {
605       --delimiterNesting_;
606     }
607     char nch{EmitCharAndAdvance(tokens, ch)};
608     preventHollerith_ = false;
609     if ((nch == '=' &&
610             (ch == '<' || ch == '>' || ch == '/' || ch == '=' || ch == '!')) ||
611         (ch == nch &&
612             (ch == '/' || ch == ':' || ch == '*' || ch == '#' || ch == '&' ||
613                 ch == '|' || ch == '<' || ch == '>')) ||
614         (ch == '=' && nch == '>')) {
615       // token comprises two characters
616       EmitCharAndAdvance(tokens, nch);
617     } else if (ch == '/') {
618       slashInCurrentStatement_ = true;
619     }
620   }
621   tokens.CloseToken();
622   return true;
623 }
624 
ExponentAndKind(TokenSequence & tokens)625 bool Prescanner::ExponentAndKind(TokenSequence &tokens) {
626   char ed{ToLowerCaseLetter(*at_)};
627   if (ed != 'e' && ed != 'd') {
628     return false;
629   }
630   EmitCharAndAdvance(tokens, ed);
631   if (*at_ == '+' || *at_ == '-') {
632     EmitCharAndAdvance(tokens, *at_);
633   }
634   while (IsDecimalDigit(*at_)) {
635     EmitCharAndAdvance(tokens, *at_);
636   }
637   if (*at_ == '_') {
638     while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_))) {
639     }
640   }
641   return true;
642 }
643 
QuotedCharacterLiteral(TokenSequence & tokens,const char * start)644 void Prescanner::QuotedCharacterLiteral(
645     TokenSequence &tokens, const char *start) {
646   char quote{*at_};
647   const char *end{at_ + 1};
648   inCharLiteral_ = true;
649   const auto emit{[&](char ch) { EmitChar(tokens, ch); }};
650   const auto insert{[&](char ch) { EmitInsertedChar(tokens, ch); }};
651   bool isEscaped{false};
652   bool escapesEnabled{features_.IsEnabled(LanguageFeature::BackslashEscapes)};
653   while (true) {
654     if (*at_ == '\\') {
655       if (escapesEnabled) {
656         isEscaped = !isEscaped;
657       } else {
658         // The parser always processes escape sequences, so don't confuse it
659         // when escapes are disabled.
660         insert('\\');
661       }
662     } else {
663       isEscaped = false;
664     }
665     EmitQuotedChar(static_cast<unsigned char>(*at_), emit, insert, false,
666         Encoding::LATIN_1);
667     while (PadOutCharacterLiteral(tokens)) {
668     }
669     if (*at_ == '\n') {
670       if (!inPreprocessorDirective_) {
671         Say(GetProvenanceRange(start, end),
672             "Incomplete character literal"_err_en_US);
673       }
674       break;
675     }
676     end = at_ + 1;
677     NextChar();
678     if (*at_ == quote && !isEscaped) {
679       // A doubled unescaped quote mark becomes a single instance of that
680       // quote character in the literal (later).  There can be spaces between
681       // the quotes in fixed form source.
682       EmitChar(tokens, quote);
683       inCharLiteral_ = false; // for cases like print *, '...'!comment
684       NextChar();
685       if (InFixedFormSource()) {
686         SkipSpaces();
687       }
688       if (*at_ != quote) {
689         break;
690       }
691       inCharLiteral_ = true;
692     }
693   }
694   inCharLiteral_ = false;
695 }
696 
Hollerith(TokenSequence & tokens,int count,const char * start)697 void Prescanner::Hollerith(
698     TokenSequence &tokens, int count, const char *start) {
699   inCharLiteral_ = true;
700   CHECK(*at_ == 'h' || *at_ == 'H');
701   EmitChar(tokens, 'H');
702   while (count-- > 0) {
703     if (PadOutCharacterLiteral(tokens)) {
704     } else if (*at_ == '\n') {
705       Say(GetProvenanceRange(start, at_),
706           "Possible truncated Hollerith literal"_warn_en_US);
707       break;
708     } else {
709       NextChar();
710       // Each multi-byte character encoding counts as a single character.
711       // No escape sequences are recognized.
712       // Hollerith is always emitted to the cooked character
713       // stream in UTF-8.
714       DecodedCharacter decoded{DecodeCharacter(
715           encoding_, at_, static_cast<std::size_t>(limit_ - at_), false)};
716       if (decoded.bytes > 0) {
717         EncodedCharacter utf8{
718             EncodeCharacter<Encoding::UTF_8>(decoded.codepoint)};
719         for (int j{0}; j < utf8.bytes; ++j) {
720           EmitChar(tokens, utf8.buffer[j]);
721         }
722         at_ += decoded.bytes - 1;
723       } else {
724         Say(GetProvenanceRange(start, at_),
725             "Bad character in Hollerith literal"_err_en_US);
726         break;
727       }
728     }
729   }
730   if (*at_ != '\n') {
731     NextChar();
732   }
733   inCharLiteral_ = false;
734 }
735 
736 // In fixed form, source card images must be processed as if they were at
737 // least 72 columns wide, at least in character literal contexts.
PadOutCharacterLiteral(TokenSequence & tokens)738 bool Prescanner::PadOutCharacterLiteral(TokenSequence &tokens) {
739   while (inFixedForm_ && !tabInCurrentLine_ && at_[1] == '\n') {
740     if (column_ < fixedFormColumnLimit_) {
741       tokens.PutNextTokenChar(' ', spaceProvenance_);
742       ++column_;
743       return true;
744     }
745     if (!FixedFormContinuation(false /*no need to insert space*/) ||
746         tabInCurrentLine_) {
747       return false;
748     }
749     CHECK(column_ == 7);
750     --at_; // point to column 6 of continuation line
751     column_ = 6;
752   }
753   return false;
754 }
755 
IsFixedFormCommentLine(const char * start) const756 bool Prescanner::IsFixedFormCommentLine(const char *start) const {
757   const char *p{start};
758   if (IsFixedFormCommentChar(*p) || *p == '%' || // VAX %list, %eject, &c.
759       ((*p == 'D' || *p == 'd') &&
760           !features_.IsEnabled(LanguageFeature::OldDebugLines))) {
761     return true;
762   }
763   bool anyTabs{false};
764   while (true) {
765     if (*p == ' ') {
766       ++p;
767     } else if (*p == '\t') {
768       anyTabs = true;
769       ++p;
770     } else if (*p == '0' && !anyTabs && p == start + 5) {
771       ++p; // 0 in column 6 must treated as a space
772     } else {
773       break;
774     }
775   }
776   if (!anyTabs && p >= start + fixedFormColumnLimit_) {
777     return true;
778   }
779   if (*p == '!' && !inCharLiteral_ && (anyTabs || p != start + 5)) {
780     return true;
781   }
782   return *p == '\n';
783 }
784 
IsFreeFormComment(const char * p) const785 const char *Prescanner::IsFreeFormComment(const char *p) const {
786   p = SkipWhiteSpaceAndCComments(p);
787   if (*p == '!' || *p == '\n') {
788     return p;
789   } else {
790     return nullptr;
791   }
792 }
793 
IsIncludeLine(const char * start) const794 std::optional<std::size_t> Prescanner::IsIncludeLine(const char *start) const {
795   const char *p{SkipWhiteSpace(start)};
796   for (char ch : "include"s) {
797     if (ToLowerCaseLetter(*p++) != ch) {
798       return std::nullopt;
799     }
800   }
801   p = SkipWhiteSpace(p);
802   if (*p == '"' || *p == '\'') {
803     return {p - start};
804   }
805   return std::nullopt;
806 }
807 
FortranInclude(const char * firstQuote)808 void Prescanner::FortranInclude(const char *firstQuote) {
809   const char *p{firstQuote};
810   while (*p != '"' && *p != '\'') {
811     ++p;
812   }
813   char quote{*p};
814   std::string path;
815   for (++p; *p != '\n'; ++p) {
816     if (*p == quote) {
817       if (p[1] != quote) {
818         break;
819       }
820       ++p;
821     }
822     path += *p;
823   }
824   if (*p != quote) {
825     Say(GetProvenanceRange(firstQuote, p),
826         "malformed path name string"_err_en_US);
827     return;
828   }
829   p = SkipWhiteSpace(p + 1);
830   if (*p != '\n' && *p != '!') {
831     const char *garbage{p};
832     for (; *p != '\n' && *p != '!'; ++p) {
833     }
834     Say(GetProvenanceRange(garbage, p),
835         "excess characters after path name"_warn_en_US);
836   }
837   std::string buf;
838   llvm::raw_string_ostream error{buf};
839   Provenance provenance{GetProvenance(nextLine_)};
840   std::optional<std::string> prependPath;
841   if (const SourceFile * currentFile{allSources_.GetSourceFile(provenance)}) {
842     prependPath = DirectoryName(currentFile->path());
843   }
844   const SourceFile *included{
845       allSources_.Open(path, error, std::move(prependPath))};
846   if (!included) {
847     Say(provenance, "INCLUDE: %s"_err_en_US, error.str());
848   } else if (included->bytes() > 0) {
849     ProvenanceRange includeLineRange{
850         provenance, static_cast<std::size_t>(p - nextLine_)};
851     ProvenanceRange fileRange{
852         allSources_.AddIncludedFile(*included, includeLineRange)};
853     Prescanner{*this}.set_encoding(included->encoding()).Prescan(fileRange);
854   }
855 }
856 
IsPreprocessorDirectiveLine(const char * start) const857 const char *Prescanner::IsPreprocessorDirectiveLine(const char *start) const {
858   const char *p{start};
859   for (; *p == ' '; ++p) {
860   }
861   if (*p == '#') {
862     if (inFixedForm_ && p == start + 5) {
863       return nullptr;
864     }
865   } else {
866     p = SkipWhiteSpace(p);
867     if (*p != '#') {
868       return nullptr;
869     }
870   }
871   return SkipWhiteSpace(p + 1);
872 }
873 
IsNextLinePreprocessorDirective() const874 bool Prescanner::IsNextLinePreprocessorDirective() const {
875   return IsPreprocessorDirectiveLine(nextLine_) != nullptr;
876 }
877 
SkipCommentLine(bool afterAmpersand)878 bool Prescanner::SkipCommentLine(bool afterAmpersand) {
879   if (IsAtEnd()) {
880     if (afterAmpersand && prescannerNesting_ > 0) {
881       // A continuation marker at the end of the last line in an
882       // include file inhibits the newline for that line.
883       SkipToEndOfLine();
884       omitNewline_ = true;
885     }
886     return false;
887   }
888   auto lineClass{ClassifyLine(nextLine_)};
889   if (lineClass.kind == LineClassification::Kind::Comment) {
890     NextLine();
891     return true;
892   } else if (inPreprocessorDirective_) {
893     return false;
894   } else if (lineClass.kind ==
895           LineClassification::Kind::ConditionalCompilationDirective ||
896       lineClass.kind == LineClassification::Kind::PreprocessorDirective) {
897     // Allow conditional compilation directives (e.g., #ifdef) to affect
898     // continuation lines.
899     // Allow other preprocessor directives, too, except #include
900     // (when it does not follow '&'), #define, and #undef (because
901     // they cannot be allowed to affect preceding text on a
902     // continued line).
903     preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
904     return true;
905   } else if (afterAmpersand &&
906       (lineClass.kind == LineClassification::Kind::IncludeDirective ||
907           lineClass.kind == LineClassification::Kind::IncludeLine)) {
908     SkipToEndOfLine();
909     omitNewline_ = true;
910     skipLeadingAmpersand_ = true;
911     return false;
912   } else {
913     return false;
914   }
915 }
916 
FixedFormContinuationLine(bool mightNeedSpace)917 const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) {
918   if (IsAtEnd()) {
919     return nullptr;
920   }
921   tabInCurrentLine_ = false;
922   char col1{*nextLine_};
923   if (InCompilerDirective()) {
924     // Must be a continued compiler directive.
925     if (!IsFixedFormCommentChar(col1)) {
926       return nullptr;
927     }
928     int j{1};
929     for (; j < 5; ++j) {
930       char ch{directiveSentinel_[j - 1]};
931       if (ch == '\0') {
932         break;
933       }
934       if (ch != ToLowerCaseLetter(nextLine_[j])) {
935         return nullptr;
936       }
937     }
938     for (; j < 5; ++j) {
939       if (nextLine_[j] != ' ') {
940         return nullptr;
941       }
942     }
943     char col6{nextLine_[5]};
944     if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
945       if (nextLine_[6] != ' ' && mightNeedSpace) {
946         insertASpace_ = true;
947       }
948       return nextLine_ + 6;
949     }
950     return nullptr;
951   } else {
952     // Normal case: not in a compiler directive.
953     if (col1 == '&' &&
954         features_.IsEnabled(
955             LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
956       // Extension: '&' as continuation marker
957       if (features_.ShouldWarn(
958               LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
959         Say(GetProvenance(nextLine_), "nonstandard usage"_port_en_US);
960       }
961       return nextLine_ + 1;
962     }
963     if (col1 == '\t' && nextLine_[1] >= '1' && nextLine_[1] <= '9') {
964       tabInCurrentLine_ = true;
965       return nextLine_ + 2; // VAX extension
966     }
967     if (col1 == ' ' && nextLine_[1] == ' ' && nextLine_[2] == ' ' &&
968         nextLine_[3] == ' ' && nextLine_[4] == ' ') {
969       char col6{nextLine_[5]};
970       if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
971         return nextLine_ + 6;
972       }
973     }
974     if (IsImplicitContinuation()) {
975       return nextLine_;
976     }
977   }
978   return nullptr; // not a continuation line
979 }
980 
FreeFormContinuationLine(bool ampersand)981 const char *Prescanner::FreeFormContinuationLine(bool ampersand) {
982   const char *p{nextLine_};
983   if (p >= limit_) {
984     return nullptr;
985   }
986   p = SkipWhiteSpace(p);
987   if (InCompilerDirective()) {
988     if (*p++ != '!') {
989       return nullptr;
990     }
991     for (const char *s{directiveSentinel_}; *s != '\0'; ++p, ++s) {
992       if (*s != ToLowerCaseLetter(*p)) {
993         return nullptr;
994       }
995     }
996     p = SkipWhiteSpace(p);
997     if (*p == '&') {
998       if (!ampersand) {
999         insertASpace_ = true;
1000       }
1001       return p + 1;
1002     } else if (ampersand) {
1003       return p;
1004     } else {
1005       return nullptr;
1006     }
1007   } else {
1008     if (*p == '&') {
1009       return p + 1;
1010     } else if (*p == '!' || *p == '\n' || *p == '#') {
1011       return nullptr;
1012     } else if (ampersand || IsImplicitContinuation()) {
1013       if (p > nextLine_) {
1014         --p;
1015       } else {
1016         insertASpace_ = true;
1017       }
1018       return p;
1019     } else {
1020       return nullptr;
1021     }
1022   }
1023 }
1024 
FixedFormContinuation(bool mightNeedSpace)1025 bool Prescanner::FixedFormContinuation(bool mightNeedSpace) {
1026   // N.B. We accept '&' as a continuation indicator in fixed form, too,
1027   // but not in a character literal.
1028   if (*at_ == '&' && inCharLiteral_) {
1029     return false;
1030   }
1031   do {
1032     if (const char *cont{FixedFormContinuationLine(mightNeedSpace)}) {
1033       BeginSourceLine(cont);
1034       column_ = 7;
1035       NextLine();
1036       return true;
1037     }
1038   } while (SkipCommentLine(false /* not after ampersand */));
1039   return false;
1040 }
1041 
FreeFormContinuation()1042 bool Prescanner::FreeFormContinuation() {
1043   const char *p{at_};
1044   bool ampersand{*p == '&'};
1045   if (ampersand) {
1046     p = SkipWhiteSpace(p + 1);
1047   }
1048   if (*p != '\n') {
1049     if (inCharLiteral_) {
1050       return false;
1051     } else if (*p != '!' &&
1052         features_.ShouldWarn(LanguageFeature::CruftAfterAmpersand)) {
1053       Say(GetProvenance(p), "missing ! before comment after &"_warn_en_US);
1054     }
1055   }
1056   do {
1057     if (const char *cont{FreeFormContinuationLine(ampersand)}) {
1058       BeginSourceLine(cont);
1059       NextLine();
1060       return true;
1061     }
1062   } while (SkipCommentLine(ampersand));
1063   return false;
1064 }
1065 
1066 // Implicit line continuation allows a preprocessor macro call with
1067 // arguments to span multiple lines.
IsImplicitContinuation() const1068 bool Prescanner::IsImplicitContinuation() const {
1069   return !inPreprocessorDirective_ && !inCharLiteral_ &&
1070       delimiterNesting_ > 0 && !IsAtEnd() &&
1071       ClassifyLine(nextLine_).kind == LineClassification::Kind::Source;
1072 }
1073 
Continuation(bool mightNeedFixedFormSpace)1074 bool Prescanner::Continuation(bool mightNeedFixedFormSpace) {
1075   if (*at_ == '\n' || *at_ == '&') {
1076     if (inFixedForm_) {
1077       return FixedFormContinuation(mightNeedFixedFormSpace);
1078     } else {
1079       return FreeFormContinuation();
1080     }
1081   } else {
1082     return false;
1083   }
1084 }
1085 
1086 std::optional<Prescanner::LineClassification>
IsFixedFormCompilerDirectiveLine(const char * start) const1087 Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const {
1088   const char *p{start};
1089   char col1{*p++};
1090   if (!IsFixedFormCommentChar(col1)) {
1091     return std::nullopt;
1092   }
1093   char sentinel[5], *sp{sentinel};
1094   int column{2};
1095   for (; column < 6; ++column, ++p) {
1096     if (*p != ' ') {
1097       if (*p == '\n' || *p == '\t') {
1098         break;
1099       }
1100       if (sp == sentinel + 1 && sentinel[0] == '$' && IsDecimalDigit(*p)) {
1101         // OpenMP conditional compilation line: leave the label alone
1102         break;
1103       }
1104       *sp++ = ToLowerCaseLetter(*p);
1105     }
1106   }
1107   if (column == 6) {
1108     if (*p == ' ' || *p == '\t' || *p == '0') {
1109       ++p;
1110     } else {
1111       // This is a Continuation line, not an initial directive line.
1112       return std::nullopt;
1113     }
1114   }
1115   if (sp == sentinel) {
1116     return std::nullopt;
1117   }
1118   *sp = '\0';
1119   if (const char *ss{IsCompilerDirectiveSentinel(sentinel)}) {
1120     std::size_t payloadOffset = p - start;
1121     return {LineClassification{
1122         LineClassification::Kind::CompilerDirective, payloadOffset, ss}};
1123   }
1124   return std::nullopt;
1125 }
1126 
1127 std::optional<Prescanner::LineClassification>
IsFreeFormCompilerDirectiveLine(const char * start) const1128 Prescanner::IsFreeFormCompilerDirectiveLine(const char *start) const {
1129   char sentinel[8];
1130   const char *p{SkipWhiteSpace(start)};
1131   if (*p++ != '!') {
1132     return std::nullopt;
1133   }
1134   for (std::size_t j{0}; j + 1 < sizeof sentinel; ++p, ++j) {
1135     if (*p == '\n') {
1136       break;
1137     }
1138     if (*p == ' ' || *p == '\t' || *p == '&') {
1139       if (j == 0) {
1140         break;
1141       }
1142       sentinel[j] = '\0';
1143       p = SkipWhiteSpace(p + 1);
1144       if (*p == '!') {
1145         break;
1146       }
1147       if (const char *sp{IsCompilerDirectiveSentinel(sentinel)}) {
1148         std::size_t offset = p - start;
1149         return {LineClassification{
1150             LineClassification::Kind::CompilerDirective, offset, sp}};
1151       }
1152       break;
1153     }
1154     sentinel[j] = ToLowerCaseLetter(*p);
1155   }
1156   return std::nullopt;
1157 }
1158 
AddCompilerDirectiveSentinel(const std::string & dir)1159 Prescanner &Prescanner::AddCompilerDirectiveSentinel(const std::string &dir) {
1160   std::uint64_t packed{0};
1161   for (char ch : dir) {
1162     packed = (packed << 8) | (ToLowerCaseLetter(ch) & 0xff);
1163   }
1164   compilerDirectiveBloomFilter_.set(packed % prime1);
1165   compilerDirectiveBloomFilter_.set(packed % prime2);
1166   compilerDirectiveSentinels_.insert(dir);
1167   return *this;
1168 }
1169 
IsCompilerDirectiveSentinel(const char * sentinel) const1170 const char *Prescanner::IsCompilerDirectiveSentinel(
1171     const char *sentinel) const {
1172   std::uint64_t packed{0};
1173   std::size_t n{0};
1174   for (; sentinel[n] != '\0'; ++n) {
1175     packed = (packed << 8) | (sentinel[n] & 0xff);
1176   }
1177   if (n == 0 || !compilerDirectiveBloomFilter_.test(packed % prime1) ||
1178       !compilerDirectiveBloomFilter_.test(packed % prime2)) {
1179     return nullptr;
1180   }
1181   const auto iter{compilerDirectiveSentinels_.find(std::string(sentinel, n))};
1182   return iter == compilerDirectiveSentinels_.end() ? nullptr : iter->c_str();
1183 }
1184 
IsDirective(const char * match,const char * dir)1185 constexpr bool IsDirective(const char *match, const char *dir) {
1186   for (; *match; ++match) {
1187     if (*match != ToLowerCaseLetter(*dir++)) {
1188       return false;
1189     }
1190   }
1191   return true;
1192 }
1193 
ClassifyLine(const char * start) const1194 Prescanner::LineClassification Prescanner::ClassifyLine(
1195     const char *start) const {
1196   if (inFixedForm_) {
1197     if (std::optional<LineClassification> lc{
1198             IsFixedFormCompilerDirectiveLine(start)}) {
1199       return std::move(*lc);
1200     }
1201     if (IsFixedFormCommentLine(start)) {
1202       return {LineClassification::Kind::Comment};
1203     }
1204   } else {
1205     if (std::optional<LineClassification> lc{
1206             IsFreeFormCompilerDirectiveLine(start)}) {
1207       return std::move(*lc);
1208     }
1209     if (const char *bang{IsFreeFormComment(start)}) {
1210       return {LineClassification::Kind::Comment,
1211           static_cast<std::size_t>(bang - start)};
1212     }
1213   }
1214   if (std::optional<std::size_t> quoteOffset{IsIncludeLine(start)}) {
1215     return {LineClassification::Kind::IncludeLine, *quoteOffset};
1216   }
1217   if (const char *dir{IsPreprocessorDirectiveLine(start)}) {
1218     if (IsDirective("if", dir) || IsDirective("elif", dir) ||
1219         IsDirective("else", dir) || IsDirective("endif", dir)) {
1220       return {LineClassification::Kind::ConditionalCompilationDirective};
1221     } else if (IsDirective("include", dir)) {
1222       return {LineClassification::Kind::IncludeDirective};
1223     } else if (IsDirective("define", dir) || IsDirective("undef", dir)) {
1224       return {LineClassification::Kind::DefinitionDirective};
1225     } else {
1226       return {LineClassification::Kind::PreprocessorDirective};
1227     }
1228   }
1229   return {LineClassification::Kind::Source};
1230 }
1231 
SourceFormChange(std::string && dir)1232 void Prescanner::SourceFormChange(std::string &&dir) {
1233   if (dir == "!dir$ free") {
1234     inFixedForm_ = false;
1235   } else if (dir == "!dir$ fixed") {
1236     inFixedForm_ = true;
1237   }
1238 }
1239 } // namespace Fortran::parser
1240