1 //===-- lib/Parser/prescan.cpp --------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "prescan.h"
10 #include "preprocessor.h"
11 #include "token-sequence.h"
12 #include "flang/Common/idioms.h"
13 #include "flang/Parser/characters.h"
14 #include "flang/Parser/message.h"
15 #include "flang/Parser/source.h"
16 #include "llvm/Support/raw_ostream.h"
17 #include <cstddef>
18 #include <cstring>
19 #include <utility>
20 #include <vector>
21 
22 namespace Fortran::parser {
23 
24 using common::LanguageFeature;
25 
26 static constexpr int maxPrescannerNesting{100};
27 
28 Prescanner::Prescanner(Messages &messages, CookedSource &cooked,
29     Preprocessor &preprocessor, common::LanguageFeatureControl lfc)
30     : messages_{messages}, cooked_{cooked}, preprocessor_{preprocessor},
31       allSources_{preprocessor_.allSources()}, features_{lfc},
32       encoding_{allSources_.encoding()} {}
33 
34 Prescanner::Prescanner(const Prescanner &that)
35     : messages_{that.messages_}, cooked_{that.cooked_},
36       preprocessor_{that.preprocessor_}, allSources_{that.allSources_},
37       features_{that.features_}, inFixedForm_{that.inFixedForm_},
38       fixedFormColumnLimit_{that.fixedFormColumnLimit_},
39       encoding_{that.encoding_}, prescannerNesting_{that.prescannerNesting_ +
40                                      1},
41       skipLeadingAmpersand_{that.skipLeadingAmpersand_},
42       compilerDirectiveBloomFilter_{that.compilerDirectiveBloomFilter_},
43       compilerDirectiveSentinels_{that.compilerDirectiveSentinels_} {}
44 
45 static inline constexpr bool IsFixedFormCommentChar(char ch) {
46   return ch == '!' || ch == '*' || ch == 'C' || ch == 'c';
47 }
48 
49 static void NormalizeCompilerDirectiveCommentMarker(TokenSequence &dir) {
50   char *p{dir.GetMutableCharData()};
51   char *limit{p + dir.SizeInChars()};
52   for (; p < limit; ++p) {
53     if (*p != ' ') {
54       CHECK(IsFixedFormCommentChar(*p));
55       *p = '!';
56       return;
57     }
58   }
59   DIE("compiler directive all blank");
60 }
61 
62 void Prescanner::Prescan(ProvenanceRange range) {
63   startProvenance_ = range.start();
64   start_ = allSources_.GetSource(range);
65   CHECK(start_);
66   limit_ = start_ + range.size();
67   nextLine_ = start_;
68   const bool beganInFixedForm{inFixedForm_};
69   if (prescannerNesting_ > maxPrescannerNesting) {
70     Say(GetProvenance(start_),
71         "too many nested INCLUDE/#include files, possibly circular"_err_en_US);
72     return;
73   }
74   while (!IsAtEnd()) {
75     Statement();
76   }
77   if (inFixedForm_ != beganInFixedForm) {
78     std::string dir{"!dir$ "};
79     if (beganInFixedForm) {
80       dir += "fixed";
81     } else {
82       dir += "free";
83     }
84     dir += '\n';
85     TokenSequence tokens{dir, allSources_.AddCompilerInsertion(dir).start()};
86     tokens.Emit(cooked_);
87   }
88 }
89 
90 void Prescanner::Statement() {
91   TokenSequence tokens;
92   LineClassification line{ClassifyLine(nextLine_)};
93   switch (line.kind) {
94   case LineClassification::Kind::Comment:
95     nextLine_ += line.payloadOffset; // advance to '!' or newline
96     NextLine();
97     return;
98   case LineClassification::Kind::IncludeLine:
99     FortranInclude(nextLine_ + line.payloadOffset);
100     NextLine();
101     return;
102   case LineClassification::Kind::ConditionalCompilationDirective:
103   case LineClassification::Kind::IncludeDirective:
104   case LineClassification::Kind::DefinitionDirective:
105   case LineClassification::Kind::PreprocessorDirective:
106     preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
107     return;
108   case LineClassification::Kind::CompilerDirective:
109     directiveSentinel_ = line.sentinel;
110     CHECK(InCompilerDirective());
111     BeginStatementAndAdvance();
112     if (inFixedForm_) {
113       CHECK(IsFixedFormCommentChar(*at_));
114     } else {
115       while (*at_ == ' ' || *at_ == '\t') {
116         ++at_, ++column_;
117       }
118       CHECK(*at_ == '!');
119     }
120     if (directiveSentinel_[0] == '$' && directiveSentinel_[1] == '\0') {
121       // OpenMP conditional compilation line.  Remove the sentinel and then
122       // treat the line as if it were normal source.
123       at_ += 2, column_ += 2;
124       if (inFixedForm_) {
125         LabelField(tokens);
126       } else {
127         SkipSpaces();
128       }
129     } else {
130       // Compiler directive.  Emit normalized sentinel.
131       EmitChar(tokens, '!');
132       ++at_, ++column_;
133       for (const char *sp{directiveSentinel_}; *sp != '\0';
134            ++sp, ++at_, ++column_) {
135         EmitChar(tokens, *sp);
136       }
137       if (*at_ == ' ') {
138         EmitChar(tokens, ' ');
139         ++at_, ++column_;
140       }
141       tokens.CloseToken();
142     }
143     break;
144   case LineClassification::Kind::Source:
145     BeginStatementAndAdvance();
146     if (inFixedForm_) {
147       LabelField(tokens);
148     } else if (skipLeadingAmpersand_) {
149       skipLeadingAmpersand_ = false;
150       const char *p{SkipWhiteSpace(at_)};
151       if (p < limit_ && *p == '&') {
152         column_ += ++p - at_;
153         at_ = p;
154       }
155     } else {
156       SkipSpaces();
157     }
158     break;
159   }
160 
161   while (NextToken(tokens)) {
162   }
163 
164   Provenance newlineProvenance{GetCurrentProvenance()};
165   if (std::optional<TokenSequence> preprocessed{
166           preprocessor_.MacroReplacement(tokens, *this)}) {
167     // Reprocess the preprocessed line.  Append a newline temporarily.
168     preprocessed->PutNextTokenChar('\n', newlineProvenance);
169     preprocessed->CloseToken();
170     const char *ppd{preprocessed->ToCharBlock().begin()};
171     LineClassification ppl{ClassifyLine(ppd)};
172     preprocessed->pop_back(); // remove the newline
173     switch (ppl.kind) {
174     case LineClassification::Kind::Comment:
175       break;
176     case LineClassification::Kind::IncludeLine:
177       FortranInclude(ppd + ppl.payloadOffset);
178       break;
179     case LineClassification::Kind::ConditionalCompilationDirective:
180     case LineClassification::Kind::IncludeDirective:
181     case LineClassification::Kind::DefinitionDirective:
182     case LineClassification::Kind::PreprocessorDirective:
183       Say(preprocessed->GetProvenanceRange(),
184           "Preprocessed line resembles a preprocessor directive"_warn_en_US);
185       preprocessed->ToLowerCase()
186           .CheckBadFortranCharacters(messages_)
187           .CheckBadParentheses(messages_)
188           .Emit(cooked_);
189       break;
190     case LineClassification::Kind::CompilerDirective:
191       if (preprocessed->HasRedundantBlanks()) {
192         preprocessed->RemoveRedundantBlanks();
193       }
194       NormalizeCompilerDirectiveCommentMarker(*preprocessed);
195       preprocessed->ToLowerCase();
196       SourceFormChange(preprocessed->ToString());
197       preprocessed->ClipComment(true /* skip first ! */)
198           .CheckBadFortranCharacters(messages_)
199           .CheckBadParentheses(messages_)
200           .Emit(cooked_);
201       break;
202     case LineClassification::Kind::Source:
203       if (inFixedForm_) {
204         if (preprocessed->HasBlanks(/*after column*/ 6)) {
205           preprocessed->RemoveBlanks(/*after column*/ 6);
206         }
207       } else {
208         if (preprocessed->HasRedundantBlanks()) {
209           preprocessed->RemoveRedundantBlanks();
210         }
211       }
212       preprocessed->ToLowerCase()
213           .ClipComment()
214           .CheckBadFortranCharacters(messages_)
215           .CheckBadParentheses(messages_)
216           .Emit(cooked_);
217       break;
218     }
219   } else {
220     tokens.ToLowerCase();
221     if (line.kind == LineClassification::Kind::CompilerDirective) {
222       SourceFormChange(tokens.ToString());
223     }
224     if (inFixedForm_ && line.kind == LineClassification::Kind::Source) {
225       EnforceStupidEndStatementRules(tokens);
226     }
227     tokens.CheckBadFortranCharacters(messages_)
228         .CheckBadParentheses(messages_)
229         .Emit(cooked_);
230   }
231   if (omitNewline_) {
232     omitNewline_ = false;
233   } else {
234     cooked_.Put('\n', newlineProvenance);
235   }
236   directiveSentinel_ = nullptr;
237 }
238 
239 TokenSequence Prescanner::TokenizePreprocessorDirective() {
240   CHECK(!IsAtEnd() && !inPreprocessorDirective_);
241   inPreprocessorDirective_ = true;
242   BeginStatementAndAdvance();
243   TokenSequence tokens;
244   while (NextToken(tokens)) {
245   }
246   inPreprocessorDirective_ = false;
247   return tokens;
248 }
249 
250 void Prescanner::NextLine() {
251   void *vstart{static_cast<void *>(const_cast<char *>(nextLine_))};
252   void *v{std::memchr(vstart, '\n', limit_ - nextLine_)};
253   if (!v) {
254     nextLine_ = limit_;
255   } else {
256     const char *nl{const_cast<const char *>(static_cast<char *>(v))};
257     nextLine_ = nl + 1;
258   }
259 }
260 
261 void Prescanner::LabelField(TokenSequence &token) {
262   const char *bad{nullptr};
263   int outCol{1};
264   const char *start{at_};
265   for (; *at_ != '\n' && column_ <= 6; ++at_) {
266     if (*at_ == '\t') {
267       ++at_;
268       column_ = 7;
269       break;
270     }
271     if (*at_ != ' ' &&
272         !(*at_ == '0' && column_ == 6)) { // '0' in column 6 becomes space
273       EmitChar(token, *at_);
274       ++outCol;
275       if (!bad && !IsDecimalDigit(*at_)) {
276         bad = at_;
277       }
278     }
279     ++column_;
280   }
281   if (bad && !preprocessor_.IsNameDefined(token.CurrentOpenToken())) {
282     Say(GetProvenance(bad),
283         "Character in fixed-form label field must be a digit"_warn_en_US);
284     token.clear();
285     at_ = start;
286     return;
287   }
288   if (outCol == 1) { // empty label field
289     // Emit a space so that, if the line is rescanned after preprocessing,
290     // a leading 'C' or 'D' won't be left-justified and then accidentally
291     // misinterpreted as a comment card.
292     EmitChar(token, ' ');
293     ++outCol;
294   }
295   token.CloseToken();
296   SkipToNextSignificantCharacter();
297   if (IsDecimalDigit(*at_)) {
298     Say(GetProvenance(at_),
299         "Label digit is not in fixed-form label field"_port_en_US);
300   }
301 }
302 
303 // 6.3.3.5: A program unit END statement, or any other statement whose
304 // initial line resembles an END statement, shall not be continued in
305 // fixed form source.
306 void Prescanner::EnforceStupidEndStatementRules(const TokenSequence &tokens) {
307   CharBlock cBlock{tokens.ToCharBlock()};
308   const char *str{cBlock.begin()};
309   std::size_t n{cBlock.size()};
310   if (n < 3) {
311     return;
312   }
313   std::size_t j{0};
314   for (; j < n && (str[j] == ' ' || (str[j] >= '0' && str[j] <= '9')); ++j) {
315   }
316   if (j + 3 > n || std::memcmp(str + j, "end", 3) != 0) {
317     return;
318   }
319   // It starts with END, possibly after a label.
320   auto start{allSources_.GetSourcePosition(tokens.GetCharProvenance(j))};
321   auto end{allSources_.GetSourcePosition(tokens.GetCharProvenance(n - 1))};
322   if (!start || !end) {
323     return;
324   }
325   if (&start->file == &end->file && start->line == end->line) {
326     return; // no continuation
327   }
328   j += 3;
329   static const char *const prefixes[]{"program", "subroutine", "function",
330       "blockdata", "module", "submodule", nullptr};
331   bool isPrefix{j == n || !IsLegalInIdentifier(str[j])}; // prefix is END
332   std::size_t endOfPrefix{j - 1};
333   for (const char *const *p{prefixes}; *p; ++p) {
334     std::size_t pLen{std::strlen(*p)};
335     if (j + pLen <= n && std::memcmp(str + j, *p, pLen) == 0) {
336       isPrefix = true; // END thing as prefix
337       j += pLen;
338       endOfPrefix = j - 1;
339       for (; j < n && IsLegalInIdentifier(str[j]); ++j) {
340       }
341       break;
342     }
343   }
344   if (isPrefix) {
345     auto range{tokens.GetTokenProvenanceRange(1)};
346     if (j == n) { // END or END thing [name]
347       Say(range,
348           "Program unit END statement may not be continued in fixed form source"_err_en_US);
349     } else {
350       auto endOfPrefixPos{
351           allSources_.GetSourcePosition(tokens.GetCharProvenance(endOfPrefix))};
352       auto next{allSources_.GetSourcePosition(tokens.GetCharProvenance(j))};
353       if (endOfPrefixPos && next && &endOfPrefixPos->file == &start->file &&
354           endOfPrefixPos->line == start->line &&
355           (&next->file != &start->file || next->line != start->line)) {
356         Say(range,
357             "Initial line of continued statement must not appear to be a program unit END in fixed form source"_err_en_US);
358       }
359     }
360   }
361 }
362 
363 void Prescanner::SkipToEndOfLine() {
364   while (*at_ != '\n') {
365     ++at_, ++column_;
366   }
367 }
368 
369 bool Prescanner::MustSkipToEndOfLine() const {
370   if (inFixedForm_ && column_ > fixedFormColumnLimit_ && !tabInCurrentLine_) {
371     return true; // skip over ignored columns in right margin (73:80)
372   } else if (*at_ == '!' && !inCharLiteral_) {
373     return true; // inline comment goes to end of source line
374   } else {
375     return false;
376   }
377 }
378 
379 void Prescanner::NextChar() {
380   CHECK(*at_ != '\n');
381   ++at_, ++column_;
382   while (at_[0] == '\xef' && at_[1] == '\xbb' && at_[2] == '\xbf') {
383     // UTF-8 byte order mark - treat this file as UTF-8
384     at_ += 3;
385     encoding_ = Encoding::UTF_8;
386   }
387   SkipToNextSignificantCharacter();
388 }
389 
390 // Skip everything that should be ignored until the next significant
391 // character is reached; handles C-style comments in preprocessing
392 // directives, Fortran ! comments, stuff after the right margin in
393 // fixed form, and all forms of line continuation.
394 void Prescanner::SkipToNextSignificantCharacter() {
395   if (inPreprocessorDirective_) {
396     SkipCComments();
397   } else {
398     bool mightNeedSpace{false};
399     if (MustSkipToEndOfLine()) {
400       SkipToEndOfLine();
401     } else {
402       mightNeedSpace = *at_ == '\n';
403     }
404     for (; Continuation(mightNeedSpace); mightNeedSpace = false) {
405       if (MustSkipToEndOfLine()) {
406         SkipToEndOfLine();
407       }
408     }
409     if (*at_ == '\t') {
410       tabInCurrentLine_ = true;
411     }
412   }
413 }
414 
415 void Prescanner::SkipCComments() {
416   while (true) {
417     if (IsCComment(at_)) {
418       if (const char *after{SkipCComment(at_)}) {
419         column_ += after - at_;
420         // May have skipped over one or more newlines; relocate the start of
421         // the next line.
422         nextLine_ = at_ = after;
423         NextLine();
424       } else {
425         // Don't emit any messages about unclosed C-style comments, because
426         // the sequence /* can appear legally in a FORMAT statement.  There's
427         // no ambiguity, since the sequence */ cannot appear legally.
428         break;
429       }
430     } else if (inPreprocessorDirective_ && at_[0] == '\\' && at_ + 2 < limit_ &&
431         at_[1] == '\n' && !IsAtEnd()) {
432       BeginSourceLineAndAdvance();
433     } else {
434       break;
435     }
436   }
437 }
438 
439 void Prescanner::SkipSpaces() {
440   while (*at_ == ' ' || *at_ == '\t') {
441     NextChar();
442   }
443   insertASpace_ = false;
444 }
445 
446 const char *Prescanner::SkipWhiteSpace(const char *p) {
447   while (*p == ' ' || *p == '\t') {
448     ++p;
449   }
450   return p;
451 }
452 
453 const char *Prescanner::SkipWhiteSpaceAndCComments(const char *p) const {
454   while (true) {
455     if (*p == ' ' || *p == '\t') {
456       ++p;
457     } else if (IsCComment(p)) {
458       if (const char *after{SkipCComment(p)}) {
459         p = after;
460       } else {
461         break;
462       }
463     } else {
464       break;
465     }
466   }
467   return p;
468 }
469 
470 const char *Prescanner::SkipCComment(const char *p) const {
471   char star{' '}, slash{' '};
472   p += 2;
473   while (star != '*' || slash != '/') {
474     if (p >= limit_) {
475       return nullptr; // signifies an unterminated comment
476     }
477     star = slash;
478     slash = *p++;
479   }
480   return p;
481 }
482 
483 bool Prescanner::NextToken(TokenSequence &tokens) {
484   CHECK(at_ >= start_ && at_ < limit_);
485   if (InFixedFormSource()) {
486     SkipSpaces();
487   } else {
488     if (*at_ == '/' && IsCComment(at_)) {
489       // Recognize and skip over classic C style /*comments*/ when
490       // outside a character literal.
491       if (features_.ShouldWarn(LanguageFeature::ClassicCComments)) {
492         Say(GetProvenance(at_),
493             "nonstandard usage: C-style comment"_port_en_US);
494       }
495       SkipCComments();
496     }
497     if (*at_ == ' ' || *at_ == '\t') {
498       // Compress free-form white space into a single space character.
499       const auto theSpace{at_};
500       char previous{at_ <= start_ ? ' ' : at_[-1]};
501       NextChar();
502       SkipSpaces();
503       if (*at_ == '\n') {
504         // Discard white space at the end of a line.
505       } else if (!inPreprocessorDirective_ &&
506           (previous == '(' || *at_ == '(' || *at_ == ')')) {
507         // Discard white space before/after '(' and before ')', unless in a
508         // preprocessor directive.  This helps yield space-free contiguous
509         // names for generic interfaces like OPERATOR( + ) and
510         // READ ( UNFORMATTED ), without misinterpreting #define f (notAnArg).
511         // This has the effect of silently ignoring the illegal spaces in
512         // the array constructor ( /1,2/ ) but that seems benign; it's
513         // hard to avoid that while still removing spaces from OPERATOR( / )
514         // and OPERATOR( // ).
515       } else {
516         // Preserve the squashed white space as a single space character.
517         tokens.PutNextTokenChar(' ', GetProvenance(theSpace));
518         tokens.CloseToken();
519         return true;
520       }
521     }
522   }
523   if (insertASpace_) {
524     tokens.PutNextTokenChar(' ', spaceProvenance_);
525     insertASpace_ = false;
526   }
527   if (*at_ == '\n') {
528     return false;
529   }
530   const char *start{at_};
531   if (*at_ == '\'' || *at_ == '"') {
532     QuotedCharacterLiteral(tokens, start);
533     preventHollerith_ = false;
534   } else if (IsDecimalDigit(*at_)) {
535     int n{0}, digits{0};
536     static constexpr int maxHollerith{256 /*lines*/ * (132 - 6 /*columns*/)};
537     do {
538       if (n < maxHollerith) {
539         n = 10 * n + DecimalDigitValue(*at_);
540       }
541       EmitCharAndAdvance(tokens, *at_);
542       ++digits;
543       if (InFixedFormSource()) {
544         SkipSpaces();
545       }
546     } while (IsDecimalDigit(*at_));
547     if ((*at_ == 'h' || *at_ == 'H') && n > 0 && n < maxHollerith &&
548         !preventHollerith_) {
549       Hollerith(tokens, n, start);
550     } else if (*at_ == '.') {
551       while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
552       }
553       ExponentAndKind(tokens);
554     } else if (ExponentAndKind(tokens)) {
555     } else if (digits == 1 && n == 0 && (*at_ == 'x' || *at_ == 'X') &&
556         inPreprocessorDirective_) {
557       do {
558         EmitCharAndAdvance(tokens, *at_);
559       } while (IsHexadecimalDigit(*at_));
560     } else if (IsLetter(*at_)) {
561       // Handles FORMAT(3I9HHOLLERITH) by skipping over the first I so that
562       // we don't misrecognize I9HOLLERITH as an identifier in the next case.
563       EmitCharAndAdvance(tokens, *at_);
564     } else if (at_[0] == '_' && (at_[1] == '\'' || at_[1] == '"')) { // 4_"..."
565       EmitCharAndAdvance(tokens, *at_);
566       QuotedCharacterLiteral(tokens, start);
567     }
568     preventHollerith_ = false;
569   } else if (*at_ == '.') {
570     char nch{EmitCharAndAdvance(tokens, '.')};
571     if (!inPreprocessorDirective_ && IsDecimalDigit(nch)) {
572       while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
573       }
574       ExponentAndKind(tokens);
575     } else if (nch == '.' && EmitCharAndAdvance(tokens, '.') == '.') {
576       EmitCharAndAdvance(tokens, '.'); // variadic macro definition ellipsis
577     }
578     preventHollerith_ = false;
579   } else if (IsLegalInIdentifier(*at_)) {
580     do {
581     } while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_)));
582     if ((*at_ == '\'' || *at_ == '"') &&
583         tokens.CharAt(tokens.SizeInChars() - 1) == '_') { // kind_"..."
584       QuotedCharacterLiteral(tokens, start);
585     }
586     preventHollerith_ = false;
587   } else if (*at_ == '*') {
588     if (EmitCharAndAdvance(tokens, '*') == '*') {
589       EmitCharAndAdvance(tokens, '*');
590     } else {
591       // Subtle ambiguity:
592       //  CHARACTER*2H     declares H because *2 is a kind specifier
593       //  DATAC/N*2H  /    is repeated Hollerith
594       preventHollerith_ = !slashInCurrentStatement_;
595     }
596   } else {
597     char ch{*at_};
598     if (ch == '(' || ch == '[') {
599       ++delimiterNesting_;
600     } else if ((ch == ')' || ch == ']') && delimiterNesting_ > 0) {
601       --delimiterNesting_;
602     }
603     char nch{EmitCharAndAdvance(tokens, ch)};
604     preventHollerith_ = false;
605     if ((nch == '=' &&
606             (ch == '<' || ch == '>' || ch == '/' || ch == '=' || ch == '!')) ||
607         (ch == nch &&
608             (ch == '/' || ch == ':' || ch == '*' || ch == '#' || ch == '&' ||
609                 ch == '|' || ch == '<' || ch == '>')) ||
610         (ch == '=' && nch == '>')) {
611       // token comprises two characters
612       EmitCharAndAdvance(tokens, nch);
613     } else if (ch == '/') {
614       slashInCurrentStatement_ = true;
615     }
616   }
617   tokens.CloseToken();
618   return true;
619 }
620 
621 bool Prescanner::ExponentAndKind(TokenSequence &tokens) {
622   char ed{ToLowerCaseLetter(*at_)};
623   if (ed != 'e' && ed != 'd') {
624     return false;
625   }
626   EmitCharAndAdvance(tokens, ed);
627   if (*at_ == '+' || *at_ == '-') {
628     EmitCharAndAdvance(tokens, *at_);
629   }
630   while (IsDecimalDigit(*at_)) {
631     EmitCharAndAdvance(tokens, *at_);
632   }
633   if (*at_ == '_') {
634     while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_))) {
635     }
636   }
637   return true;
638 }
639 
640 void Prescanner::QuotedCharacterLiteral(
641     TokenSequence &tokens, const char *start) {
642   char quote{*at_};
643   const char *end{at_ + 1};
644   inCharLiteral_ = true;
645   const auto emit{[&](char ch) { EmitChar(tokens, ch); }};
646   const auto insert{[&](char ch) { EmitInsertedChar(tokens, ch); }};
647   bool isEscaped{false};
648   bool escapesEnabled{features_.IsEnabled(LanguageFeature::BackslashEscapes)};
649   while (true) {
650     if (*at_ == '\\') {
651       if (escapesEnabled) {
652         isEscaped = !isEscaped;
653       } else {
654         // The parser always processes escape sequences, so don't confuse it
655         // when escapes are disabled.
656         insert('\\');
657       }
658     } else {
659       isEscaped = false;
660     }
661     EmitQuotedChar(static_cast<unsigned char>(*at_), emit, insert, false,
662         Encoding::LATIN_1);
663     while (PadOutCharacterLiteral(tokens)) {
664     }
665     if (*at_ == '\n') {
666       if (!inPreprocessorDirective_) {
667         Say(GetProvenanceRange(start, end),
668             "Incomplete character literal"_err_en_US);
669       }
670       break;
671     }
672     end = at_ + 1;
673     NextChar();
674     if (*at_ == quote && !isEscaped) {
675       // A doubled unescaped quote mark becomes a single instance of that
676       // quote character in the literal (later).  There can be spaces between
677       // the quotes in fixed form source.
678       EmitChar(tokens, quote);
679       inCharLiteral_ = false; // for cases like print *, '...'!comment
680       NextChar();
681       if (InFixedFormSource()) {
682         SkipSpaces();
683       }
684       if (*at_ != quote) {
685         break;
686       }
687       inCharLiteral_ = true;
688     }
689   }
690   inCharLiteral_ = false;
691 }
692 
693 void Prescanner::Hollerith(
694     TokenSequence &tokens, int count, const char *start) {
695   inCharLiteral_ = true;
696   CHECK(*at_ == 'h' || *at_ == 'H');
697   EmitChar(tokens, 'H');
698   while (count-- > 0) {
699     if (PadOutCharacterLiteral(tokens)) {
700     } else if (*at_ == '\n') {
701       Say(GetProvenanceRange(start, at_),
702           "Possible truncated Hollerith literal"_warn_en_US);
703       break;
704     } else {
705       NextChar();
706       // Each multi-byte character encoding counts as a single character.
707       // No escape sequences are recognized.
708       // Hollerith is always emitted to the cooked character
709       // stream in UTF-8.
710       DecodedCharacter decoded{DecodeCharacter(
711           encoding_, at_, static_cast<std::size_t>(limit_ - at_), false)};
712       if (decoded.bytes > 0) {
713         EncodedCharacter utf8{
714             EncodeCharacter<Encoding::UTF_8>(decoded.codepoint)};
715         for (int j{0}; j < utf8.bytes; ++j) {
716           EmitChar(tokens, utf8.buffer[j]);
717         }
718         at_ += decoded.bytes - 1;
719       } else {
720         Say(GetProvenanceRange(start, at_),
721             "Bad character in Hollerith literal"_err_en_US);
722         break;
723       }
724     }
725   }
726   if (*at_ != '\n') {
727     NextChar();
728   }
729   inCharLiteral_ = false;
730 }
731 
732 // In fixed form, source card images must be processed as if they were at
733 // least 72 columns wide, at least in character literal contexts.
734 bool Prescanner::PadOutCharacterLiteral(TokenSequence &tokens) {
735   while (inFixedForm_ && !tabInCurrentLine_ && at_[1] == '\n') {
736     if (column_ < fixedFormColumnLimit_) {
737       tokens.PutNextTokenChar(' ', spaceProvenance_);
738       ++column_;
739       return true;
740     }
741     if (!FixedFormContinuation(false /*no need to insert space*/) ||
742         tabInCurrentLine_) {
743       return false;
744     }
745     CHECK(column_ == 7);
746     --at_; // point to column 6 of continuation line
747     column_ = 6;
748   }
749   return false;
750 }
751 
752 bool Prescanner::IsFixedFormCommentLine(const char *start) const {
753   const char *p{start};
754   if (IsFixedFormCommentChar(*p) || *p == '%' || // VAX %list, %eject, &c.
755       ((*p == 'D' || *p == 'd') &&
756           !features_.IsEnabled(LanguageFeature::OldDebugLines))) {
757     return true;
758   }
759   bool anyTabs{false};
760   while (true) {
761     if (*p == ' ') {
762       ++p;
763     } else if (*p == '\t') {
764       anyTabs = true;
765       ++p;
766     } else if (*p == '0' && !anyTabs && p == start + 5) {
767       ++p; // 0 in column 6 must treated as a space
768     } else {
769       break;
770     }
771   }
772   if (!anyTabs && p >= start + fixedFormColumnLimit_) {
773     return true;
774   }
775   if (*p == '!' && !inCharLiteral_ && (anyTabs || p != start + 5)) {
776     return true;
777   }
778   return *p == '\n';
779 }
780 
781 const char *Prescanner::IsFreeFormComment(const char *p) const {
782   p = SkipWhiteSpaceAndCComments(p);
783   if (*p == '!' || *p == '\n') {
784     return p;
785   } else {
786     return nullptr;
787   }
788 }
789 
790 std::optional<std::size_t> Prescanner::IsIncludeLine(const char *start) const {
791   const char *p{SkipWhiteSpace(start)};
792   for (char ch : "include"s) {
793     if (ToLowerCaseLetter(*p++) != ch) {
794       return std::nullopt;
795     }
796   }
797   p = SkipWhiteSpace(p);
798   if (*p == '"' || *p == '\'') {
799     return {p - start};
800   }
801   return std::nullopt;
802 }
803 
804 void Prescanner::FortranInclude(const char *firstQuote) {
805   const char *p{firstQuote};
806   while (*p != '"' && *p != '\'') {
807     ++p;
808   }
809   char quote{*p};
810   std::string path;
811   for (++p; *p != '\n'; ++p) {
812     if (*p == quote) {
813       if (p[1] != quote) {
814         break;
815       }
816       ++p;
817     }
818     path += *p;
819   }
820   if (*p != quote) {
821     Say(GetProvenanceRange(firstQuote, p),
822         "malformed path name string"_err_en_US);
823     return;
824   }
825   p = SkipWhiteSpace(p + 1);
826   if (*p != '\n' && *p != '!') {
827     const char *garbage{p};
828     for (; *p != '\n' && *p != '!'; ++p) {
829     }
830     Say(GetProvenanceRange(garbage, p),
831         "excess characters after path name"_warn_en_US);
832   }
833   std::string buf;
834   llvm::raw_string_ostream error{buf};
835   Provenance provenance{GetProvenance(nextLine_)};
836   std::optional<std::string> prependPath;
837   if (const SourceFile * currentFile{allSources_.GetSourceFile(provenance)}) {
838     prependPath = DirectoryName(currentFile->path());
839   }
840   const SourceFile *included{
841       allSources_.Open(path, error, std::move(prependPath))};
842   if (!included) {
843     Say(provenance, "INCLUDE: %s"_err_en_US, error.str());
844   } else if (included->bytes() > 0) {
845     ProvenanceRange includeLineRange{
846         provenance, static_cast<std::size_t>(p - nextLine_)};
847     ProvenanceRange fileRange{
848         allSources_.AddIncludedFile(*included, includeLineRange)};
849     Prescanner{*this}.set_encoding(included->encoding()).Prescan(fileRange);
850   }
851 }
852 
853 const char *Prescanner::IsPreprocessorDirectiveLine(const char *start) const {
854   const char *p{start};
855   for (; *p == ' '; ++p) {
856   }
857   if (*p == '#') {
858     if (inFixedForm_ && p == start + 5) {
859       return nullptr;
860     }
861   } else {
862     p = SkipWhiteSpace(p);
863     if (*p != '#') {
864       return nullptr;
865     }
866   }
867   return SkipWhiteSpace(p + 1);
868 }
869 
870 bool Prescanner::IsNextLinePreprocessorDirective() const {
871   return IsPreprocessorDirectiveLine(nextLine_) != nullptr;
872 }
873 
874 bool Prescanner::SkipCommentLine(bool afterAmpersand) {
875   if (IsAtEnd()) {
876     if (afterAmpersand && prescannerNesting_ > 0) {
877       // A continuation marker at the end of the last line in an
878       // include file inhibits the newline for that line.
879       SkipToEndOfLine();
880       omitNewline_ = true;
881     }
882     return false;
883   }
884   auto lineClass{ClassifyLine(nextLine_)};
885   if (lineClass.kind == LineClassification::Kind::Comment) {
886     NextLine();
887     return true;
888   } else if (inPreprocessorDirective_) {
889     return false;
890   } else if (lineClass.kind ==
891           LineClassification::Kind::ConditionalCompilationDirective ||
892       lineClass.kind == LineClassification::Kind::PreprocessorDirective) {
893     // Allow conditional compilation directives (e.g., #ifdef) to affect
894     // continuation lines.
895     // Allow other preprocessor directives, too, except #include
896     // (when it does not follow '&'), #define, and #undef (because
897     // they cannot be allowed to affect preceding text on a
898     // continued line).
899     preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
900     return true;
901   } else if (afterAmpersand &&
902       (lineClass.kind == LineClassification::Kind::IncludeDirective ||
903           lineClass.kind == LineClassification::Kind::IncludeLine)) {
904     SkipToEndOfLine();
905     omitNewline_ = true;
906     skipLeadingAmpersand_ = true;
907     return false;
908   } else {
909     return false;
910   }
911 }
912 
913 const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) {
914   if (IsAtEnd()) {
915     return nullptr;
916   }
917   tabInCurrentLine_ = false;
918   char col1{*nextLine_};
919   if (InCompilerDirective()) {
920     // Must be a continued compiler directive.
921     if (!IsFixedFormCommentChar(col1)) {
922       return nullptr;
923     }
924     int j{1};
925     for (; j < 5; ++j) {
926       char ch{directiveSentinel_[j - 1]};
927       if (ch == '\0') {
928         break;
929       }
930       if (ch != ToLowerCaseLetter(nextLine_[j])) {
931         return nullptr;
932       }
933     }
934     for (; j < 5; ++j) {
935       if (nextLine_[j] != ' ') {
936         return nullptr;
937       }
938     }
939     char col6{nextLine_[5]};
940     if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
941       if (nextLine_[6] != ' ' && mightNeedSpace) {
942         insertASpace_ = true;
943       }
944       return nextLine_ + 6;
945     }
946     return nullptr;
947   } else {
948     // Normal case: not in a compiler directive.
949     if (col1 == '&' &&
950         features_.IsEnabled(
951             LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
952       // Extension: '&' as continuation marker
953       if (features_.ShouldWarn(
954               LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
955         Say(GetProvenance(nextLine_), "nonstandard usage"_port_en_US);
956       }
957       return nextLine_ + 1;
958     }
959     if (col1 == '\t' && nextLine_[1] >= '1' && nextLine_[1] <= '9') {
960       tabInCurrentLine_ = true;
961       return nextLine_ + 2; // VAX extension
962     }
963     if (col1 == ' ' && nextLine_[1] == ' ' && nextLine_[2] == ' ' &&
964         nextLine_[3] == ' ' && nextLine_[4] == ' ') {
965       char col6{nextLine_[5]};
966       if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
967         return nextLine_ + 6;
968       }
969     }
970     if (IsImplicitContinuation()) {
971       return nextLine_;
972     }
973   }
974   return nullptr; // not a continuation line
975 }
976 
977 const char *Prescanner::FreeFormContinuationLine(bool ampersand) {
978   const char *p{nextLine_};
979   if (p >= limit_) {
980     return nullptr;
981   }
982   p = SkipWhiteSpace(p);
983   if (InCompilerDirective()) {
984     if (*p++ != '!') {
985       return nullptr;
986     }
987     for (const char *s{directiveSentinel_}; *s != '\0'; ++p, ++s) {
988       if (*s != ToLowerCaseLetter(*p)) {
989         return nullptr;
990       }
991     }
992     p = SkipWhiteSpace(p);
993     if (*p == '&') {
994       if (!ampersand) {
995         insertASpace_ = true;
996       }
997       return p + 1;
998     } else if (ampersand) {
999       return p;
1000     } else {
1001       return nullptr;
1002     }
1003   } else {
1004     if (*p == '&') {
1005       return p + 1;
1006     } else if (*p == '!' || *p == '\n' || *p == '#') {
1007       return nullptr;
1008     } else if (ampersand || IsImplicitContinuation()) {
1009       if (p > nextLine_) {
1010         --p;
1011       } else {
1012         insertASpace_ = true;
1013       }
1014       return p;
1015     } else {
1016       return nullptr;
1017     }
1018   }
1019 }
1020 
1021 bool Prescanner::FixedFormContinuation(bool mightNeedSpace) {
1022   // N.B. We accept '&' as a continuation indicator in fixed form, too,
1023   // but not in a character literal.
1024   if (*at_ == '&' && inCharLiteral_) {
1025     return false;
1026   }
1027   do {
1028     if (const char *cont{FixedFormContinuationLine(mightNeedSpace)}) {
1029       BeginSourceLine(cont);
1030       column_ = 7;
1031       NextLine();
1032       return true;
1033     }
1034   } while (SkipCommentLine(false /* not after ampersand */));
1035   return false;
1036 }
1037 
1038 bool Prescanner::FreeFormContinuation() {
1039   const char *p{at_};
1040   bool ampersand{*p == '&'};
1041   if (ampersand) {
1042     p = SkipWhiteSpace(p + 1);
1043   }
1044   if (*p != '\n') {
1045     if (inCharLiteral_) {
1046       return false;
1047     } else if (*p != '!' &&
1048         features_.ShouldWarn(LanguageFeature::CruftAfterAmpersand)) {
1049       Say(GetProvenance(p), "missing ! before comment after &"_warn_en_US);
1050     }
1051   }
1052   do {
1053     if (const char *cont{FreeFormContinuationLine(ampersand)}) {
1054       BeginSourceLine(cont);
1055       NextLine();
1056       return true;
1057     }
1058   } while (SkipCommentLine(ampersand));
1059   return false;
1060 }
1061 
1062 // Implicit line continuation allows a preprocessor macro call with
1063 // arguments to span multiple lines.
1064 bool Prescanner::IsImplicitContinuation() const {
1065   return !inPreprocessorDirective_ && !inCharLiteral_ &&
1066       delimiterNesting_ > 0 && !IsAtEnd() &&
1067       ClassifyLine(nextLine_).kind == LineClassification::Kind::Source;
1068 }
1069 
1070 bool Prescanner::Continuation(bool mightNeedFixedFormSpace) {
1071   if (*at_ == '\n' || *at_ == '&') {
1072     if (inFixedForm_) {
1073       return FixedFormContinuation(mightNeedFixedFormSpace);
1074     } else {
1075       return FreeFormContinuation();
1076     }
1077   } else {
1078     return false;
1079   }
1080 }
1081 
1082 std::optional<Prescanner::LineClassification>
1083 Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const {
1084   const char *p{start};
1085   char col1{*p++};
1086   if (!IsFixedFormCommentChar(col1)) {
1087     return std::nullopt;
1088   }
1089   char sentinel[5], *sp{sentinel};
1090   int column{2};
1091   for (; column < 6; ++column, ++p) {
1092     if (*p != ' ') {
1093       if (*p == '\n' || *p == '\t') {
1094         break;
1095       }
1096       if (sp == sentinel + 1 && sentinel[0] == '$' && IsDecimalDigit(*p)) {
1097         // OpenMP conditional compilation line: leave the label alone
1098         break;
1099       }
1100       *sp++ = ToLowerCaseLetter(*p);
1101     }
1102   }
1103   if (column == 6) {
1104     if (*p == ' ' || *p == '\t' || *p == '0') {
1105       ++p;
1106     } else {
1107       // This is a Continuation line, not an initial directive line.
1108       return std::nullopt;
1109     }
1110   }
1111   if (sp == sentinel) {
1112     return std::nullopt;
1113   }
1114   *sp = '\0';
1115   if (const char *ss{IsCompilerDirectiveSentinel(sentinel)}) {
1116     std::size_t payloadOffset = p - start;
1117     return {LineClassification{
1118         LineClassification::Kind::CompilerDirective, payloadOffset, ss}};
1119   }
1120   return std::nullopt;
1121 }
1122 
1123 std::optional<Prescanner::LineClassification>
1124 Prescanner::IsFreeFormCompilerDirectiveLine(const char *start) const {
1125   char sentinel[8];
1126   const char *p{SkipWhiteSpace(start)};
1127   if (*p++ != '!') {
1128     return std::nullopt;
1129   }
1130   for (std::size_t j{0}; j + 1 < sizeof sentinel; ++p, ++j) {
1131     if (*p == '\n') {
1132       break;
1133     }
1134     if (*p == ' ' || *p == '\t' || *p == '&') {
1135       if (j == 0) {
1136         break;
1137       }
1138       sentinel[j] = '\0';
1139       p = SkipWhiteSpace(p + 1);
1140       if (*p == '!') {
1141         break;
1142       }
1143       if (const char *sp{IsCompilerDirectiveSentinel(sentinel)}) {
1144         std::size_t offset = p - start;
1145         return {LineClassification{
1146             LineClassification::Kind::CompilerDirective, offset, sp}};
1147       }
1148       break;
1149     }
1150     sentinel[j] = ToLowerCaseLetter(*p);
1151   }
1152   return std::nullopt;
1153 }
1154 
1155 Prescanner &Prescanner::AddCompilerDirectiveSentinel(const std::string &dir) {
1156   std::uint64_t packed{0};
1157   for (char ch : dir) {
1158     packed = (packed << 8) | (ToLowerCaseLetter(ch) & 0xff);
1159   }
1160   compilerDirectiveBloomFilter_.set(packed % prime1);
1161   compilerDirectiveBloomFilter_.set(packed % prime2);
1162   compilerDirectiveSentinels_.insert(dir);
1163   return *this;
1164 }
1165 
1166 const char *Prescanner::IsCompilerDirectiveSentinel(
1167     const char *sentinel) const {
1168   std::uint64_t packed{0};
1169   std::size_t n{0};
1170   for (; sentinel[n] != '\0'; ++n) {
1171     packed = (packed << 8) | (sentinel[n] & 0xff);
1172   }
1173   if (n == 0 || !compilerDirectiveBloomFilter_.test(packed % prime1) ||
1174       !compilerDirectiveBloomFilter_.test(packed % prime2)) {
1175     return nullptr;
1176   }
1177   const auto iter{compilerDirectiveSentinels_.find(std::string(sentinel, n))};
1178   return iter == compilerDirectiveSentinels_.end() ? nullptr : iter->c_str();
1179 }
1180 
1181 constexpr bool IsDirective(const char *match, const char *dir) {
1182   for (; *match; ++match) {
1183     if (*match != ToLowerCaseLetter(*dir++)) {
1184       return false;
1185     }
1186   }
1187   return true;
1188 }
1189 
1190 Prescanner::LineClassification Prescanner::ClassifyLine(
1191     const char *start) const {
1192   if (inFixedForm_) {
1193     if (std::optional<LineClassification> lc{
1194             IsFixedFormCompilerDirectiveLine(start)}) {
1195       return std::move(*lc);
1196     }
1197     if (IsFixedFormCommentLine(start)) {
1198       return {LineClassification::Kind::Comment};
1199     }
1200   } else {
1201     if (std::optional<LineClassification> lc{
1202             IsFreeFormCompilerDirectiveLine(start)}) {
1203       return std::move(*lc);
1204     }
1205     if (const char *bang{IsFreeFormComment(start)}) {
1206       return {LineClassification::Kind::Comment,
1207           static_cast<std::size_t>(bang - start)};
1208     }
1209   }
1210   if (std::optional<std::size_t> quoteOffset{IsIncludeLine(start)}) {
1211     return {LineClassification::Kind::IncludeLine, *quoteOffset};
1212   }
1213   if (const char *dir{IsPreprocessorDirectiveLine(start)}) {
1214     if (IsDirective("if", dir) || IsDirective("elif", dir) ||
1215         IsDirective("else", dir) || IsDirective("endif", dir)) {
1216       return {LineClassification::Kind::ConditionalCompilationDirective};
1217     } else if (IsDirective("include", dir)) {
1218       return {LineClassification::Kind::IncludeDirective};
1219     } else if (IsDirective("define", dir) || IsDirective("undef", dir)) {
1220       return {LineClassification::Kind::DefinitionDirective};
1221     } else {
1222       return {LineClassification::Kind::PreprocessorDirective};
1223     }
1224   }
1225   return {LineClassification::Kind::Source};
1226 }
1227 
1228 void Prescanner::SourceFormChange(std::string &&dir) {
1229   if (dir == "!dir$ free") {
1230     inFixedForm_ = false;
1231   } else if (dir == "!dir$ fixed") {
1232     inFixedForm_ = true;
1233   }
1234 }
1235 } // namespace Fortran::parser
1236