1 //===-- lib/Parser/prescan.cpp --------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "prescan.h"
10 #include "preprocessor.h"
11 #include "token-sequence.h"
12 #include "flang/Common/idioms.h"
13 #include "flang/Parser/characters.h"
14 #include "flang/Parser/message.h"
15 #include "flang/Parser/source.h"
16 #include "llvm/Support/raw_ostream.h"
17 #include <cstddef>
18 #include <cstring>
19 #include <utility>
20 #include <vector>
21 
22 namespace Fortran::parser {
23 
24 using common::LanguageFeature;
25 
26 static constexpr int maxPrescannerNesting{100};
27 
28 Prescanner::Prescanner(Messages &messages, CookedSource &cooked,
29     Preprocessor &preprocessor, common::LanguageFeatureControl lfc)
30     : messages_{messages}, cooked_{cooked}, preprocessor_{preprocessor},
31       allSources_{preprocessor_.allSources()}, features_{lfc},
32       encoding_{allSources_.encoding()} {}
33 
34 Prescanner::Prescanner(const Prescanner &that)
35     : messages_{that.messages_}, cooked_{that.cooked_},
36       preprocessor_{that.preprocessor_}, allSources_{that.allSources_},
37       features_{that.features_}, inFixedForm_{that.inFixedForm_},
38       fixedFormColumnLimit_{that.fixedFormColumnLimit_},
39       encoding_{that.encoding_}, prescannerNesting_{that.prescannerNesting_ +
40                                      1},
41       skipLeadingAmpersand_{that.skipLeadingAmpersand_},
42       compilerDirectiveBloomFilter_{that.compilerDirectiveBloomFilter_},
43       compilerDirectiveSentinels_{that.compilerDirectiveSentinels_} {}
44 
45 static inline constexpr bool IsFixedFormCommentChar(char ch) {
46   return ch == '!' || ch == '*' || ch == 'C' || ch == 'c';
47 }
48 
49 static void NormalizeCompilerDirectiveCommentMarker(TokenSequence &dir) {
50   char *p{dir.GetMutableCharData()};
51   char *limit{p + dir.SizeInChars()};
52   for (; p < limit; ++p) {
53     if (*p != ' ') {
54       CHECK(IsFixedFormCommentChar(*p));
55       *p = '!';
56       return;
57     }
58   }
59   DIE("compiler directive all blank");
60 }
61 
62 void Prescanner::Prescan(ProvenanceRange range) {
63   startProvenance_ = range.start();
64   start_ = allSources_.GetSource(range);
65   CHECK(start_);
66   limit_ = start_ + range.size();
67   nextLine_ = start_;
68   const bool beganInFixedForm{inFixedForm_};
69   if (prescannerNesting_ > maxPrescannerNesting) {
70     Say(GetProvenance(start_),
71         "too many nested INCLUDE/#include files, possibly circular"_err_en_US);
72     return;
73   }
74   while (!IsAtEnd()) {
75     Statement();
76   }
77   if (inFixedForm_ != beganInFixedForm) {
78     std::string dir{"!dir$ "};
79     if (beganInFixedForm) {
80       dir += "fixed";
81     } else {
82       dir += "free";
83     }
84     dir += '\n';
85     TokenSequence tokens{dir, allSources_.AddCompilerInsertion(dir).start()};
86     tokens.Emit(cooked_);
87   }
88 }
89 
90 void Prescanner::Statement() {
91   TokenSequence tokens;
92   LineClassification line{ClassifyLine(nextLine_)};
93   switch (line.kind) {
94   case LineClassification::Kind::Comment:
95     nextLine_ += line.payloadOffset; // advance to '!' or newline
96     NextLine();
97     return;
98   case LineClassification::Kind::IncludeLine:
99     FortranInclude(nextLine_ + line.payloadOffset);
100     NextLine();
101     return;
102   case LineClassification::Kind::ConditionalCompilationDirective:
103   case LineClassification::Kind::IncludeDirective:
104   case LineClassification::Kind::DefinitionDirective:
105   case LineClassification::Kind::PreprocessorDirective:
106     preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
107     return;
108   case LineClassification::Kind::CompilerDirective:
109     directiveSentinel_ = line.sentinel;
110     CHECK(InCompilerDirective());
111     BeginStatementAndAdvance();
112     if (inFixedForm_) {
113       CHECK(IsFixedFormCommentChar(*at_));
114     } else {
115       while (*at_ == ' ' || *at_ == '\t') {
116         ++at_, ++column_;
117       }
118       CHECK(*at_ == '!');
119     }
120     if (directiveSentinel_[0] == '$' && directiveSentinel_[1] == '\0') {
121       // OpenMP conditional compilation line.  Remove the sentinel and then
122       // treat the line as if it were normal source.
123       at_ += 2, column_ += 2;
124       if (inFixedForm_) {
125         LabelField(tokens);
126       } else {
127         SkipSpaces();
128       }
129     } else {
130       // Compiler directive.  Emit normalized sentinel.
131       EmitChar(tokens, '!');
132       ++at_, ++column_;
133       for (const char *sp{directiveSentinel_}; *sp != '\0';
134            ++sp, ++at_, ++column_) {
135         EmitChar(tokens, *sp);
136       }
137       if (*at_ == ' ') {
138         EmitChar(tokens, ' ');
139         ++at_, ++column_;
140       }
141       tokens.CloseToken();
142     }
143     break;
144   case LineClassification::Kind::Source:
145     BeginStatementAndAdvance();
146     if (inFixedForm_) {
147       LabelField(tokens);
148     } else if (skipLeadingAmpersand_) {
149       skipLeadingAmpersand_ = false;
150       const char *p{SkipWhiteSpace(at_)};
151       if (p < limit_ && *p == '&') {
152         column_ += ++p - at_;
153         at_ = p;
154       }
155     } else {
156       SkipSpaces();
157     }
158     break;
159   }
160 
161   while (NextToken(tokens)) {
162   }
163 
164   Provenance newlineProvenance{GetCurrentProvenance()};
165   if (std::optional<TokenSequence> preprocessed{
166           preprocessor_.MacroReplacement(tokens, *this)}) {
167     // Reprocess the preprocessed line.  Append a newline temporarily.
168     preprocessed->PutNextTokenChar('\n', newlineProvenance);
169     preprocessed->CloseToken();
170     const char *ppd{preprocessed->ToCharBlock().begin()};
171     LineClassification ppl{ClassifyLine(ppd)};
172     preprocessed->pop_back(); // remove the newline
173     switch (ppl.kind) {
174     case LineClassification::Kind::Comment:
175       break;
176     case LineClassification::Kind::IncludeLine:
177       FortranInclude(ppd + ppl.payloadOffset);
178       break;
179     case LineClassification::Kind::ConditionalCompilationDirective:
180     case LineClassification::Kind::IncludeDirective:
181     case LineClassification::Kind::DefinitionDirective:
182     case LineClassification::Kind::PreprocessorDirective:
183       Say(preprocessed->GetProvenanceRange(),
184           "Preprocessed line resembles a preprocessor directive"_en_US);
185       preprocessed->ToLowerCase()
186           .CheckBadFortranCharacters(messages_)
187           .CheckBadParentheses(messages_)
188           .Emit(cooked_);
189       break;
190     case LineClassification::Kind::CompilerDirective:
191       if (preprocessed->HasRedundantBlanks()) {
192         preprocessed->RemoveRedundantBlanks();
193       }
194       NormalizeCompilerDirectiveCommentMarker(*preprocessed);
195       preprocessed->ToLowerCase();
196       SourceFormChange(preprocessed->ToString());
197       preprocessed->ClipComment(true /* skip first ! */)
198           .CheckBadFortranCharacters(messages_)
199           .CheckBadParentheses(messages_)
200           .Emit(cooked_);
201       break;
202     case LineClassification::Kind::Source:
203       if (inFixedForm_) {
204         if (preprocessed->HasBlanks(/*after column*/ 6)) {
205           preprocessed->RemoveBlanks(/*after column*/ 6);
206         }
207       } else {
208         if (preprocessed->HasRedundantBlanks()) {
209           preprocessed->RemoveRedundantBlanks();
210         }
211       }
212       preprocessed->ToLowerCase()
213           .ClipComment()
214           .CheckBadFortranCharacters(messages_)
215           .CheckBadParentheses(messages_)
216           .Emit(cooked_);
217       break;
218     }
219   } else {
220     tokens.ToLowerCase();
221     if (line.kind == LineClassification::Kind::CompilerDirective) {
222       SourceFormChange(tokens.ToString());
223     }
224     if (inFixedForm_ && line.kind == LineClassification::Kind::Source) {
225       EnforceStupidEndStatementRules(tokens);
226     }
227     tokens.CheckBadFortranCharacters(messages_)
228         .CheckBadParentheses(messages_)
229         .Emit(cooked_);
230   }
231   if (omitNewline_) {
232     omitNewline_ = false;
233   } else {
234     cooked_.Put('\n', newlineProvenance);
235   }
236   directiveSentinel_ = nullptr;
237 }
238 
239 TokenSequence Prescanner::TokenizePreprocessorDirective() {
240   CHECK(!IsAtEnd() && !inPreprocessorDirective_);
241   inPreprocessorDirective_ = true;
242   BeginStatementAndAdvance();
243   TokenSequence tokens;
244   while (NextToken(tokens)) {
245   }
246   inPreprocessorDirective_ = false;
247   return tokens;
248 }
249 
250 void Prescanner::NextLine() {
251   void *vstart{static_cast<void *>(const_cast<char *>(nextLine_))};
252   void *v{std::memchr(vstart, '\n', limit_ - nextLine_)};
253   if (!v) {
254     nextLine_ = limit_;
255   } else {
256     const char *nl{const_cast<const char *>(static_cast<char *>(v))};
257     nextLine_ = nl + 1;
258   }
259 }
260 
261 void Prescanner::LabelField(TokenSequence &token) {
262   const char *bad{nullptr};
263   int outCol{1};
264   const char *start{at_};
265   for (; *at_ != '\n' && column_ <= 6; ++at_) {
266     if (*at_ == '\t') {
267       ++at_;
268       column_ = 7;
269       break;
270     }
271     if (*at_ != ' ' &&
272         !(*at_ == '0' && column_ == 6)) { // '0' in column 6 becomes space
273       EmitChar(token, *at_);
274       ++outCol;
275       if (!bad && !IsDecimalDigit(*at_)) {
276         bad = at_;
277       }
278     }
279     ++column_;
280   }
281   if (bad && !preprocessor_.IsNameDefined(token.CurrentOpenToken())) {
282     Say(GetProvenance(bad),
283         "Character in fixed-form label field must be a digit"_en_US);
284     token.clear();
285     at_ = start;
286     return;
287   }
288   if (outCol == 1) { // empty label field
289     // Emit a space so that, if the line is rescanned after preprocessing,
290     // a leading 'C' or 'D' won't be left-justified and then accidentally
291     // misinterpreted as a comment card.
292     EmitChar(token, ' ');
293     ++outCol;
294   }
295   token.CloseToken();
296   SkipToNextSignificantCharacter();
297   if (IsDecimalDigit(*at_)) {
298     Say(GetProvenance(at_),
299         "Label digit is not in fixed-form label field"_en_US);
300   }
301 }
302 
303 // 6.3.3.5: A program unit END statement, or any other statement whose
304 // initial line resembles an END statement, shall not be continued in
305 // fixed form source.
306 void Prescanner::EnforceStupidEndStatementRules(const TokenSequence &tokens) {
307   CharBlock cBlock{tokens.ToCharBlock()};
308   const char *str{cBlock.begin()};
309   std::size_t n{cBlock.size()};
310   if (n < 3) {
311     return;
312   }
313   std::size_t j{0};
314   for (; j < n && (str[j] == ' ' || (str[j] >= '0' && str[j] <= '9')); ++j) {
315   }
316   if (j + 3 > n || std::memcmp(str + j, "end", 3) != 0) {
317     return;
318   }
319   // It starts with END, possibly after a label.
320   auto start{allSources_.GetSourcePosition(tokens.GetCharProvenance(j))};
321   auto end{allSources_.GetSourcePosition(tokens.GetCharProvenance(n - 1))};
322   if (!start || !end) {
323     return;
324   }
325   if (&start->file == &end->file && start->line == end->line) {
326     return; // no continuation
327   }
328   j += 3;
329   static const char *const prefixes[]{"program", "subroutine", "function",
330       "blockdata", "module", "submodule", nullptr};
331   bool isPrefix{j == n || !IsLegalInIdentifier(str[j])}; // prefix is END
332   std::size_t endOfPrefix{j - 1};
333   for (const char *const *p{prefixes}; *p; ++p) {
334     std::size_t pLen{std::strlen(*p)};
335     if (j + pLen <= n && std::memcmp(str + j, *p, pLen) == 0) {
336       isPrefix = true; // END thing as prefix
337       j += pLen;
338       endOfPrefix = j - 1;
339       for (; j < n && IsLegalInIdentifier(str[j]); ++j) {
340       }
341       break;
342     }
343   }
344   if (isPrefix) {
345     auto range{tokens.GetTokenProvenanceRange(1)};
346     if (j == n) { // END or END thing [name]
347       Say(range,
348           "Program unit END statement may not be continued in fixed form source"_err_en_US);
349     } else {
350       auto endOfPrefixPos{
351           allSources_.GetSourcePosition(tokens.GetCharProvenance(endOfPrefix))};
352       auto next{allSources_.GetSourcePosition(tokens.GetCharProvenance(j))};
353       if (endOfPrefixPos && next && &endOfPrefixPos->file == &start->file &&
354           endOfPrefixPos->line == start->line &&
355           (&next->file != &start->file || next->line != start->line)) {
356         Say(range,
357             "Initial line of continued statement must not appear to be a program unit END in fixed form source"_err_en_US);
358       }
359     }
360   }
361 }
362 
363 void Prescanner::SkipToEndOfLine() {
364   while (*at_ != '\n') {
365     ++at_, ++column_;
366   }
367 }
368 
369 bool Prescanner::MustSkipToEndOfLine() const {
370   if (inFixedForm_ && column_ > fixedFormColumnLimit_ && !tabInCurrentLine_) {
371     return true; // skip over ignored columns in right margin (73:80)
372   } else if (*at_ == '!' && !inCharLiteral_) {
373     return true; // inline comment goes to end of source line
374   } else {
375     return false;
376   }
377 }
378 
379 void Prescanner::NextChar() {
380   CHECK(*at_ != '\n');
381   ++at_, ++column_;
382   while (at_[0] == '\xef' && at_[1] == '\xbb' && at_[2] == '\xbf') {
383     // UTF-8 byte order mark - treat this file as UTF-8
384     at_ += 3;
385     encoding_ = Encoding::UTF_8;
386   }
387   SkipToNextSignificantCharacter();
388 }
389 
390 // Skip everything that should be ignored until the next significant
391 // character is reached; handles C-style comments in preprocessing
392 // directives, Fortran ! comments, stuff after the right margin in
393 // fixed form, and all forms of line continuation.
394 void Prescanner::SkipToNextSignificantCharacter() {
395   if (inPreprocessorDirective_) {
396     SkipCComments();
397   } else {
398     bool mightNeedSpace{false};
399     if (MustSkipToEndOfLine()) {
400       SkipToEndOfLine();
401     } else {
402       mightNeedSpace = *at_ == '\n';
403     }
404     for (; Continuation(mightNeedSpace); mightNeedSpace = false) {
405       if (MustSkipToEndOfLine()) {
406         SkipToEndOfLine();
407       }
408     }
409     if (*at_ == '\t') {
410       tabInCurrentLine_ = true;
411     }
412   }
413 }
414 
415 void Prescanner::SkipCComments() {
416   while (true) {
417     if (IsCComment(at_)) {
418       if (const char *after{SkipCComment(at_)}) {
419         column_ += after - at_;
420         // May have skipped over one or more newlines; relocate the start of
421         // the next line.
422         nextLine_ = at_ = after;
423         NextLine();
424       } else {
425         // Don't emit any messages about unclosed C-style comments, because
426         // the sequence /* can appear legally in a FORMAT statement.  There's
427         // no ambiguity, since the sequence */ cannot appear legally.
428         break;
429       }
430     } else if (inPreprocessorDirective_ && at_[0] == '\\' && at_ + 2 < limit_ &&
431         at_[1] == '\n' && !IsAtEnd()) {
432       BeginSourceLineAndAdvance();
433     } else {
434       break;
435     }
436   }
437 }
438 
439 void Prescanner::SkipSpaces() {
440   while (*at_ == ' ' || *at_ == '\t') {
441     NextChar();
442   }
443   insertASpace_ = false;
444 }
445 
446 const char *Prescanner::SkipWhiteSpace(const char *p) {
447   while (*p == ' ' || *p == '\t') {
448     ++p;
449   }
450   return p;
451 }
452 
453 const char *Prescanner::SkipWhiteSpaceAndCComments(const char *p) const {
454   while (true) {
455     if (*p == ' ' || *p == '\t') {
456       ++p;
457     } else if (IsCComment(p)) {
458       if (const char *after{SkipCComment(p)}) {
459         p = after;
460       } else {
461         break;
462       }
463     } else {
464       break;
465     }
466   }
467   return p;
468 }
469 
470 const char *Prescanner::SkipCComment(const char *p) const {
471   char star{' '}, slash{' '};
472   p += 2;
473   while (star != '*' || slash != '/') {
474     if (p >= limit_) {
475       return nullptr; // signifies an unterminated comment
476     }
477     star = slash;
478     slash = *p++;
479   }
480   return p;
481 }
482 
483 bool Prescanner::NextToken(TokenSequence &tokens) {
484   CHECK(at_ >= start_ && at_ < limit_);
485   if (InFixedFormSource()) {
486     SkipSpaces();
487   } else {
488     if (*at_ == '/' && IsCComment(at_)) {
489       // Recognize and skip over classic C style /*comments*/ when
490       // outside a character literal.
491       if (features_.ShouldWarn(LanguageFeature::ClassicCComments)) {
492         Say(GetProvenance(at_), "nonstandard usage: C-style comment"_en_US);
493       }
494       SkipCComments();
495     }
496     if (*at_ == ' ' || *at_ == '\t') {
497       // Compress free-form white space into a single space character.
498       const auto theSpace{at_};
499       char previous{at_ <= start_ ? ' ' : at_[-1]};
500       NextChar();
501       SkipSpaces();
502       if (*at_ == '\n') {
503         // Discard white space at the end of a line.
504       } else if (!inPreprocessorDirective_ &&
505           (previous == '(' || *at_ == '(' || *at_ == ')')) {
506         // Discard white space before/after '(' and before ')', unless in a
507         // preprocessor directive.  This helps yield space-free contiguous
508         // names for generic interfaces like OPERATOR( + ) and
509         // READ ( UNFORMATTED ), without misinterpreting #define f (notAnArg).
510         // This has the effect of silently ignoring the illegal spaces in
511         // the array constructor ( /1,2/ ) but that seems benign; it's
512         // hard to avoid that while still removing spaces from OPERATOR( / )
513         // and OPERATOR( // ).
514       } else {
515         // Preserve the squashed white space as a single space character.
516         tokens.PutNextTokenChar(' ', GetProvenance(theSpace));
517         tokens.CloseToken();
518         return true;
519       }
520     }
521   }
522   if (insertASpace_) {
523     tokens.PutNextTokenChar(' ', spaceProvenance_);
524     insertASpace_ = false;
525   }
526   if (*at_ == '\n') {
527     return false;
528   }
529   const char *start{at_};
530   if (*at_ == '\'' || *at_ == '"') {
531     QuotedCharacterLiteral(tokens, start);
532     preventHollerith_ = false;
533   } else if (IsDecimalDigit(*at_)) {
534     int n{0}, digits{0};
535     static constexpr int maxHollerith{256 /*lines*/ * (132 - 6 /*columns*/)};
536     do {
537       if (n < maxHollerith) {
538         n = 10 * n + DecimalDigitValue(*at_);
539       }
540       EmitCharAndAdvance(tokens, *at_);
541       ++digits;
542       if (InFixedFormSource()) {
543         SkipSpaces();
544       }
545     } while (IsDecimalDigit(*at_));
546     if ((*at_ == 'h' || *at_ == 'H') && n > 0 && n < maxHollerith &&
547         !preventHollerith_) {
548       Hollerith(tokens, n, start);
549     } else if (*at_ == '.') {
550       while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
551       }
552       ExponentAndKind(tokens);
553     } else if (ExponentAndKind(tokens)) {
554     } else if (digits == 1 && n == 0 && (*at_ == 'x' || *at_ == 'X') &&
555         inPreprocessorDirective_) {
556       do {
557         EmitCharAndAdvance(tokens, *at_);
558       } while (IsHexadecimalDigit(*at_));
559     } else if (IsLetter(*at_)) {
560       // Handles FORMAT(3I9HHOLLERITH) by skipping over the first I so that
561       // we don't misrecognize I9HOLLERITH as an identifier in the next case.
562       EmitCharAndAdvance(tokens, *at_);
563     } else if (at_[0] == '_' && (at_[1] == '\'' || at_[1] == '"')) { // 4_"..."
564       EmitCharAndAdvance(tokens, *at_);
565       QuotedCharacterLiteral(tokens, start);
566     }
567     preventHollerith_ = false;
568   } else if (*at_ == '.') {
569     char nch{EmitCharAndAdvance(tokens, '.')};
570     if (!inPreprocessorDirective_ && IsDecimalDigit(nch)) {
571       while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
572       }
573       ExponentAndKind(tokens);
574     } else if (nch == '.' && EmitCharAndAdvance(tokens, '.') == '.') {
575       EmitCharAndAdvance(tokens, '.'); // variadic macro definition ellipsis
576     }
577     preventHollerith_ = false;
578   } else if (IsLegalInIdentifier(*at_)) {
579     do {
580     } while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_)));
581     if ((*at_ == '\'' || *at_ == '"') &&
582         tokens.CharAt(tokens.SizeInChars() - 1) == '_') { // kind_"..."
583       QuotedCharacterLiteral(tokens, start);
584     }
585     preventHollerith_ = false;
586   } else if (*at_ == '*') {
587     if (EmitCharAndAdvance(tokens, '*') == '*') {
588       EmitCharAndAdvance(tokens, '*');
589     } else {
590       // Subtle ambiguity:
591       //  CHARACTER*2H     declares H because *2 is a kind specifier
592       //  DATAC/N*2H  /    is repeated Hollerith
593       preventHollerith_ = !slashInCurrentStatement_;
594     }
595   } else {
596     char ch{*at_};
597     if (ch == '(' || ch == '[') {
598       ++delimiterNesting_;
599     } else if ((ch == ')' || ch == ']') && delimiterNesting_ > 0) {
600       --delimiterNesting_;
601     }
602     char nch{EmitCharAndAdvance(tokens, ch)};
603     preventHollerith_ = false;
604     if ((nch == '=' &&
605             (ch == '<' || ch == '>' || ch == '/' || ch == '=' || ch == '!')) ||
606         (ch == nch &&
607             (ch == '/' || ch == ':' || ch == '*' || ch == '#' || ch == '&' ||
608                 ch == '|' || ch == '<' || ch == '>')) ||
609         (ch == '=' && nch == '>')) {
610       // token comprises two characters
611       EmitCharAndAdvance(tokens, nch);
612     } else if (ch == '/') {
613       slashInCurrentStatement_ = true;
614     }
615   }
616   tokens.CloseToken();
617   return true;
618 }
619 
620 bool Prescanner::ExponentAndKind(TokenSequence &tokens) {
621   char ed{ToLowerCaseLetter(*at_)};
622   if (ed != 'e' && ed != 'd') {
623     return false;
624   }
625   EmitCharAndAdvance(tokens, ed);
626   if (*at_ == '+' || *at_ == '-') {
627     EmitCharAndAdvance(tokens, *at_);
628   }
629   while (IsDecimalDigit(*at_)) {
630     EmitCharAndAdvance(tokens, *at_);
631   }
632   if (*at_ == '_') {
633     while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_))) {
634     }
635   }
636   return true;
637 }
638 
639 void Prescanner::QuotedCharacterLiteral(
640     TokenSequence &tokens, const char *start) {
641   char quote{*at_};
642   const char *end{at_ + 1};
643   inCharLiteral_ = true;
644   const auto emit{[&](char ch) { EmitChar(tokens, ch); }};
645   const auto insert{[&](char ch) { EmitInsertedChar(tokens, ch); }};
646   bool isEscaped{false};
647   bool escapesEnabled{features_.IsEnabled(LanguageFeature::BackslashEscapes)};
648   while (true) {
649     if (*at_ == '\\') {
650       if (escapesEnabled) {
651         isEscaped = !isEscaped;
652       } else {
653         // The parser always processes escape sequences, so don't confuse it
654         // when escapes are disabled.
655         insert('\\');
656       }
657     } else {
658       isEscaped = false;
659     }
660     EmitQuotedChar(static_cast<unsigned char>(*at_), emit, insert, false,
661         Encoding::LATIN_1);
662     while (PadOutCharacterLiteral(tokens)) {
663     }
664     if (*at_ == '\n') {
665       if (!inPreprocessorDirective_) {
666         Say(GetProvenanceRange(start, end),
667             "Incomplete character literal"_err_en_US);
668       }
669       break;
670     }
671     end = at_ + 1;
672     NextChar();
673     if (*at_ == quote && !isEscaped) {
674       // A doubled unescaped quote mark becomes a single instance of that
675       // quote character in the literal (later).  There can be spaces between
676       // the quotes in fixed form source.
677       EmitChar(tokens, quote);
678       inCharLiteral_ = false; // for cases like print *, '...'!comment
679       NextChar();
680       if (InFixedFormSource()) {
681         SkipSpaces();
682       }
683       if (*at_ != quote) {
684         break;
685       }
686       inCharLiteral_ = true;
687     }
688   }
689   inCharLiteral_ = false;
690 }
691 
692 void Prescanner::Hollerith(
693     TokenSequence &tokens, int count, const char *start) {
694   inCharLiteral_ = true;
695   CHECK(*at_ == 'h' || *at_ == 'H');
696   EmitChar(tokens, 'H');
697   while (count-- > 0) {
698     if (PadOutCharacterLiteral(tokens)) {
699     } else if (*at_ == '\n') {
700       Say(GetProvenanceRange(start, at_),
701           "Possible truncated Hollerith literal"_en_US);
702       break;
703     } else {
704       NextChar();
705       // Each multi-byte character encoding counts as a single character.
706       // No escape sequences are recognized.
707       // Hollerith is always emitted to the cooked character
708       // stream in UTF-8.
709       DecodedCharacter decoded{DecodeCharacter(
710           encoding_, at_, static_cast<std::size_t>(limit_ - at_), false)};
711       if (decoded.bytes > 0) {
712         EncodedCharacter utf8{
713             EncodeCharacter<Encoding::UTF_8>(decoded.codepoint)};
714         for (int j{0}; j < utf8.bytes; ++j) {
715           EmitChar(tokens, utf8.buffer[j]);
716         }
717         at_ += decoded.bytes - 1;
718       } else {
719         Say(GetProvenanceRange(start, at_),
720             "Bad character in Hollerith literal"_err_en_US);
721         break;
722       }
723     }
724   }
725   if (*at_ != '\n') {
726     NextChar();
727   }
728   inCharLiteral_ = false;
729 }
730 
731 // In fixed form, source card images must be processed as if they were at
732 // least 72 columns wide, at least in character literal contexts.
733 bool Prescanner::PadOutCharacterLiteral(TokenSequence &tokens) {
734   while (inFixedForm_ && !tabInCurrentLine_ && at_[1] == '\n') {
735     if (column_ < fixedFormColumnLimit_) {
736       tokens.PutNextTokenChar(' ', spaceProvenance_);
737       ++column_;
738       return true;
739     }
740     if (!FixedFormContinuation(false /*no need to insert space*/) ||
741         tabInCurrentLine_) {
742       return false;
743     }
744     CHECK(column_ == 7);
745     --at_; // point to column 6 of continuation line
746     column_ = 6;
747   }
748   return false;
749 }
750 
751 bool Prescanner::IsFixedFormCommentLine(const char *start) const {
752   const char *p{start};
753   if (IsFixedFormCommentChar(*p) || *p == '%' || // VAX %list, %eject, &c.
754       ((*p == 'D' || *p == 'd') &&
755           !features_.IsEnabled(LanguageFeature::OldDebugLines))) {
756     return true;
757   }
758   bool anyTabs{false};
759   while (true) {
760     if (*p == ' ') {
761       ++p;
762     } else if (*p == '\t') {
763       anyTabs = true;
764       ++p;
765     } else if (*p == '0' && !anyTabs && p == start + 5) {
766       ++p; // 0 in column 6 must treated as a space
767     } else {
768       break;
769     }
770   }
771   if (!anyTabs && p >= start + fixedFormColumnLimit_) {
772     return true;
773   }
774   if (*p == '!' && !inCharLiteral_ && (anyTabs || p != start + 5)) {
775     return true;
776   }
777   return *p == '\n';
778 }
779 
780 const char *Prescanner::IsFreeFormComment(const char *p) const {
781   p = SkipWhiteSpaceAndCComments(p);
782   if (*p == '!' || *p == '\n') {
783     return p;
784   } else {
785     return nullptr;
786   }
787 }
788 
789 std::optional<std::size_t> Prescanner::IsIncludeLine(const char *start) const {
790   const char *p{SkipWhiteSpace(start)};
791   for (char ch : "include"s) {
792     if (ToLowerCaseLetter(*p++) != ch) {
793       return std::nullopt;
794     }
795   }
796   p = SkipWhiteSpace(p);
797   if (*p == '"' || *p == '\'') {
798     return {p - start};
799   }
800   return std::nullopt;
801 }
802 
803 void Prescanner::FortranInclude(const char *firstQuote) {
804   const char *p{firstQuote};
805   while (*p != '"' && *p != '\'') {
806     ++p;
807   }
808   char quote{*p};
809   std::string path;
810   for (++p; *p != '\n'; ++p) {
811     if (*p == quote) {
812       if (p[1] != quote) {
813         break;
814       }
815       ++p;
816     }
817     path += *p;
818   }
819   if (*p != quote) {
820     Say(GetProvenanceRange(firstQuote, p),
821         "malformed path name string"_err_en_US);
822     return;
823   }
824   p = SkipWhiteSpace(p + 1);
825   if (*p != '\n' && *p != '!') {
826     const char *garbage{p};
827     for (; *p != '\n' && *p != '!'; ++p) {
828     }
829     Say(GetProvenanceRange(garbage, p),
830         "excess characters after path name"_en_US);
831   }
832   std::string buf;
833   llvm::raw_string_ostream error{buf};
834   Provenance provenance{GetProvenance(nextLine_)};
835   std::optional<std::string> prependPath;
836   if (const SourceFile * currentFile{allSources_.GetSourceFile(provenance)}) {
837     prependPath = DirectoryName(currentFile->path());
838   }
839   const SourceFile *included{
840       allSources_.Open(path, error, std::move(prependPath))};
841   if (!included) {
842     Say(provenance, "INCLUDE: %s"_err_en_US, error.str());
843   } else if (included->bytes() > 0) {
844     ProvenanceRange includeLineRange{
845         provenance, static_cast<std::size_t>(p - nextLine_)};
846     ProvenanceRange fileRange{
847         allSources_.AddIncludedFile(*included, includeLineRange)};
848     Prescanner{*this}.set_encoding(included->encoding()).Prescan(fileRange);
849   }
850 }
851 
852 const char *Prescanner::IsPreprocessorDirectiveLine(const char *start) const {
853   const char *p{start};
854   for (; *p == ' '; ++p) {
855   }
856   if (*p == '#') {
857     if (inFixedForm_ && p == start + 5) {
858       return nullptr;
859     }
860   } else {
861     p = SkipWhiteSpace(p);
862     if (*p != '#') {
863       return nullptr;
864     }
865   }
866   return SkipWhiteSpace(p + 1);
867 }
868 
869 bool Prescanner::IsNextLinePreprocessorDirective() const {
870   return IsPreprocessorDirectiveLine(nextLine_) != nullptr;
871 }
872 
873 bool Prescanner::SkipCommentLine(bool afterAmpersand) {
874   if (IsAtEnd()) {
875     if (afterAmpersand && prescannerNesting_ > 0) {
876       // A continuation marker at the end of the last line in an
877       // include file inhibits the newline for that line.
878       SkipToEndOfLine();
879       omitNewline_ = true;
880     }
881     return false;
882   }
883   auto lineClass{ClassifyLine(nextLine_)};
884   if (lineClass.kind == LineClassification::Kind::Comment) {
885     NextLine();
886     return true;
887   } else if (inPreprocessorDirective_) {
888     return false;
889   } else if (lineClass.kind ==
890           LineClassification::Kind::ConditionalCompilationDirective ||
891       lineClass.kind == LineClassification::Kind::PreprocessorDirective) {
892     // Allow conditional compilation directives (e.g., #ifdef) to affect
893     // continuation lines.
894     // Allow other preprocessor directives, too, except #include
895     // (when it does not follow '&'), #define, and #undef (because
896     // they cannot be allowed to affect preceding text on a
897     // continued line).
898     preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
899     return true;
900   } else if (afterAmpersand &&
901       (lineClass.kind == LineClassification::Kind::IncludeDirective ||
902           lineClass.kind == LineClassification::Kind::IncludeLine)) {
903     SkipToEndOfLine();
904     omitNewline_ = true;
905     skipLeadingAmpersand_ = true;
906     return false;
907   } else {
908     return false;
909   }
910 }
911 
912 const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) {
913   if (IsAtEnd()) {
914     return nullptr;
915   }
916   tabInCurrentLine_ = false;
917   char col1{*nextLine_};
918   if (InCompilerDirective()) {
919     // Must be a continued compiler directive.
920     if (!IsFixedFormCommentChar(col1)) {
921       return nullptr;
922     }
923     int j{1};
924     for (; j < 5; ++j) {
925       char ch{directiveSentinel_[j - 1]};
926       if (ch == '\0') {
927         break;
928       }
929       if (ch != ToLowerCaseLetter(nextLine_[j])) {
930         return nullptr;
931       }
932     }
933     for (; j < 5; ++j) {
934       if (nextLine_[j] != ' ') {
935         return nullptr;
936       }
937     }
938     char col6{nextLine_[5]};
939     if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
940       if (nextLine_[6] != ' ' && mightNeedSpace) {
941         insertASpace_ = true;
942       }
943       return nextLine_ + 6;
944     }
945     return nullptr;
946   } else {
947     // Normal case: not in a compiler directive.
948     if (col1 == '&' &&
949         features_.IsEnabled(
950             LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
951       // Extension: '&' as continuation marker
952       if (features_.ShouldWarn(
953               LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
954         Say(GetProvenance(nextLine_), "nonstandard usage"_en_US);
955       }
956       return nextLine_ + 1;
957     }
958     if (col1 == '\t' && nextLine_[1] >= '1' && nextLine_[1] <= '9') {
959       tabInCurrentLine_ = true;
960       return nextLine_ + 2; // VAX extension
961     }
962     if (col1 == ' ' && nextLine_[1] == ' ' && nextLine_[2] == ' ' &&
963         nextLine_[3] == ' ' && nextLine_[4] == ' ') {
964       char col6{nextLine_[5]};
965       if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
966         return nextLine_ + 6;
967       }
968     }
969     if (IsImplicitContinuation()) {
970       return nextLine_;
971     }
972   }
973   return nullptr; // not a continuation line
974 }
975 
976 const char *Prescanner::FreeFormContinuationLine(bool ampersand) {
977   const char *p{nextLine_};
978   if (p >= limit_) {
979     return nullptr;
980   }
981   p = SkipWhiteSpace(p);
982   if (InCompilerDirective()) {
983     if (*p++ != '!') {
984       return nullptr;
985     }
986     for (const char *s{directiveSentinel_}; *s != '\0'; ++p, ++s) {
987       if (*s != ToLowerCaseLetter(*p)) {
988         return nullptr;
989       }
990     }
991     p = SkipWhiteSpace(p);
992     if (*p == '&') {
993       if (!ampersand) {
994         insertASpace_ = true;
995       }
996       return p + 1;
997     } else if (ampersand) {
998       return p;
999     } else {
1000       return nullptr;
1001     }
1002   } else {
1003     if (*p == '&') {
1004       return p + 1;
1005     } else if (*p == '!' || *p == '\n' || *p == '#') {
1006       return nullptr;
1007     } else if (ampersand || IsImplicitContinuation()) {
1008       if (p > nextLine_) {
1009         --p;
1010       } else {
1011         insertASpace_ = true;
1012       }
1013       return p;
1014     } else {
1015       return nullptr;
1016     }
1017   }
1018 }
1019 
1020 bool Prescanner::FixedFormContinuation(bool mightNeedSpace) {
1021   // N.B. We accept '&' as a continuation indicator in fixed form, too,
1022   // but not in a character literal.
1023   if (*at_ == '&' && inCharLiteral_) {
1024     return false;
1025   }
1026   do {
1027     if (const char *cont{FixedFormContinuationLine(mightNeedSpace)}) {
1028       BeginSourceLine(cont);
1029       column_ = 7;
1030       NextLine();
1031       return true;
1032     }
1033   } while (SkipCommentLine(false /* not after ampersand */));
1034   return false;
1035 }
1036 
1037 bool Prescanner::FreeFormContinuation() {
1038   const char *p{at_};
1039   bool ampersand{*p == '&'};
1040   if (ampersand) {
1041     p = SkipWhiteSpace(p + 1);
1042   }
1043   if (*p != '\n') {
1044     if (inCharLiteral_) {
1045       return false;
1046     } else if (*p != '!' &&
1047         features_.ShouldWarn(LanguageFeature::CruftAfterAmpersand)) {
1048       Say(GetProvenance(p), "missing ! before comment after &"_en_US);
1049     }
1050   }
1051   do {
1052     if (const char *cont{FreeFormContinuationLine(ampersand)}) {
1053       BeginSourceLine(cont);
1054       NextLine();
1055       return true;
1056     }
1057   } while (SkipCommentLine(ampersand));
1058   return false;
1059 }
1060 
1061 // Implicit line continuation allows a preprocessor macro call with
1062 // arguments to span multiple lines.
1063 bool Prescanner::IsImplicitContinuation() const {
1064   return !inPreprocessorDirective_ && !inCharLiteral_ &&
1065       delimiterNesting_ > 0 && !IsAtEnd() &&
1066       ClassifyLine(nextLine_).kind == LineClassification::Kind::Source;
1067 }
1068 
1069 bool Prescanner::Continuation(bool mightNeedFixedFormSpace) {
1070   if (*at_ == '\n' || *at_ == '&') {
1071     if (inFixedForm_) {
1072       return FixedFormContinuation(mightNeedFixedFormSpace);
1073     } else {
1074       return FreeFormContinuation();
1075     }
1076   } else {
1077     return false;
1078   }
1079 }
1080 
1081 std::optional<Prescanner::LineClassification>
1082 Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const {
1083   const char *p{start};
1084   char col1{*p++};
1085   if (!IsFixedFormCommentChar(col1)) {
1086     return std::nullopt;
1087   }
1088   char sentinel[5], *sp{sentinel};
1089   int column{2};
1090   for (; column < 6; ++column, ++p) {
1091     if (*p != ' ') {
1092       if (*p == '\n' || *p == '\t') {
1093         break;
1094       }
1095       if (sp == sentinel + 1 && sentinel[0] == '$' && IsDecimalDigit(*p)) {
1096         // OpenMP conditional compilation line: leave the label alone
1097         break;
1098       }
1099       *sp++ = ToLowerCaseLetter(*p);
1100     }
1101   }
1102   if (column == 6) {
1103     if (*p == ' ' || *p == '\t' || *p == '0') {
1104       ++p;
1105     } else {
1106       // This is a Continuation line, not an initial directive line.
1107       return std::nullopt;
1108     }
1109   }
1110   if (sp == sentinel) {
1111     return std::nullopt;
1112   }
1113   *sp = '\0';
1114   if (const char *ss{IsCompilerDirectiveSentinel(sentinel)}) {
1115     std::size_t payloadOffset = p - start;
1116     return {LineClassification{
1117         LineClassification::Kind::CompilerDirective, payloadOffset, ss}};
1118   }
1119   return std::nullopt;
1120 }
1121 
1122 std::optional<Prescanner::LineClassification>
1123 Prescanner::IsFreeFormCompilerDirectiveLine(const char *start) const {
1124   char sentinel[8];
1125   const char *p{SkipWhiteSpace(start)};
1126   if (*p++ != '!') {
1127     return std::nullopt;
1128   }
1129   for (std::size_t j{0}; j + 1 < sizeof sentinel; ++p, ++j) {
1130     if (*p == '\n') {
1131       break;
1132     }
1133     if (*p == ' ' || *p == '\t' || *p == '&') {
1134       if (j == 0) {
1135         break;
1136       }
1137       sentinel[j] = '\0';
1138       p = SkipWhiteSpace(p + 1);
1139       if (*p == '!') {
1140         break;
1141       }
1142       if (const char *sp{IsCompilerDirectiveSentinel(sentinel)}) {
1143         std::size_t offset = p - start;
1144         return {LineClassification{
1145             LineClassification::Kind::CompilerDirective, offset, sp}};
1146       }
1147       break;
1148     }
1149     sentinel[j] = ToLowerCaseLetter(*p);
1150   }
1151   return std::nullopt;
1152 }
1153 
1154 Prescanner &Prescanner::AddCompilerDirectiveSentinel(const std::string &dir) {
1155   std::uint64_t packed{0};
1156   for (char ch : dir) {
1157     packed = (packed << 8) | (ToLowerCaseLetter(ch) & 0xff);
1158   }
1159   compilerDirectiveBloomFilter_.set(packed % prime1);
1160   compilerDirectiveBloomFilter_.set(packed % prime2);
1161   compilerDirectiveSentinels_.insert(dir);
1162   return *this;
1163 }
1164 
1165 const char *Prescanner::IsCompilerDirectiveSentinel(
1166     const char *sentinel) const {
1167   std::uint64_t packed{0};
1168   std::size_t n{0};
1169   for (; sentinel[n] != '\0'; ++n) {
1170     packed = (packed << 8) | (sentinel[n] & 0xff);
1171   }
1172   if (n == 0 || !compilerDirectiveBloomFilter_.test(packed % prime1) ||
1173       !compilerDirectiveBloomFilter_.test(packed % prime2)) {
1174     return nullptr;
1175   }
1176   const auto iter{compilerDirectiveSentinels_.find(std::string(sentinel, n))};
1177   return iter == compilerDirectiveSentinels_.end() ? nullptr : iter->c_str();
1178 }
1179 
1180 constexpr bool IsDirective(const char *match, const char *dir) {
1181   for (; *match; ++match) {
1182     if (*match != ToLowerCaseLetter(*dir++)) {
1183       return false;
1184     }
1185   }
1186   return true;
1187 }
1188 
1189 Prescanner::LineClassification Prescanner::ClassifyLine(
1190     const char *start) const {
1191   if (inFixedForm_) {
1192     if (std::optional<LineClassification> lc{
1193             IsFixedFormCompilerDirectiveLine(start)}) {
1194       return std::move(*lc);
1195     }
1196     if (IsFixedFormCommentLine(start)) {
1197       return {LineClassification::Kind::Comment};
1198     }
1199   } else {
1200     if (std::optional<LineClassification> lc{
1201             IsFreeFormCompilerDirectiveLine(start)}) {
1202       return std::move(*lc);
1203     }
1204     if (const char *bang{IsFreeFormComment(start)}) {
1205       return {LineClassification::Kind::Comment,
1206           static_cast<std::size_t>(bang - start)};
1207     }
1208   }
1209   if (std::optional<std::size_t> quoteOffset{IsIncludeLine(start)}) {
1210     return {LineClassification::Kind::IncludeLine, *quoteOffset};
1211   }
1212   if (const char *dir{IsPreprocessorDirectiveLine(start)}) {
1213     if (IsDirective("if", dir) || IsDirective("elif", dir) ||
1214         IsDirective("else", dir) || IsDirective("endif", dir)) {
1215       return {LineClassification::Kind::ConditionalCompilationDirective};
1216     } else if (IsDirective("include", dir)) {
1217       return {LineClassification::Kind::IncludeDirective};
1218     } else if (IsDirective("define", dir) || IsDirective("undef", dir)) {
1219       return {LineClassification::Kind::DefinitionDirective};
1220     } else {
1221       return {LineClassification::Kind::PreprocessorDirective};
1222     }
1223   }
1224   return {LineClassification::Kind::Source};
1225 }
1226 
1227 void Prescanner::SourceFormChange(std::string &&dir) {
1228   if (dir == "!dir$ free") {
1229     inFixedForm_ = false;
1230   } else if (dir == "!dir$ fixed") {
1231     inFixedForm_ = true;
1232   }
1233 }
1234 } // namespace Fortran::parser
1235