1 //===--- DirectiveTree.cpp - Find and strip preprocessor directives -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "clang-pseudo/DirectiveTree.h"
10 #include "clang/Basic/IdentifierTable.h"
11 #include "clang/Basic/TokenKinds.h"
12 #include "llvm/Support/FormatVariadic.h"
13 
14 namespace clang {
15 namespace pseudo {
16 namespace {
17 
18 class DirectiveParser {
19 public:
DirectiveParser(const TokenStream & Code)20   explicit DirectiveParser(const TokenStream &Code)
21       : Code(Code), Tok(&Code.front()) {}
parse(DirectiveTree * Result)22   void parse(DirectiveTree *Result) { parse(Result, /*TopLevel=*/true); }
23 
24 private:
25   // Roles that a directive might take within a conditional block.
26   enum class Cond { None, If, Else, End };
classifyDirective(tok::PPKeywordKind K)27   static Cond classifyDirective(tok::PPKeywordKind K) {
28     switch (K) {
29     case clang::tok::pp_if:
30     case clang::tok::pp_ifdef:
31     case clang::tok::pp_ifndef:
32       return Cond::If;
33     case clang::tok::pp_elif:
34     case clang::tok::pp_elifdef:
35     case clang::tok::pp_elifndef:
36     case clang::tok::pp_else:
37       return Cond::Else;
38     case clang::tok::pp_endif:
39       return Cond::End;
40     default:
41       return Cond::None;
42     }
43   }
44 
45   // Parses tokens starting at Tok into Tree.
46   // If we reach an End or Else directive that ends Tree, returns it.
47   // If TopLevel is true, then we do not expect End and always return None.
parse(DirectiveTree * Tree,bool TopLevel)48   llvm::Optional<DirectiveTree::Directive> parse(DirectiveTree *Tree,
49                                                 bool TopLevel) {
50     auto StartsDirective =
51         [&, AllowDirectiveAt((const Token *)nullptr)]() mutable {
52           if (Tok->flag(LexFlags::StartsPPLine)) {
53             // If we considered a comment at the start of a PP-line, it doesn't
54             // start a directive but the directive can still start after it.
55             if (Tok->Kind == tok::comment)
56               AllowDirectiveAt = Tok + 1;
57             return Tok->Kind == tok::hash;
58           }
59           return Tok->Kind == tok::hash && AllowDirectiveAt == Tok;
60         };
61     // Each iteration adds one chunk (or returns, if we see #endif).
62     while (Tok->Kind != tok::eof) {
63       // If there's no directive here, we have a code chunk.
64       if (!StartsDirective()) {
65         const Token *Start = Tok;
66         do
67           ++Tok;
68         while (Tok->Kind != tok::eof && !StartsDirective());
69         Tree->Chunks.push_back(DirectiveTree::Code{
70             Token::Range{Code.index(*Start), Code.index(*Tok)}});
71         continue;
72       }
73 
74       // We have some kind of directive.
75       DirectiveTree::Directive Directive;
76       parseDirective(&Directive);
77       Cond Kind = classifyDirective(Directive.Kind);
78       if (Kind == Cond::If) {
79         // #if or similar, starting a nested conditional block.
80         DirectiveTree::Conditional Conditional;
81         Conditional.Branches.emplace_back();
82         Conditional.Branches.back().first = std::move(Directive);
83         parseConditional(&Conditional);
84         Tree->Chunks.push_back(std::move(Conditional));
85       } else if ((Kind == Cond::Else || Kind == Cond::End) && !TopLevel) {
86         // #endif or similar, ending this PStructure scope.
87         // (#endif is unexpected at the top level, treat as simple directive).
88         return std::move(Directive);
89       } else {
90         // #define or similar, a simple directive at the current scope.
91         Tree->Chunks.push_back(std::move(Directive));
92       }
93     }
94     return None;
95   }
96 
97   // Parse the rest of a conditional section, after seeing the If directive.
98   // Returns after consuming the End directive.
parseConditional(DirectiveTree::Conditional * C)99   void parseConditional(DirectiveTree::Conditional *C) {
100     assert(C->Branches.size() == 1 &&
101            C->Branches.front().second.Chunks.empty() &&
102            "Should be ready to parse first branch body");
103     while (Tok->Kind != tok::eof) {
104       auto Terminator = parse(&C->Branches.back().second, /*TopLevel=*/false);
105       if (!Terminator) {
106         assert(Tok->Kind == tok::eof && "gave up parsing before eof?");
107         C->End.Tokens = Token::Range::emptyAt(Code.index(*Tok));
108         return;
109       }
110       if (classifyDirective(Terminator->Kind) == Cond::End) {
111         C->End = std::move(*Terminator);
112         return;
113       }
114       assert(classifyDirective(Terminator->Kind) == Cond::Else &&
115              "ended branch unexpectedly");
116       C->Branches.emplace_back();
117       C->Branches.back().first = std::move(*Terminator);
118     }
119   }
120 
121   // Parse a directive. Tok is the hash.
parseDirective(DirectiveTree::Directive * D)122   void parseDirective(DirectiveTree::Directive *D) {
123     assert(Tok->Kind == tok::hash);
124 
125     // Directive spans from the hash until the end of line or file.
126     const Token *Begin = Tok++;
127     while (Tok->Kind != tok::eof && !Tok->flag(LexFlags::StartsPPLine))
128       ++Tok;
129     ArrayRef<Token> Tokens{Begin, Tok};
130     D->Tokens = {Code.index(*Tokens.begin()), Code.index(*Tokens.end())};
131 
132     // Directive name is the first non-comment token after the hash.
133     Tokens = Tokens.drop_front().drop_while(
134         [](const Token &T) { return T.Kind == tok::comment; });
135     if (!Tokens.empty())
136       D->Kind = PPKeywords.get(Tokens.front().text()).getPPKeywordID();
137   }
138 
139   const TokenStream &Code;
140   const Token *Tok;
141   clang::IdentifierTable PPKeywords;
142 };
143 
144 } // namespace
145 
parse(const TokenStream & Code)146 DirectiveTree DirectiveTree::parse(const TokenStream &Code) {
147   DirectiveTree Result;
148   DirectiveParser(Code).parse(&Result);
149   return Result;
150 }
151 
152 static void dump(llvm::raw_ostream &OS, const DirectiveTree &, unsigned Indent);
dump(llvm::raw_ostream & OS,const DirectiveTree::Directive & Directive,unsigned Indent,bool Taken=false)153 static void dump(llvm::raw_ostream &OS,
154                  const DirectiveTree::Directive &Directive, unsigned Indent,
155                  bool Taken = false) {
156   OS.indent(Indent) << llvm::formatv(
157       "#{0} ({1} tokens){2}\n", tok::getPPKeywordSpelling(Directive.Kind),
158       Directive.Tokens.size(), Taken ? " TAKEN" : "");
159 }
dump(llvm::raw_ostream & OS,const DirectiveTree::Code & Code,unsigned Indent)160 static void dump(llvm::raw_ostream &OS, const DirectiveTree::Code &Code,
161                  unsigned Indent) {
162   OS.indent(Indent) << llvm::formatv("code ({0} tokens)\n", Code.Tokens.size());
163 }
dump(llvm::raw_ostream & OS,const DirectiveTree::Conditional & Conditional,unsigned Indent)164 static void dump(llvm::raw_ostream &OS,
165                  const DirectiveTree::Conditional &Conditional,
166                  unsigned Indent) {
167   for (unsigned I = 0; I < Conditional.Branches.size(); ++I) {
168     const auto &Branch = Conditional.Branches[I];
169     dump(OS, Branch.first, Indent, Conditional.Taken == I);
170     dump(OS, Branch.second, Indent + 2);
171   }
172   dump(OS, Conditional.End, Indent);
173 }
174 
dump(llvm::raw_ostream & OS,const DirectiveTree::Chunk & Chunk,unsigned Indent)175 static void dump(llvm::raw_ostream &OS, const DirectiveTree::Chunk &Chunk,
176                  unsigned Indent) {
177   switch (Chunk.kind()) {
178   case DirectiveTree::Chunk::K_Empty:
179     llvm_unreachable("invalid chunk");
180   case DirectiveTree::Chunk::K_Code:
181     return dump(OS, (const DirectiveTree::Code &)Chunk, Indent);
182   case DirectiveTree::Chunk::K_Directive:
183     return dump(OS, (const DirectiveTree::Directive &)Chunk, Indent);
184   case DirectiveTree::Chunk::K_Conditional:
185     return dump(OS, (const DirectiveTree::Conditional &)Chunk, Indent);
186   }
187 }
188 
dump(llvm::raw_ostream & OS,const DirectiveTree & Tree,unsigned Indent)189 static void dump(llvm::raw_ostream &OS, const DirectiveTree &Tree,
190                  unsigned Indent) {
191   for (const auto &Chunk : Tree.Chunks)
192     dump(OS, Chunk, Indent);
193 }
194 
195 // Define operator<< in terms of dump() functions above.
196 #define OSTREAM_DUMP(Type)                                                     \
197   llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Type &T) {        \
198     dump(OS, T, 0);                                                            \
199     return OS;                                                                 \
200   }
201 OSTREAM_DUMP(DirectiveTree)
202 OSTREAM_DUMP(DirectiveTree::Chunk)
203 OSTREAM_DUMP(DirectiveTree::Directive)
204 OSTREAM_DUMP(DirectiveTree::Conditional)
205 OSTREAM_DUMP(DirectiveTree::Code)
206 #undef OSTREAM_DUMP
207 
208 namespace {
209 // Makes choices about conditional branches.
210 //
211 // Generally it tries to maximize the amount of useful code we see.
212 //
213 // Caveat: each conditional is evaluated independently. Consider this code:
214 //   #ifdef WINDOWS
215 //     bool isWindows = true;
216 //   #endif
217 //   #ifndef WINDOWS
218 //     bool isWindows = false;
219 //   #endif
220 // We take both branches and define isWindows twice. We could track more state
221 // in order to produce a consistent view, but this is complex.
222 class BranchChooser {
223 public:
BranchChooser(const TokenStream & Code)224   BranchChooser(const TokenStream &Code) : Code(Code) {}
225 
choose(DirectiveTree & M)226   void choose(DirectiveTree &M) { walk(M); }
227 
228 private:
229   // Describes code seen by making particular branch choices. Higher is better.
230   struct Score {
231     int Tokens = 0; // excluding comments and directives
232     int Directives = 0;
233     int Errors = 0; // #error directives
234 
operator >clang::pseudo::__anon141705870411::BranchChooser::Score235     bool operator>(const Score &Other) const {
236       // Seeing errors is bad, other things are good.
237       return std::make_tuple(-Errors, Tokens, Directives) >
238              std::make_tuple(-Other.Errors, Other.Tokens, Other.Directives);
239     }
240 
operator +=clang::pseudo::__anon141705870411::BranchChooser::Score241     Score &operator+=(const Score &Other) {
242       Tokens += Other.Tokens;
243       Directives += Other.Directives;
244       Errors += Other.Errors;
245       return *this;
246     }
247   };
248 
walk(DirectiveTree::Code & C)249   Score walk(DirectiveTree::Code &C) {
250     Score S;
251     for (const Token &T : Code.tokens(C.Tokens))
252       if (T.Kind != tok::comment)
253         ++S.Tokens;
254     return S;
255   }
256 
walk(DirectiveTree::Directive & D)257   Score walk(DirectiveTree::Directive &D) {
258     Score S;
259     S.Directives = 1;
260     S.Errors = D.Kind == tok::pp_error;
261     return S;
262   }
263 
walk(DirectiveTree::Chunk & C)264   Score walk(DirectiveTree::Chunk &C) {
265     switch (C.kind()) {
266     case DirectiveTree::Chunk::K_Code:
267       return walk((DirectiveTree::Code &)C);
268     case DirectiveTree::Chunk::K_Directive:
269       return walk((DirectiveTree::Directive &)C);
270     case DirectiveTree::Chunk::K_Conditional:
271       return walk((DirectiveTree::Conditional &)C);
272     case DirectiveTree::Chunk::K_Empty:
273       break;
274     }
275     llvm_unreachable("bad chunk kind");
276   }
277 
walk(DirectiveTree & M)278   Score walk(DirectiveTree &M) {
279     Score S;
280     for (DirectiveTree::Chunk &C : M.Chunks)
281       S += walk(C);
282     return S;
283   }
284 
walk(DirectiveTree::Conditional & C)285   Score walk(DirectiveTree::Conditional &C) {
286     Score Best;
287     bool MayTakeTrivial = true;
288     bool TookTrivial = false;
289 
290     for (unsigned I = 0; I < C.Branches.size(); ++I) {
291       // Walk the branch to make its nested choices in any case.
292       Score BranchScore = walk(C.Branches[I].second);
293       // If we already took an #if 1, don't consider any other branches.
294       if (TookTrivial)
295         continue;
296       // Is this a trivial #if 0 or #if 1?
297       if (auto TriviallyTaken = isTakenWhenReached(C.Branches[I].first)) {
298         if (!*TriviallyTaken)
299           continue; // Don't consider #if 0 even if it scores well.
300         if (MayTakeTrivial)
301           TookTrivial = true;
302       } else {
303         // After a nontrivial condition, #elif 1 isn't guaranteed taken.
304         MayTakeTrivial = false;
305       }
306       // Is this the best branch so far? (Including if it's #if 1).
307       if (TookTrivial || !C.Taken || BranchScore > Best) {
308         Best = BranchScore;
309         C.Taken = I;
310       }
311     }
312     return Best;
313   }
314 
315   // Return true if the directive starts an always-taken conditional branch,
316   // false if the branch is never taken, and None otherwise.
isTakenWhenReached(const DirectiveTree::Directive & Dir)317   llvm::Optional<bool> isTakenWhenReached(const DirectiveTree::Directive &Dir) {
318     switch (Dir.Kind) {
319     case clang::tok::pp_if:
320     case clang::tok::pp_elif:
321       break; // handled below
322     case clang::tok::pp_else:
323       return true;
324     default: // #ifdef etc
325       return llvm::None;
326     }
327 
328     const auto &Tokens = Code.tokens(Dir.Tokens);
329     assert(!Tokens.empty() && Tokens.front().Kind == tok::hash);
330     const Token &Name = Tokens.front().nextNC();
331     const Token &Value = Name.nextNC();
332     // Does the condition consist of exactly one token?
333     if (&Value >= Tokens.end() || &Value.nextNC() < Tokens.end())
334       return llvm::None;
335     return llvm::StringSwitch<llvm::Optional<bool>>(Value.text())
336         .Cases("true", "1", true)
337         .Cases("false", "0", false)
338         .Default(llvm::None);
339   }
340 
341   const TokenStream &Code;
342 };
343 
344 } // namespace
345 
chooseConditionalBranches(DirectiveTree & Tree,const TokenStream & Code)346 void chooseConditionalBranches(DirectiveTree &Tree, const TokenStream &Code) {
347   BranchChooser{Code}.choose(Tree);
348 }
349 
350 namespace {
351 class Preprocessor {
352   const TokenStream &In;
353   TokenStream &Out;
354 
355 public:
Preprocessor(const TokenStream & In,TokenStream & Out)356   Preprocessor(const TokenStream &In, TokenStream &Out) : In(In), Out(Out) {}
~Preprocessor()357   ~Preprocessor() { Out.finalize(); }
358 
walk(const DirectiveTree & T)359   void walk(const DirectiveTree &T) {
360     for (const auto &C : T.Chunks)
361       walk(C);
362   }
363 
walk(const DirectiveTree::Chunk & C)364   void walk(const DirectiveTree::Chunk &C) {
365     switch (C.kind()) {
366     case DirectiveTree::Chunk::K_Code:
367       return walk((const DirectiveTree::Code &)C);
368     case DirectiveTree::Chunk::K_Directive:
369       return walk((const DirectiveTree::Directive &)C);
370     case DirectiveTree::Chunk::K_Conditional:
371       return walk((const DirectiveTree::Conditional &)C);
372     case DirectiveTree::Chunk::K_Empty:
373       break;
374     }
375     llvm_unreachable("bad chunk kind");
376   }
377 
walk(const DirectiveTree::Code & C)378   void walk(const DirectiveTree::Code &C) {
379     for (const auto &Tok : In.tokens(C.Tokens))
380       Out.push(Tok);
381   }
382 
walk(const DirectiveTree::Directive &)383   void walk(const DirectiveTree::Directive &) {}
384 
walk(const DirectiveTree::Conditional & C)385   void walk(const DirectiveTree::Conditional &C) {
386     if (C.Taken)
387       walk(C.Branches[*C.Taken].second);
388   }
389 };
390 } // namespace
391 
stripDirectives(const TokenStream & In) const392 TokenStream DirectiveTree::stripDirectives(const TokenStream &In) const {
393   TokenStream Out;
394   Preprocessor(In, Out).walk(*this);
395   return Out;
396 }
397 
398 } // namespace pseudo
399 } // namespace clang
400