1 //===--- DirectiveTree.cpp - Find and strip preprocessor directives -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "clang-pseudo/DirectiveTree.h"
10 #include "clang/Basic/IdentifierTable.h"
11 #include "clang/Basic/TokenKinds.h"
12 #include "llvm/Support/FormatVariadic.h"
13
14 namespace clang {
15 namespace pseudo {
16 namespace {
17
18 class DirectiveParser {
19 public:
DirectiveParser(const TokenStream & Code)20 explicit DirectiveParser(const TokenStream &Code)
21 : Code(Code), Tok(&Code.front()) {}
parse(DirectiveTree * Result)22 void parse(DirectiveTree *Result) { parse(Result, /*TopLevel=*/true); }
23
24 private:
25 // Roles that a directive might take within a conditional block.
26 enum class Cond { None, If, Else, End };
classifyDirective(tok::PPKeywordKind K)27 static Cond classifyDirective(tok::PPKeywordKind K) {
28 switch (K) {
29 case clang::tok::pp_if:
30 case clang::tok::pp_ifdef:
31 case clang::tok::pp_ifndef:
32 return Cond::If;
33 case clang::tok::pp_elif:
34 case clang::tok::pp_elifdef:
35 case clang::tok::pp_elifndef:
36 case clang::tok::pp_else:
37 return Cond::Else;
38 case clang::tok::pp_endif:
39 return Cond::End;
40 default:
41 return Cond::None;
42 }
43 }
44
45 // Parses tokens starting at Tok into Tree.
46 // If we reach an End or Else directive that ends Tree, returns it.
47 // If TopLevel is true, then we do not expect End and always return None.
parse(DirectiveTree * Tree,bool TopLevel)48 llvm::Optional<DirectiveTree::Directive> parse(DirectiveTree *Tree,
49 bool TopLevel) {
50 auto StartsDirective =
51 [&, AllowDirectiveAt((const Token *)nullptr)]() mutable {
52 if (Tok->flag(LexFlags::StartsPPLine)) {
53 // If we considered a comment at the start of a PP-line, it doesn't
54 // start a directive but the directive can still start after it.
55 if (Tok->Kind == tok::comment)
56 AllowDirectiveAt = Tok + 1;
57 return Tok->Kind == tok::hash;
58 }
59 return Tok->Kind == tok::hash && AllowDirectiveAt == Tok;
60 };
61 // Each iteration adds one chunk (or returns, if we see #endif).
62 while (Tok->Kind != tok::eof) {
63 // If there's no directive here, we have a code chunk.
64 if (!StartsDirective()) {
65 const Token *Start = Tok;
66 do
67 ++Tok;
68 while (Tok->Kind != tok::eof && !StartsDirective());
69 Tree->Chunks.push_back(DirectiveTree::Code{
70 Token::Range{Code.index(*Start), Code.index(*Tok)}});
71 continue;
72 }
73
74 // We have some kind of directive.
75 DirectiveTree::Directive Directive;
76 parseDirective(&Directive);
77 Cond Kind = classifyDirective(Directive.Kind);
78 if (Kind == Cond::If) {
79 // #if or similar, starting a nested conditional block.
80 DirectiveTree::Conditional Conditional;
81 Conditional.Branches.emplace_back();
82 Conditional.Branches.back().first = std::move(Directive);
83 parseConditional(&Conditional);
84 Tree->Chunks.push_back(std::move(Conditional));
85 } else if ((Kind == Cond::Else || Kind == Cond::End) && !TopLevel) {
86 // #endif or similar, ending this PStructure scope.
87 // (#endif is unexpected at the top level, treat as simple directive).
88 return std::move(Directive);
89 } else {
90 // #define or similar, a simple directive at the current scope.
91 Tree->Chunks.push_back(std::move(Directive));
92 }
93 }
94 return None;
95 }
96
97 // Parse the rest of a conditional section, after seeing the If directive.
98 // Returns after consuming the End directive.
parseConditional(DirectiveTree::Conditional * C)99 void parseConditional(DirectiveTree::Conditional *C) {
100 assert(C->Branches.size() == 1 &&
101 C->Branches.front().second.Chunks.empty() &&
102 "Should be ready to parse first branch body");
103 while (Tok->Kind != tok::eof) {
104 auto Terminator = parse(&C->Branches.back().second, /*TopLevel=*/false);
105 if (!Terminator) {
106 assert(Tok->Kind == tok::eof && "gave up parsing before eof?");
107 C->End.Tokens = Token::Range::emptyAt(Code.index(*Tok));
108 return;
109 }
110 if (classifyDirective(Terminator->Kind) == Cond::End) {
111 C->End = std::move(*Terminator);
112 return;
113 }
114 assert(classifyDirective(Terminator->Kind) == Cond::Else &&
115 "ended branch unexpectedly");
116 C->Branches.emplace_back();
117 C->Branches.back().first = std::move(*Terminator);
118 }
119 }
120
121 // Parse a directive. Tok is the hash.
parseDirective(DirectiveTree::Directive * D)122 void parseDirective(DirectiveTree::Directive *D) {
123 assert(Tok->Kind == tok::hash);
124
125 // Directive spans from the hash until the end of line or file.
126 const Token *Begin = Tok++;
127 while (Tok->Kind != tok::eof && !Tok->flag(LexFlags::StartsPPLine))
128 ++Tok;
129 ArrayRef<Token> Tokens{Begin, Tok};
130 D->Tokens = {Code.index(*Tokens.begin()), Code.index(*Tokens.end())};
131
132 // Directive name is the first non-comment token after the hash.
133 Tokens = Tokens.drop_front().drop_while(
134 [](const Token &T) { return T.Kind == tok::comment; });
135 if (!Tokens.empty())
136 D->Kind = PPKeywords.get(Tokens.front().text()).getPPKeywordID();
137 }
138
139 const TokenStream &Code;
140 const Token *Tok;
141 clang::IdentifierTable PPKeywords;
142 };
143
144 } // namespace
145
parse(const TokenStream & Code)146 DirectiveTree DirectiveTree::parse(const TokenStream &Code) {
147 DirectiveTree Result;
148 DirectiveParser(Code).parse(&Result);
149 return Result;
150 }
151
152 static void dump(llvm::raw_ostream &OS, const DirectiveTree &, unsigned Indent);
dump(llvm::raw_ostream & OS,const DirectiveTree::Directive & Directive,unsigned Indent,bool Taken=false)153 static void dump(llvm::raw_ostream &OS,
154 const DirectiveTree::Directive &Directive, unsigned Indent,
155 bool Taken = false) {
156 OS.indent(Indent) << llvm::formatv(
157 "#{0} ({1} tokens){2}\n", tok::getPPKeywordSpelling(Directive.Kind),
158 Directive.Tokens.size(), Taken ? " TAKEN" : "");
159 }
dump(llvm::raw_ostream & OS,const DirectiveTree::Code & Code,unsigned Indent)160 static void dump(llvm::raw_ostream &OS, const DirectiveTree::Code &Code,
161 unsigned Indent) {
162 OS.indent(Indent) << llvm::formatv("code ({0} tokens)\n", Code.Tokens.size());
163 }
dump(llvm::raw_ostream & OS,const DirectiveTree::Conditional & Conditional,unsigned Indent)164 static void dump(llvm::raw_ostream &OS,
165 const DirectiveTree::Conditional &Conditional,
166 unsigned Indent) {
167 for (unsigned I = 0; I < Conditional.Branches.size(); ++I) {
168 const auto &Branch = Conditional.Branches[I];
169 dump(OS, Branch.first, Indent, Conditional.Taken == I);
170 dump(OS, Branch.second, Indent + 2);
171 }
172 dump(OS, Conditional.End, Indent);
173 }
174
dump(llvm::raw_ostream & OS,const DirectiveTree::Chunk & Chunk,unsigned Indent)175 static void dump(llvm::raw_ostream &OS, const DirectiveTree::Chunk &Chunk,
176 unsigned Indent) {
177 switch (Chunk.kind()) {
178 case DirectiveTree::Chunk::K_Empty:
179 llvm_unreachable("invalid chunk");
180 case DirectiveTree::Chunk::K_Code:
181 return dump(OS, (const DirectiveTree::Code &)Chunk, Indent);
182 case DirectiveTree::Chunk::K_Directive:
183 return dump(OS, (const DirectiveTree::Directive &)Chunk, Indent);
184 case DirectiveTree::Chunk::K_Conditional:
185 return dump(OS, (const DirectiveTree::Conditional &)Chunk, Indent);
186 }
187 }
188
dump(llvm::raw_ostream & OS,const DirectiveTree & Tree,unsigned Indent)189 static void dump(llvm::raw_ostream &OS, const DirectiveTree &Tree,
190 unsigned Indent) {
191 for (const auto &Chunk : Tree.Chunks)
192 dump(OS, Chunk, Indent);
193 }
194
195 // Define operator<< in terms of dump() functions above.
196 #define OSTREAM_DUMP(Type) \
197 llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Type &T) { \
198 dump(OS, T, 0); \
199 return OS; \
200 }
201 OSTREAM_DUMP(DirectiveTree)
202 OSTREAM_DUMP(DirectiveTree::Chunk)
203 OSTREAM_DUMP(DirectiveTree::Directive)
204 OSTREAM_DUMP(DirectiveTree::Conditional)
205 OSTREAM_DUMP(DirectiveTree::Code)
206 #undef OSTREAM_DUMP
207
208 namespace {
209 // Makes choices about conditional branches.
210 //
211 // Generally it tries to maximize the amount of useful code we see.
212 //
213 // Caveat: each conditional is evaluated independently. Consider this code:
214 // #ifdef WINDOWS
215 // bool isWindows = true;
216 // #endif
217 // #ifndef WINDOWS
218 // bool isWindows = false;
219 // #endif
220 // We take both branches and define isWindows twice. We could track more state
221 // in order to produce a consistent view, but this is complex.
222 class BranchChooser {
223 public:
BranchChooser(const TokenStream & Code)224 BranchChooser(const TokenStream &Code) : Code(Code) {}
225
choose(DirectiveTree & M)226 void choose(DirectiveTree &M) { walk(M); }
227
228 private:
229 // Describes code seen by making particular branch choices. Higher is better.
230 struct Score {
231 int Tokens = 0; // excluding comments and directives
232 int Directives = 0;
233 int Errors = 0; // #error directives
234
operator >clang::pseudo::__anon141705870411::BranchChooser::Score235 bool operator>(const Score &Other) const {
236 // Seeing errors is bad, other things are good.
237 return std::make_tuple(-Errors, Tokens, Directives) >
238 std::make_tuple(-Other.Errors, Other.Tokens, Other.Directives);
239 }
240
operator +=clang::pseudo::__anon141705870411::BranchChooser::Score241 Score &operator+=(const Score &Other) {
242 Tokens += Other.Tokens;
243 Directives += Other.Directives;
244 Errors += Other.Errors;
245 return *this;
246 }
247 };
248
walk(DirectiveTree::Code & C)249 Score walk(DirectiveTree::Code &C) {
250 Score S;
251 for (const Token &T : Code.tokens(C.Tokens))
252 if (T.Kind != tok::comment)
253 ++S.Tokens;
254 return S;
255 }
256
walk(DirectiveTree::Directive & D)257 Score walk(DirectiveTree::Directive &D) {
258 Score S;
259 S.Directives = 1;
260 S.Errors = D.Kind == tok::pp_error;
261 return S;
262 }
263
walk(DirectiveTree::Chunk & C)264 Score walk(DirectiveTree::Chunk &C) {
265 switch (C.kind()) {
266 case DirectiveTree::Chunk::K_Code:
267 return walk((DirectiveTree::Code &)C);
268 case DirectiveTree::Chunk::K_Directive:
269 return walk((DirectiveTree::Directive &)C);
270 case DirectiveTree::Chunk::K_Conditional:
271 return walk((DirectiveTree::Conditional &)C);
272 case DirectiveTree::Chunk::K_Empty:
273 break;
274 }
275 llvm_unreachable("bad chunk kind");
276 }
277
walk(DirectiveTree & M)278 Score walk(DirectiveTree &M) {
279 Score S;
280 for (DirectiveTree::Chunk &C : M.Chunks)
281 S += walk(C);
282 return S;
283 }
284
walk(DirectiveTree::Conditional & C)285 Score walk(DirectiveTree::Conditional &C) {
286 Score Best;
287 bool MayTakeTrivial = true;
288 bool TookTrivial = false;
289
290 for (unsigned I = 0; I < C.Branches.size(); ++I) {
291 // Walk the branch to make its nested choices in any case.
292 Score BranchScore = walk(C.Branches[I].second);
293 // If we already took an #if 1, don't consider any other branches.
294 if (TookTrivial)
295 continue;
296 // Is this a trivial #if 0 or #if 1?
297 if (auto TriviallyTaken = isTakenWhenReached(C.Branches[I].first)) {
298 if (!*TriviallyTaken)
299 continue; // Don't consider #if 0 even if it scores well.
300 if (MayTakeTrivial)
301 TookTrivial = true;
302 } else {
303 // After a nontrivial condition, #elif 1 isn't guaranteed taken.
304 MayTakeTrivial = false;
305 }
306 // Is this the best branch so far? (Including if it's #if 1).
307 if (TookTrivial || !C.Taken || BranchScore > Best) {
308 Best = BranchScore;
309 C.Taken = I;
310 }
311 }
312 return Best;
313 }
314
315 // Return true if the directive starts an always-taken conditional branch,
316 // false if the branch is never taken, and None otherwise.
isTakenWhenReached(const DirectiveTree::Directive & Dir)317 llvm::Optional<bool> isTakenWhenReached(const DirectiveTree::Directive &Dir) {
318 switch (Dir.Kind) {
319 case clang::tok::pp_if:
320 case clang::tok::pp_elif:
321 break; // handled below
322 case clang::tok::pp_else:
323 return true;
324 default: // #ifdef etc
325 return llvm::None;
326 }
327
328 const auto &Tokens = Code.tokens(Dir.Tokens);
329 assert(!Tokens.empty() && Tokens.front().Kind == tok::hash);
330 const Token &Name = Tokens.front().nextNC();
331 const Token &Value = Name.nextNC();
332 // Does the condition consist of exactly one token?
333 if (&Value >= Tokens.end() || &Value.nextNC() < Tokens.end())
334 return llvm::None;
335 return llvm::StringSwitch<llvm::Optional<bool>>(Value.text())
336 .Cases("true", "1", true)
337 .Cases("false", "0", false)
338 .Default(llvm::None);
339 }
340
341 const TokenStream &Code;
342 };
343
344 } // namespace
345
chooseConditionalBranches(DirectiveTree & Tree,const TokenStream & Code)346 void chooseConditionalBranches(DirectiveTree &Tree, const TokenStream &Code) {
347 BranchChooser{Code}.choose(Tree);
348 }
349
350 namespace {
351 class Preprocessor {
352 const TokenStream &In;
353 TokenStream &Out;
354
355 public:
Preprocessor(const TokenStream & In,TokenStream & Out)356 Preprocessor(const TokenStream &In, TokenStream &Out) : In(In), Out(Out) {}
~Preprocessor()357 ~Preprocessor() { Out.finalize(); }
358
walk(const DirectiveTree & T)359 void walk(const DirectiveTree &T) {
360 for (const auto &C : T.Chunks)
361 walk(C);
362 }
363
walk(const DirectiveTree::Chunk & C)364 void walk(const DirectiveTree::Chunk &C) {
365 switch (C.kind()) {
366 case DirectiveTree::Chunk::K_Code:
367 return walk((const DirectiveTree::Code &)C);
368 case DirectiveTree::Chunk::K_Directive:
369 return walk((const DirectiveTree::Directive &)C);
370 case DirectiveTree::Chunk::K_Conditional:
371 return walk((const DirectiveTree::Conditional &)C);
372 case DirectiveTree::Chunk::K_Empty:
373 break;
374 }
375 llvm_unreachable("bad chunk kind");
376 }
377
walk(const DirectiveTree::Code & C)378 void walk(const DirectiveTree::Code &C) {
379 for (const auto &Tok : In.tokens(C.Tokens))
380 Out.push(Tok);
381 }
382
walk(const DirectiveTree::Directive &)383 void walk(const DirectiveTree::Directive &) {}
384
walk(const DirectiveTree::Conditional & C)385 void walk(const DirectiveTree::Conditional &C) {
386 if (C.Taken)
387 walk(C.Branches[*C.Taken].second);
388 }
389 };
390 } // namespace
391
stripDirectives(const TokenStream & In) const392 TokenStream DirectiveTree::stripDirectives(const TokenStream &In) const {
393 TokenStream Out;
394 Preprocessor(In, Out).walk(*this);
395 return Out;
396 }
397
398 } // namespace pseudo
399 } // namespace clang
400