1 //===--- Preprocess.cpp - C Language Family Preprocessor Implementation ---===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 //  This file implements the Preprocessor interface.
11 //
12 //===----------------------------------------------------------------------===//
13 //
14 // Options to support:
15 //   -H       - Print the name of each header file used.
16 //   -d[DNI] - Dump various things.
17 //   -fworking-directory - #line's with preprocessor's working dir.
18 //   -fpreprocessed
19 //   -dependency-file,-M,-MM,-MF,-MG,-MP,-MT,-MQ,-MD,-MMD
20 //   -W*
21 //   -w
22 //
23 // Messages to emit:
24 //   "Multiple include guards may be useful for:\n"
25 //
26 //===----------------------------------------------------------------------===//
27 
28 #include "clang/Lex/Preprocessor.h"
29 #include "clang/Lex/HeaderSearch.h"
30 #include "clang/Lex/MacroInfo.h"
31 #include "clang/Lex/Pragma.h"
32 #include "clang/Lex/ScratchBuffer.h"
33 #include "clang/Lex/LexDiagnostic.h"
34 #include "clang/Basic/SourceManager.h"
35 #include "clang/Basic/FileManager.h"
36 #include "clang/Basic/TargetInfo.h"
37 #include "llvm/ADT/APFloat.h"
38 #include "llvm/ADT/SmallVector.h"
39 #include "llvm/Support/MemoryBuffer.h"
40 #include "llvm/Support/Streams.h"
41 #include <cstdio>
42 using namespace clang;
43 
44 //===----------------------------------------------------------------------===//
45 
46 PreprocessorFactory::~PreprocessorFactory() {}
47 
48 Preprocessor::Preprocessor(Diagnostic &diags, const LangOptions &opts,
49                            TargetInfo &target, SourceManager &SM,
50                            HeaderSearch &Headers,
51                            IdentifierInfoLookup* IILookup)
52   : Diags(&diags), Features(opts), Target(target),FileMgr(Headers.getFileMgr()),
53     SourceMgr(SM), HeaderInfo(Headers), Identifiers(opts, IILookup),
54     BuiltinInfo(Target), CurPPLexer(0), CurDirLookup(0), Callbacks(0) {
55   ScratchBuf = new ScratchBuffer(SourceMgr);
56   CounterValue = 0; // __COUNTER__ starts at 0.
57 
58   // Clear stats.
59   NumDirectives = NumDefined = NumUndefined = NumPragma = 0;
60   NumIf = NumElse = NumEndif = 0;
61   NumEnteredSourceFiles = 0;
62   NumMacroExpanded = NumFnMacroExpanded = NumBuiltinMacroExpanded = 0;
63   NumFastMacroExpanded = NumTokenPaste = NumFastTokenPaste = 0;
64   MaxIncludeStackDepth = 0;
65   NumSkipped = 0;
66 
67   // Default to discarding comments.
68   KeepComments = false;
69   KeepMacroComments = false;
70 
71   // Macro expansion is enabled.
72   DisableMacroExpansion = false;
73   InMacroArgs = false;
74   NumCachedTokenLexers = 0;
75 
76   CachedLexPos = 0;
77 
78   // "Poison" __VA_ARGS__, which can only appear in the expansion of a macro.
79   // This gets unpoisoned where it is allowed.
80   (Ident__VA_ARGS__ = getIdentifierInfo("__VA_ARGS__"))->setIsPoisoned();
81 
82   // Initialize the pragma handlers.
83   PragmaHandlers = new PragmaNamespace(0);
84   RegisterBuiltinPragmas();
85 
86   // Initialize builtin macros like __LINE__ and friends.
87   RegisterBuiltinMacros();
88 }
89 
90 Preprocessor::~Preprocessor() {
91   assert(BacktrackPositions.empty() && "EnableBacktrack/Backtrack imbalance!");
92 
93   while (!IncludeMacroStack.empty()) {
94     delete IncludeMacroStack.back().TheLexer;
95     delete IncludeMacroStack.back().TheTokenLexer;
96     IncludeMacroStack.pop_back();
97   }
98 
99   // Free any macro definitions.
100   for (llvm::DenseMap<IdentifierInfo*, MacroInfo*>::iterator I =
101        Macros.begin(), E = Macros.end(); I != E; ++I) {
102     // We don't need to free the MacroInfo objects directly.  These
103     // will be released when the BumpPtrAllocator 'BP' object gets
104     // destroyed. We still need to run the dstor, however, to free
105     // memory alocated by MacroInfo.
106     I->second->Destroy(BP);
107     I->first->setHasMacroDefinition(false);
108   }
109 
110   // Free any cached macro expanders.
111   for (unsigned i = 0, e = NumCachedTokenLexers; i != e; ++i)
112     delete TokenLexerCache[i];
113 
114   // Release pragma information.
115   delete PragmaHandlers;
116 
117   // Delete the scratch buffer info.
118   delete ScratchBuf;
119 
120   delete Callbacks;
121 }
122 
123 void Preprocessor::setPTHManager(PTHManager* pm) {
124   PTH.reset(pm);
125   FileMgr.setStatCache(PTH->createStatCache());
126 }
127 
128 void Preprocessor::DumpToken(const Token &Tok, bool DumpFlags) const {
129   llvm::cerr << tok::getTokenName(Tok.getKind()) << " '"
130              << getSpelling(Tok) << "'";
131 
132   if (!DumpFlags) return;
133 
134   llvm::cerr << "\t";
135   if (Tok.isAtStartOfLine())
136     llvm::cerr << " [StartOfLine]";
137   if (Tok.hasLeadingSpace())
138     llvm::cerr << " [LeadingSpace]";
139   if (Tok.isExpandDisabled())
140     llvm::cerr << " [ExpandDisabled]";
141   if (Tok.needsCleaning()) {
142     const char *Start = SourceMgr.getCharacterData(Tok.getLocation());
143     llvm::cerr << " [UnClean='" << std::string(Start, Start+Tok.getLength())
144                << "']";
145   }
146 
147   llvm::cerr << "\tLoc=<";
148   DumpLocation(Tok.getLocation());
149   llvm::cerr << ">";
150 }
151 
152 void Preprocessor::DumpLocation(SourceLocation Loc) const {
153   Loc.dump(SourceMgr);
154 }
155 
156 void Preprocessor::DumpMacro(const MacroInfo &MI) const {
157   llvm::cerr << "MACRO: ";
158   for (unsigned i = 0, e = MI.getNumTokens(); i != e; ++i) {
159     DumpToken(MI.getReplacementToken(i));
160     llvm::cerr << "  ";
161   }
162   llvm::cerr << "\n";
163 }
164 
165 void Preprocessor::PrintStats() {
166   llvm::cerr << "\n*** Preprocessor Stats:\n";
167   llvm::cerr << NumDirectives << " directives found:\n";
168   llvm::cerr << "  " << NumDefined << " #define.\n";
169   llvm::cerr << "  " << NumUndefined << " #undef.\n";
170   llvm::cerr << "  #include/#include_next/#import:\n";
171   llvm::cerr << "    " << NumEnteredSourceFiles << " source files entered.\n";
172   llvm::cerr << "    " << MaxIncludeStackDepth << " max include stack depth\n";
173   llvm::cerr << "  " << NumIf << " #if/#ifndef/#ifdef.\n";
174   llvm::cerr << "  " << NumElse << " #else/#elif.\n";
175   llvm::cerr << "  " << NumEndif << " #endif.\n";
176   llvm::cerr << "  " << NumPragma << " #pragma.\n";
177   llvm::cerr << NumSkipped << " #if/#ifndef#ifdef regions skipped\n";
178 
179   llvm::cerr << NumMacroExpanded << "/" << NumFnMacroExpanded << "/"
180              << NumBuiltinMacroExpanded << " obj/fn/builtin macros expanded, "
181              << NumFastMacroExpanded << " on the fast path.\n";
182   llvm::cerr << (NumFastTokenPaste+NumTokenPaste)
183              << " token paste (##) operations performed, "
184              << NumFastTokenPaste << " on the fast path.\n";
185 }
186 
187 //===----------------------------------------------------------------------===//
188 // Token Spelling
189 //===----------------------------------------------------------------------===//
190 
191 
192 /// getSpelling() - Return the 'spelling' of this token.  The spelling of a
193 /// token are the characters used to represent the token in the source file
194 /// after trigraph expansion and escaped-newline folding.  In particular, this
195 /// wants to get the true, uncanonicalized, spelling of things like digraphs
196 /// UCNs, etc.
197 std::string Preprocessor::getSpelling(const Token &Tok) const {
198   assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
199 
200   // If this token contains nothing interesting, return it directly.
201   const char* TokStart = SourceMgr.getCharacterData(Tok.getLocation());
202   if (!Tok.needsCleaning())
203     return std::string(TokStart, TokStart+Tok.getLength());
204 
205   std::string Result;
206   Result.reserve(Tok.getLength());
207 
208   // Otherwise, hard case, relex the characters into the string.
209   for (const char *Ptr = TokStart, *End = TokStart+Tok.getLength();
210        Ptr != End; ) {
211     unsigned CharSize;
212     Result.push_back(Lexer::getCharAndSizeNoWarn(Ptr, CharSize, Features));
213     Ptr += CharSize;
214   }
215   assert(Result.size() != unsigned(Tok.getLength()) &&
216          "NeedsCleaning flag set on something that didn't need cleaning!");
217   return Result;
218 }
219 
220 /// getSpelling - This method is used to get the spelling of a token into a
221 /// preallocated buffer, instead of as an std::string.  The caller is required
222 /// to allocate enough space for the token, which is guaranteed to be at least
223 /// Tok.getLength() bytes long.  The actual length of the token is returned.
224 ///
225 /// Note that this method may do two possible things: it may either fill in
226 /// the buffer specified with characters, or it may *change the input pointer*
227 /// to point to a constant buffer with the data already in it (avoiding a
228 /// copy).  The caller is not allowed to modify the returned buffer pointer
229 /// if an internal buffer is returned.
230 unsigned Preprocessor::getSpelling(const Token &Tok,
231                                    const char *&Buffer) const {
232   assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
233 
234   // If this token is an identifier, just return the string from the identifier
235   // table, which is very quick.
236   if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
237     Buffer = II->getName();
238     return II->getLength();
239   }
240 
241   // Otherwise, compute the start of the token in the input lexer buffer.
242   const char *TokStart = 0;
243 
244   if (Tok.isLiteral())
245     TokStart = Tok.getLiteralData();
246 
247   if (TokStart == 0)
248     TokStart = SourceMgr.getCharacterData(Tok.getLocation());
249 
250   // If this token contains nothing interesting, return it directly.
251   if (!Tok.needsCleaning()) {
252     Buffer = TokStart;
253     return Tok.getLength();
254   }
255 
256   // Otherwise, hard case, relex the characters into the string.
257   char *OutBuf = const_cast<char*>(Buffer);
258   for (const char *Ptr = TokStart, *End = TokStart+Tok.getLength();
259        Ptr != End; ) {
260     unsigned CharSize;
261     *OutBuf++ = Lexer::getCharAndSizeNoWarn(Ptr, CharSize, Features);
262     Ptr += CharSize;
263   }
264   assert(unsigned(OutBuf-Buffer) != Tok.getLength() &&
265          "NeedsCleaning flag set on something that didn't need cleaning!");
266 
267   return OutBuf-Buffer;
268 }
269 
270 /// CreateString - Plop the specified string into a scratch buffer and return a
271 /// location for it.  If specified, the source location provides a source
272 /// location for the token.
273 void Preprocessor::CreateString(const char *Buf, unsigned Len, Token &Tok,
274                                 SourceLocation InstantiationLoc) {
275   Tok.setLength(Len);
276 
277   const char *DestPtr;
278   SourceLocation Loc = ScratchBuf->getToken(Buf, Len, DestPtr);
279 
280   if (InstantiationLoc.isValid())
281     Loc = SourceMgr.createInstantiationLoc(Loc, InstantiationLoc,
282                                            InstantiationLoc, Len);
283   Tok.setLocation(Loc);
284 
285   // If this is a literal token, set the pointer data.
286   if (Tok.isLiteral())
287     Tok.setLiteralData(DestPtr);
288 }
289 
290 
291 /// AdvanceToTokenCharacter - Given a location that specifies the start of a
292 /// token, return a new location that specifies a character within the token.
293 SourceLocation Preprocessor::AdvanceToTokenCharacter(SourceLocation TokStart,
294                                                      unsigned CharNo) {
295   // Figure out how many physical characters away the specified instantiation
296   // character is.  This needs to take into consideration newlines and
297   // trigraphs.
298   const char *TokPtr = SourceMgr.getCharacterData(TokStart);
299 
300   // If they request the first char of the token, we're trivially done.
301   if (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr))
302     return TokStart;
303 
304   unsigned PhysOffset = 0;
305 
306   // The usual case is that tokens don't contain anything interesting.  Skip
307   // over the uninteresting characters.  If a token only consists of simple
308   // chars, this method is extremely fast.
309   while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
310     if (CharNo == 0)
311       return TokStart.getFileLocWithOffset(PhysOffset);
312     ++TokPtr, --CharNo, ++PhysOffset;
313   }
314 
315   // If we have a character that may be a trigraph or escaped newline, use a
316   // lexer to parse it correctly.
317   for (; CharNo; --CharNo) {
318     unsigned Size;
319     Lexer::getCharAndSizeNoWarn(TokPtr, Size, Features);
320     TokPtr += Size;
321     PhysOffset += Size;
322   }
323 
324   // Final detail: if we end up on an escaped newline, we want to return the
325   // location of the actual byte of the token.  For example foo\<newline>bar
326   // advanced by 3 should return the location of b, not of \\.  One compounding
327   // detail of this is that the escape may be made by a trigraph.
328   if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
329     PhysOffset = Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
330 
331   return TokStart.getFileLocWithOffset(PhysOffset);
332 }
333 
334 /// \brief Computes the source location just past the end of the
335 /// token at this source location.
336 ///
337 /// This routine can be used to produce a source location that
338 /// points just past the end of the token referenced by \p Loc, and
339 /// is generally used when a diagnostic needs to point just after a
340 /// token where it expected something different that it received. If
341 /// the returned source location would not be meaningful (e.g., if
342 /// it points into a macro), this routine returns an invalid
343 /// source location.
344 SourceLocation Preprocessor::getLocForEndOfToken(SourceLocation Loc) {
345   if (Loc.isInvalid() || !Loc.isFileID())
346     return SourceLocation();
347 
348   unsigned Len = Lexer::MeasureTokenLength(Loc, getSourceManager(), Features);
349   return AdvanceToTokenCharacter(Loc, Len);
350 }
351 
352 
353 
354 //===----------------------------------------------------------------------===//
355 // Preprocessor Initialization Methods
356 //===----------------------------------------------------------------------===//
357 
358 
359 /// EnterMainSourceFile - Enter the specified FileID as the main source file,
360 /// which implicitly adds the builtin defines etc.
361 void Preprocessor::EnterMainSourceFile() {
362   // We do not allow the preprocessor to reenter the main file.  Doing so will
363   // cause FileID's to accumulate information from both runs (e.g. #line
364   // information) and predefined macros aren't guaranteed to be set properly.
365   assert(NumEnteredSourceFiles == 0 && "Cannot reenter the main file!");
366   FileID MainFileID = SourceMgr.getMainFileID();
367 
368   // Enter the main file source buffer.
369   EnterSourceFile(MainFileID, 0);
370 
371   // Tell the header info that the main file was entered.  If the file is later
372   // #imported, it won't be re-entered.
373   if (const FileEntry *FE = SourceMgr.getFileEntryForID(MainFileID))
374     HeaderInfo.IncrementIncludeCount(FE);
375 
376   std::vector<char> PrologFile;
377   PrologFile.reserve(4080);
378 
379   // FIXME: Don't make a copy.
380   PrologFile.insert(PrologFile.end(), Predefines.begin(), Predefines.end());
381 
382   // Memory buffer must end with a null byte!
383   PrologFile.push_back(0);
384 
385   // Now that we have emitted the predefined macros, #includes, etc into
386   // PrologFile, preprocess it to populate the initial preprocessor state.
387   llvm::MemoryBuffer *SB =
388     llvm::MemoryBuffer::getMemBufferCopy(&PrologFile.front(),&PrologFile.back(),
389                                          "<built-in>");
390   assert(SB && "Cannot fail to create predefined source buffer");
391   FileID FID = SourceMgr.createFileIDForMemBuffer(SB);
392   assert(!FID.isInvalid() && "Could not create FileID for predefines?");
393 
394   // Start parsing the predefines.
395   EnterSourceFile(FID, 0);
396 }
397 
398 
399 //===----------------------------------------------------------------------===//
400 // Lexer Event Handling.
401 //===----------------------------------------------------------------------===//
402 
403 /// LookUpIdentifierInfo - Given a tok::identifier token, look up the
404 /// identifier information for the token and install it into the token.
405 IdentifierInfo *Preprocessor::LookUpIdentifierInfo(Token &Identifier,
406                                                    const char *BufPtr) {
407   assert(Identifier.is(tok::identifier) && "Not an identifier!");
408   assert(Identifier.getIdentifierInfo() == 0 && "Identinfo already exists!");
409 
410   // Look up this token, see if it is a macro, or if it is a language keyword.
411   IdentifierInfo *II;
412   if (BufPtr && !Identifier.needsCleaning()) {
413     // No cleaning needed, just use the characters from the lexed buffer.
414     II = getIdentifierInfo(BufPtr, BufPtr+Identifier.getLength());
415   } else {
416     // Cleaning needed, alloca a buffer, clean into it, then use the buffer.
417     llvm::SmallVector<char, 64> IdentifierBuffer;
418     IdentifierBuffer.resize(Identifier.getLength());
419     const char *TmpBuf = &IdentifierBuffer[0];
420     unsigned Size = getSpelling(Identifier, TmpBuf);
421     II = getIdentifierInfo(TmpBuf, TmpBuf+Size);
422   }
423   Identifier.setIdentifierInfo(II);
424   return II;
425 }
426 
427 
428 /// HandleIdentifier - This callback is invoked when the lexer reads an
429 /// identifier.  This callback looks up the identifier in the map and/or
430 /// potentially macro expands it or turns it into a named token (like 'for').
431 ///
432 /// Note that callers of this method are guarded by checking the
433 /// IdentifierInfo's 'isHandleIdentifierCase' bit.  If this method changes, the
434 /// IdentifierInfo methods that compute these properties will need to change to
435 /// match.
436 void Preprocessor::HandleIdentifier(Token &Identifier) {
437   assert(Identifier.getIdentifierInfo() &&
438          "Can't handle identifiers without identifier info!");
439 
440   IdentifierInfo &II = *Identifier.getIdentifierInfo();
441 
442   // If this identifier was poisoned, and if it was not produced from a macro
443   // expansion, emit an error.
444   if (II.isPoisoned() && CurPPLexer) {
445     if (&II != Ident__VA_ARGS__)   // We warn about __VA_ARGS__ with poisoning.
446       Diag(Identifier, diag::err_pp_used_poisoned_id);
447     else
448       Diag(Identifier, diag::ext_pp_bad_vaargs_use);
449   }
450 
451   // If this is a macro to be expanded, do it.
452   if (MacroInfo *MI = getMacroInfo(&II)) {
453     if (!DisableMacroExpansion && !Identifier.isExpandDisabled()) {
454       if (MI->isEnabled()) {
455         if (!HandleMacroExpandedIdentifier(Identifier, MI))
456           return;
457       } else {
458         // C99 6.10.3.4p2 says that a disabled macro may never again be
459         // expanded, even if it's in a context where it could be expanded in the
460         // future.
461         Identifier.setFlag(Token::DisableExpand);
462       }
463     }
464   }
465 
466   // C++ 2.11p2: If this is an alternative representation of a C++ operator,
467   // then we act as if it is the actual operator and not the textual
468   // representation of it.
469   if (II.isCPlusPlusOperatorKeyword())
470     Identifier.setIdentifierInfo(0);
471 
472   // If this is an extension token, diagnose its use.
473   // We avoid diagnosing tokens that originate from macro definitions.
474   // FIXME: This warning is disabled in cases where it shouldn't be,
475   // like "#define TY typeof", "TY(1) x".
476   if (II.isExtensionToken() && !DisableMacroExpansion)
477     Diag(Identifier, diag::ext_token_used);
478 }
479 
480 void Preprocessor::AddCommentHandler(CommentHandler *Handler) {
481   assert(Handler && "NULL comment handler");
482   assert(std::find(CommentHandlers.begin(), CommentHandlers.end(), Handler) ==
483          CommentHandlers.end() && "Comment handler already registered");
484   CommentHandlers.push_back(Handler);
485 }
486 
487 void Preprocessor::RemoveCommentHandler(CommentHandler *Handler) {
488   std::vector<CommentHandler *>::iterator Pos
489   = std::find(CommentHandlers.begin(), CommentHandlers.end(), Handler);
490   assert(Pos != CommentHandlers.end() && "Comment handler not registered");
491   CommentHandlers.erase(Pos);
492 }
493 
494 void Preprocessor::HandleComment(SourceRange Comment) {
495   for (std::vector<CommentHandler *>::iterator H = CommentHandlers.begin(),
496        HEnd = CommentHandlers.end();
497        H != HEnd; ++H)
498     (*H)->HandleComment(*this, Comment);
499 }
500 
501 CommentHandler::~CommentHandler() { }
502