1 //===--- Lex.cpp - extract token stream from source code ---------*- C++-*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "clang-pseudo/Token.h"
10 #include "clang/Basic/IdentifierTable.h"
11 #include "clang/Basic/SourceLocation.h"
12 #include "clang/Basic/TokenKinds.h"
13 #include "clang/Lex/Lexer.h"
14 #include "clang/Lex/LiteralSupport.h"
15 
16 namespace clang {
17 namespace pseudo {
18 
19 TokenStream lex(const std::string &Code, const clang::LangOptions &LangOpts) {
20   clang::SourceLocation Start;
21   // Tokenize using clang's lexer in raw mode.
22   // std::string guarantees null-termination, which the lexer needs.
23   clang::Lexer Lexer(Start, LangOpts, Code.data(), Code.data(),
24                      Code.data() + Code.size());
25   Lexer.SetCommentRetentionState(true);
26 
27   TokenStream Result;
28   clang::Token CT;
29   unsigned LastOffset = 0;
30   unsigned Line = 0;
31   unsigned Indent = 0;
32   for (Lexer.LexFromRawLexer(CT); CT.getKind() != clang::tok::eof;
33        Lexer.LexFromRawLexer(CT)) {
34     unsigned Offset =
35         CT.getLocation().getRawEncoding() - Start.getRawEncoding();
36 
37     Token Tok;
38     Tok.Data = &Code[Offset];
39     Tok.Length = CT.getLength();
40     Tok.Kind = CT.getKind();
41 
42     // Update current line number and indentation from raw source code.
43     unsigned NewLineStart = 0;
44     for (unsigned I = LastOffset; I < Offset; ++I) {
45       if (Code[I] == '\n') {
46         NewLineStart = I + 1;
47         ++Line;
48       }
49     }
50     if (NewLineStart || !LastOffset) {
51       Indent = 0;
52       for (char C : StringRef(Code).slice(NewLineStart, Offset)) {
53         if (C == ' ')
54           ++Indent;
55         else if (C == '\t')
56           Indent += 8;
57         else
58           break;
59       }
60     }
61     Tok.Indent = Indent;
62     Tok.Line = Line;
63 
64     if (CT.isAtStartOfLine())
65       Tok.setFlag(LexFlags::StartsPPLine);
66     if (CT.needsCleaning() || CT.hasUCN())
67       Tok.setFlag(LexFlags::NeedsCleaning);
68 
69     Result.push(Tok);
70     LastOffset = Offset;
71   }
72   Result.finalize();
73   return Result;
74 }
75 
76 TokenStream cook(const TokenStream &Code, const LangOptions &LangOpts) {
77   auto CleanedStorage = std::make_shared<llvm::BumpPtrAllocator>();
78   clang::IdentifierTable Identifiers(LangOpts);
79   TokenStream Result(CleanedStorage);
80 
81   for (auto Tok : Code.tokens()) {
82     if (Tok.flag(LexFlags::NeedsCleaning)) {
83       // Remove escaped newlines and trigraphs.
84       llvm::SmallString<64> CleanBuffer;
85       const char *Pos = Tok.text().begin();
86       while (Pos < Tok.text().end()) {
87         unsigned CharSize = 0;
88         CleanBuffer.push_back(
89             clang::Lexer::getCharAndSizeNoWarn(Pos, CharSize, LangOpts));
90         assert(CharSize != 0 && "no progress!");
91         Pos += CharSize;
92       }
93       // Remove universal character names (UCN).
94       llvm::SmallString<64> UCNBuffer;
95       clang::expandUCNs(UCNBuffer, CleanBuffer);
96 
97       llvm::StringRef Text = llvm::StringRef(UCNBuffer).copy(*CleanedStorage);
98       Tok.Data = Text.data();
99       Tok.Length = Text.size();
100       Tok.Flags &= ~static_cast<decltype(Tok.Flags)>(LexFlags::NeedsCleaning);
101     }
102 
103     if (Tok.Kind == tok::raw_identifier) {
104       // Cook raw_identifiers into identifier, keyword, etc.
105       Tok.Kind = Identifiers.get(Tok.text()).getTokenID();
106     } else if (Tok.Kind == tok::greatergreater) {
107       // Split the greatergreater token.
108       // FIXME: split lessless token to support Cuda triple angle brackets <<<.
109       assert(Tok.text() == ">>");
110       Tok.Kind = tok::greater;
111       Tok.Length = 1;
112       Result.push(Tok);
113       // Line is wrong if the first greater is followed by an escaped newline!
114       Tok.Data = Tok.text().data() + 1;
115     }
116 
117     Result.push(std::move(Tok));
118   }
119 
120   Result.finalize();
121   return Result;
122 }
123 
124 } // namespace pseudo
125 } // namespace clang
126