1 //===--- TokenTest.cpp ----------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "clang-pseudo/Token.h"
10 #include "clang/Basic/LangOptions.h"
11 #include "clang/Basic/TokenKinds.h"
12 #include "gmock/gmock.h"
13 #include "gtest/gtest.h"
14 
15 namespace clang {
16 namespace pseudo {
17 namespace {
18 
19 using testing::AllOf;
20 using testing::ElementsAre;
21 using testing::ElementsAreArray;
22 using testing::Not;
23 
24 MATCHER_P2(token, Text, Kind, "") {
25   return arg.Kind == Kind && arg.text() == Text;
26 }
27 
28 MATCHER_P(hasFlag, Flag, "") { return arg.flag(Flag); }
29 
30 MATCHER_P2(lineIndent, Line, Indent, "") {
31   return arg.Line == (unsigned)Line && arg.Indent == (unsigned)Indent;
32 }
33 
34 MATCHER_P(originalIndex, index, "") {
35   return arg.OriginalIndex == (Token::Index)index;
36 }
37 
TEST(TokenTest,Lex)38 TEST(TokenTest, Lex) {
39   LangOptions Opts;
40   std::string Code = R"cpp(
41     #include <stdio.h>
42     int main() {
43       return 42; // the answer
44     }
45   )cpp";
46   TokenStream Raw = lex(Code, Opts);
47   ASSERT_TRUE(Raw.isFinalized());
48   EXPECT_THAT(Raw.tokens(),
49               ElementsAreArray({
50                   // Lexing of directives is weird, especially <angled> strings.
51                   token("#", tok::hash),
52                   token("include", tok::raw_identifier),
53                   token("<", tok::less),
54                   token("stdio", tok::raw_identifier),
55                   token(".", tok::period),
56                   token("h", tok::raw_identifier),
57                   token(">", tok::greater),
58 
59                   token("int", tok::raw_identifier),
60                   token("main", tok::raw_identifier),
61                   token("(", tok::l_paren),
62                   token(")", tok::r_paren),
63                   token("{", tok::l_brace),
64                   token("return", tok::raw_identifier),
65                   token("42", tok::numeric_constant),
66                   token(";", tok::semi),
67                   token("// the answer", tok::comment),
68                   token("}", tok::r_brace),
69               }));
70 
71   TokenStream Cooked = cook(Raw, Opts);
72   ASSERT_TRUE(Cooked.isFinalized());
73   EXPECT_THAT(Cooked.tokens(),
74               ElementsAreArray({
75                   // Cooked identifier types in directives are not meaningful.
76                   token("#", tok::hash),
77                   token("include", tok::identifier),
78                   token("<", tok::less),
79                   token("stdio", tok::identifier),
80                   token(".", tok::period),
81                   token("h", tok::identifier),
82                   token(">", tok::greater),
83 
84                   token("int", tok::kw_int),
85                   token("main", tok::identifier),
86                   token("(", tok::l_paren),
87                   token(")", tok::r_paren),
88                   token("{", tok::l_brace),
89                   token("return", tok::kw_return),
90                   token("42", tok::numeric_constant),
91                   token(";", tok::semi),
92                   token("// the answer", tok::comment),
93                   token("}", tok::r_brace),
94               }));
95   // Check raw tokens point back into original source code.
96   EXPECT_EQ(Raw.tokens().front().text().begin(), &Code[Code.find('#')]);
97 }
98 
TEST(TokenTest,LineContinuation)99 TEST(TokenTest, LineContinuation) {
100   LangOptions Opts;
101   std::string Code = R"cpp(
102 one_\
103 token
104 two \
105 tokens
106   )cpp";
107   TokenStream Raw = lex(Code, Opts);
108   EXPECT_THAT(
109       Raw.tokens(),
110       ElementsAre(AllOf(token("one_\\\ntoken", tok::raw_identifier),
111                         hasFlag(LexFlags::StartsPPLine),
112                         hasFlag(LexFlags::NeedsCleaning), lineIndent(1, 0),
113                         originalIndex(0)),
114                   AllOf(token("two", tok::raw_identifier),
115                         hasFlag(LexFlags::StartsPPLine),
116                         Not(hasFlag(LexFlags::NeedsCleaning)),
117                         originalIndex(1)),
118                   AllOf(token("\\\ntokens", tok::raw_identifier),
119                         Not(hasFlag(LexFlags::StartsPPLine)),
120                         hasFlag(LexFlags::NeedsCleaning), originalIndex(2))));
121 
122   TokenStream Cooked = cook(Raw, Opts);
123   EXPECT_THAT(
124       Cooked.tokens(),
125       ElementsAre(AllOf(token("one_token", tok::identifier), lineIndent(1, 0),
126                         originalIndex(0)),
127                   AllOf(token("two", tok::identifier), originalIndex(1)),
128                   AllOf(token("tokens", tok::identifier), originalIndex(2))));
129 }
130 
TEST(TokenTest,EncodedCharacters)131 TEST(TokenTest, EncodedCharacters) {
132   LangOptions Opts;
133   Opts.Trigraphs = true;
134   Opts.Digraphs = true;
135   Opts.C99 = true; // UCNs
136   Opts.CXXOperatorNames = true;
137   std::string Code = R"(and <: ??! '??=' \u00E9)";
138   TokenStream Raw = lex(Code, Opts);
139   EXPECT_THAT(
140       Raw.tokens(),
141       ElementsAre( // and is not recognized as && until cook().
142           AllOf(token("and", tok::raw_identifier),
143                 Not(hasFlag(LexFlags::NeedsCleaning))),
144           // Digraphs are just different spellings of tokens.
145           AllOf(token("<:", tok::l_square),
146                 Not(hasFlag(LexFlags::NeedsCleaning))),
147           // Trigraps are interpreted, still need text cleaning.
148           AllOf(token(R"(??!)", tok::pipe), hasFlag(LexFlags::NeedsCleaning)),
149           // Trigraphs must be substituted inside constants too.
150           AllOf(token(R"('??=')", tok::char_constant),
151                 hasFlag(LexFlags::NeedsCleaning)),
152           // UCNs need substitution.
153           AllOf(token(R"(\u00E9)", tok::raw_identifier),
154                 hasFlag(LexFlags::NeedsCleaning))));
155 
156   TokenStream Cooked = cook(Raw, Opts);
157   EXPECT_THAT(
158       Cooked.tokens(),
159       ElementsAre(token("and", tok::ampamp), // alternate spelling recognized
160                   token("<:", tok::l_square),
161                   token("|", tok::pipe),            // trigraph substituted
162                   token("'#'", tok::char_constant), // trigraph substituted
163                   token("é", tok::identifier)));    // UCN substituted
164 }
165 
TEST(TokenTest,Indentation)166 TEST(TokenTest, Indentation) {
167   LangOptions Opts;
168   std::string Code = R"cpp(   hello world
169 no_indent \
170   line_was_continued
171 )cpp";
172   TokenStream Raw = lex(Code, Opts);
173   EXPECT_THAT(Raw.tokens(), ElementsAreArray({
174                                 lineIndent(0, 3), // hello
175                                 lineIndent(0, 3), // world
176                                 lineIndent(1, 0), // no_indent
177                                 lineIndent(2, 2), // line_was_continued
178                             }));
179 }
180 
TEST(TokenTest,SplitGreaterGreater)181 TEST(TokenTest, SplitGreaterGreater) {
182   LangOptions Opts;
183   std::string Code = R"cpp(
184 >> // split
185 // >> with an escaped newline in the middle, split
186 >\
187 >
188 >>= // not split
189 )cpp";
190   TokenStream Cook = cook(lex(Code, Opts), Opts);
191   TokenStream Split = stripComments(Cook);
192   EXPECT_THAT(Split.tokens(),
193               ElementsAre(AllOf(token(">", tok::greater), originalIndex(0)),
194                           AllOf(token(">", tok::greater), originalIndex(0)),
195                           // Token 1 and 2 are comments.
196                           AllOf(token(">", tok::greater), originalIndex(3)),
197                           AllOf(token(">", tok::greater), originalIndex(3)),
198                           AllOf(token(">>=", tok::greatergreaterequal),
199                                 originalIndex(4))));
200 }
201 
TEST(TokenTest,DropComments)202 TEST(TokenTest, DropComments) {
203   LangOptions Opts;
204   std::string Code = R"cpp(
205   // comment
206   int /*abc*/;
207 )cpp";
208   TokenStream Raw = cook(lex(Code, Opts), Opts);
209   TokenStream Stripped = stripComments(Raw);
210   EXPECT_THAT(
211       Raw.tokens(),
212       ElementsAre(AllOf(token("// comment", tok::comment), originalIndex(0)),
213                   AllOf(token("int", tok::kw_int), originalIndex(1)),
214                   AllOf(token("/*abc*/", tok::comment), originalIndex(2)),
215                   AllOf(token(";", tok::semi), originalIndex(3))));
216 
217   EXPECT_THAT(Stripped.tokens(),
218               ElementsAre(AllOf(token("int", tok::kw_int), originalIndex(1)),
219                           AllOf(token(";", tok::semi), originalIndex(3))));
220 }
221 
222 } // namespace
223 } // namespace pseudo
224 } // namespace clang
225