1 //===--- GLRTest.cpp - Test the GLR parser ----------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "clang-pseudo/GLR.h"
10 #include "clang-pseudo/Bracket.h"
11 #include "clang-pseudo/Language.h"
12 #include "clang-pseudo/Token.h"
13 #include "clang-pseudo/grammar/Grammar.h"
14 #include "clang/Basic/LangOptions.h"
15 #include "clang/Basic/TokenKinds.h"
16 #include "llvm/ADT/StringExtras.h"
17 #include "llvm/Support/FormatVariadic.h"
18 #include "gmock/gmock.h"
19 #include "gtest/gtest.h"
20 #include <memory>
21 
22 namespace clang {
23 namespace pseudo {
24 
operator <<(llvm::raw_ostream & OS,const std::vector<const GSS::Node * > & Heads)25 llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
26                               const std::vector<const GSS::Node *> &Heads) {
27   for (const auto *Head : Heads)
28     OS << *Head << "\n";
29   return OS;
30 }
31 
32 namespace {
33 
34 using StateID = LRTable::StateID;
35 using testing::AllOf;
36 using testing::ElementsAre;
37 using testing::IsEmpty;
38 using testing::UnorderedElementsAre;
39 
40 MATCHER_P(state, StateID, "") { return arg->State == StateID; }
41 MATCHER_P(parsedSymbol, FNode, "") { return arg->Payload == FNode; }
42 MATCHER_P(parsedSymbolID, SID, "") { return arg->Payload->symbol() == SID; }
43 MATCHER_P(start, Start, "") { return arg->Payload->startTokenIndex() == Start; }
44 
45 testing::Matcher<const GSS::Node *>
parents(llvm::ArrayRef<const GSS::Node * > Parents)46 parents(llvm::ArrayRef<const GSS::Node *> Parents) {
47   return testing::Property(&GSS::Node::parents,
48                            testing::UnorderedElementsAreArray(Parents));
49 }
50 
recoverBraces(Token::Index Begin,const TokenStream & Code)51 Token::Index recoverBraces(Token::Index Begin, const TokenStream &Code) {
52   EXPECT_GT(Begin, 0u);
53   const Token &Left = Code.tokens()[Begin - 1];
54   EXPECT_EQ(Left.Kind, tok::l_brace);
55   if (const auto* Right = Left.pair()) {
56     EXPECT_EQ(Right->Kind, tok::r_brace);
57     return Code.index(*Right);
58   }
59   return Token::Invalid;
60 }
61 
62 class GLRTest : public ::testing::Test {
63 public:
build(llvm::StringRef GrammarBNF)64   void build(llvm::StringRef GrammarBNF) {
65     std::vector<std::string> Diags;
66     TestLang.G = Grammar::parseBNF(GrammarBNF, Diags);
67   }
68 
emptyTokenStream()69   TokenStream emptyTokenStream() {
70     TokenStream Empty;
71     Empty.finalize();
72     return Empty;
73   }
74 
buildGrammar(std::vector<std::string> Nonterminals,std::vector<std::string> Rules)75   void buildGrammar(std::vector<std::string> Nonterminals,
76                     std::vector<std::string> Rules) {
77     Nonterminals.push_back("_");
78     llvm::sort(Nonterminals);
79     Nonterminals.erase(std::unique(Nonterminals.begin(), Nonterminals.end()),
80                        Nonterminals.end());
81     std::string FakeTestBNF;
82     for (const auto &NT : Nonterminals)
83       FakeTestBNF += llvm::formatv("{0} := {1}\n", "_", NT);
84     FakeTestBNF += llvm::join(Rules, "\n");
85     build(FakeTestBNF);
86   }
87 
id(llvm::StringRef Name) const88   SymbolID id(llvm::StringRef Name) const {
89     for (unsigned I = 0; I < NumTerminals; ++I)
90       if (TestLang.G.table().Terminals[I] == Name)
91         return tokenSymbol(static_cast<tok::TokenKind>(I));
92     for (SymbolID ID = 0; ID < TestLang.G.table().Nonterminals.size(); ++ID)
93       if (TestLang.G.table().Nonterminals[ID].Name == Name)
94         return ID;
95     ADD_FAILURE() << "No such symbol found: " << Name;
96     return 0;
97   }
extensionID(llvm::StringRef AttrValueName) const98   ExtensionID extensionID(llvm::StringRef AttrValueName) const {
99     for (ExtensionID EID = 0; EID < TestLang.G.table().AttributeValues.size();
100          ++EID)
101       if (TestLang.G.table().AttributeValues[EID] == AttrValueName)
102         return EID;
103     ADD_FAILURE() << "No such attribute value found: " << AttrValueName;
104     return 0;
105   }
106 
ruleFor(llvm::StringRef NonterminalName) const107   RuleID ruleFor(llvm::StringRef NonterminalName) const {
108     auto RuleRange =
109         TestLang.G.table().Nonterminals[id(NonterminalName)].RuleRange;
110     if (RuleRange.End - RuleRange.Start == 1)
111       return TestLang.G.table()
112           .Nonterminals[id(NonterminalName)]
113           .RuleRange.Start;
114     ADD_FAILURE() << "Expected a single rule for " << NonterminalName
115                   << ", but it has " << RuleRange.End - RuleRange.Start
116                   << " rule!\n";
117     return 0;
118   }
119 
120 protected:
121   Language TestLang;
122   ForestArena Arena;
123   GSS GSStack;
124 };
125 
TEST_F(GLRTest,ShiftMergingHeads)126 TEST_F(GLRTest, ShiftMergingHeads) {
127   // Given a test case where we have two heads 1, 2, 3 in the GSS, the heads 1,
128   // 2 have shift actions to reach state 4, and the head 3 has a shift action to
129   // reach state 5:
130   //   0--1
131   //   └--2
132   //   └--3
133   // After the shift action, the GSS (with new heads 4, 5) is:
134   //   0---1---4
135   //   └---2---┘
136   //   └---3---5
137   auto *GSSNode0 =
138       GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
139   auto *GSSNode1 = GSStack.addNode(/*State=*/1, /*ForestNode=*/nullptr,
140                                    /*Parents=*/{GSSNode0});
141   auto *GSSNode2 = GSStack.addNode(/*State=*/2, /*ForestNode=*/nullptr,
142                                    /*Parents=*/{GSSNode0});
143   auto *GSSNode3 = GSStack.addNode(/*State=*/3, /*ForestNode=*/nullptr,
144                                    /*Parents=*/{GSSNode0});
145 
146   buildGrammar({}, {}); // Create a fake empty grammar.
147   LRTable::Builder B(TestLang.G);
148   B.Transition[{StateID{1}, tokenSymbol(tok::semi)}] = StateID{4};
149   B.Transition[{StateID{2}, tokenSymbol(tok::semi)}] = StateID{4};
150   B.Transition[{StateID{3}, tokenSymbol(tok::semi)}] = StateID{5};
151   TestLang.Table = std::move(B).build();
152 
153   ForestNode &SemiTerminal = Arena.createTerminal(tok::semi, 0);
154   std::vector<const GSS::Node *> NewHeads;
155   glrShift({GSSNode1, GSSNode2, GSSNode3}, SemiTerminal,
156            {emptyTokenStream(), Arena, GSStack}, TestLang, NewHeads);
157 
158   EXPECT_THAT(NewHeads,
159               UnorderedElementsAre(AllOf(state(4), parsedSymbol(&SemiTerminal),
160                                          parents({GSSNode1, GSSNode2})),
161                                    AllOf(state(5), parsedSymbol(&SemiTerminal),
162                                          parents({GSSNode3}))))
163       << NewHeads;
164 }
165 
TEST_F(GLRTest,ReduceConflictsSplitting)166 TEST_F(GLRTest, ReduceConflictsSplitting) {
167   //  Before (splitting due to R/R conflict):
168   //    0--1(IDENTIFIER)
169   //  After reducing 1 by `class-name := IDENTIFIER` and
170   //                      `enum-name := IDENTIFIER`:
171   //    0--2(class-name)    // 2 is goto(0, class-name)
172   //    └--3(enum-name)     // 3 is goto(0, enum-name)
173   buildGrammar({"class-name", "enum-name"},
174                {"class-name := IDENTIFIER", "enum-name := IDENTIFIER"});
175   LRTable::Builder B(TestLang.G);
176   B.Transition[{StateID{0}, id("class-name")}] = StateID{2};
177   B.Transition[{StateID{0}, id("enum-name")}] = StateID{3};
178   B.Reduce[StateID{1}].insert(ruleFor("class-name"));
179   B.Reduce[StateID{1}].insert(ruleFor("enum-name"));
180   TestLang.Table = std::move(B).build();
181 
182   const auto *GSSNode0 =
183       GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
184   const auto *GSSNode1 =
185       GSStack.addNode(1, &Arena.createTerminal(tok::identifier, 0), {GSSNode0});
186 
187   std::vector<const GSS::Node *> Heads = {GSSNode1};
188   glrReduce(Heads, tokenSymbol(tok::eof),
189             {emptyTokenStream(), Arena, GSStack}, TestLang);
190   EXPECT_THAT(Heads, UnorderedElementsAre(
191                          GSSNode1,
192                          AllOf(state(2), parsedSymbolID(id("class-name")),
193                                parents({GSSNode0})),
194                          AllOf(state(3), parsedSymbolID(id("enum-name")),
195                                parents({GSSNode0}))))
196       << Heads;
197 }
198 
TEST_F(GLRTest,ReduceSplittingDueToMultipleBases)199 TEST_F(GLRTest, ReduceSplittingDueToMultipleBases) {
200   //  Before (splitting due to multiple bases):
201   //    2(class-name)--4(*)
202   //    3(enum-name)---┘
203   //  After reducing 4 by `ptr-operator := *`:
204   //    2(class-name)--5(ptr-operator)    // 5 is goto(2, ptr-operator)
205   //    3(enum-name)---6(ptr-operator)    // 6 is goto(3, ptr-operator)
206   buildGrammar({"ptr-operator", "class-name", "enum-name"},
207                {"ptr-operator := *"});
208 
209   auto *ClassNameNode = &Arena.createOpaque(id("class-name"), /*TokenIndex=*/0);
210   auto *EnumNameNode = &Arena.createOpaque(id("enum-name"), /*TokenIndex=*/0);
211 
212   const auto *GSSNode2 =
213       GSStack.addNode(/*State=*/2, /*ForestNode=*/ClassNameNode, /*Parents=*/{});
214   const auto *GSSNode3 =
215       GSStack.addNode(/*State=*/3, /*ForestNode=*/EnumNameNode, /*Parents=*/{});
216   const auto *GSSNode4 = GSStack.addNode(
217       /*State=*/4, &Arena.createTerminal(tok::star, /*TokenIndex=*/1),
218       /*Parents=*/{GSSNode2, GSSNode3});
219 
220   LRTable::Builder B(TestLang.G);
221   B.Transition[{StateID{2}, id("ptr-operator")}] = StateID{5};
222   B.Transition[{StateID{3}, id("ptr-operator")}] = StateID{6};
223   B.Reduce[StateID{4}].insert(ruleFor("ptr-operator"));
224   TestLang.Table = std::move(B).build();
225 
226   std::vector<const GSS::Node *> Heads = {GSSNode4};
227   glrReduce(Heads, tokenSymbol(tok::eof), {emptyTokenStream(), Arena, GSStack},
228             TestLang);
229 
230   EXPECT_THAT(Heads, UnorderedElementsAre(
231                          GSSNode4,
232                          AllOf(state(5), parsedSymbolID(id("ptr-operator")),
233                                parents({GSSNode2})),
234                          AllOf(state(6), parsedSymbolID(id("ptr-operator")),
235                                parents({GSSNode3}))))
236       << Heads;
237   // Verify that the payload of the two new heads is shared, only a single
238   // ptr-operator node is created in the forest.
239   EXPECT_EQ(Heads[1]->Payload, Heads[2]->Payload);
240 }
241 
TEST_F(GLRTest,ReduceJoiningWithMultipleBases)242 TEST_F(GLRTest, ReduceJoiningWithMultipleBases) {
243   //  Before (joining due to same goto state, multiple bases):
244   //    0--1(cv-qualifier)--3(class-name)
245   //    └--2(cv-qualifier)--4(enum-name)
246   //  After reducing 3 by `type-name := class-name` and
247   //                 4 by `type-name := enum-name`:
248   //    0--1(cv-qualifier)--5(type-name)  // 5 is goto(1, type-name) and
249   //    └--2(cv-qualifier)--┘             // goto(2, type-name)
250   buildGrammar({"type-name", "class-name", "enum-name", "cv-qualifier"},
251                {"type-name := class-name", "type-name := enum-name"});
252 
253   auto *CVQualifierNode =
254       &Arena.createOpaque(id("cv-qualifier"), /*TokenIndex=*/0);
255   auto *ClassNameNode = &Arena.createOpaque(id("class-name"), /*TokenIndex=*/1);
256   auto *EnumNameNode = &Arena.createOpaque(id("enum-name"), /*TokenIndex=*/1);
257 
258   const auto *GSSNode0 =
259       GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
260   const auto *GSSNode1 = GSStack.addNode(
261       /*State=*/1, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
262   const auto *GSSNode2 = GSStack.addNode(
263       /*State=*/2, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
264   const auto *GSSNode3 = GSStack.addNode(
265       /*State=*/3, /*ForestNode=*/ClassNameNode,
266       /*Parents=*/{GSSNode1});
267   const auto *GSSNode4 =
268       GSStack.addNode(/*State=*/4, /*ForestNode=*/EnumNameNode,
269                       /*Parents=*/{GSSNode2});
270 
271   // FIXME: figure out a way to get rid of the hard-coded reduce RuleID!
272   LRTable::Builder B(TestLang.G);
273   B.Transition[{StateID{1}, id("type-name")}] = StateID{5};
274   B.Transition[{StateID{2}, id("type-name")}] = StateID{5};
275   B.Reduce[StateID{3}].insert(/* type-name := class-name */ RuleID{0});
276   B.Reduce[StateID{4}].insert(/* type-name := enum-name */ RuleID{1});
277   TestLang.Table = std::move(B).build();
278 
279   std::vector<const GSS::Node *> Heads = {GSSNode3, GSSNode4};
280   glrReduce(Heads, tokenSymbol(tok::eof), {emptyTokenStream(), Arena, GSStack},
281             TestLang);
282 
283   // Verify that the stack heads are joint at state 5 after reduces.
284   EXPECT_THAT(Heads, UnorderedElementsAre(GSSNode3, GSSNode4,
285                                           AllOf(state(5),
286                                                 parsedSymbolID(id("type-name")),
287                                                 parents({GSSNode1, GSSNode2}))))
288       << Heads;
289   // Verify that we create an ambiguous ForestNode of two parses of `type-name`.
290   EXPECT_EQ(Heads.back()->Payload->dumpRecursive(TestLang.G),
291             "[  1, end) type-name := <ambiguous>\n"
292             "[  1, end) ├─type-name := class-name\n"
293             "[  1, end) │ └─class-name := <opaque>\n"
294             "[  1, end) └─type-name := enum-name\n"
295             "[  1, end)   └─enum-name := <opaque>\n");
296 }
297 
TEST_F(GLRTest,ReduceJoiningWithSameBase)298 TEST_F(GLRTest, ReduceJoiningWithSameBase) {
299   //  Before (joining due to same goto state, the same base):
300   //    0--1(class-name)--3(*)
301   //    └--2(enum-name)--4(*)
302   //  After reducing 3 by `pointer := class-name *` and
303   //                 2 by `pointer := enum-name *`:
304   //    0--5(pointer)  // 5 is goto(0, pointer)
305   buildGrammar({"pointer", "class-name", "enum-name"},
306                {"pointer := class-name *", "pointer := enum-name *"});
307 
308   auto *ClassNameNode = &Arena.createOpaque(id("class-name"), /*TokenIndex=*/0);
309   auto *EnumNameNode = &Arena.createOpaque(id("enum-name"), /*TokenIndex=*/0);
310   auto *StartTerminal = &Arena.createTerminal(tok::star, /*TokenIndex=*/1);
311 
312   const auto *GSSNode0 =
313       GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
314   const auto *GSSNode1 =
315       GSStack.addNode(/*State=*/1, /*ForestNode=*/ClassNameNode,
316                       /*Parents=*/{GSSNode0});
317   const auto *GSSNode2 =
318       GSStack.addNode(/*State=*/2, /*ForestNode=*/EnumNameNode,
319                       /*Parents=*/{GSSNode0});
320   const auto *GSSNode3 =
321       GSStack.addNode(/*State=*/3, /*ForestNode=*/StartTerminal,
322                       /*Parents=*/{GSSNode1});
323   const auto *GSSNode4 =
324       GSStack.addNode(/*State=*/4, /*ForestNode=*/StartTerminal,
325                       /*Parents=*/{GSSNode2});
326 
327   // FIXME: figure out a way to get rid of the hard-coded reduce RuleID!
328   LRTable::Builder B(TestLang.G);
329   B.Transition[{StateID{0}, id("pointer")}] = StateID{5};
330   B.Reduce[StateID{3}].insert(/* pointer := class-name */ RuleID{0});
331   B.Reduce[StateID{4}].insert(/* pointer := enum-name */ RuleID{1});
332   TestLang.Table = std::move(B).build();
333 
334   std::vector<const GSS::Node *> Heads = {GSSNode3, GSSNode4};
335   glrReduce(Heads, tokenSymbol(tok::eof),
336             {emptyTokenStream(), Arena, GSStack}, TestLang);
337 
338   EXPECT_THAT(
339       Heads, UnorderedElementsAre(GSSNode3, GSSNode4,
340                                   AllOf(state(5), parsedSymbolID(id("pointer")),
341                                         parents({GSSNode0}))))
342       << Heads;
343   EXPECT_EQ(Heads.back()->Payload->dumpRecursive(TestLang.G),
344             "[  0, end) pointer := <ambiguous>\n"
345             "[  0, end) ├─pointer := class-name *\n"
346             "[  0,   1) │ ├─class-name := <opaque>\n"
347             "[  1, end) │ └─* := tok[1]\n"
348             "[  0, end) └─pointer := enum-name *\n"
349             "[  0,   1)   ├─enum-name := <opaque>\n"
350             "[  1, end)   └─* := tok[1]\n");
351 }
352 
TEST_F(GLRTest,ReduceLookahead)353 TEST_F(GLRTest, ReduceLookahead) {
354   // A term can be followed by +, but not by -.
355   buildGrammar({"sum", "term"}, {"expr := term + term", "term := IDENTIFIER"});
356   LRTable::Builder B(TestLang.G);
357   B.Transition[{StateID{0}, id("term")}] = StateID{2};
358   B.Reduce[StateID{1}].insert(RuleID{0});
359   TestLang.Table = std::move(B).build();
360 
361   auto *Identifier = &Arena.createTerminal(tok::identifier, /*Start=*/0);
362 
363   const auto *Root =
364       GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
365   const auto *GSSNode1 =
366       GSStack.addNode(/*State=*/1, /*ForestNode=*/Identifier, {Root});
367 
368   // When the lookahead is +, reduce is performed.
369   std::vector<const GSS::Node *> Heads = {GSSNode1};
370   glrReduce(Heads, tokenSymbol(tok::plus), {emptyTokenStream(), Arena, GSStack},
371             TestLang);
372   EXPECT_THAT(Heads,
373               ElementsAre(GSSNode1, AllOf(state(2), parsedSymbolID(id("term")),
374                                           parents(Root))));
375 
376   // When the lookahead is -, reduce is not performed.
377   Heads = {GSSNode1};
378   glrReduce(Heads, tokenSymbol(tok::minus),
379             {emptyTokenStream(), Arena, GSStack}, TestLang);
380   EXPECT_THAT(Heads, ElementsAre(GSSNode1));
381 }
382 
TEST_F(GLRTest,Recover)383 TEST_F(GLRTest, Recover) {
384   // Recovery while parsing "word" inside braces.
385   //  Before:
386   //    0--1({)--2(?)
387   //  After recovering a `word` at state 1:
388   //    0--3(word)  // 3 is goto(1, word)
389   buildGrammar({"word", "top"}, {"top := { word [recover=Braces] }"});
390   LRTable::Builder B(TestLang.G);
391   B.Transition[{StateID{1}, id("word")}] = StateID{3};
392   B.Recoveries.push_back({StateID{1}, {extensionID("Braces"), id("word")}});
393   TestLang.Table = std::move(B).build();
394   TestLang.RecoveryStrategies.try_emplace(extensionID("Braces"), recoverBraces);
395 
396   auto *LBrace = &Arena.createTerminal(tok::l_brace, 0);
397   auto *Question1 = &Arena.createTerminal(tok::question, 1);
398   const auto *Root = GSStack.addNode(0, nullptr, {});
399   const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root});
400   const auto *AfterQuestion1 = GSStack.addNode(2, Question1, {OpenedBraces});
401 
402   // Need a token stream with paired braces so the strategy works.
403   clang::LangOptions LOptions;
404   TokenStream Tokens = cook(lex("{ ? ? ? }", LOptions), LOptions);
405   pairBrackets(Tokens);
406   std::vector<const GSS::Node *> NewHeads;
407 
408   unsigned TokenIndex = 2;
409   glrRecover({AfterQuestion1}, TokenIndex, {Tokens, Arena, GSStack}, TestLang,
410              NewHeads);
411   EXPECT_EQ(TokenIndex, 4u) << "should skip ahead to matching brace";
412   EXPECT_THAT(NewHeads, ElementsAre(AllOf(state(3), parsedSymbolID(id("word")),
413                                           parents({OpenedBraces}), start(1u))));
414   EXPECT_EQ(NewHeads.front()->Payload->kind(), ForestNode::Opaque);
415 
416   // Test recovery failure: omit closing brace so strategy fails
417   TokenStream NoRBrace = cook(lex("{ ? ? ? ?", LOptions), LOptions);
418   pairBrackets(NoRBrace);
419   NewHeads.clear();
420   TokenIndex = 2;
421   glrRecover({AfterQuestion1}, TokenIndex, {NoRBrace, Arena, GSStack}, TestLang,
422              NewHeads);
423   EXPECT_EQ(TokenIndex, 2u) << "should not advance on failure";
424   EXPECT_THAT(NewHeads, IsEmpty());
425 }
426 
TEST_F(GLRTest,RecoverRightmost)427 TEST_F(GLRTest, RecoverRightmost) {
428   // In a nested block structure, we recover at the innermost possible block.
429   //  Before:
430   //    0--1({)--1({)--1({)
431   //  After recovering a `block` at inside the second braces:
432   //    0--1({)--2(body)  // 2 is goto(1, body)
433   buildGrammar({"body", "top"}, {"top := { body [recover=Braces] }"});
434   LRTable::Builder B(TestLang.G);
435   B.Transition[{StateID{1}, id("body")}] = StateID{2};
436   B.Recoveries.push_back({StateID{1}, {extensionID("Braces"), id("body")}});
437   TestLang.Table = std::move(B).build();
438   TestLang.RecoveryStrategies.try_emplace(extensionID("Braces"), recoverBraces);
439 
440   clang::LangOptions LOptions;
441   // Innermost brace is unmatched, to test fallback to next brace.
442   TokenStream Tokens = cook(lex("{ { { ? } }", LOptions), LOptions);
443   Tokens.tokens()[0].Pair = 5;
444   Tokens.tokens()[1].Pair = 4;
445   Tokens.tokens()[4].Pair = 1;
446   Tokens.tokens()[5].Pair = 0;
447 
448   auto *Brace1 = &Arena.createTerminal(tok::l_brace, 0);
449   auto *Brace2 = &Arena.createTerminal(tok::l_brace, 1);
450   auto *Brace3 = &Arena.createTerminal(tok::l_brace, 2);
451   const auto *Root = GSStack.addNode(0, nullptr, {});
452   const auto *In1 = GSStack.addNode(1, Brace1, {Root});
453   const auto *In2 = GSStack.addNode(1, Brace2, {In1});
454   const auto *In3 = GSStack.addNode(1, Brace3, {In2});
455 
456   unsigned TokenIndex = 3;
457   std::vector<const GSS::Node *> NewHeads;
458   glrRecover({In3}, TokenIndex, {Tokens, Arena, GSStack}, TestLang, NewHeads);
459   EXPECT_EQ(TokenIndex, 5u);
460   EXPECT_THAT(NewHeads, ElementsAre(AllOf(state(2), parsedSymbolID(id("body")),
461                                           parents({In2}), start(2u))));
462 }
463 
TEST_F(GLRTest,RecoverAlternatives)464 TEST_F(GLRTest, RecoverAlternatives) {
465   // Recovery inside braces with multiple equally good options
466   //  Before:
467   //    0--1({)
468   //  After recovering either `word` or `number` inside the braces:
469   //    0--1({)--2(word)   // 2 is goto(1, word)
470   //          └--3(number) // 3 is goto(1, number)
471   buildGrammar({"number", "word", "top"},
472                {
473                    "top := { number [recover=Braces] }",
474                    "top := { word [recover=Braces] }",
475                });
476   LRTable::Builder B(TestLang.G);
477   B.Transition[{StateID{1}, id("number")}] = StateID{2};
478   B.Transition[{StateID{1}, id("word")}] = StateID{3};
479   B.Recoveries.push_back({StateID{1}, {extensionID("Braces"), id("number")}});
480   B.Recoveries.push_back({StateID{1}, {extensionID("Braces"), id("word")}});
481   TestLang.RecoveryStrategies.try_emplace(extensionID("Braces"), recoverBraces);
482   TestLang.Table = std::move(B).build();
483   auto *LBrace = &Arena.createTerminal(tok::l_brace, 0);
484   const auto *Root = GSStack.addNode(0, nullptr, {});
485   const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root});
486 
487   clang::LangOptions LOptions;
488   TokenStream Tokens = cook(lex("{ ? }", LOptions), LOptions);
489   pairBrackets(Tokens);
490   std::vector<const GSS::Node *> NewHeads;
491   unsigned TokenIndex = 1;
492 
493   glrRecover({OpenedBraces}, TokenIndex, {Tokens, Arena, GSStack}, TestLang,
494              NewHeads);
495   EXPECT_EQ(TokenIndex, 2u);
496   EXPECT_THAT(NewHeads,
497               UnorderedElementsAre(AllOf(state(2), parsedSymbolID(id("number")),
498                                          parents({OpenedBraces}), start(1u)),
499                                    AllOf(state(3), parsedSymbolID(id("word")),
500                                          parents({OpenedBraces}), start(1u))));
501 }
502 
TEST_F(GLRTest,PerfectForestNodeSharing)503 TEST_F(GLRTest, PerfectForestNodeSharing) {
504   // Run the GLR on a simple grammar and test that we build exactly one forest
505   // node per (SymbolID, token range).
506 
507   // This is a grmammar where the original parsing-stack-based forest node
508   // sharing approach will fail. In its LR0 graph, it has two states containing
509   // item `expr := • IDENTIFIER`, and both have different goto states on the
510   // nonterminal `expr`.
511   build(R"bnf(
512     _ := test
513 
514     test := { expr
515     test := { IDENTIFIER
516     test := left-paren expr
517     left-paren := {
518     expr := IDENTIFIER
519   )bnf");
520   TestLang.Table = LRTable::buildSLR(TestLang.G);
521   clang::LangOptions LOptions;
522   const TokenStream &Tokens = cook(lex("{ abc", LOptions), LOptions);
523 
524   const ForestNode &Parsed =
525       glrParse({Tokens, Arena, GSStack}, id("test"), TestLang);
526   // Verify that there is no duplicated sequence node of `expr := IDENTIFIER`
527   // in the forest, see the `#1` and `=#1` in the dump string.
528   EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
529             "[  0, end) test := <ambiguous>\n"
530             "[  0, end) ├─test := { expr\n"
531             "[  0,   1) │ ├─{ := tok[0]\n"
532             "[  1, end) │ └─expr := IDENTIFIER #1\n"
533             "[  1, end) │   └─IDENTIFIER := tok[1]\n"
534             "[  0, end) ├─test := { IDENTIFIER\n"
535             "[  0,   1) │ ├─{ := tok[0]\n"
536             "[  1, end) │ └─IDENTIFIER := tok[1]\n"
537             "[  0, end) └─test := left-paren expr\n"
538             "[  0,   1)   ├─left-paren := {\n"
539             "[  0,   1)   │ └─{ := tok[0]\n"
540             "[  1, end)   └─expr =#1\n");
541 }
542 
TEST_F(GLRTest,GLRReduceOrder)543 TEST_F(GLRTest, GLRReduceOrder) {
544   // Given the following grammar, and the input `IDENTIFIER`, reductions should
545   // be performed in the following order:
546   //  1. foo := IDENTIFIER
547   //  2. { test := IDENTIFIER, test := foo }
548   // foo should be reduced first, so that in step 2 we have completed reduces
549   // for test, and form an ambiguous forest node.
550   build(R"bnf(
551     _ := test
552 
553     test := IDENTIFIER
554     test := foo
555     foo := IDENTIFIER
556   )bnf");
557   clang::LangOptions LOptions;
558   const TokenStream &Tokens = cook(lex("IDENTIFIER", LOptions), LOptions);
559   TestLang.Table = LRTable::buildSLR(TestLang.G);
560 
561   const ForestNode &Parsed =
562       glrParse({Tokens, Arena, GSStack}, id("test"), TestLang);
563   EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
564             "[  0, end) test := <ambiguous>\n"
565             "[  0, end) ├─test := IDENTIFIER\n"
566             "[  0, end) │ └─IDENTIFIER := tok[0]\n"
567             "[  0, end) └─test := foo\n"
568             "[  0, end)   └─foo := IDENTIFIER\n"
569             "[  0, end)     └─IDENTIFIER := tok[0]\n");
570 }
571 
TEST_F(GLRTest,RecoveryEndToEnd)572 TEST_F(GLRTest, RecoveryEndToEnd) {
573   // Simple example of brace-based recovery showing:
574   //  - recovered region includes tokens both ahead of and behind the cursor
575   //  - multiple possible recovery rules
576   //  - recovery from outer scopes is rejected
577   build(R"bnf(
578     _ := block
579 
580     block := { block [recover=Braces] }
581     block := { numbers [recover=Braces] }
582     numbers := NUMERIC_CONSTANT NUMERIC_CONSTANT
583   )bnf");
584   TestLang.Table = LRTable::buildSLR(TestLang.G);
585   TestLang.RecoveryStrategies.try_emplace(extensionID("Braces"), recoverBraces);
586   clang::LangOptions LOptions;
587   TokenStream Tokens = cook(lex("{ { 42 ? } }", LOptions), LOptions);
588   pairBrackets(Tokens);
589 
590   const ForestNode &Parsed =
591       glrParse({Tokens, Arena, GSStack}, id("block"), TestLang);
592   EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
593             "[  0, end) block := { block [recover=Braces] }\n"
594             "[  0,   1) ├─{ := tok[0]\n"
595             "[  1,   5) ├─block := <ambiguous>\n"
596             "[  1,   5) │ ├─block := { block [recover=Braces] }\n"
597             "[  1,   2) │ │ ├─{ := tok[1]\n"
598             "[  2,   4) │ │ ├─block := <opaque>\n"
599             "[  4,   5) │ │ └─} := tok[4]\n"
600             "[  1,   5) │ └─block := { numbers [recover=Braces] }\n"
601             "[  1,   2) │   ├─{ := tok[1]\n"
602             "[  2,   4) │   ├─numbers := <opaque>\n"
603             "[  4,   5) │   └─} := tok[4]\n"
604             "[  5, end) └─} := tok[5]\n");
605 }
606 
TEST_F(GLRTest,RecoverTerminal)607 TEST_F(GLRTest, RecoverTerminal) {
608   build(R"bnf(
609     _ := stmt
610 
611     stmt := IDENTIFIER ; [recover=Skip]
612   )bnf");
613   TestLang.Table = LRTable::buildSLR(TestLang.G);
614   TestLang.RecoveryStrategies.try_emplace(
615       extensionID("Skip"),
616       [](Token::Index Start, const TokenStream &) { return Start + 1; });
617   clang::LangOptions LOptions;
618   TokenStream Tokens = cook(lex("foo", LOptions), LOptions);
619 
620   const ForestNode &Parsed =
621       glrParse({Tokens, Arena, GSStack}, id("stmt"), TestLang);
622   EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
623             "[  0, end) stmt := IDENTIFIER ; [recover=Skip]\n"
624             "[  0,   1) ├─IDENTIFIER := tok[0]\n"
625             "[  1, end) └─; := <opaque>\n");
626 }
627 
628 
TEST_F(GLRTest,NoExplicitAccept)629 TEST_F(GLRTest, NoExplicitAccept) {
630   build(R"bnf(
631     _ := test
632 
633     test := IDENTIFIER test
634     test := IDENTIFIER
635   )bnf");
636   clang::LangOptions LOptions;
637   // Given the following input, and the grammar above, we perform two reductions
638   // of the nonterminal `test` when the next token is `eof`, verify that the
639   // parser stops at the right state.
640   const TokenStream &Tokens = cook(lex("id id", LOptions), LOptions);
641   TestLang.Table = LRTable::buildSLR(TestLang.G);
642 
643   const ForestNode &Parsed =
644       glrParse({Tokens, Arena, GSStack}, id("test"), TestLang);
645   EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
646             "[  0, end) test := IDENTIFIER test\n"
647             "[  0,   1) ├─IDENTIFIER := tok[0]\n"
648             "[  1, end) └─test := IDENTIFIER\n"
649             "[  1, end)   └─IDENTIFIER := tok[1]\n");
650 }
651 
TEST_F(GLRTest,GuardExtension)652 TEST_F(GLRTest, GuardExtension) {
653   build(R"bnf(
654     _ := start
655 
656     start := IDENTIFIER [guard]
657   )bnf");
658   TestLang.Guards.try_emplace(
659       ruleFor("start"), [&](const GuardParams &P) {
660         assert(P.RHS.size() == 1 &&
661                P.RHS.front()->symbol() ==
662                    tokenSymbol(clang::tok::identifier));
663         return P.Tokens.tokens()[P.RHS.front()->startTokenIndex()]
664                    .text() == "test";
665       });
666   clang::LangOptions LOptions;
667   TestLang.Table = LRTable::buildSLR(TestLang.G);
668 
669   std::string Input = "test";
670   const TokenStream &Succeeded = cook(lex(Input, LOptions), LOptions);
671   EXPECT_EQ(glrParse({Succeeded, Arena, GSStack}, id("start"), TestLang)
672                 .dumpRecursive(TestLang.G),
673             "[  0, end) start := IDENTIFIER [guard]\n"
674             "[  0, end) └─IDENTIFIER := tok[0]\n");
675 
676   Input = "notest";
677   const TokenStream &Failed = cook(lex(Input, LOptions), LOptions);
678   EXPECT_EQ(glrParse({Failed, Arena, GSStack}, id("start"), TestLang)
679                 .dumpRecursive(TestLang.G),
680             "[  0, end) start := <opaque>\n");
681 }
682 
TEST(GSSTest,GC)683 TEST(GSSTest, GC) {
684   //      ┌-A-┬-AB
685   //      ├-B-┘
686   // Root-+-C
687   //      ├-D
688   //      └-E
689   GSS GSStack;
690   auto *Root = GSStack.addNode(0, nullptr, {});
691   auto *A = GSStack.addNode(0, nullptr, {Root});
692   auto *B = GSStack.addNode(0, nullptr, {Root});
693   auto *C = GSStack.addNode(0, nullptr, {Root});
694   auto *D = GSStack.addNode(0, nullptr, {Root});
695   auto *AB = GSStack.addNode(0, nullptr, {A, B});
696 
697   EXPECT_EQ(1u, GSStack.gc({AB, C})) << "D is destroyed";
698   EXPECT_EQ(0u, GSStack.gc({AB, C})) << "D is already gone";
699   auto *E = GSStack.addNode(0, nullptr, {Root});
700   EXPECT_EQ(D, E) << "Storage of GCed node D is reused for E";
701   EXPECT_EQ(3u, GSStack.gc({A, E})) << "Destroys B, AB, C";
702   EXPECT_EQ(1u, GSStack.gc({E})) << "Destroys A";
703 }
704 
705 } // namespace
706 } // namespace pseudo
707 } // namespace clang
708