1 //===--- GLR.cpp -----------------------------------------------*- C++-*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "clang-pseudo/GLR.h"
10 #include "clang-pseudo/Language.h"
11 #include "clang-pseudo/grammar/Grammar.h"
12 #include "clang-pseudo/grammar/LRTable.h"
13 #include "clang/Basic/TokenKinds.h"
14 #include "llvm/ADT/ArrayRef.h"
15 #include "llvm/ADT/STLExtras.h"
16 #include "llvm/ADT/ScopeExit.h"
17 #include "llvm/ADT/StringExtras.h"
18 #include "llvm/Support/Debug.h"
19 #include "llvm/Support/FormatVariadic.h"
20 #include <algorithm>
21 #include <memory>
22 #include <queue>
23
24 #define DEBUG_TYPE "GLR.cpp"
25
26 namespace clang {
27 namespace pseudo {
28 namespace {
29
findRecoveryEndpoint(ExtensionID Strategy,Token::Index Begin,const TokenStream & Tokens,const Language & Lang)30 Token::Index findRecoveryEndpoint(ExtensionID Strategy, Token::Index Begin,
31 const TokenStream &Tokens,
32 const Language &Lang) {
33 assert(Strategy != 0);
34 assert(Begin > 0);
35 if (auto S = Lang.RecoveryStrategies.lookup(Strategy))
36 return S(Begin, Tokens);
37 return Token::Invalid;
38 }
39
40 } // namespace
41
glrRecover(llvm::ArrayRef<const GSS::Node * > OldHeads,unsigned & TokenIndex,const ParseParams & Params,const Language & Lang,std::vector<const GSS::Node * > & NewHeads)42 void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads,
43 unsigned &TokenIndex, const ParseParams &Params,
44 const Language &Lang,
45 std::vector<const GSS::Node *> &NewHeads) {
46 LLVM_DEBUG(llvm::dbgs() << "Recovery at token " << TokenIndex << "...\n");
47 // Describes a possibility to recover by forcibly interpreting a range of
48 // tokens around the cursor as a nonterminal that we expected to see.
49 struct PlaceholderRecovery {
50 // The token prior to the nonterminal which is being recovered.
51 // This starts of the region we're skipping, so higher Position is better.
52 Token::Index Position;
53 // The nonterminal which will be created in order to recover.
54 SymbolID Symbol;
55 // The heuristic used to choose the bounds of the nonterminal to recover.
56 ExtensionID Strategy;
57
58 // The GSS head where we are expecting the recovered nonterminal.
59 const GSS::Node *RecoveryNode;
60 // Payload of nodes on the way back from the OldHead to the recovery node.
61 // These represent the partial parse that is being discarded.
62 // They should become the children of the opaque recovery node.
63 // FIXME: internal structure of opaque nodes is not implemented.
64 //
65 // There may be multiple paths leading to the same recovery node, we choose
66 // one arbitrarily.
67 std::vector<const ForestNode *> DiscardedParse;
68 };
69 std::vector<PlaceholderRecovery> Options;
70
71 // Find recovery options by walking up the stack.
72 //
73 // This is similar to exception handling: we walk up the "frames" of nested
74 // rules being parsed until we find one that has a "handler" which allows us
75 // to determine the node bounds without parsing it.
76 //
77 // Unfortunately there's a significant difference: the stack contains both
78 // "upward" nodes (ancestor parses) and "leftward" ones.
79 // e.g. when parsing `{ if (1) ? }` as compound-stmt, the stack contains:
80 // stmt := IF ( expr ) . stmt - current state, we should recover here!
81 // stmt := IF ( expr . ) stmt - (left, no recovery here)
82 // stmt := IF ( . expr ) stmt - left, we should NOT recover here!
83 // stmt := IF . ( expr ) stmt - (left, no recovery here)
84 // stmt-seq := . stmt - up, we might recover here
85 // compound-stmt := { . stmt-seq } - up, we should recover here!
86 //
87 // It's not obvious how to avoid collecting "leftward" recovery options.
88 // I think the distinction is ill-defined after merging items into states.
89 // For now, we have to take this into account when defining recovery rules.
90 // (e.g. in the expr recovery above, stay inside the parentheses).
91 // FIXME: find a more satisfying way to avoid such false recovery.
92 // FIXME: Add a test for spurious recovery once tests can define strategies.
93 std::vector<const ForestNode *> Path;
94 llvm::DenseSet<const GSS::Node *> Seen;
95 auto WalkUp = [&](const GSS::Node *N, Token::Index NextTok, auto &WalkUp) {
96 if (!Seen.insert(N).second)
97 return;
98 for (auto Strategy : Lang.Table.getRecovery(N->State)) {
99 Options.push_back(PlaceholderRecovery{
100 NextTok,
101 Strategy.Result,
102 Strategy.Strategy,
103 N,
104 Path,
105 });
106 LLVM_DEBUG(llvm::dbgs()
107 << "Option: recover " << Lang.G.symbolName(Strategy.Result)
108 << " at token " << NextTok << "\n");
109 }
110 Path.push_back(N->Payload);
111 for (const GSS::Node *Parent : N->parents())
112 WalkUp(Parent, N->Payload->startTokenIndex(), WalkUp);
113 Path.pop_back();
114 };
115 for (auto *N : OldHeads)
116 WalkUp(N, TokenIndex, WalkUp);
117
118 // Now we select the option(s) we will use to recover.
119 //
120 // We prefer options starting further right, as these discard less code
121 // (e.g. we prefer to recover inner scopes rather than outer ones).
122 // The options also need to agree on an endpoint, so the parser has a
123 // consistent position afterwards.
124 //
125 // So conceptually we're sorting by the tuple (start, end), though we avoid
126 // computing `end` for options that can't be winners.
127
128 // Consider options starting further right first.
129 // Don't drop the others yet though, we may still use them if preferred fails.
130 llvm::stable_sort(Options, [&](const auto &L, const auto &R) {
131 return L.Position > R.Position;
132 });
133
134 // We may find multiple winners, but they will have the same range.
135 llvm::Optional<Token::Range> RecoveryRange;
136 std::vector<const PlaceholderRecovery *> BestOptions;
137 for (const PlaceholderRecovery &Option : Options) {
138 // If this starts further left than options we've already found, then
139 // we'll never find anything better. Skip computing End for the rest.
140 if (RecoveryRange && Option.Position < RecoveryRange->Begin)
141 break;
142
143 auto End = findRecoveryEndpoint(Option.Strategy, Option.Position,
144 Params.Code, Lang);
145 // Recovery may not take the parse backwards.
146 if (End == Token::Invalid || End < TokenIndex)
147 continue;
148 if (RecoveryRange) {
149 // If this is worse than our previous options, ignore it.
150 if (RecoveryRange->End < End)
151 continue;
152 // If this is an improvement over our previous options, then drop them.
153 if (RecoveryRange->End > End)
154 BestOptions.clear();
155 }
156 // Create recovery nodes and heads for them in the GSS. These may be
157 // discarded if a better recovery is later found, but this path isn't hot.
158 RecoveryRange = {Option.Position, End};
159 BestOptions.push_back(&Option);
160 }
161
162 if (BestOptions.empty()) {
163 LLVM_DEBUG(llvm::dbgs() << "Recovery failed after trying " << Options.size()
164 << " strategies\n");
165 return;
166 }
167
168 // We've settled on a set of recovery options, so create their nodes and
169 // advance the cursor.
170 LLVM_DEBUG({
171 llvm::dbgs() << "Recovered range=" << *RecoveryRange << ":";
172 for (const auto *Option : BestOptions)
173 llvm::dbgs() << " " << Lang.G.symbolName(Option->Symbol);
174 llvm::dbgs() << "\n";
175 });
176 // FIXME: in general, we might have the same Option->Symbol multiple times,
177 // and we risk creating redundant Forest and GSS nodes.
178 // We also may inadvertently set up the next glrReduce to create a sequence
179 // node duplicating an opaque node that we're creating here.
180 // There are various options, including simply breaking ties between options.
181 // For now it's obscure enough to ignore.
182 for (const PlaceholderRecovery *Option : BestOptions) {
183 const ForestNode &Placeholder =
184 Params.Forest.createOpaque(Option->Symbol, RecoveryRange->Begin);
185 LRTable::StateID OldState = Option->RecoveryNode->State;
186 LRTable::StateID NewState =
187 isToken(Option->Symbol)
188 ? *Lang.Table.getShiftState(OldState, Option->Symbol)
189 : *Lang.Table.getGoToState(OldState, Option->Symbol);
190 const GSS::Node *NewHead =
191 Params.GSStack.addNode(NewState, &Placeholder, {Option->RecoveryNode});
192 NewHeads.push_back(NewHead);
193 }
194 TokenIndex = RecoveryRange->End;
195 }
196
197 using StateID = LRTable::StateID;
198
operator <<(llvm::raw_ostream & OS,const GSS::Node & N)199 llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const GSS::Node &N) {
200 std::vector<std::string> ParentStates;
201 for (const auto *Parent : N.parents())
202 ParentStates.push_back(llvm::formatv("{0}", Parent->State));
203 OS << llvm::formatv("state {0}, parsed symbol {1}, parents {3}", N.State,
204 N.Payload ? N.Payload->symbol() : 0,
205 llvm::join(ParentStates, ", "));
206 return OS;
207 }
208
209 // Apply all pending shift actions.
210 // In theory, LR parsing doesn't have shift/shift conflicts on a single head.
211 // But we may have multiple active heads, and each head has a shift action.
212 //
213 // We merge the stack -- if multiple heads will reach the same state after
214 // shifting a token, we shift only once by combining these heads.
215 //
216 // E.g. we have two heads (2, 3) in the GSS, and will shift both to reach 4:
217 // 0---1---2
218 // └---3
219 // After the shift action, the GSS is:
220 // 0---1---2---4
221 // └---3---┘
glrShift(llvm::ArrayRef<const GSS::Node * > OldHeads,const ForestNode & NewTok,const ParseParams & Params,const Language & Lang,std::vector<const GSS::Node * > & NewHeads)222 void glrShift(llvm::ArrayRef<const GSS::Node *> OldHeads,
223 const ForestNode &NewTok, const ParseParams &Params,
224 const Language &Lang, std::vector<const GSS::Node *> &NewHeads) {
225 assert(NewTok.kind() == ForestNode::Terminal);
226 LLVM_DEBUG(llvm::dbgs() << llvm::formatv(" Shift {0} ({1} active heads):\n",
227 Lang.G.symbolName(NewTok.symbol()),
228 OldHeads.size()));
229
230 // We group pending shifts by their target state so we can merge them.
231 llvm::SmallVector<std::pair<StateID, const GSS::Node *>, 8> Shifts;
232 for (const auto *H : OldHeads)
233 if (auto S = Lang.Table.getShiftState(H->State, NewTok.symbol()))
234 Shifts.push_back({*S, H});
235 llvm::stable_sort(Shifts, llvm::less_first{});
236
237 auto Rest = llvm::makeArrayRef(Shifts);
238 llvm::SmallVector<const GSS::Node *> Parents;
239 while (!Rest.empty()) {
240 // Collect the batch of PendingShift that have compatible shift states.
241 // Their heads become TempParents, the parents of the new GSS node.
242 StateID NextState = Rest.front().first;
243
244 Parents.clear();
245 for (const auto &Base : Rest) {
246 if (Base.first != NextState)
247 break;
248 Parents.push_back(Base.second);
249 }
250 Rest = Rest.drop_front(Parents.size());
251
252 LLVM_DEBUG(llvm::dbgs() << llvm::formatv(" --> S{0} ({1} heads)\n",
253 NextState, Parents.size()));
254 NewHeads.push_back(Params.GSStack.addNode(NextState, &NewTok, Parents));
255 }
256 }
257
258 namespace {
259 // A KeyedQueue yields pairs of keys and values in order of the keys.
260 template <typename Key, typename Value>
261 using KeyedQueue =
262 std::priority_queue<std::pair<Key, Value>,
263 std::vector<std::pair<Key, Value>>, llvm::less_first>;
264
sortAndUnique(std::vector<T> & Vec)265 template <typename T> void sortAndUnique(std::vector<T> &Vec) {
266 llvm::sort(Vec);
267 Vec.erase(std::unique(Vec.begin(), Vec.end()), Vec.end());
268 }
269
270 // Perform reduces until no more are possible.
271 //
272 // Generally this means walking up from the heads gathering ForestNodes that
273 // will match the RHS of the rule we're reducing into a sequence ForestNode,
274 // and ending up at a base node.
275 // Then we push a new GSS node onto that base, taking care to:
276 // - pack alternative sequence ForestNodes into an ambiguous ForestNode.
277 // - use the same GSS node for multiple heads if the parse state matches.
278 //
279 // Examples of reduction:
280 // Before (simple):
281 // 0--1(expr)--2(semi)
282 // After reducing 2 by `stmt := expr semi`:
283 // 0--3(stmt) // 3 is goto(0, stmt)
284 //
285 // Before (splitting due to R/R conflict):
286 // 0--1(IDENTIFIER)
287 // After reducing 1 by `class-name := IDENTIFIER` & `enum-name := IDENTIFIER`:
288 // 0--2(class-name) // 2 is goto(0, class-name)
289 // └--3(enum-name) // 3 is goto(0, enum-name)
290 //
291 // Before (splitting due to multiple bases):
292 // 0--2(class-name)--4(STAR)
293 // └--3(enum-name)---┘
294 // After reducing 4 by `ptr-operator := STAR`:
295 // 0--2(class-name)--5(ptr-operator) // 5 is goto(2, ptr-operator)
296 // └--3(enum-name)---6(ptr-operator) // 6 is goto(3, ptr-operator)
297 //
298 // Before (joining due to same goto state, multiple bases):
299 // 0--1(cv-qualifier)--3(class-name)
300 // └--2(cv-qualifier)--4(enum-name)
301 // After reducing 3 by `type-name := class-name` and
302 // 4 by `type-name := enum-name`:
303 // 0--1(cv-qualifier)--5(type-name) // 5 is goto(1, type-name) and
304 // └--2(cv-qualifier)--┘ // goto(2, type-name)
305 //
306 // Before (joining due to same goto state, the same base):
307 // 0--1(class-name)--3(STAR)
308 // └--2(enum-name)--4(STAR)
309 // After reducing 3 by `pointer := class-name STAR` and
310 // 2 by`enum-name := class-name STAR`:
311 // 0--5(pointer) // 5 is goto(0, pointer)
312 //
313 // (This is a functor rather than a function to allow it to reuse scratch
314 // storage across calls).
315 class GLRReduce {
316 const ParseParams &Params;
317 const Language& Lang;
318 // There are two interacting complications:
319 // 1. Performing one reduce can unlock new reduces on the newly-created head.
320 // 2a. The ambiguous ForestNodes must be complete (have all sequence nodes).
321 // This means we must have unlocked all the reduces that contribute to it.
322 // 2b. Similarly, the new GSS nodes must be complete (have all parents).
323 //
324 // We define a "family" of reduces as those that produce the same symbol and
325 // cover the same range of tokens. These are exactly the set of reductions
326 // whose sequence nodes would be covered by the same ambiguous node.
327 // We wish to process a whole family at a time (to satisfy complication 2),
328 // and can address complication 1 by carefully ordering the families:
329 // - Process families covering fewer tokens first.
330 // A reduce can't depend on a longer reduce!
331 // - For equal token ranges: if S := T, process T families before S families.
332 // Parsing T can't depend on an equal-length S, as the grammar is acyclic.
333 //
334 // This isn't quite enough: we don't know the token length of the reduction
335 // until we walk up the stack to perform the pop.
336 // So we perform the pop part upfront, and place the push specification on
337 // priority queues such that we can retrieve a family at a time.
338
339 // A reduction family is characterized by its token range and symbol produced.
340 // It is used as a key in the priority queues to group pushes by family.
341 struct Family {
342 // The start of the token range of the reduce.
343 Token::Index Start;
344 SymbolID Symbol;
345 // Rule must produce Symbol and can otherwise be arbitrary.
346 // RuleIDs have the topological order based on the acyclic grammar.
347 // FIXME: should SymbolIDs be so ordered instead?
348 RuleID Rule;
349
operator ==clang::pseudo::__anonec8d3c1d0411::GLRReduce::Family350 bool operator==(const Family &Other) const {
351 return Start == Other.Start && Symbol == Other.Symbol;
352 }
353 // The larger Family is the one that should be processed first.
operator <clang::pseudo::__anonec8d3c1d0411::GLRReduce::Family354 bool operator<(const Family &Other) const {
355 if (Start != Other.Start)
356 return Start < Other.Start;
357 if (Symbol != Other.Symbol)
358 return Rule > Other.Rule;
359 assert(*this == Other);
360 return false;
361 }
362 };
363
364 // A sequence is the ForestNode payloads of the GSS nodes we are reducing.
365 using Sequence = llvm::SmallVector<const ForestNode *, Rule::MaxElements>;
366 // Like ArrayRef<const ForestNode*>, but with the missing operator<.
367 // (Sequences are big to move by value as the collections gets rearranged).
368 struct SequenceRef {
SequenceRefclang::pseudo::__anonec8d3c1d0411::GLRReduce::SequenceRef369 SequenceRef(const Sequence &S) : S(S) {}
370 llvm::ArrayRef<const ForestNode *> S;
operator ==(SequenceRef A,SequenceRef B)371 friend bool operator==(SequenceRef A, SequenceRef B) { return A.S == B.S; }
operator <(const SequenceRef & A,const SequenceRef & B)372 friend bool operator<(const SequenceRef &A, const SequenceRef &B) {
373 return std::lexicographical_compare(A.S.begin(), A.S.end(), B.S.begin(),
374 B.S.end());
375 }
376 };
377 // Underlying storage for sequences pointed to by stored SequenceRefs.
378 std::deque<Sequence> SequenceStorage;
379 // We don't actually destroy the sequences between calls, to reuse storage.
380 // Everything SequenceStorage[ >=SequenceStorageCount ] is reusable scratch.
381 unsigned SequenceStorageCount;
382
383 // Halfway through a reduction (after the pop, before the push), we have
384 // collected nodes for the RHS of a rule, and reached a base node.
385 // They specify a sequence ForestNode we may build (but we dedup first).
386 // (The RuleID is not stored here, but rather in the Family).
387 struct PushSpec {
388 // The last node popped before pushing. Its parent is the reduction base(s).
389 // (Base is more fundamental, but this is cheaper to store).
390 const GSS::Node* LastPop = nullptr;
391 Sequence *Seq = nullptr;
392 };
393 KeyedQueue<Family, PushSpec> Sequences; // FIXME: rename => PendingPushes?
394
395 // We treat Heads as a queue of Pop operations still to be performed.
396 // PoppedHeads is our position within it.
397 std::vector<const GSS::Node *> *Heads;
398 unsigned NextPopHead;
399 SymbolID Lookahead;
400
401 Sequence TempSequence;
402 public:
GLRReduce(const ParseParams & Params,const Language & Lang)403 GLRReduce(const ParseParams &Params, const Language &Lang)
404 : Params(Params), Lang(Lang) {}
405
operator ()(std::vector<const GSS::Node * > & Heads,SymbolID Lookahead)406 void operator()(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead) {
407 assert(isToken(Lookahead));
408
409 NextPopHead = 0;
410 this->Heads = &Heads;
411 this->Lookahead = Lookahead;
412 assert(Sequences.empty());
413 SequenceStorageCount = 0;
414
415 popPending();
416 while (!Sequences.empty()) {
417 pushNext();
418 popPending();
419 }
420 }
421
422 private:
canReduce(const Rule & R,RuleID RID,llvm::ArrayRef<const ForestNode * > RHS) const423 bool canReduce(const Rule &R, RuleID RID,
424 llvm::ArrayRef<const ForestNode *> RHS) const {
425 if (!R.Guarded)
426 return true;
427 if (auto Guard = Lang.Guards.lookup(RID))
428 return Guard({RHS, Params.Code, Lookahead});
429 LLVM_DEBUG(llvm::dbgs()
430 << llvm::formatv("missing guard implementation for rule {0}\n",
431 Lang.G.dumpRule(RID)));
432 return true;
433 }
434 // pop walks up the parent chain(s) for a reduction from Head by to Rule.
435 // Once we reach the end, record the bases and sequences.
pop(const GSS::Node * Head,RuleID RID,const Rule & Rule)436 void pop(const GSS::Node *Head, RuleID RID, const Rule &Rule) {
437 LLVM_DEBUG(llvm::dbgs() << " Pop " << Lang.G.dumpRule(RID) << "\n");
438 Family F{/*Start=*/0, /*Symbol=*/Rule.Target, /*Rule=*/RID};
439 TempSequence.resize_for_overwrite(Rule.Size);
440 auto DFS = [&](const GSS::Node *N, unsigned I, auto &DFS) {
441 TempSequence[Rule.Size - 1 - I] = N->Payload;
442 if (I + 1 == Rule.Size) {
443 F.Start = TempSequence.front()->startTokenIndex();
444 LLVM_DEBUG({
445 for (const auto *B : N->parents())
446 llvm::dbgs() << " --> base at S" << B->State << "\n";
447 });
448 if (!canReduce(Rule, RID, TempSequence))
449 return;
450 // Copy the chain to stable storage so it can be enqueued.
451 if (SequenceStorageCount == SequenceStorage.size())
452 SequenceStorage.emplace_back();
453 SequenceStorage[SequenceStorageCount] = TempSequence;
454 Sequence *Seq = &SequenceStorage[SequenceStorageCount++];
455
456 Sequences.emplace(F, PushSpec{N, Seq});
457 return;
458 }
459 for (const GSS::Node *Parent : N->parents())
460 DFS(Parent, I + 1, DFS);
461 };
462 DFS(Head, 0, DFS);
463 }
464
465 // popPending pops every available reduction.
popPending()466 void popPending() {
467 for (; NextPopHead < Heads->size(); ++NextPopHead) {
468 // In trivial cases, we perform the complete reduce here!
469 if (popAndPushTrivial())
470 continue;
471 for (RuleID RID :
472 Lang.Table.getReduceRules((*Heads)[NextPopHead]->State)) {
473 const auto &Rule = Lang.G.lookupRule(RID);
474 if (Lang.Table.canFollow(Rule.Target, Lookahead))
475 pop((*Heads)[NextPopHead], RID, Rule);
476 }
477 }
478 }
479
480 // Storage reused by each call to pushNext.
481 std::vector<std::pair</*Goto*/ StateID, const GSS::Node *>> FamilyBases;
482 std::vector<std::pair<RuleID, SequenceRef>> FamilySequences;
483 std::vector<const GSS::Node *> Parents;
484 std::vector<const ForestNode *> SequenceNodes;
485
486 // Process one push family, forming a forest node.
487 // This produces new GSS heads which may enable more pops.
pushNext()488 void pushNext() {
489 assert(!Sequences.empty());
490 Family F = Sequences.top().first;
491
492 LLVM_DEBUG(llvm::dbgs() << " Push " << Lang.G.symbolName(F.Symbol)
493 << " from token " << F.Start << "\n");
494
495 // Grab the sequences and bases for this family.
496 // We don't care which rule yielded each base. If Family.Symbol is S, the
497 // base includes an item X := ... • S ... and since the grammar is
498 // context-free, *all* parses of S are valid here.
499 FamilySequences.clear();
500 FamilyBases.clear();
501 do {
502 const PushSpec &Push = Sequences.top().second;
503 FamilySequences.emplace_back(Sequences.top().first.Rule, *Push.Seq);
504 for (const GSS::Node *Base : Push.LastPop->parents()) {
505 auto NextState = Lang.Table.getGoToState(Base->State, F.Symbol);
506 assert(NextState.has_value() && "goto must succeed after reduce!");
507 FamilyBases.emplace_back(*NextState, Base);
508 }
509
510 Sequences.pop();
511 } while (!Sequences.empty() && Sequences.top().first == F);
512 // Build a forest node for each unique sequence.
513 sortAndUnique(FamilySequences);
514 SequenceNodes.clear();
515 for (const auto &SequenceSpec : FamilySequences)
516 SequenceNodes.push_back(&Params.Forest.createSequence(
517 F.Symbol, SequenceSpec.first, SequenceSpec.second.S));
518 // Wrap in an ambiguous node if needed.
519 const ForestNode *Parsed =
520 SequenceNodes.size() == 1
521 ? SequenceNodes.front()
522 : &Params.Forest.createAmbiguous(F.Symbol, SequenceNodes);
523 LLVM_DEBUG(llvm::dbgs() << " --> " << Parsed->dump(Lang.G) << "\n");
524
525 // Bases for this family, deduplicate them, and group by the goTo State.
526 sortAndUnique(FamilyBases);
527 // Create a GSS node for each unique goto state.
528 llvm::ArrayRef<decltype(FamilyBases)::value_type> BasesLeft = FamilyBases;
529 while (!BasesLeft.empty()) {
530 StateID NextState = BasesLeft.front().first;
531 Parents.clear();
532 for (const auto &Base : BasesLeft) {
533 if (Base.first != NextState)
534 break;
535 Parents.push_back(Base.second);
536 }
537 BasesLeft = BasesLeft.drop_front(Parents.size());
538 Heads->push_back(Params.GSStack.addNode(NextState, Parsed, Parents));
539 }
540 }
541
542 // In general we split a reduce into a pop/push, so concurrently-available
543 // reductions can run in the correct order. The data structures are expensive.
544 //
545 // When only one reduction is possible at a time, we can skip this:
546 // we pop and immediately push, as an LR parser (as opposed to GLR) would.
547 // This is valid whenever there's only one concurrent PushSpec.
548 //
549 // This function handles a trivial but common subset of these cases:
550 // - there must be no pending pushes, and only one poppable head
551 // - the head must have only one reduction rule
552 // - the reduction path must be a straight line (no multiple parents)
553 // (Roughly this means there's no local ambiguity, so the LR algorithm works).
554 //
555 // Returns true if we successfully consumed the next unpopped head.
popAndPushTrivial()556 bool popAndPushTrivial() {
557 if (!Sequences.empty() || Heads->size() != NextPopHead + 1)
558 return false;
559 const GSS::Node *Head = Heads->back();
560 llvm::Optional<RuleID> RID;
561 for (RuleID R : Lang.Table.getReduceRules(Head->State)) {
562 if (RID.has_value())
563 return false;
564 RID = R;
565 }
566 if (!RID)
567 return true; // no reductions available, but we've processed the head!
568 const auto &Rule = Lang.G.lookupRule(*RID);
569 if (!Lang.Table.canFollow(Rule.Target, Lookahead))
570 return true; // reduction is not available
571 const GSS::Node *Base = Head;
572 TempSequence.resize_for_overwrite(Rule.Size);
573 for (unsigned I = 0; I < Rule.Size; ++I) {
574 if (Base->parents().size() != 1)
575 return false;
576 TempSequence[Rule.Size - 1 - I] = Base->Payload;
577 Base = Base->parents().front();
578 }
579 if (!canReduce(Rule, *RID, TempSequence))
580 return true; // reduction is not available
581 const ForestNode *Parsed =
582 &Params.Forest.createSequence(Rule.Target, *RID, TempSequence);
583 auto NextState = Lang.Table.getGoToState(Base->State, Rule.Target);
584 assert(NextState.has_value() && "goto must succeed after reduce!");
585 Heads->push_back(Params.GSStack.addNode(*NextState, Parsed, {Base}));
586 return true;
587 }
588 };
589
590 } // namespace
591
glrParse(const ParseParams & Params,SymbolID StartSymbol,const Language & Lang)592 const ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
593 const Language &Lang) {
594 GLRReduce Reduce(Params, Lang);
595 assert(isNonterminal(StartSymbol) && "Start symbol must be a nonterminal");
596 llvm::ArrayRef<ForestNode> Terminals = Params.Forest.createTerminals(Params.Code);
597 auto &GSS = Params.GSStack;
598
599 StateID StartState = Lang.Table.getStartState(StartSymbol);
600 // Heads correspond to the parse of tokens [0, I), NextHeads to [0, I+1).
601 std::vector<const GSS::Node *> Heads = {GSS.addNode(/*State=*/StartState,
602 /*ForestNode=*/nullptr,
603 {})};
604 std::vector<const GSS::Node *> NextHeads;
605 auto MaybeGC = [&, Roots(std::vector<const GSS::Node *>{}), I(0u)]() mutable {
606 assert(NextHeads.empty() && "Running GC at the wrong time!");
607 if (++I != 20) // Run periodically to balance CPU and memory usage.
608 return;
609 I = 0;
610
611 // We need to copy the list: Roots is consumed by the GC.
612 Roots = Heads;
613 GSS.gc(std::move(Roots));
614 };
615 // Each iteration fully processes a single token.
616 for (unsigned I = 0; I < Terminals.size();) {
617 LLVM_DEBUG(llvm::dbgs() << llvm::formatv(
618 "Next token {0} (id={1})\n",
619 Lang.G.symbolName(Terminals[I].symbol()), Terminals[I].symbol()));
620 // Consume the token.
621 glrShift(Heads, Terminals[I], Params, Lang, NextHeads);
622
623 // If we weren't able to consume the token, try to skip over some tokens
624 // so we can keep parsing.
625 if (NextHeads.empty()) {
626 // FIXME: Heads may not be fully reduced, because our reductions were
627 // constrained by lookahead (but lookahead is meaningless to recovery).
628 glrRecover(Heads, I, Params, Lang, NextHeads);
629 if (NextHeads.empty())
630 // FIXME: Ensure the `_ := start-symbol` rules have a fallback
631 // error-recovery strategy attached. Then this condition can't happen.
632 return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0);
633 } else
634 ++I;
635
636 // Form nonterminals containing the token we just consumed.
637 SymbolID Lookahead =
638 I == Terminals.size() ? tokenSymbol(tok::eof) : Terminals[I].symbol();
639 Reduce(NextHeads, Lookahead);
640 // Prepare for the next token.
641 std::swap(Heads, NextHeads);
642 NextHeads.clear();
643 MaybeGC();
644 }
645 LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Reached eof\n"));
646
647 // The parse was successful if we're in state `_ := start-symbol .`
648 auto AcceptState = Lang.Table.getGoToState(StartState, StartSymbol);
649 assert(AcceptState.has_value() && "goto must succeed after start symbol!");
650 auto SearchForAccept = [&](llvm::ArrayRef<const GSS::Node *> Heads) {
651 const ForestNode *Result = nullptr;
652 for (const auto *Head : Heads) {
653 if (Head->State == *AcceptState) {
654 assert(Head->Payload->symbol() == StartSymbol);
655 assert(Result == nullptr && "multiple results!");
656 Result = Head->Payload;
657 }
658 }
659 return Result;
660 };
661 if (auto *Result = SearchForAccept(Heads))
662 return *Result;
663 // Failed to parse the input, attempt to run recovery.
664 // FIXME: this awkwardly repeats the recovery in the loop, when shift fails.
665 // More elegant is to include EOF in the token stream, and make the
666 // augmented rule: `_ := translation-unit EOF`. In this way recovery at EOF
667 // would not be a special case: it show up as a failure to shift the EOF
668 // token.
669 unsigned I = Terminals.size();
670 glrRecover(Heads, I, Params, Lang, NextHeads);
671 Reduce(NextHeads, tokenSymbol(tok::eof));
672 if (auto *Result = SearchForAccept(NextHeads))
673 return *Result;
674
675 // We failed to parse the input, returning an opaque forest node for recovery.
676 // FIXME: as above, we can add fallback error handling so this is impossible.
677 return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0);
678 }
679
glrReduce(std::vector<const GSS::Node * > & Heads,SymbolID Lookahead,const ParseParams & Params,const Language & Lang)680 void glrReduce(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead,
681 const ParseParams &Params, const Language &Lang) {
682 // Create a new GLRReduce each time for tests, performance doesn't matter.
683 GLRReduce{Params, Lang}(Heads, Lookahead);
684 }
685
addNode(LRTable::StateID State,const ForestNode * Symbol,llvm::ArrayRef<const Node * > Parents)686 const GSS::Node *GSS::addNode(LRTable::StateID State, const ForestNode *Symbol,
687
688 llvm::ArrayRef<const Node *> Parents) {
689 Node *Result = new (allocate(Parents.size()))
690 Node({State, GCParity, static_cast<uint16_t>(Parents.size())});
691 Alive.push_back(Result);
692 ++NodesCreated;
693 Result->Payload = Symbol;
694 if (!Parents.empty())
695 llvm::copy(Parents, reinterpret_cast<const Node **>(Result + 1));
696 return Result;
697 }
698
allocate(unsigned Parents)699 GSS::Node *GSS::allocate(unsigned Parents) {
700 if (FreeList.size() <= Parents)
701 FreeList.resize(Parents + 1);
702 auto &SizedList = FreeList[Parents];
703 if (!SizedList.empty()) {
704 auto *Result = SizedList.back();
705 SizedList.pop_back();
706 return Result;
707 }
708 return static_cast<Node *>(
709 Arena.Allocate(sizeof(Node) + Parents * sizeof(Node *), alignof(Node)));
710 }
711
destroy(Node * N)712 void GSS::destroy(Node *N) {
713 unsigned ParentCount = N->ParentCount;
714 N->~Node();
715 assert(FreeList.size() > ParentCount && "established on construction!");
716 FreeList[ParentCount].push_back(N);
717 }
718
gc(std::vector<const Node * > && Queue)719 unsigned GSS::gc(std::vector<const Node *> &&Queue) {
720 #ifndef NDEBUG
721 auto ParityMatches = [&](const Node *N) { return N->GCParity == GCParity; };
722 assert("Before GC" && llvm::all_of(Alive, ParityMatches));
723 auto Deferred = llvm::make_scope_exit(
724 [&] { assert("After GC" && llvm::all_of(Alive, ParityMatches)); });
725 assert(llvm::all_of(
726 Queue, [&](const Node *R) { return llvm::is_contained(Alive, R); }));
727 #endif
728 unsigned InitialCount = Alive.size();
729
730 // Mark
731 GCParity = !GCParity;
732 while (!Queue.empty()) {
733 Node *N = const_cast<Node *>(Queue.back()); // Safe: we created these nodes.
734 Queue.pop_back();
735 if (N->GCParity != GCParity) { // Not seen yet
736 N->GCParity = GCParity; // Mark as seen
737 for (const Node *P : N->parents()) // And walk parents
738 Queue.push_back(P);
739 }
740 }
741 // Sweep
742 llvm::erase_if(Alive, [&](Node *N) {
743 if (N->GCParity == GCParity) // Walk reached this node.
744 return false;
745 destroy(N);
746 return true;
747 });
748
749 LLVM_DEBUG(llvm::dbgs() << "GC pruned " << (InitialCount - Alive.size())
750 << "/" << InitialCount << " GSS nodes\n");
751 return InitialCount - Alive.size();
752 }
753
754 } // namespace pseudo
755 } // namespace clang
756