1 //===--- Grammar.h - grammar used by clang pseudoparser  ---------*- C++-*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //  This file defines base structures for parsing & modeling a grammar for a
10 //  programming language:
11 //
12 //    # This is a fake C++ BNF grammar
13 //    _ := translation-unit
14 //    translation-unit := declaration-seq_opt
15 //    declaration-seq := declaration
16 //    declaration-seq := declaration-seq declaration
17 //
18 //  A grammar formally describes a language, and it is constructed by a set of
19 //  production rules. A rule is of BNF form (AAA := BBB CCC). A symbol is either
20 //  nonterminal or terminal, identified by a SymbolID.
21 //
22 //  Annotations are supported in a syntax form of [key=value]. They specify
23 //  attributes which are associated with either a grammar symbol (on the
24 //  right-hand side of the symbol) or a grammar rule (at the end of the rule
25 //  body).
26 //  Attributes provide a way to inject custom code into the GLR parser. Each
27 //  unique attribute value creates an extension point (identified by ExtensionID
28 //  ), and an extension point corresponds to a piece of native code. For
29 //  example, C++ grammar has a rule:
30 //
31 //   compound_statement := { statement-seq [recover=Brackets] }
32 //
33 //  The `recover` attribute instructs the parser that we should perform error
34 //  recovery if parsing the statement-seq fails. The `Brackets` recovery
35 //  heuristic is implemented in CXX.cpp by binding the ExtensionID for the
36 //  `Recovery` value to a specific C++ function that finds the recovery point.
37 //
38 //  Notions about the BNF grammar:
39 //  - "_" is the start symbol of the augmented grammar;
40 //  - single-line comment is supported, starting with a #
41 //  - A rule describes how a nonterminal (left side of :=) is constructed, and
42 //    it is *per line* in the grammar file
43 //  - Terminals (also called tokens) correspond to the clang::TokenKind; they
44 //    are written in the grammar like "IDENTIFIER", "USING", "+"
45 //  - Nonterminals are specified with "lower-case" names in the grammar; they
46 //    shouldn't be nullable (has an empty sequence)
47 //  - optional symbols are supported (specified with a _opt suffix), and they
48 //    will be eliminated during the grammar parsing stage
49 //
50 //===----------------------------------------------------------------------===//
51 
52 #ifndef CLANG_PSEUDO_GRAMMAR_GRAMMAR_H
53 #define CLANG_PSEUDO_GRAMMAR_GRAMMAR_H
54 
55 #include "clang/Basic/TokenKinds.h"
56 #include "llvm/ADT/ArrayRef.h"
57 #include "llvm/ADT/DenseSet.h"
58 #include "llvm/ADT/Optional.h"
59 #include "llvm/ADT/StringRef.h"
60 #include "llvm/Support/raw_ostream.h"
61 #include <cstdint>
62 #include <vector>
63 
64 namespace clang {
65 namespace pseudo {
66 // A SymbolID uniquely identifies a terminal/nonterminal symbol in a grammar.
67 // nonterminal IDs are indexes into a table of nonterminal symbols.
68 // Terminal IDs correspond to the clang TokenKind enum.
69 using SymbolID = uint16_t;
70 // SymbolID is only 12 bits wide.
71 // There are maximum 2^11 terminals (aka tokens) and 2^11 nonterminals.
72 static constexpr uint16_t SymbolBits = 12;
73 static constexpr uint16_t NumTerminals = tok::NUM_TOKENS;
74 // SymbolIDs with the top bit set are tokens/terminals.
75 static constexpr SymbolID TokenFlag = 1 << (SymbolBits - 1);
isToken(SymbolID ID)76 inline bool isToken(SymbolID ID) { return ID & TokenFlag; }
isNonterminal(SymbolID ID)77 inline bool isNonterminal(SymbolID ID) { return !isToken(ID); }
78 // The terminals are always the clang tok::TokenKind (not all are used).
symbolToToken(SymbolID SID)79 inline tok::TokenKind symbolToToken(SymbolID SID) {
80   assert(isToken(SID));
81   SID &= ~TokenFlag;
82   assert(SID < NumTerminals);
83   return static_cast<tok::TokenKind>(SID);
84 }
tokenSymbol(tok::TokenKind TK)85 inline constexpr SymbolID tokenSymbol(tok::TokenKind TK) {
86   return TokenFlag | static_cast<SymbolID>(TK);
87 }
88 
89 // An extension is a piece of native code specific to a grammar that modifies
90 // the behavior of annotated rules. One ExtensionID is assigned for each unique
91 // attribute value (all attributes share a namespace).
92 using ExtensionID = uint16_t;
93 
94 // A RuleID uniquely identifies a production rule in a grammar.
95 // It is an index into a table of rules.
96 using RuleID = uint16_t;
97 // There are maximum 2^12 rules.
98 static constexpr unsigned RuleBits = 12;
99 
100 // Represent a production rule in the grammar, e.g.
101 //   expression := a b c
102 //   ^Target       ^Sequence
103 struct Rule {
104   Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Seq);
105 
106   // We occupy 4 bits for the sequence, in theory, it can be at most 2^4 tokens
107   // long, however, we're stricter in order to reduce the size, we limit the max
108   // length to 9 (this is the longest sequence in cxx grammar).
109   static constexpr unsigned SizeBits = 4;
110   static constexpr unsigned MaxElements = 9;
111   static_assert(MaxElements < (1 << SizeBits), "Exceeds the maximum limit");
112   static_assert(SizeBits + SymbolBits <= 16,
113                 "Must be able to store symbol ID + size efficiently");
114 
115   // 16 bits for target symbol and size of sequence:
116   // SymbolID : 12 | Size : 4
117   SymbolID Target : SymbolBits;
118   uint8_t Size : SizeBits; // Size of the Sequence
119   SymbolID Sequence[MaxElements];
120 
121   // A guarded rule has extra logic to determine whether the RHS is eligible.
122   bool Guarded = false;
123 
124   // Specifies the index within Sequence eligible for error recovery.
125   // Given stmt := { stmt-seq_opt }, if we fail to parse the stmt-seq then we
126   // should recover by finding the matching brace, and forcing stmt-seq to match
127   // everything between braces.
128   // For now, only a single strategy at a single point is possible.
129   uint8_t RecoveryIndex = -1;
130   ExtensionID Recovery = 0;
131 
seqRule132   llvm::ArrayRef<SymbolID> seq() const {
133     return llvm::ArrayRef<SymbolID>(Sequence, Size);
134   }
135   friend bool operator==(const Rule &L, const Rule &R) {
136     return L.Target == R.Target && L.seq() == R.seq() && L.Guarded == R.Guarded;
137   }
138 };
139 
140 struct GrammarTable;
141 
142 // Grammar that describes a programming language, e.g. C++. It represents the
143 // contents of the specified grammar.
144 // It is a building block for constructing a table-based parser.
145 class Grammar {
146 public:
147   Grammar() = default; // Creates an invalid dummy grammar.
148   explicit Grammar(std::unique_ptr<GrammarTable>);
149 
150   // Parses grammar from a BNF file.
151   // Diagnostics emitted during parsing are stored in Diags.
152   static Grammar parseBNF(llvm::StringRef BNF, std::vector<std::string> &Diags);
153 
154   // Returns the SymbolID of the symbol '_'.
underscore()155   SymbolID underscore() const { return Underscore; };
156 
157   // Returns all rules of the given nonterminal symbol.
158   llvm::ArrayRef<Rule> rulesFor(SymbolID SID) const;
159   const Rule &lookupRule(RuleID RID) const;
160 
161   // Gets symbol (terminal or nonterminal) name.
162   // Terminals have names like "," (kw_comma) or "OPERATOR" (kw_operator).
163   llvm::StringRef symbolName(SymbolID) const;
164 
165   // Gets the mangled name for a terminal/nonterminal.
166   // Compared to names in the grammar,
167   //   nonterminals `ptr-declartor` becomes `ptr_declarator`;
168   //   terminal `,` becomes `comma`;
169   //   terminal `IDENTIFIER` becomes `identifier`;
170   //   terminal `INT` becomes `int`;
171   // NOTE: for nonterminals, the mangled name is the same as the cxx::Symbol
172   // enum class; for terminals, we deliberately stripped the `kw_` prefix in
173   // favor of the simplicity.
174   std::string mangleSymbol(SymbolID) const;
175   // Gets the mangled name for the rule.
176   // E.g. for the grammar rule `ptr-declarator := ptr-operator ptr-declarator`,
177   // it is `ptr_declarator_0ptr_operator_1ptr_declarator`.
178   std::string mangleRule(RuleID) const;
179 
180   // Lookup the SymbolID of the nonterminal symbol by Name.
181   llvm::Optional<SymbolID> findNonterminal(llvm::StringRef Name) const;
182 
183   // Dumps the whole grammar.
184   std::string dump() const;
185   // Dumps a particular rule.
186   std::string dumpRule(RuleID) const;
187   // Dumps all rules of the given nonterminal symbol.
188   std::string dumpRules(SymbolID) const;
189 
table()190   const GrammarTable &table() const { return *T; }
191 
192 private:
193   std::unique_ptr<GrammarTable> T;
194   // The symbol ID of '_'. (In the LR literature, this is the start symbol of
195   // the augmented grammar.)
196   SymbolID Underscore;
197 };
198 // For each nonterminal X, computes the set of terminals that begin strings
199 // derived from X. (Known as FIRST sets in grammar-based parsers).
200 std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &);
201 // For each nonterminal X, computes the set of terminals that could immediately
202 // follow X. (Known as FOLLOW sets in grammar-based parsers).
203 std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &);
204 
205 // Storage for the underlying data of the Grammar.
206 // It can be constructed dynamically (from compiling BNF file) or statically
207 // (a compiled data-source).
208 struct GrammarTable {
209   GrammarTable();
210 
211   struct Nonterminal {
212     std::string Name;
213     // Corresponding rules that construct the nonterminal, it is a [Start, End)
214     // index range of the Rules table.
215     struct {
216       RuleID Start;
217       RuleID End;
218     } RuleRange;
219   };
220 
221   // RuleID is an index into this table of rule definitions.
222   //
223   // Rules with the same target symbol (LHS) are grouped into a single range.
224   // The relative order of different target symbols is *not* by SymbolID, but
225   // rather a topological sort: if S := T then the rules producing T have lower
226   // RuleIDs than rules producing S.
227   // (This strange order simplifies the GLR parser: for a given token range, if
228   // we reduce in increasing RuleID order then we need never backtrack --
229   // prerequisite reductions are reached before dependent ones).
230   std::vector<Rule> Rules;
231   // A table of terminals (aka tokens). It corresponds to the clang::Token.
232   // clang::tok::TokenKind is the index of the table.
233   llvm::ArrayRef<std::string> Terminals;
234   // A table of nonterminals, sorted by name.
235   // SymbolID is the index of the table.
236   std::vector<Nonterminal> Nonterminals;
237   // A table of attribute values, sorted by name.
238   // ExtensionID is the index of the table.
239   std::vector<std::string> AttributeValues;
240 };
241 
242 } // namespace pseudo
243 } // namespace clang
244 
245 #endif // CLANG_PSEUDO_GRAMMAR_GRAMMAR_H
246