1 //===- FormatGen.h - Utilities for custom assembly formats ------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains common classes for building custom assembly format parsers
10 // and generators.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef MLIR_TOOLS_MLIRTBLGEN_FORMATGEN_H_
15 #define MLIR_TOOLS_MLIRTBLGEN_FORMATGEN_H_
16 
17 #include "mlir/Support/LLVM.h"
18 #include "mlir/Support/LogicalResult.h"
19 #include "llvm/ADT/StringRef.h"
20 #include "llvm/ADT/StringSet.h"
21 #include "llvm/Support/Allocator.h"
22 #include "llvm/Support/CommandLine.h"
23 #include "llvm/Support/SMLoc.h"
24 #include <vector>
25 
26 namespace llvm {
27 class SourceMgr;
28 } // namespace llvm
29 
30 namespace mlir {
31 namespace tblgen {
32 
33 //===----------------------------------------------------------------------===//
34 // FormatToken
35 //===----------------------------------------------------------------------===//
36 
37 /// This class represents a specific token in the input format.
38 class FormatToken {
39 public:
40   /// Basic token kinds.
41   enum Kind {
42     // Markers.
43     eof,
44     error,
45 
46     // Tokens with no info.
47     l_paren,
48     r_paren,
49     caret,
50     colon,
51     comma,
52     equal,
53     less,
54     greater,
55     question,
56     star,
57     pipe,
58 
59     // Keywords.
60     keyword_start,
61     kw_attr_dict,
62     kw_attr_dict_w_keyword,
63     kw_custom,
64     kw_functional_type,
65     kw_oilist,
66     kw_operands,
67     kw_params,
68     kw_qualified,
69     kw_ref,
70     kw_regions,
71     kw_results,
72     kw_struct,
73     kw_successors,
74     kw_type,
75     keyword_end,
76 
77     // String valued tokens.
78     identifier,
79     literal,
80     variable,
81   };
82 
FormatToken(Kind kind,StringRef spelling)83   FormatToken(Kind kind, StringRef spelling) : kind(kind), spelling(spelling) {}
84 
85   /// Return the bytes that make up this token.
getSpelling()86   StringRef getSpelling() const { return spelling; }
87 
88   /// Return the kind of this token.
getKind()89   Kind getKind() const { return kind; }
90 
91   /// Return a location for this token.
92   SMLoc getLoc() const;
93 
94   /// Returns true if the token is of the given kind.
is(Kind kind)95   bool is(Kind kind) { return getKind() == kind; }
96 
97   /// Return if this token is a keyword.
isKeyword()98   bool isKeyword() const {
99     return getKind() > Kind::keyword_start && getKind() < Kind::keyword_end;
100   }
101 
102 private:
103   /// Discriminator that indicates the kind of token this is.
104   Kind kind;
105 
106   /// A reference to the entire token contents; this is always a pointer into
107   /// a memory buffer owned by the source manager.
108   StringRef spelling;
109 };
110 
111 //===----------------------------------------------------------------------===//
112 // FormatLexer
113 //===----------------------------------------------------------------------===//
114 
115 /// This class implements a simple lexer for operation assembly format strings.
116 class FormatLexer {
117 public:
118   FormatLexer(llvm::SourceMgr &mgr, SMLoc loc);
119 
120   /// Lex the next token and return it.
121   FormatToken lexToken();
122 
123   /// Emit an error to the lexer with the given location and message.
124   FormatToken emitError(SMLoc loc, const Twine &msg);
125   FormatToken emitError(const char *loc, const Twine &msg);
126 
127   FormatToken emitErrorAndNote(SMLoc loc, const Twine &msg, const Twine &note);
128 
129 private:
130   /// Return the next character in the stream.
131   int getNextChar();
132 
133   /// Lex an identifier, literal, or variable.
134   FormatToken lexIdentifier(const char *tokStart);
135   FormatToken lexLiteral(const char *tokStart);
136   FormatToken lexVariable(const char *tokStart);
137 
138   /// Create a token with the current pointer and a start pointer.
formToken(FormatToken::Kind kind,const char * tokStart)139   FormatToken formToken(FormatToken::Kind kind, const char *tokStart) {
140     return FormatToken(kind, StringRef(tokStart, curPtr - tokStart));
141   }
142 
143   /// The source manager containing the format string.
144   llvm::SourceMgr &mgr;
145   /// Location of the format string.
146   SMLoc loc;
147   /// Buffer containing the format string.
148   StringRef curBuffer;
149   /// Current pointer in the buffer.
150   const char *curPtr;
151 };
152 
153 //===----------------------------------------------------------------------===//
154 // FormatElement
155 //===----------------------------------------------------------------------===//
156 
157 /// This class represents a single format element.
158 ///
159 /// If you squint and take a close look, you can see the outline of a `Format`
160 /// dialect.
161 class FormatElement {
162 public:
163   virtual ~FormatElement();
164 
165   // The top-level kinds of format elements.
166   enum Kind { Literal, Variable, Whitespace, Directive, Optional };
167 
168   /// Support LLVM-style RTTI.
classof(const FormatElement * el)169   static bool classof(const FormatElement *el) { return true; }
170 
171   /// Get the element kind.
getKind()172   Kind getKind() const { return kind; }
173 
174 protected:
175   /// Create a format element with the given kind.
FormatElement(Kind kind)176   FormatElement(Kind kind) : kind(kind) {}
177 
178 private:
179   /// The kind of the element.
180   Kind kind;
181 };
182 
183 /// The base class for all format elements. This class implements common methods
184 /// for LLVM-style RTTI.
185 template <FormatElement::Kind ElementKind>
186 class FormatElementBase : public FormatElement {
187 public:
188   /// Support LLVM-style RTTI.
classof(const FormatElement * el)189   static bool classof(const FormatElement *el) {
190     return ElementKind == el->getKind();
191   }
192 
193 protected:
194   /// Create a format element with the given kind.
FormatElementBase()195   FormatElementBase() : FormatElement(ElementKind) {}
196 };
197 
198 /// This class represents a literal element. A literal is either one of the
199 /// supported punctuation characters (e.g. `(` or `,`) or a string literal (e.g.
200 /// `literal`).
201 class LiteralElement : public FormatElementBase<FormatElement::Literal> {
202 public:
203   /// Create a literal element with the given spelling.
LiteralElement(StringRef spelling)204   explicit LiteralElement(StringRef spelling) : spelling(spelling) {}
205 
206   /// Get the spelling of the literal.
getSpelling()207   StringRef getSpelling() const { return spelling; }
208 
209 private:
210   /// The spelling of the variable, i.e. the string contained within the
211   /// backticks.
212   StringRef spelling;
213 };
214 
215 /// This class represents a variable element. A variable refers to some part of
216 /// the object being parsed, e.g. an attribute or operand on an operation or a
217 /// parameter on an attribute.
218 class VariableElement : public FormatElementBase<FormatElement::Variable> {
219 public:
220   /// These are the kinds of variables.
221   enum Kind { Attribute, Operand, Region, Result, Successor, Parameter };
222 
223   /// Get the kind of variable.
getKind()224   Kind getKind() const { return kind; }
225 
226 protected:
227   /// Create a variable with a kind.
VariableElement(Kind kind)228   VariableElement(Kind kind) : kind(kind) {}
229 
230 private:
231   /// The kind of variable.
232   Kind kind;
233 };
234 
235 /// Base class for variable elements. This class implements common methods for
236 /// LLVM-style RTTI.
237 template <VariableElement::Kind VariableKind>
238 class VariableElementBase : public VariableElement {
239 public:
240   /// An element is of this class if it is a variable and has the same variable
241   /// type.
classof(const FormatElement * el)242   static bool classof(const FormatElement *el) {
243     if (auto *varEl = dyn_cast<VariableElement>(el))
244       return VariableKind == varEl->getKind();
245     return false;
246   }
247 
248 protected:
249   /// Create a variable element with the given variable kind.
VariableElementBase()250   VariableElementBase() : VariableElement(VariableKind) {}
251 };
252 
253 /// This class represents a whitespace element, e.g. a newline or space. It is a
254 /// literal that is printed but never parsed. When the value is empty, i.e. ``,
255 /// a space is elided where one would have been printed automatically.
256 class WhitespaceElement : public FormatElementBase<FormatElement::Whitespace> {
257 public:
258   /// Create a whitespace element.
WhitespaceElement(StringRef value)259   explicit WhitespaceElement(StringRef value) : value(value) {}
260 
261   /// Get the whitespace value.
getValue()262   StringRef getValue() const { return value; }
263 
264 private:
265   /// The value of the whitespace element. Can be empty.
266   StringRef value;
267 };
268 
269 class DirectiveElement : public FormatElementBase<FormatElement::Directive> {
270 public:
271   /// These are the kinds of directives.
272   enum Kind {
273     AttrDict,
274     Custom,
275     FunctionalType,
276     OIList,
277     Operands,
278     Ref,
279     Regions,
280     Results,
281     Successors,
282     Type,
283     Params,
284     Struct
285   };
286 
287   /// Get the directive kind.
getKind()288   Kind getKind() const { return kind; }
289 
290 protected:
291   /// Create a directive element with a kind.
DirectiveElement(Kind kind)292   DirectiveElement(Kind kind) : kind(kind) {}
293 
294 private:
295   /// The directive kind.
296   Kind kind;
297 };
298 
299 /// Base class for directive elements. This class implements common methods for
300 /// LLVM-style RTTI.
301 template <DirectiveElement::Kind DirectiveKind>
302 class DirectiveElementBase : public DirectiveElement {
303 public:
304   /// Create a directive element with the specified kind.
DirectiveElementBase()305   DirectiveElementBase() : DirectiveElement(DirectiveKind) {}
306 
307   /// A format element is of this class if it is a directive element and has the
308   /// same kind.
classof(const FormatElement * el)309   static bool classof(const FormatElement *el) {
310     if (auto *directiveEl = dyn_cast<DirectiveElement>(el))
311       return DirectiveKind == directiveEl->getKind();
312     return false;
313   }
314 };
315 
316 /// This class represents a custom format directive that is implemented by the
317 /// user in C++. The directive accepts a list of arguments that is passed to the
318 /// C++ function.
319 class CustomDirective : public DirectiveElementBase<DirectiveElement::Custom> {
320 public:
321   /// Create a custom directive with a name and list of arguments.
CustomDirective(StringRef name,std::vector<FormatElement * > && arguments)322   CustomDirective(StringRef name, std::vector<FormatElement *> &&arguments)
323       : name(name), arguments(std::move(arguments)) {}
324 
325   /// Get the custom directive name.
getName()326   StringRef getName() const { return name; }
327 
328   /// Get the arguments to the custom directive.
getArguments()329   ArrayRef<FormatElement *> getArguments() const { return arguments; }
330 
331 private:
332   /// The name of the custom directive. The name is used to call two C++
333   /// methods: `parse{name}` and `print{name}` with the given arguments.
334   StringRef name;
335   /// The arguments with which to call the custom functions. These are either
336   /// variables (for which the functions are responsible for populating) or
337   /// references to variables.
338   std::vector<FormatElement *> arguments;
339 };
340 
341 /// This class represents a reference directive. This directive can be used to
342 /// reference but not bind a previously bound variable or format object. Its
343 /// current only use is to pass variables as arguments to the custom directive.
344 class RefDirective : public DirectiveElementBase<DirectiveElement::Ref> {
345 public:
346   /// Create a reference directive with the single referenced child.
RefDirective(FormatElement * arg)347   RefDirective(FormatElement *arg) : arg(arg) {}
348 
349   /// Get the reference argument.
getArg()350   FormatElement *getArg() const { return arg; }
351 
352 private:
353   /// The referenced argument.
354   FormatElement *arg;
355 };
356 
357 /// This class represents a group of elements that are optionally emitted based
358 /// on an optional variable "anchor" and a group of elements that are emitted
359 /// when the anchor element is not present.
360 class OptionalElement : public FormatElementBase<FormatElement::Optional> {
361 public:
362   /// Create an optional group with the given child elements.
OptionalElement(std::vector<FormatElement * > && thenElements,std::vector<FormatElement * > && elseElements,unsigned anchorIndex,unsigned parseStart)363   OptionalElement(std::vector<FormatElement *> &&thenElements,
364                   std::vector<FormatElement *> &&elseElements,
365                   unsigned anchorIndex, unsigned parseStart)
366       : thenElements(std::move(thenElements)),
367         elseElements(std::move(elseElements)), anchorIndex(anchorIndex),
368         parseStart(parseStart) {}
369 
370   /// Return the `then` elements of the optional group.
getThenElements()371   ArrayRef<FormatElement *> getThenElements() const { return thenElements; }
372 
373   /// Return the `else` elements of the optional group.
getElseElements()374   ArrayRef<FormatElement *> getElseElements() const { return elseElements; }
375 
376   /// Return the anchor of the optional group.
getAnchor()377   FormatElement *getAnchor() const { return thenElements[anchorIndex]; }
378 
379   /// Return the index of the first element to be parsed.
getParseStart()380   unsigned getParseStart() const { return parseStart; }
381 
382 private:
383   /// The child elements emitted when the anchor is present.
384   std::vector<FormatElement *> thenElements;
385   /// The child elements emitted when the anchor is not present.
386   std::vector<FormatElement *> elseElements;
387   /// The index of the anchor element of the optional group within
388   /// `thenElements`.
389   unsigned anchorIndex;
390   /// The index of the first element that is parsed in `thenElements`. That is,
391   /// the first non-whitespace element.
392   unsigned parseStart;
393 };
394 
395 //===----------------------------------------------------------------------===//
396 // FormatParserBase
397 //===----------------------------------------------------------------------===//
398 
399 /// Base class for a parser that implements an assembly format. This class
400 /// defines a common assembly format syntax and the creation of format elements.
401 /// Subclasses will need to implement parsing for the format elements they
402 /// support.
403 class FormatParser {
404 public:
405   /// Vtable anchor.
406   virtual ~FormatParser();
407 
408   /// Parse the assembly format.
409   FailureOr<std::vector<FormatElement *>> parse();
410 
411 protected:
412   /// The current context of the parser when parsing an element.
413   enum Context {
414     /// The element is being parsed in a "top-level" context, i.e. at the top of
415     /// the format or in an optional group.
416     TopLevelContext,
417     /// The element is being parsed as a custom directive child.
418     CustomDirectiveContext,
419     /// The element is being parsed as a type directive child.
420     TypeDirectiveContext,
421     /// The element is being parsed as a reference directive child.
422     RefDirectiveContext,
423     /// The element is being parsed as a struct directive child.
424     StructDirectiveContext
425   };
426 
427   /// Create a format parser with the given source manager and a location.
FormatParser(llvm::SourceMgr & mgr,llvm::SMLoc loc)428   explicit FormatParser(llvm::SourceMgr &mgr, llvm::SMLoc loc)
429       : lexer(mgr, loc), curToken(lexer.lexToken()) {}
430 
431   /// Allocate and construct a format element.
432   template <typename FormatElementT, typename... Args>
create(Args &&...args)433   FormatElementT *create(Args &&...args) {
434     // FormatElementT *ptr = allocator.Allocate<FormatElementT>();
435     // ::new (ptr) FormatElementT(std::forward<Args>(args)...);
436     // return ptr;
437     auto mem = std::make_unique<FormatElementT>(std::forward<Args>(args)...);
438     FormatElementT *ptr = mem.get();
439     allocator.push_back(std::move(mem));
440     return ptr;
441   }
442 
443   //===--------------------------------------------------------------------===//
444   // Element Parsing
445 
446   /// Parse a single element of any kind.
447   FailureOr<FormatElement *> parseElement(Context ctx);
448   /// Parse a literal.
449   FailureOr<FormatElement *> parseLiteral(Context ctx);
450   /// Parse a variable.
451   FailureOr<FormatElement *> parseVariable(Context ctx);
452   /// Parse a directive.
453   FailureOr<FormatElement *> parseDirective(Context ctx);
454   /// Parse an optional group.
455   FailureOr<FormatElement *> parseOptionalGroup(Context ctx);
456 
457   /// Parse a custom directive.
458   FailureOr<FormatElement *> parseCustomDirective(llvm::SMLoc loc, Context ctx);
459 
460   /// Parse a format-specific variable kind.
461   virtual FailureOr<FormatElement *>
462   parseVariableImpl(llvm::SMLoc loc, StringRef name, Context ctx) = 0;
463   /// Parse a format-specific directive kind.
464   virtual FailureOr<FormatElement *>
465   parseDirectiveImpl(llvm::SMLoc loc, FormatToken::Kind kind, Context ctx) = 0;
466 
467   //===--------------------------------------------------------------------===//
468   // Format Verification
469 
470   /// Verify that the format is well-formed.
471   virtual LogicalResult verify(llvm::SMLoc loc,
472                                ArrayRef<FormatElement *> elements) = 0;
473   /// Verify the arguments to a custom directive.
474   virtual LogicalResult
475   verifyCustomDirectiveArguments(llvm::SMLoc loc,
476                                  ArrayRef<FormatElement *> arguments) = 0;
477   /// Verify the elements of an optional group.
478   virtual LogicalResult
479   verifyOptionalGroupElements(llvm::SMLoc loc,
480                               ArrayRef<FormatElement *> elements,
481                               Optional<unsigned> anchorIndex) = 0;
482 
483   //===--------------------------------------------------------------------===//
484   // Lexer Utilities
485 
486   /// Emit an error at the given location.
emitError(llvm::SMLoc loc,const Twine & msg)487   LogicalResult emitError(llvm::SMLoc loc, const Twine &msg) {
488     lexer.emitError(loc, msg);
489     return failure();
490   }
491 
492   /// Emit an error and a note at the given notation.
emitErrorAndNote(llvm::SMLoc loc,const Twine & msg,const Twine & note)493   LogicalResult emitErrorAndNote(llvm::SMLoc loc, const Twine &msg,
494                                  const Twine &note) {
495     lexer.emitErrorAndNote(loc, msg, note);
496     return failure();
497   }
498 
499   /// Parse a single token of the expected kind.
parseToken(FormatToken::Kind kind,const Twine & msg)500   FailureOr<FormatToken> parseToken(FormatToken::Kind kind, const Twine &msg) {
501     if (!curToken.is(kind))
502       return emitError(curToken.getLoc(), msg);
503     FormatToken tok = curToken;
504     consumeToken();
505     return tok;
506   }
507 
508   /// Advance the lexer to the next token.
consumeToken()509   void consumeToken() {
510     assert(!curToken.is(FormatToken::eof) && !curToken.is(FormatToken::error) &&
511            "shouldn't advance past EOF or errors");
512     curToken = lexer.lexToken();
513   }
514 
515   /// Get the current token.
peekToken()516   FormatToken peekToken() { return curToken; }
517 
518 private:
519   /// The format parser retains ownership of the format elements in a bump
520   /// pointer allocator.
521   // FIXME: FormatElement with `std::vector` need to be converted to use
522   // trailing objects.
523   // llvm::BumpPtrAllocator allocator;
524   std::vector<std::unique_ptr<FormatElement>> allocator;
525   /// The format lexer to use.
526   FormatLexer lexer;
527   /// The current token in the lexer.
528   FormatToken curToken;
529 };
530 
531 //===----------------------------------------------------------------------===//
532 // Utility Functions
533 //===----------------------------------------------------------------------===//
534 
535 /// Whether a space needs to be emitted before a literal. E.g., two keywords
536 /// back-to-back require a space separator, but a keyword followed by '<' does
537 /// not require a space.
538 bool shouldEmitSpaceBefore(StringRef value, bool lastWasPunctuation);
539 
540 /// Returns true if the given string can be formatted as a keyword.
541 bool canFormatStringAsKeyword(StringRef value,
542                               function_ref<void(Twine)> emitError = nullptr);
543 
544 /// Returns true if the given string is valid format literal element.
545 /// If `emitError` is provided, it is invoked with the reason for the failure.
546 bool isValidLiteral(StringRef value,
547                     function_ref<void(Twine)> emitError = nullptr);
548 
549 /// Whether a failure in parsing the assembly format should be a fatal error.
550 extern llvm::cl::opt<bool> formatErrorIsFatal;
551 
552 } // namespace tblgen
553 } // namespace mlir
554 
555 #endif // MLIR_TOOLS_MLIRTBLGEN_FORMATGEN_H_
556