1 //===- FormatGen.h - Utilities for custom assembly formats ------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains common classes for building custom assembly format parsers 10 // and generators. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef MLIR_TOOLS_MLIRTBLGEN_FORMATGEN_H_ 15 #define MLIR_TOOLS_MLIRTBLGEN_FORMATGEN_H_ 16 17 #include "mlir/Support/LLVM.h" 18 #include "mlir/Support/LogicalResult.h" 19 #include "llvm/ADT/StringRef.h" 20 #include "llvm/ADT/StringSet.h" 21 #include "llvm/Support/Allocator.h" 22 #include "llvm/Support/CommandLine.h" 23 #include "llvm/Support/SMLoc.h" 24 #include <vector> 25 26 namespace llvm { 27 class SourceMgr; 28 } // namespace llvm 29 30 namespace mlir { 31 namespace tblgen { 32 33 //===----------------------------------------------------------------------===// 34 // FormatToken 35 //===----------------------------------------------------------------------===// 36 37 /// This class represents a specific token in the input format. 38 class FormatToken { 39 public: 40 /// Basic token kinds. 41 enum Kind { 42 // Markers. 43 eof, 44 error, 45 46 // Tokens with no info. 47 l_paren, 48 r_paren, 49 caret, 50 colon, 51 comma, 52 equal, 53 less, 54 greater, 55 question, 56 star, 57 pipe, 58 59 // Keywords. 60 keyword_start, 61 kw_attr_dict, 62 kw_attr_dict_w_keyword, 63 kw_custom, 64 kw_functional_type, 65 kw_oilist, 66 kw_operands, 67 kw_params, 68 kw_qualified, 69 kw_ref, 70 kw_regions, 71 kw_results, 72 kw_struct, 73 kw_successors, 74 kw_type, 75 keyword_end, 76 77 // String valued tokens. 78 identifier, 79 literal, 80 variable, 81 }; 82 FormatToken(Kind kind,StringRef spelling)83 FormatToken(Kind kind, StringRef spelling) : kind(kind), spelling(spelling) {} 84 85 /// Return the bytes that make up this token. getSpelling()86 StringRef getSpelling() const { return spelling; } 87 88 /// Return the kind of this token. getKind()89 Kind getKind() const { return kind; } 90 91 /// Return a location for this token. 92 SMLoc getLoc() const; 93 94 /// Returns true if the token is of the given kind. is(Kind kind)95 bool is(Kind kind) { return getKind() == kind; } 96 97 /// Return if this token is a keyword. isKeyword()98 bool isKeyword() const { 99 return getKind() > Kind::keyword_start && getKind() < Kind::keyword_end; 100 } 101 102 private: 103 /// Discriminator that indicates the kind of token this is. 104 Kind kind; 105 106 /// A reference to the entire token contents; this is always a pointer into 107 /// a memory buffer owned by the source manager. 108 StringRef spelling; 109 }; 110 111 //===----------------------------------------------------------------------===// 112 // FormatLexer 113 //===----------------------------------------------------------------------===// 114 115 /// This class implements a simple lexer for operation assembly format strings. 116 class FormatLexer { 117 public: 118 FormatLexer(llvm::SourceMgr &mgr, SMLoc loc); 119 120 /// Lex the next token and return it. 121 FormatToken lexToken(); 122 123 /// Emit an error to the lexer with the given location and message. 124 FormatToken emitError(SMLoc loc, const Twine &msg); 125 FormatToken emitError(const char *loc, const Twine &msg); 126 127 FormatToken emitErrorAndNote(SMLoc loc, const Twine &msg, const Twine ¬e); 128 129 private: 130 /// Return the next character in the stream. 131 int getNextChar(); 132 133 /// Lex an identifier, literal, or variable. 134 FormatToken lexIdentifier(const char *tokStart); 135 FormatToken lexLiteral(const char *tokStart); 136 FormatToken lexVariable(const char *tokStart); 137 138 /// Create a token with the current pointer and a start pointer. formToken(FormatToken::Kind kind,const char * tokStart)139 FormatToken formToken(FormatToken::Kind kind, const char *tokStart) { 140 return FormatToken(kind, StringRef(tokStart, curPtr - tokStart)); 141 } 142 143 /// The source manager containing the format string. 144 llvm::SourceMgr &mgr; 145 /// Location of the format string. 146 SMLoc loc; 147 /// Buffer containing the format string. 148 StringRef curBuffer; 149 /// Current pointer in the buffer. 150 const char *curPtr; 151 }; 152 153 //===----------------------------------------------------------------------===// 154 // FormatElement 155 //===----------------------------------------------------------------------===// 156 157 /// This class represents a single format element. 158 /// 159 /// If you squint and take a close look, you can see the outline of a `Format` 160 /// dialect. 161 class FormatElement { 162 public: 163 virtual ~FormatElement(); 164 165 // The top-level kinds of format elements. 166 enum Kind { Literal, Variable, Whitespace, Directive, Optional }; 167 168 /// Support LLVM-style RTTI. classof(const FormatElement * el)169 static bool classof(const FormatElement *el) { return true; } 170 171 /// Get the element kind. getKind()172 Kind getKind() const { return kind; } 173 174 protected: 175 /// Create a format element with the given kind. FormatElement(Kind kind)176 FormatElement(Kind kind) : kind(kind) {} 177 178 private: 179 /// The kind of the element. 180 Kind kind; 181 }; 182 183 /// The base class for all format elements. This class implements common methods 184 /// for LLVM-style RTTI. 185 template <FormatElement::Kind ElementKind> 186 class FormatElementBase : public FormatElement { 187 public: 188 /// Support LLVM-style RTTI. classof(const FormatElement * el)189 static bool classof(const FormatElement *el) { 190 return ElementKind == el->getKind(); 191 } 192 193 protected: 194 /// Create a format element with the given kind. FormatElementBase()195 FormatElementBase() : FormatElement(ElementKind) {} 196 }; 197 198 /// This class represents a literal element. A literal is either one of the 199 /// supported punctuation characters (e.g. `(` or `,`) or a string literal (e.g. 200 /// `literal`). 201 class LiteralElement : public FormatElementBase<FormatElement::Literal> { 202 public: 203 /// Create a literal element with the given spelling. LiteralElement(StringRef spelling)204 explicit LiteralElement(StringRef spelling) : spelling(spelling) {} 205 206 /// Get the spelling of the literal. getSpelling()207 StringRef getSpelling() const { return spelling; } 208 209 private: 210 /// The spelling of the variable, i.e. the string contained within the 211 /// backticks. 212 StringRef spelling; 213 }; 214 215 /// This class represents a variable element. A variable refers to some part of 216 /// the object being parsed, e.g. an attribute or operand on an operation or a 217 /// parameter on an attribute. 218 class VariableElement : public FormatElementBase<FormatElement::Variable> { 219 public: 220 /// These are the kinds of variables. 221 enum Kind { Attribute, Operand, Region, Result, Successor, Parameter }; 222 223 /// Get the kind of variable. getKind()224 Kind getKind() const { return kind; } 225 226 protected: 227 /// Create a variable with a kind. VariableElement(Kind kind)228 VariableElement(Kind kind) : kind(kind) {} 229 230 private: 231 /// The kind of variable. 232 Kind kind; 233 }; 234 235 /// Base class for variable elements. This class implements common methods for 236 /// LLVM-style RTTI. 237 template <VariableElement::Kind VariableKind> 238 class VariableElementBase : public VariableElement { 239 public: 240 /// An element is of this class if it is a variable and has the same variable 241 /// type. classof(const FormatElement * el)242 static bool classof(const FormatElement *el) { 243 if (auto *varEl = dyn_cast<VariableElement>(el)) 244 return VariableKind == varEl->getKind(); 245 return false; 246 } 247 248 protected: 249 /// Create a variable element with the given variable kind. VariableElementBase()250 VariableElementBase() : VariableElement(VariableKind) {} 251 }; 252 253 /// This class represents a whitespace element, e.g. a newline or space. It is a 254 /// literal that is printed but never parsed. When the value is empty, i.e. ``, 255 /// a space is elided where one would have been printed automatically. 256 class WhitespaceElement : public FormatElementBase<FormatElement::Whitespace> { 257 public: 258 /// Create a whitespace element. WhitespaceElement(StringRef value)259 explicit WhitespaceElement(StringRef value) : value(value) {} 260 261 /// Get the whitespace value. getValue()262 StringRef getValue() const { return value; } 263 264 private: 265 /// The value of the whitespace element. Can be empty. 266 StringRef value; 267 }; 268 269 class DirectiveElement : public FormatElementBase<FormatElement::Directive> { 270 public: 271 /// These are the kinds of directives. 272 enum Kind { 273 AttrDict, 274 Custom, 275 FunctionalType, 276 OIList, 277 Operands, 278 Ref, 279 Regions, 280 Results, 281 Successors, 282 Type, 283 Params, 284 Struct 285 }; 286 287 /// Get the directive kind. getKind()288 Kind getKind() const { return kind; } 289 290 protected: 291 /// Create a directive element with a kind. DirectiveElement(Kind kind)292 DirectiveElement(Kind kind) : kind(kind) {} 293 294 private: 295 /// The directive kind. 296 Kind kind; 297 }; 298 299 /// Base class for directive elements. This class implements common methods for 300 /// LLVM-style RTTI. 301 template <DirectiveElement::Kind DirectiveKind> 302 class DirectiveElementBase : public DirectiveElement { 303 public: 304 /// Create a directive element with the specified kind. DirectiveElementBase()305 DirectiveElementBase() : DirectiveElement(DirectiveKind) {} 306 307 /// A format element is of this class if it is a directive element and has the 308 /// same kind. classof(const FormatElement * el)309 static bool classof(const FormatElement *el) { 310 if (auto *directiveEl = dyn_cast<DirectiveElement>(el)) 311 return DirectiveKind == directiveEl->getKind(); 312 return false; 313 } 314 }; 315 316 /// This class represents a custom format directive that is implemented by the 317 /// user in C++. The directive accepts a list of arguments that is passed to the 318 /// C++ function. 319 class CustomDirective : public DirectiveElementBase<DirectiveElement::Custom> { 320 public: 321 /// Create a custom directive with a name and list of arguments. CustomDirective(StringRef name,std::vector<FormatElement * > && arguments)322 CustomDirective(StringRef name, std::vector<FormatElement *> &&arguments) 323 : name(name), arguments(std::move(arguments)) {} 324 325 /// Get the custom directive name. getName()326 StringRef getName() const { return name; } 327 328 /// Get the arguments to the custom directive. getArguments()329 ArrayRef<FormatElement *> getArguments() const { return arguments; } 330 331 private: 332 /// The name of the custom directive. The name is used to call two C++ 333 /// methods: `parse{name}` and `print{name}` with the given arguments. 334 StringRef name; 335 /// The arguments with which to call the custom functions. These are either 336 /// variables (for which the functions are responsible for populating) or 337 /// references to variables. 338 std::vector<FormatElement *> arguments; 339 }; 340 341 /// This class represents a reference directive. This directive can be used to 342 /// reference but not bind a previously bound variable or format object. Its 343 /// current only use is to pass variables as arguments to the custom directive. 344 class RefDirective : public DirectiveElementBase<DirectiveElement::Ref> { 345 public: 346 /// Create a reference directive with the single referenced child. RefDirective(FormatElement * arg)347 RefDirective(FormatElement *arg) : arg(arg) {} 348 349 /// Get the reference argument. getArg()350 FormatElement *getArg() const { return arg; } 351 352 private: 353 /// The referenced argument. 354 FormatElement *arg; 355 }; 356 357 /// This class represents a group of elements that are optionally emitted based 358 /// on an optional variable "anchor" and a group of elements that are emitted 359 /// when the anchor element is not present. 360 class OptionalElement : public FormatElementBase<FormatElement::Optional> { 361 public: 362 /// Create an optional group with the given child elements. OptionalElement(std::vector<FormatElement * > && thenElements,std::vector<FormatElement * > && elseElements,unsigned anchorIndex,unsigned parseStart)363 OptionalElement(std::vector<FormatElement *> &&thenElements, 364 std::vector<FormatElement *> &&elseElements, 365 unsigned anchorIndex, unsigned parseStart) 366 : thenElements(std::move(thenElements)), 367 elseElements(std::move(elseElements)), anchorIndex(anchorIndex), 368 parseStart(parseStart) {} 369 370 /// Return the `then` elements of the optional group. getThenElements()371 ArrayRef<FormatElement *> getThenElements() const { return thenElements; } 372 373 /// Return the `else` elements of the optional group. getElseElements()374 ArrayRef<FormatElement *> getElseElements() const { return elseElements; } 375 376 /// Return the anchor of the optional group. getAnchor()377 FormatElement *getAnchor() const { return thenElements[anchorIndex]; } 378 379 /// Return the index of the first element to be parsed. getParseStart()380 unsigned getParseStart() const { return parseStart; } 381 382 private: 383 /// The child elements emitted when the anchor is present. 384 std::vector<FormatElement *> thenElements; 385 /// The child elements emitted when the anchor is not present. 386 std::vector<FormatElement *> elseElements; 387 /// The index of the anchor element of the optional group within 388 /// `thenElements`. 389 unsigned anchorIndex; 390 /// The index of the first element that is parsed in `thenElements`. That is, 391 /// the first non-whitespace element. 392 unsigned parseStart; 393 }; 394 395 //===----------------------------------------------------------------------===// 396 // FormatParserBase 397 //===----------------------------------------------------------------------===// 398 399 /// Base class for a parser that implements an assembly format. This class 400 /// defines a common assembly format syntax and the creation of format elements. 401 /// Subclasses will need to implement parsing for the format elements they 402 /// support. 403 class FormatParser { 404 public: 405 /// Vtable anchor. 406 virtual ~FormatParser(); 407 408 /// Parse the assembly format. 409 FailureOr<std::vector<FormatElement *>> parse(); 410 411 protected: 412 /// The current context of the parser when parsing an element. 413 enum Context { 414 /// The element is being parsed in a "top-level" context, i.e. at the top of 415 /// the format or in an optional group. 416 TopLevelContext, 417 /// The element is being parsed as a custom directive child. 418 CustomDirectiveContext, 419 /// The element is being parsed as a type directive child. 420 TypeDirectiveContext, 421 /// The element is being parsed as a reference directive child. 422 RefDirectiveContext, 423 /// The element is being parsed as a struct directive child. 424 StructDirectiveContext 425 }; 426 427 /// Create a format parser with the given source manager and a location. FormatParser(llvm::SourceMgr & mgr,llvm::SMLoc loc)428 explicit FormatParser(llvm::SourceMgr &mgr, llvm::SMLoc loc) 429 : lexer(mgr, loc), curToken(lexer.lexToken()) {} 430 431 /// Allocate and construct a format element. 432 template <typename FormatElementT, typename... Args> create(Args &&...args)433 FormatElementT *create(Args &&...args) { 434 // FormatElementT *ptr = allocator.Allocate<FormatElementT>(); 435 // ::new (ptr) FormatElementT(std::forward<Args>(args)...); 436 // return ptr; 437 auto mem = std::make_unique<FormatElementT>(std::forward<Args>(args)...); 438 FormatElementT *ptr = mem.get(); 439 allocator.push_back(std::move(mem)); 440 return ptr; 441 } 442 443 //===--------------------------------------------------------------------===// 444 // Element Parsing 445 446 /// Parse a single element of any kind. 447 FailureOr<FormatElement *> parseElement(Context ctx); 448 /// Parse a literal. 449 FailureOr<FormatElement *> parseLiteral(Context ctx); 450 /// Parse a variable. 451 FailureOr<FormatElement *> parseVariable(Context ctx); 452 /// Parse a directive. 453 FailureOr<FormatElement *> parseDirective(Context ctx); 454 /// Parse an optional group. 455 FailureOr<FormatElement *> parseOptionalGroup(Context ctx); 456 457 /// Parse a custom directive. 458 FailureOr<FormatElement *> parseCustomDirective(llvm::SMLoc loc, Context ctx); 459 460 /// Parse a format-specific variable kind. 461 virtual FailureOr<FormatElement *> 462 parseVariableImpl(llvm::SMLoc loc, StringRef name, Context ctx) = 0; 463 /// Parse a format-specific directive kind. 464 virtual FailureOr<FormatElement *> 465 parseDirectiveImpl(llvm::SMLoc loc, FormatToken::Kind kind, Context ctx) = 0; 466 467 //===--------------------------------------------------------------------===// 468 // Format Verification 469 470 /// Verify that the format is well-formed. 471 virtual LogicalResult verify(llvm::SMLoc loc, 472 ArrayRef<FormatElement *> elements) = 0; 473 /// Verify the arguments to a custom directive. 474 virtual LogicalResult 475 verifyCustomDirectiveArguments(llvm::SMLoc loc, 476 ArrayRef<FormatElement *> arguments) = 0; 477 /// Verify the elements of an optional group. 478 virtual LogicalResult 479 verifyOptionalGroupElements(llvm::SMLoc loc, 480 ArrayRef<FormatElement *> elements, 481 Optional<unsigned> anchorIndex) = 0; 482 483 //===--------------------------------------------------------------------===// 484 // Lexer Utilities 485 486 /// Emit an error at the given location. emitError(llvm::SMLoc loc,const Twine & msg)487 LogicalResult emitError(llvm::SMLoc loc, const Twine &msg) { 488 lexer.emitError(loc, msg); 489 return failure(); 490 } 491 492 /// Emit an error and a note at the given notation. emitErrorAndNote(llvm::SMLoc loc,const Twine & msg,const Twine & note)493 LogicalResult emitErrorAndNote(llvm::SMLoc loc, const Twine &msg, 494 const Twine ¬e) { 495 lexer.emitErrorAndNote(loc, msg, note); 496 return failure(); 497 } 498 499 /// Parse a single token of the expected kind. parseToken(FormatToken::Kind kind,const Twine & msg)500 FailureOr<FormatToken> parseToken(FormatToken::Kind kind, const Twine &msg) { 501 if (!curToken.is(kind)) 502 return emitError(curToken.getLoc(), msg); 503 FormatToken tok = curToken; 504 consumeToken(); 505 return tok; 506 } 507 508 /// Advance the lexer to the next token. consumeToken()509 void consumeToken() { 510 assert(!curToken.is(FormatToken::eof) && !curToken.is(FormatToken::error) && 511 "shouldn't advance past EOF or errors"); 512 curToken = lexer.lexToken(); 513 } 514 515 /// Get the current token. peekToken()516 FormatToken peekToken() { return curToken; } 517 518 private: 519 /// The format parser retains ownership of the format elements in a bump 520 /// pointer allocator. 521 // FIXME: FormatElement with `std::vector` need to be converted to use 522 // trailing objects. 523 // llvm::BumpPtrAllocator allocator; 524 std::vector<std::unique_ptr<FormatElement>> allocator; 525 /// The format lexer to use. 526 FormatLexer lexer; 527 /// The current token in the lexer. 528 FormatToken curToken; 529 }; 530 531 //===----------------------------------------------------------------------===// 532 // Utility Functions 533 //===----------------------------------------------------------------------===// 534 535 /// Whether a space needs to be emitted before a literal. E.g., two keywords 536 /// back-to-back require a space separator, but a keyword followed by '<' does 537 /// not require a space. 538 bool shouldEmitSpaceBefore(StringRef value, bool lastWasPunctuation); 539 540 /// Returns true if the given string can be formatted as a keyword. 541 bool canFormatStringAsKeyword(StringRef value, 542 function_ref<void(Twine)> emitError = nullptr); 543 544 /// Returns true if the given string is valid format literal element. 545 /// If `emitError` is provided, it is invoked with the reason for the failure. 546 bool isValidLiteral(StringRef value, 547 function_ref<void(Twine)> emitError = nullptr); 548 549 /// Whether a failure in parsing the assembly format should be a fatal error. 550 extern llvm::cl::opt<bool> formatErrorIsFatal; 551 552 } // namespace tblgen 553 } // namespace mlir 554 555 #endif // MLIR_TOOLS_MLIRTBLGEN_FORMATGEN_H_ 556