1 //===- FormatGen.cpp - Utilities for custom assembly formats ----*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "FormatGen.h"
10 #include "llvm/ADT/StringSwitch.h"
11 #include "llvm/Support/SourceMgr.h"
12 #include "llvm/TableGen/Error.h"
13 
14 using namespace mlir;
15 using namespace mlir::tblgen;
16 
17 //===----------------------------------------------------------------------===//
18 // FormatToken
19 //===----------------------------------------------------------------------===//
20 
21 llvm::SMLoc FormatToken::getLoc() const {
22   return llvm::SMLoc::getFromPointer(spelling.data());
23 }
24 
25 //===----------------------------------------------------------------------===//
26 // FormatLexer
27 //===----------------------------------------------------------------------===//
28 
29 FormatLexer::FormatLexer(llvm::SourceMgr &mgr, llvm::SMLoc loc)
30     : mgr(mgr), loc(loc),
31       curBuffer(mgr.getMemoryBuffer(mgr.getMainFileID())->getBuffer()),
32       curPtr(curBuffer.begin()) {}
33 
34 FormatToken FormatLexer::emitError(llvm::SMLoc loc, const Twine &msg) {
35   mgr.PrintMessage(loc, llvm::SourceMgr::DK_Error, msg);
36   llvm::SrcMgr.PrintMessage(this->loc, llvm::SourceMgr::DK_Note,
37                             "in custom assembly format for this operation");
38   return formToken(FormatToken::error, loc.getPointer());
39 }
40 
41 FormatToken FormatLexer::emitError(const char *loc, const Twine &msg) {
42   return emitError(llvm::SMLoc::getFromPointer(loc), msg);
43 }
44 
45 FormatToken FormatLexer::emitErrorAndNote(llvm::SMLoc loc, const Twine &msg,
46                                           const Twine &note) {
47   mgr.PrintMessage(loc, llvm::SourceMgr::DK_Error, msg);
48   llvm::SrcMgr.PrintMessage(this->loc, llvm::SourceMgr::DK_Note,
49                             "in custom assembly format for this operation");
50   mgr.PrintMessage(loc, llvm::SourceMgr::DK_Note, note);
51   return formToken(FormatToken::error, loc.getPointer());
52 }
53 
54 int FormatLexer::getNextChar() {
55   char curChar = *curPtr++;
56   switch (curChar) {
57   default:
58     return (unsigned char)curChar;
59   case 0: {
60     // A nul character in the stream is either the end of the current buffer or
61     // a random nul in the file. Disambiguate that here.
62     if (curPtr - 1 != curBuffer.end())
63       return 0;
64 
65     // Otherwise, return end of file.
66     --curPtr;
67     return EOF;
68   }
69   case '\n':
70   case '\r':
71     // Handle the newline character by ignoring it and incrementing the line
72     // count. However, be careful about 'dos style' files with \n\r in them.
73     // Only treat a \n\r or \r\n as a single line.
74     if ((*curPtr == '\n' || (*curPtr == '\r')) && *curPtr != curChar)
75       ++curPtr;
76     return '\n';
77   }
78 }
79 
80 FormatToken FormatLexer::lexToken() {
81   const char *tokStart = curPtr;
82 
83   // This always consumes at least one character.
84   int curChar = getNextChar();
85   switch (curChar) {
86   default:
87     // Handle identifiers: [a-zA-Z_]
88     if (isalpha(curChar) || curChar == '_')
89       return lexIdentifier(tokStart);
90 
91     // Unknown character, emit an error.
92     return emitError(tokStart, "unexpected character");
93   case EOF:
94     // Return EOF denoting the end of lexing.
95     return formToken(FormatToken::eof, tokStart);
96 
97   // Lex punctuation.
98   case '^':
99     return formToken(FormatToken::caret, tokStart);
100   case ':':
101     return formToken(FormatToken::colon, tokStart);
102   case ',':
103     return formToken(FormatToken::comma, tokStart);
104   case '=':
105     return formToken(FormatToken::equal, tokStart);
106   case '<':
107     return formToken(FormatToken::less, tokStart);
108   case '>':
109     return formToken(FormatToken::greater, tokStart);
110   case '?':
111     return formToken(FormatToken::question, tokStart);
112   case '(':
113     return formToken(FormatToken::l_paren, tokStart);
114   case ')':
115     return formToken(FormatToken::r_paren, tokStart);
116   case '*':
117     return formToken(FormatToken::star, tokStart);
118 
119   // Ignore whitespace characters.
120   case 0:
121   case ' ':
122   case '\t':
123   case '\n':
124     return lexToken();
125 
126   case '`':
127     return lexLiteral(tokStart);
128   case '$':
129     return lexVariable(tokStart);
130   }
131 }
132 
133 FormatToken FormatLexer::lexLiteral(const char *tokStart) {
134   assert(curPtr[-1] == '`');
135 
136   // Lex a literal surrounded by ``.
137   while (const char curChar = *curPtr++) {
138     if (curChar == '`')
139       return formToken(FormatToken::literal, tokStart);
140   }
141   return emitError(curPtr - 1, "unexpected end of file in literal");
142 }
143 
144 FormatToken FormatLexer::lexVariable(const char *tokStart) {
145   if (!isalpha(curPtr[0]) && curPtr[0] != '_')
146     return emitError(curPtr - 1, "expected variable name");
147 
148   // Otherwise, consume the rest of the characters.
149   while (isalnum(*curPtr) || *curPtr == '_')
150     ++curPtr;
151   return formToken(FormatToken::variable, tokStart);
152 }
153 
154 FormatToken FormatLexer::lexIdentifier(const char *tokStart) {
155   // Match the rest of the identifier regex: [0-9a-zA-Z_\-]*
156   while (isalnum(*curPtr) || *curPtr == '_' || *curPtr == '-')
157     ++curPtr;
158 
159   // Check to see if this identifier is a keyword.
160   StringRef str(tokStart, curPtr - tokStart);
161   auto kind =
162       StringSwitch<FormatToken::Kind>(str)
163           .Case("attr-dict", FormatToken::kw_attr_dict)
164           .Case("attr-dict-with-keyword", FormatToken::kw_attr_dict_w_keyword)
165           .Case("custom", FormatToken::kw_custom)
166           .Case("functional-type", FormatToken::kw_functional_type)
167           .Case("operands", FormatToken::kw_operands)
168           .Case("params", FormatToken::kw_params)
169           .Case("ref", FormatToken::kw_ref)
170           .Case("regions", FormatToken::kw_regions)
171           .Case("results", FormatToken::kw_results)
172           .Case("struct", FormatToken::kw_struct)
173           .Case("successors", FormatToken::kw_successors)
174           .Case("type", FormatToken::kw_type)
175           .Case("qualified", FormatToken::kw_qualified)
176           .Default(FormatToken::identifier);
177   return FormatToken(kind, str);
178 }
179 
180 //===----------------------------------------------------------------------===//
181 // Utility Functions
182 //===----------------------------------------------------------------------===//
183 
184 bool mlir::tblgen::shouldEmitSpaceBefore(StringRef value,
185                                          bool lastWasPunctuation) {
186   if (value.size() != 1 && value != "->")
187     return true;
188   if (lastWasPunctuation)
189     return !StringRef(">)}],").contains(value.front());
190   return !StringRef("<>(){}[],").contains(value.front());
191 }
192 
193 bool mlir::tblgen::canFormatStringAsKeyword(
194     StringRef value, function_ref<void(Twine)> emitError) {
195   if (!isalpha(value.front()) && value.front() != '_') {
196     if (emitError)
197       emitError("valid keyword starts with a letter or '_'");
198     return false;
199   }
200   if (!llvm::all_of(value.drop_front(), [](char c) {
201         return isalnum(c) || c == '_' || c == '$' || c == '.';
202       })) {
203     if (emitError)
204       emitError(
205           "keywords should contain only alphanum, '_', '$', or '.' characters");
206     return false;
207   }
208   return true;
209 }
210 
211 bool mlir::tblgen::isValidLiteral(StringRef value,
212                                   function_ref<void(Twine)> emitError) {
213   if (value.empty()) {
214     if (emitError)
215       emitError("literal can't be empty");
216     return false;
217   }
218   char front = value.front();
219 
220   // If there is only one character, this must either be punctuation or a
221   // single character bare identifier.
222   if (value.size() == 1) {
223     StringRef bare = "_:,=<>()[]{}?+*";
224     if (isalpha(front) || bare.contains(front))
225       return true;
226     if (emitError)
227       emitError("single character literal must be a letter or one of '" + bare +
228                 "'");
229     return false;
230   }
231   // Check the punctuation that are larger than a single character.
232   if (value == "->")
233     return true;
234 
235   // Otherwise, this must be an identifier.
236   return canFormatStringAsKeyword(value, emitError);
237 }
238 
239 //===----------------------------------------------------------------------===//
240 // Commandline Options
241 //===----------------------------------------------------------------------===//
242 
243 llvm::cl::opt<bool> mlir::tblgen::formatErrorIsFatal(
244     "asmformat-error-is-fatal",
245     llvm::cl::desc("Emit a fatal error if format parsing fails"),
246     llvm::cl::init(true));
247