1 //===-- ResourceScriptToken.cpp ---------------------------------*- C++-*-===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===---------------------------------------------------------------------===//
9 //
10 // This file implements an interface defined in ResourceScriptToken.h.
11 // In particular, it defines an .rc script tokenizer.
12 //
13 //===---------------------------------------------------------------------===//
14 
15 #include "ResourceScriptToken.h"
16 #include "llvm/Support/raw_ostream.h"
17 
18 #include <algorithm>
19 #include <cassert>
20 #include <cctype>
21 #include <cstdlib>
22 #include <utility>
23 
24 using namespace llvm;
25 
26 using Kind = RCToken::Kind;
27 
28 // Checks if Representation is a correct description of an RC integer.
29 // It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+),
30 // or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L'
31 // character (that is the difference between our representation and
32 // StringRef's one). If Representation is correct, 'true' is returned and
33 // the return value is put back in Num.
34 static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) {
35   size_t Length = Representation.size();
36   if (Length == 0)
37     return false;
38   // Strip the last 'L' if unnecessary.
39   if (std::toupper(Representation.back()) == 'L')
40     Representation = Representation.drop_back(1);
41 
42   return !Representation.getAsInteger<uint32_t>(0, Num);
43 }
44 
45 RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value)
46     : TokenKind(RCTokenKind), TokenValue(Value) {}
47 
48 uint32_t RCToken::intValue() const {
49   assert(TokenKind == Kind::Int);
50   // We assume that the token already is a correct integer (checked by
51   // rcGetAsInteger).
52   uint32_t Result;
53   bool IsSuccess = rcGetAsInteger(TokenValue, Result);
54   assert(IsSuccess);
55   (void)IsSuccess;  // Silence the compiler warning when -DNDEBUG flag is on.
56   return Result;
57 }
58 
59 StringRef RCToken::value() const { return TokenValue; }
60 
61 Kind RCToken::kind() const { return TokenKind; }
62 
63 static Error getStringError(const Twine &message) {
64   return make_error<StringError>("Error parsing file: " + message,
65                                  inconvertibleErrorCode());
66 }
67 
68 namespace {
69 
70 class Tokenizer {
71 public:
72   Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()) {}
73 
74   Expected<std::vector<RCToken>> run();
75 
76 private:
77   // All 'advancing' methods return boolean values; if they're equal to false,
78   // the stream has ended or failed.
79   bool advance(size_t Amount = 1);
80   bool skipWhitespaces();
81 
82   // Consumes a token. If any problem occurred, a non-empty Error is returned.
83   Error consumeToken(const Kind TokenKind);
84 
85   // Check if tokenizer is about to read FollowingChars.
86   bool willNowRead(StringRef FollowingChars) const;
87 
88   // Check if tokenizer can start reading an identifier at current position.
89   // The original tool did non specify the rules to determine what is a correct
90   // identifier. We assume they should follow the C convention:
91   // [a-zA-z_][a-zA-Z0-9_]*.
92   bool canStartIdentifier() const;
93   // Check if tokenizer can continue reading an identifier.
94   bool canContinueIdentifier() const;
95 
96   // Check if tokenizer can start reading an integer.
97   // A correct integer always starts with a 0-9 digit,
98   // can contain characters 0-9A-Fa-f (digits),
99   // Ll (marking the integer is 32-bit), Xx (marking the representation
100   // is hexadecimal). As some kind of separator should come after the
101   // integer, we can consume the integer until a non-alphanumeric
102   // character.
103   bool canStartInt() const;
104   bool canContinueInt() const;
105 
106   bool canStartString() const;
107 
108   bool streamEof() const;
109 
110   // Classify the token that is about to be read from the current position.
111   Kind classifyCurrentToken() const;
112 
113   // Process the Kind::Identifier token - check if it is
114   // an identifier describing a block start or end.
115   void processIdentifier(RCToken &token) const;
116 
117   StringRef Data;
118   size_t DataLength, Pos;
119 };
120 
121 Expected<std::vector<RCToken>> Tokenizer::run() {
122   Pos = 0;
123   std::vector<RCToken> Result;
124 
125   // Consume an optional UTF-8 Byte Order Mark.
126   if (willNowRead("\xef\xbb\xbf"))
127     advance(3);
128 
129   while (!streamEof()) {
130     if (!skipWhitespaces())
131       break;
132 
133     Kind TokenKind = classifyCurrentToken();
134     if (TokenKind == Kind::Invalid)
135       return getStringError("Invalid token found at position " + Twine(Pos));
136 
137     const size_t TokenStart = Pos;
138     if (Error TokenError = consumeToken(TokenKind))
139       return std::move(TokenError);
140 
141     RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart));
142     if (TokenKind == Kind::Identifier) {
143       processIdentifier(Token);
144     } else if (TokenKind == Kind::Int) {
145       uint32_t TokenInt;
146       if (!rcGetAsInteger(Token.value(), TokenInt)) {
147         // The integer has incorrect format or cannot be represented in
148         // a 32-bit integer.
149         return getStringError("Integer invalid or too large: " +
150                               Token.value().str());
151       }
152     }
153 
154     Result.push_back(Token);
155   }
156 
157   return Result;
158 }
159 
160 bool Tokenizer::advance(size_t Amount) {
161   Pos += Amount;
162   return !streamEof();
163 }
164 
165 bool Tokenizer::skipWhitespaces() {
166   while (!streamEof() && std::isspace(Data[Pos]))
167     advance();
168   return !streamEof();
169 }
170 
171 Error Tokenizer::consumeToken(const Kind TokenKind) {
172   switch (TokenKind) {
173   // One-character token consumption.
174 #define TOKEN(Name)
175 #define SHORT_TOKEN(Name, Ch) case Kind::Name:
176 #include "ResourceScriptTokenList.h"
177 #undef TOKEN
178 #undef SHORT_TOKEN
179     advance();
180     return Error::success();
181 
182   case Kind::Identifier:
183     while (!streamEof() && canContinueIdentifier())
184       advance();
185     return Error::success();
186 
187   case Kind::Int:
188     while (!streamEof() && canContinueInt())
189       advance();
190     return Error::success();
191 
192   case Kind::String:
193     // Consume the preceding 'L', if there is any.
194     if (std::toupper(Data[Pos]) == 'L')
195       advance();
196     // Consume the double-quote.
197     advance();
198 
199     // Consume the characters until the end of the file, line or string.
200     while (true) {
201       if (streamEof()) {
202         return getStringError("Unterminated string literal.");
203       } else if (Data[Pos] == '"') {
204         // Consume the ending double-quote.
205         advance();
206         return Error::success();
207       } else if (Data[Pos] == '\n') {
208         return getStringError("String literal not terminated in the line.");
209       }
210 
211       advance();
212     }
213 
214   case Kind::Invalid:
215     assert(false && "Cannot consume an invalid token.");
216   }
217 
218   // This silences the compilers which cannot notice that the execution
219   // never reaches here.
220   assert(false);
221 }
222 
223 bool Tokenizer::willNowRead(StringRef FollowingChars) const {
224   return Data.drop_front(Pos).startswith(FollowingChars);
225 }
226 
227 bool Tokenizer::canStartIdentifier() const {
228   assert(!streamEof());
229 
230   const char CurChar = Data[Pos];
231   return std::isalpha(CurChar) || CurChar == '_';
232 }
233 
234 bool Tokenizer::canContinueIdentifier() const {
235   assert(!streamEof());
236   const char CurChar = Data[Pos];
237   return std::isalnum(CurChar) || CurChar == '_';
238 }
239 
240 bool Tokenizer::canStartInt() const {
241   assert(!streamEof());
242   return std::isdigit(Data[Pos]);
243 }
244 
245 bool Tokenizer::canContinueInt() const {
246   assert(!streamEof());
247   return std::isalnum(Data[Pos]);
248 }
249 
250 bool Tokenizer::canStartString() const {
251   return willNowRead("\"") || willNowRead("L\"") || willNowRead("l\"");
252 }
253 
254 bool Tokenizer::streamEof() const { return Pos == DataLength; }
255 
256 Kind Tokenizer::classifyCurrentToken() const {
257   if (canStartInt())
258     return Kind::Int;
259   if (canStartString())
260     return Kind::String;
261   // BEGIN and END are at this point of lexing recognized as identifiers.
262   if (canStartIdentifier())
263     return Kind::Identifier;
264 
265   const char CurChar = Data[Pos];
266 
267   switch (CurChar) {
268   // One-character token classification.
269 #define TOKEN(Name)
270 #define SHORT_TOKEN(Name, Ch)                                                  \
271   case Ch:                                                                     \
272     return Kind::Name;
273 #include "ResourceScriptTokenList.h"
274 #undef TOKEN
275 #undef SHORT_TOKEN
276 
277   default:
278     return Kind::Invalid;
279   }
280 }
281 
282 void Tokenizer::processIdentifier(RCToken &Token) const {
283   assert(Token.kind() == Kind::Identifier);
284   StringRef Name = Token.value();
285 
286   if (Name.equals_lower("begin"))
287     Token = RCToken(Kind::BlockBegin, Name);
288   else if (Name.equals_lower("end"))
289     Token = RCToken(Kind::BlockEnd, Name);
290 }
291 
292 } // anonymous namespace
293 
294 namespace llvm {
295 
296 Expected<std::vector<RCToken>> tokenizeRC(StringRef Input) {
297   return Tokenizer(Input).run();
298 }
299 
300 } // namespace llvm
301