1794366a2SRui Ueyama //===- ScriptLexer.cpp ----------------------------------------------------===//
2794366a2SRui Ueyama //
32946cd70SChandler Carruth // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
42946cd70SChandler Carruth // See https://llvm.org/LICENSE.txt for license information.
52946cd70SChandler Carruth // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6794366a2SRui Ueyama //
7794366a2SRui Ueyama //===----------------------------------------------------------------------===//
8794366a2SRui Ueyama //
94c82b4f6SRui Ueyama // This file defines a lexer for the linker script.
104c82b4f6SRui Ueyama //
114c82b4f6SRui Ueyama // The linker script's grammar is not complex but ambiguous due to the
124c82b4f6SRui Ueyama // lack of the formal specification of the language. What we are trying to
134c82b4f6SRui Ueyama // do in this and other files in LLD is to make a "reasonable" linker
144c82b4f6SRui Ueyama // script processor.
154c82b4f6SRui Ueyama //
164c82b4f6SRui Ueyama // Among simplicity, compatibility and efficiency, we put the most
174c82b4f6SRui Ueyama // emphasis on simplicity when we wrote this lexer. Compatibility with the
184c82b4f6SRui Ueyama // GNU linkers is important, but we did not try to clone every tiny corner
194c82b4f6SRui Ueyama // case of their lexers, as even ld.bfd and ld.gold are subtly different
204c82b4f6SRui Ueyama // in various corner cases. We do not care much about efficiency because
214c82b4f6SRui Ueyama // the time spent in parsing linker scripts is usually negligible.
224c82b4f6SRui Ueyama //
234c82b4f6SRui Ueyama // Our grammar of the linker script is LL(2), meaning that it needs at
244c82b4f6SRui Ueyama // most two-token lookahead to parse. The only place we need two-token
254c82b4f6SRui Ueyama // lookahead is labels in version scripts, where we need to parse "local :"
264c82b4f6SRui Ueyama // as if "local:".
274c82b4f6SRui Ueyama //
28731a66aeSRui Ueyama // Overall, this lexer works fine for most linker scripts. There might
29731a66aeSRui Ueyama // be room for improving compatibility, but that's probably not at the
30731a66aeSRui Ueyama // top of our todo list.
31794366a2SRui Ueyama //
32794366a2SRui Ueyama //===----------------------------------------------------------------------===//
33794366a2SRui Ueyama
34794366a2SRui Ueyama #include "ScriptLexer.h"
35b8a59c8aSBob Haarman #include "lld/Common/ErrorHandler.h"
36794366a2SRui Ueyama #include "llvm/ADT/Twine.h"
3727bb7990SFangrui Song #include "llvm/Support/ErrorHandling.h"
3827bb7990SFangrui Song #include <algorithm>
39794366a2SRui Ueyama
40794366a2SRui Ueyama using namespace llvm;
4107837b8fSFangrui Song using namespace lld;
4207837b8fSFangrui Song using namespace lld::elf;
43794366a2SRui Ueyama
44794366a2SRui Ueyama // Returns a whole line containing the current token.
getLine()45794366a2SRui Ueyama StringRef ScriptLexer::getLine() {
463837f427SRui Ueyama StringRef s = getCurrentMB().getBuffer();
473837f427SRui Ueyama StringRef tok = tokens[pos - 1];
48794366a2SRui Ueyama
493837f427SRui Ueyama size_t pos = s.rfind('\n', tok.data() - s.data());
503837f427SRui Ueyama if (pos != StringRef::npos)
513837f427SRui Ueyama s = s.substr(pos + 1);
523837f427SRui Ueyama return s.substr(0, s.find_first_of("\r\n"));
53794366a2SRui Ueyama }
54794366a2SRui Ueyama
55794366a2SRui Ueyama // Returns 1-based line number of the current token.
getLineNumber()56794366a2SRui Ueyama size_t ScriptLexer::getLineNumber() {
57ac6abc99SFangrui Song if (pos == 0)
58ac6abc99SFangrui Song return 1;
593837f427SRui Ueyama StringRef s = getCurrentMB().getBuffer();
603837f427SRui Ueyama StringRef tok = tokens[pos - 1];
61e3877787SColin Cross const size_t tokOffset = tok.data() - s.data();
62e3877787SColin Cross
63e3877787SColin Cross // For the first token, or when going backwards, start from the beginning of
64e3877787SColin Cross // the buffer. If this token is after the previous token, start from the
65e3877787SColin Cross // previous token.
66e3877787SColin Cross size_t line = 1;
67e3877787SColin Cross size_t start = 0;
68e3877787SColin Cross if (lastLineNumberOffset > 0 && tokOffset >= lastLineNumberOffset) {
69e3877787SColin Cross start = lastLineNumberOffset;
70e3877787SColin Cross line = lastLineNumber;
71e3877787SColin Cross }
72e3877787SColin Cross
73e3877787SColin Cross line += s.substr(start, tokOffset - start).count('\n');
74e3877787SColin Cross
75e3877787SColin Cross // Store the line number of this token for reuse.
76e3877787SColin Cross lastLineNumberOffset = tokOffset;
77e3877787SColin Cross lastLineNumber = line;
78e3877787SColin Cross
79e3877787SColin Cross return line;
80794366a2SRui Ueyama }
81794366a2SRui Ueyama
82794366a2SRui Ueyama // Returns 0-based column number of the current token.
getColumnNumber()83794366a2SRui Ueyama size_t ScriptLexer::getColumnNumber() {
843837f427SRui Ueyama StringRef tok = tokens[pos - 1];
853837f427SRui Ueyama return tok.data() - getLine().data();
86794366a2SRui Ueyama }
87794366a2SRui Ueyama
getCurrentLocation()88794366a2SRui Ueyama std::string ScriptLexer::getCurrentLocation() {
89adcd0268SBenjamin Kramer std::string filename = std::string(getCurrentMB().getBufferIdentifier());
903837f427SRui Ueyama return (filename + ":" + Twine(getLineNumber())).str();
91794366a2SRui Ueyama }
92794366a2SRui Ueyama
ScriptLexer(MemoryBufferRef mb)933837f427SRui Ueyama ScriptLexer::ScriptLexer(MemoryBufferRef mb) { tokenize(mb); }
94794366a2SRui Ueyama
95794366a2SRui Ueyama // We don't want to record cascading errors. Keep only the first one.
setError(const Twine & msg)963837f427SRui Ueyama void ScriptLexer::setError(const Twine &msg) {
97b8a59c8aSBob Haarman if (errorCount())
98794366a2SRui Ueyama return;
99794366a2SRui Ueyama
1003837f427SRui Ueyama std::string s = (getCurrentLocation() + ": " + msg).str();
1013837f427SRui Ueyama if (pos)
1023837f427SRui Ueyama s += "\n>>> " + getLine().str() + "\n>>> " +
103de2d1066SGeorge Rimar std::string(getColumnNumber(), ' ') + "^";
1043837f427SRui Ueyama error(s);
105794366a2SRui Ueyama }
106794366a2SRui Ueyama
107794366a2SRui Ueyama // Split S into linker script tokens.
tokenize(MemoryBufferRef mb)1083837f427SRui Ueyama void ScriptLexer::tokenize(MemoryBufferRef mb) {
1093837f427SRui Ueyama std::vector<StringRef> vec;
1103837f427SRui Ueyama mbs.push_back(mb);
1113837f427SRui Ueyama StringRef s = mb.getBuffer();
1123837f427SRui Ueyama StringRef begin = s;
113794366a2SRui Ueyama
114794366a2SRui Ueyama for (;;) {
1153837f427SRui Ueyama s = skipSpace(s);
1163837f427SRui Ueyama if (s.empty())
117794366a2SRui Ueyama break;
118794366a2SRui Ueyama
119794366a2SRui Ueyama // Quoted token. Note that double-quote characters are parts of a token
120794366a2SRui Ueyama // because, in a glob match context, only unquoted tokens are interpreted
121794366a2SRui Ueyama // as glob patterns. Double-quoted tokens are literal patterns in that
122794366a2SRui Ueyama // context.
1233837f427SRui Ueyama if (s.startswith("\"")) {
1243837f427SRui Ueyama size_t e = s.find("\"", 1);
1253837f427SRui Ueyama if (e == StringRef::npos) {
1263837f427SRui Ueyama StringRef filename = mb.getBufferIdentifier();
1273837f427SRui Ueyama size_t lineno = begin.substr(0, s.data() - begin.data()).count('\n');
1283837f427SRui Ueyama error(filename + ":" + Twine(lineno + 1) + ": unclosed quote");
129794366a2SRui Ueyama return;
130794366a2SRui Ueyama }
131794366a2SRui Ueyama
1323837f427SRui Ueyama vec.push_back(s.take_front(e + 1));
1333837f427SRui Ueyama s = s.substr(e + 1);
134794366a2SRui Ueyama continue;
135794366a2SRui Ueyama }
136794366a2SRui Ueyama
137*0a0effddSFangrui Song // Some operators form separate tokens.
138*0a0effddSFangrui Song if (s.startswith("<<=") || s.startswith(">>=")) {
139*0a0effddSFangrui Song vec.push_back(s.substr(0, 3));
140*0a0effddSFangrui Song s = s.substr(3);
141*0a0effddSFangrui Song continue;
142*0a0effddSFangrui Song }
143*0a0effddSFangrui Song if (s.size() > 1 && ((s[1] == '=' && strchr("*/+-<>&|", s[0])) ||
144*0a0effddSFangrui Song (s[0] == s[1] && strchr("<>&|", s[0])))) {
1453837f427SRui Ueyama vec.push_back(s.substr(0, 2));
1463837f427SRui Ueyama s = s.substr(2);
147c67d6b2dSRui Ueyama continue;
148c67d6b2dSRui Ueyama }
149c67d6b2dSRui Ueyama
150794366a2SRui Ueyama // Unquoted token. This is more relaxed than tokens in C-like language,
151794366a2SRui Ueyama // so that you can write "file-name.cpp" as one bare token, for example.
1523837f427SRui Ueyama size_t pos = s.find_first_not_of(
153794366a2SRui Ueyama "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
154c67d6b2dSRui Ueyama "0123456789_.$/\\~=+[]*?-!^:");
155794366a2SRui Ueyama
156794366a2SRui Ueyama // A character that cannot start a word (which is usually a
157794366a2SRui Ueyama // punctuation) forms a single character token.
1583837f427SRui Ueyama if (pos == 0)
1593837f427SRui Ueyama pos = 1;
1603837f427SRui Ueyama vec.push_back(s.substr(0, pos));
1613837f427SRui Ueyama s = s.substr(pos);
162794366a2SRui Ueyama }
163794366a2SRui Ueyama
1643837f427SRui Ueyama tokens.insert(tokens.begin() + pos, vec.begin(), vec.end());
165794366a2SRui Ueyama }
166794366a2SRui Ueyama
167794366a2SRui Ueyama // Skip leading whitespace characters or comments.
skipSpace(StringRef s)1683837f427SRui Ueyama StringRef ScriptLexer::skipSpace(StringRef s) {
169794366a2SRui Ueyama for (;;) {
1703837f427SRui Ueyama if (s.startswith("/*")) {
1713837f427SRui Ueyama size_t e = s.find("*/", 2);
1723837f427SRui Ueyama if (e == StringRef::npos) {
173ae4279bdSGeorgii Rymar setError("unclosed comment in a linker script");
174794366a2SRui Ueyama return "";
175794366a2SRui Ueyama }
1763837f427SRui Ueyama s = s.substr(e + 2);
177794366a2SRui Ueyama continue;
178794366a2SRui Ueyama }
1793837f427SRui Ueyama if (s.startswith("#")) {
1803837f427SRui Ueyama size_t e = s.find('\n', 1);
1813837f427SRui Ueyama if (e == StringRef::npos)
1823837f427SRui Ueyama e = s.size() - 1;
1833837f427SRui Ueyama s = s.substr(e + 1);
184794366a2SRui Ueyama continue;
185794366a2SRui Ueyama }
1863837f427SRui Ueyama size_t size = s.size();
1873837f427SRui Ueyama s = s.ltrim();
1883837f427SRui Ueyama if (s.size() == size)
1893837f427SRui Ueyama return s;
190794366a2SRui Ueyama }
191794366a2SRui Ueyama }
192794366a2SRui Ueyama
193794366a2SRui Ueyama // An erroneous token is handled as if it were the last token before EOF.
atEOF()1943837f427SRui Ueyama bool ScriptLexer::atEOF() { return errorCount() || tokens.size() == pos; }
195794366a2SRui Ueyama
196731a66aeSRui Ueyama // Split a given string as an expression.
197731a66aeSRui Ueyama // This function returns "3", "*" and "5" for "3*5" for example.
tokenizeExpr(StringRef s)1983837f427SRui Ueyama static std::vector<StringRef> tokenizeExpr(StringRef s) {
19977295c54SFangrui Song StringRef ops = "!~*/+-<>?:="; // List of operators
200731a66aeSRui Ueyama
201731a66aeSRui Ueyama // Quoted strings are literal strings, so we don't want to split it.
2023837f427SRui Ueyama if (s.startswith("\""))
2033837f427SRui Ueyama return {s};
204731a66aeSRui Ueyama
205970e783bSGeorge Rimar // Split S with operators as separators.
2063837f427SRui Ueyama std::vector<StringRef> ret;
2073837f427SRui Ueyama while (!s.empty()) {
2083837f427SRui Ueyama size_t e = s.find_first_of(ops);
209731a66aeSRui Ueyama
210731a66aeSRui Ueyama // No need to split if there is no operator.
2113837f427SRui Ueyama if (e == StringRef::npos) {
2123837f427SRui Ueyama ret.push_back(s);
213731a66aeSRui Ueyama break;
214731a66aeSRui Ueyama }
215731a66aeSRui Ueyama
2167c5fcb35SKazuaki Ishizaki // Get a token before the operator.
2173837f427SRui Ueyama if (e != 0)
2183837f427SRui Ueyama ret.push_back(s.substr(0, e));
219731a66aeSRui Ueyama
2200810f16fSGeorge Rimar // Get the operator as a token.
2210810f16fSGeorge Rimar // Keep !=, ==, >=, <=, << and >> operators as a single tokens.
2223837f427SRui Ueyama if (s.substr(e).startswith("!=") || s.substr(e).startswith("==") ||
2233837f427SRui Ueyama s.substr(e).startswith(">=") || s.substr(e).startswith("<=") ||
2243837f427SRui Ueyama s.substr(e).startswith("<<") || s.substr(e).startswith(">>")) {
2253837f427SRui Ueyama ret.push_back(s.substr(e, 2));
2263837f427SRui Ueyama s = s.substr(e + 2);
2276f1d954eSHafiz Abid Qadeer } else {
2283837f427SRui Ueyama ret.push_back(s.substr(e, 1));
2293837f427SRui Ueyama s = s.substr(e + 1);
230731a66aeSRui Ueyama }
2316f1d954eSHafiz Abid Qadeer }
2323837f427SRui Ueyama return ret;
233731a66aeSRui Ueyama }
234731a66aeSRui Ueyama
235731a66aeSRui Ueyama // In contexts where expressions are expected, the lexer should apply
236731a66aeSRui Ueyama // different tokenization rules than the default one. By default,
237731a66aeSRui Ueyama // arithmetic operator characters are regular characters, but in the
238731a66aeSRui Ueyama // expression context, they should be independent tokens.
239731a66aeSRui Ueyama //
240731a66aeSRui Ueyama // For example, "foo*3" should be tokenized to "foo", "*" and "3" only
241731a66aeSRui Ueyama // in the expression context.
242731a66aeSRui Ueyama //
243731a66aeSRui Ueyama // This function may split the current token into multiple tokens.
maybeSplitExpr()244731a66aeSRui Ueyama void ScriptLexer::maybeSplitExpr() {
2453837f427SRui Ueyama if (!inExpr || errorCount() || atEOF())
246731a66aeSRui Ueyama return;
247731a66aeSRui Ueyama
2483837f427SRui Ueyama std::vector<StringRef> v = tokenizeExpr(tokens[pos]);
2493837f427SRui Ueyama if (v.size() == 1)
250731a66aeSRui Ueyama return;
2513837f427SRui Ueyama tokens.erase(tokens.begin() + pos);
2523837f427SRui Ueyama tokens.insert(tokens.begin() + pos, v.begin(), v.end());
253731a66aeSRui Ueyama }
254731a66aeSRui Ueyama
next()255794366a2SRui Ueyama StringRef ScriptLexer::next() {
256731a66aeSRui Ueyama maybeSplitExpr();
257731a66aeSRui Ueyama
258b8a59c8aSBob Haarman if (errorCount())
259794366a2SRui Ueyama return "";
260794366a2SRui Ueyama if (atEOF()) {
261794366a2SRui Ueyama setError("unexpected EOF");
262794366a2SRui Ueyama return "";
263794366a2SRui Ueyama }
2643837f427SRui Ueyama return tokens[pos++];
265794366a2SRui Ueyama }
266794366a2SRui Ueyama
peek()267f5fce486SRui Ueyama StringRef ScriptLexer::peek() {
2683837f427SRui Ueyama StringRef tok = next();
269b8a59c8aSBob Haarman if (errorCount())
270794366a2SRui Ueyama return "";
2713837f427SRui Ueyama pos = pos - 1;
2723837f427SRui Ueyama return tok;
273794366a2SRui Ueyama }
274794366a2SRui Ueyama
peek2()275a46d08ebSGeorge Rimar StringRef ScriptLexer::peek2() {
276a46d08ebSGeorge Rimar skip();
2773837f427SRui Ueyama StringRef tok = next();
278a46d08ebSGeorge Rimar if (errorCount())
279a46d08ebSGeorge Rimar return "";
2803837f427SRui Ueyama pos = pos - 2;
2813837f427SRui Ueyama return tok;
282a46d08ebSGeorge Rimar }
283a46d08ebSGeorge Rimar
consume(StringRef tok)2843837f427SRui Ueyama bool ScriptLexer::consume(StringRef tok) {
2853837f427SRui Ueyama if (peek() == tok) {
286794366a2SRui Ueyama skip();
287794366a2SRui Ueyama return true;
288794366a2SRui Ueyama }
289794366a2SRui Ueyama return false;
290794366a2SRui Ueyama }
291794366a2SRui Ueyama
292f5fce486SRui Ueyama // Consumes Tok followed by ":". Space is allowed between Tok and ":".
consumeLabel(StringRef tok)2933837f427SRui Ueyama bool ScriptLexer::consumeLabel(StringRef tok) {
2943837f427SRui Ueyama if (consume((tok + ":").str()))
295f5fce486SRui Ueyama return true;
2963837f427SRui Ueyama if (tokens.size() >= pos + 2 && tokens[pos] == tok &&
2973837f427SRui Ueyama tokens[pos + 1] == ":") {
2983837f427SRui Ueyama pos += 2;
299f5fce486SRui Ueyama return true;
300f5fce486SRui Ueyama }
301f5fce486SRui Ueyama return false;
302f5fce486SRui Ueyama }
303f5fce486SRui Ueyama
skip()304794366a2SRui Ueyama void ScriptLexer::skip() { (void)next(); }
305794366a2SRui Ueyama
expect(StringRef expect)3063837f427SRui Ueyama void ScriptLexer::expect(StringRef expect) {
307b8a59c8aSBob Haarman if (errorCount())
308794366a2SRui Ueyama return;
3093837f427SRui Ueyama StringRef tok = next();
3103837f427SRui Ueyama if (tok != expect)
3113837f427SRui Ueyama setError(expect + " expected, but got " + tok);
312794366a2SRui Ueyama }
313794366a2SRui Ueyama
314794366a2SRui Ueyama // Returns true if S encloses T.
encloses(StringRef s,StringRef t)3153837f427SRui Ueyama static bool encloses(StringRef s, StringRef t) {
3163837f427SRui Ueyama return s.bytes_begin() <= t.bytes_begin() && t.bytes_end() <= s.bytes_end();
317794366a2SRui Ueyama }
318794366a2SRui Ueyama
getCurrentMB()319794366a2SRui Ueyama MemoryBufferRef ScriptLexer::getCurrentMB() {
320794366a2SRui Ueyama // Find input buffer containing the current token.
321ac6abc99SFangrui Song assert(!mbs.empty());
322ac6abc99SFangrui Song if (pos == 0)
323ac6abc99SFangrui Song return mbs.back();
3243837f427SRui Ueyama for (MemoryBufferRef mb : mbs)
3253837f427SRui Ueyama if (encloses(mb.getBuffer(), tokens[pos - 1]))
3263837f427SRui Ueyama return mb;
327794366a2SRui Ueyama llvm_unreachable("getCurrentMB: failed to find a token");
328794366a2SRui Ueyama }
329