1 //===- ScriptParser.cpp ---------------------------------------------------===//
2 //
3 //                             The LLVM Linker
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file contains the base parser class for linker script and dynamic
11 // list.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "ScriptParser.h"
16 #include "Error.h"
17 #include "llvm/ADT/Twine.h"
18 
19 using namespace llvm;
20 using namespace lld;
21 using namespace lld::elf;
22 
23 // Returns the line that the token Tok is in.
24 static StringRef getLine(StringRef Data, StringRef Tok) {
25   size_t Pos = Tok.data() - Data.data();
26   size_t Begin = Data.rfind('\n', Pos);
27   size_t End = Data.find('\n', Pos);
28   Begin = (Begin == StringRef::npos) ? 0 : Begin + 1;
29   if (End == StringRef::npos)
30     End = Data.size();
31   // rtrim for DOS-style newlines.
32   return Data.substr(Begin, End - Begin).rtrim();
33 }
34 
35 static std::pair<size_t, size_t> getPos(StringRef Data, StringRef Tok) {
36   StringRef Line = getLine(Data, Tok);
37   size_t LineNo =
38       StringRef(Data.data(), Tok.data() - Data.data()).count('\n') + 1;
39   return {LineNo, Tok.data() - Line.data()};
40 }
41 
42 ScriptParserBase::ScriptParserBase(MemoryBufferRef MB) { tokenize(MB); }
43 
44 // We don't want to record cascading errors. Keep only the first one.
45 void ScriptParserBase::setError(const Twine &Msg) {
46   if (Error)
47     return;
48 
49   std::pair<size_t, size_t> ErrPos;
50   MemoryBufferRef MB = currentBuffer();
51   std::string Location = MB.getBufferIdentifier();
52   if (Pos) {
53     ErrPos = getPos(MB.getBuffer(), Tokens[Pos - 1]);
54     Location += ":";
55     Location += std::to_string(ErrPos.first);
56   }
57   error(Location + ": " + Msg);
58   if (Pos) {
59     error(Location + ": " + getLine(MB.getBuffer(), Tokens[Pos - 1]));
60     error(Location + ": " + std::string(ErrPos.second, ' ') + "^");
61   }
62 
63   Error = true;
64 }
65 
66 // Split S into linker script tokens.
67 void ScriptParserBase::tokenize(MemoryBufferRef MB) {
68   std::vector<StringRef> Ret;
69   MBs.push_back(MB);
70   StringRef S = MB.getBuffer();
71   StringRef Begin = S;
72   for (;;) {
73     S = skipSpace(S);
74     if (S.empty())
75       break;
76 
77     // Quoted token. Note that double-quote characters are parts of a token
78     // because, in a glob match context, only unquoted tokens are interpreted
79     // as glob patterns. Double-quoted tokens are literal patterns in that
80     // context.
81     if (S.startswith("\"")) {
82       size_t E = S.find("\"", 1);
83       if (E == StringRef::npos) {
84         auto ErrPos = getPos(Begin, S);
85         error(MB.getBufferIdentifier() + ":" + Twine(ErrPos.first) +
86               ": unclosed quote");
87         return;
88       }
89       Ret.push_back(S.take_front(E + 1));
90       S = S.substr(E + 1);
91       continue;
92     }
93 
94     // Unquoted token. This is more relaxed than tokens in C-like language,
95     // so that you can write "file-name.cpp" as one bare token, for example.
96     size_t Pos = S.find_first_not_of(
97         "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
98         "0123456789_.$/\\~=+[]*?-:!<>^");
99 
100     // A character that cannot start a word (which is usually a
101     // punctuation) forms a single character token.
102     if (Pos == 0)
103       Pos = 1;
104     Ret.push_back(S.substr(0, Pos));
105     S = S.substr(Pos);
106   }
107   Tokens.insert(Tokens.begin() + Pos, Ret.begin(), Ret.end());
108 }
109 
110 // Skip leading whitespace characters or comments.
111 StringRef ScriptParserBase::skipSpace(StringRef S) {
112   for (;;) {
113     if (S.startswith("/*")) {
114       size_t E = S.find("*/", 2);
115       if (E == StringRef::npos) {
116         error("unclosed comment in a linker script");
117         return "";
118       }
119       S = S.substr(E + 2);
120       continue;
121     }
122     if (S.startswith("#")) {
123       size_t E = S.find('\n', 1);
124       if (E == StringRef::npos)
125         E = S.size() - 1;
126       S = S.substr(E + 1);
127       continue;
128     }
129     size_t Size = S.size();
130     S = S.ltrim();
131     if (S.size() == Size)
132       return S;
133   }
134 }
135 
136 // An erroneous token is handled as if it were the last token before EOF.
137 bool ScriptParserBase::atEOF() { return Error || Tokens.size() == Pos; }
138 
139 StringRef ScriptParserBase::next() {
140   if (Error)
141     return "";
142   if (atEOF()) {
143     setError("unexpected EOF");
144     return "";
145   }
146   return Tokens[Pos++];
147 }
148 
149 StringRef ScriptParserBase::peek() {
150   StringRef Tok = next();
151   if (Error)
152     return "";
153   --Pos;
154   return Tok;
155 }
156 
157 bool ScriptParserBase::consume(StringRef Tok) {
158   if (peek() == Tok) {
159     skip();
160     return true;
161   }
162   return false;
163 }
164 
165 void ScriptParserBase::skip() { (void)next(); }
166 
167 void ScriptParserBase::expect(StringRef Expect) {
168   if (Error)
169     return;
170   StringRef Tok = next();
171   if (Tok != Expect)
172     setError(Expect + " expected, but got " + Tok);
173 }
174 
175 // Returns true if string 'Bigger' contains string 'Shorter'.
176 static bool containsString(StringRef Bigger, StringRef Shorter) {
177   const char *BiggerEnd = Bigger.data() + Bigger.size();
178   const char *ShorterEnd = Shorter.data() + Shorter.size();
179 
180   return Bigger.data() <= Shorter.data() && BiggerEnd >= ShorterEnd;
181 }
182 
183 MemoryBufferRef ScriptParserBase::currentBuffer() {
184   // Find input buffer containing the current token.
185   assert(!MBs.empty());
186   if (Pos)
187     for (MemoryBufferRef MB : MBs)
188       if (containsString(MB.getBuffer(), Tokens[Pos - 1]))
189         return MB;
190 
191   return MBs.front();
192 }
193