1 //===- InputFiles.cpp -----------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains functions to parse Mach-O object files. In this comment,
10 // we describe the Mach-O file structure and how we parse it.
11 //
12 // Mach-O is not very different from ELF or COFF. The notion of symbols,
13 // sections and relocations exists in Mach-O as it does in ELF and COFF.
14 //
15 // Perhaps the notion that is new to those who know ELF/COFF is "subsections".
16 // In ELF/COFF, sections are an atomic unit of data copied from input files to
17 // output files. When we merge or garbage-collect sections, we treat each
18 // section as an atomic unit. In Mach-O, that's not the case. Sections can
19 // consist of multiple subsections, and subsections are a unit of merging and
20 // garbage-collecting. Therefore, Mach-O's subsections are more similar to
21 // ELF/COFF's sections than Mach-O's sections are.
22 //
23 // A section can have multiple symbols. A symbol that does not have the
24 // N_ALT_ENTRY attribute indicates a beginning of a subsection. Therefore, by
25 // definition, a symbol is always present at the beginning of each subsection. A
26 // symbol with N_ALT_ENTRY attribute does not start a new subsection and can
27 // point to a middle of a subsection.
28 //
29 // The notion of subsections also affects how relocations are represented in
30 // Mach-O. All references within a section need to be explicitly represented as
31 // relocations if they refer to different subsections, because we obviously need
32 // to fix up addresses if subsections are laid out in an output file differently
33 // than they were in object files. To represent that, Mach-O relocations can
34 // refer to an unnamed location via its address. Scattered relocations (those
35 // with the R_SCATTERED bit set) always refer to unnamed locations.
36 // Non-scattered relocations refer to an unnamed location if r_extern is not set
37 // and r_symbolnum is zero.
38 //
39 // Without the above differences, I think you can use your knowledge about ELF
40 // and COFF for Mach-O.
41 //
42 //===----------------------------------------------------------------------===//
43 
44 #include "InputFiles.h"
45 #include "Config.h"
46 #include "DriverUtils.h"
47 #include "ExportTrie.h"
48 #include "InputSection.h"
49 #include "MachOStructs.h"
50 #include "ObjC.h"
51 #include "OutputSection.h"
52 #include "OutputSegment.h"
53 #include "SymbolTable.h"
54 #include "Symbols.h"
55 #include "Target.h"
56 
57 #include "lld/Common/ErrorHandler.h"
58 #include "lld/Common/Memory.h"
59 #include "llvm/ADT/iterator.h"
60 #include "llvm/BinaryFormat/MachO.h"
61 #include "llvm/Support/Endian.h"
62 #include "llvm/Support/MemoryBuffer.h"
63 #include "llvm/Support/Path.h"
64 
65 using namespace llvm;
66 using namespace llvm::MachO;
67 using namespace llvm::support::endian;
68 using namespace llvm::sys;
69 using namespace lld;
70 using namespace lld::macho;
71 
72 std::vector<InputFile *> macho::inputFiles;
73 
74 // Open a given file path and return it as a memory-mapped file.
75 Optional<MemoryBufferRef> macho::readFile(StringRef path) {
76   // Open a file.
77   auto mbOrErr = MemoryBuffer::getFile(path);
78   if (auto ec = mbOrErr.getError()) {
79     error("cannot open " + path + ": " + ec.message());
80     return None;
81   }
82 
83   std::unique_ptr<MemoryBuffer> &mb = *mbOrErr;
84   MemoryBufferRef mbref = mb->getMemBufferRef();
85   make<std::unique_ptr<MemoryBuffer>>(std::move(mb)); // take mb ownership
86 
87   // If this is a regular non-fat file, return it.
88   const char *buf = mbref.getBufferStart();
89   auto *hdr = reinterpret_cast<const MachO::fat_header *>(buf);
90   if (read32be(&hdr->magic) != MachO::FAT_MAGIC)
91     return mbref;
92 
93   // Object files and archive files may be fat files, which contains
94   // multiple real files for different CPU ISAs. Here, we search for a
95   // file that matches with the current link target and returns it as
96   // a MemoryBufferRef.
97   auto *arch = reinterpret_cast<const MachO::fat_arch *>(buf + sizeof(*hdr));
98 
99   for (uint32_t i = 0, n = read32be(&hdr->nfat_arch); i < n; ++i) {
100     if (reinterpret_cast<const char *>(arch + i + 1) >
101         buf + mbref.getBufferSize()) {
102       error(path + ": fat_arch struct extends beyond end of file");
103       return None;
104     }
105 
106     if (read32be(&arch[i].cputype) != target->cpuType ||
107         read32be(&arch[i].cpusubtype) != target->cpuSubtype)
108       continue;
109 
110     uint32_t offset = read32be(&arch[i].offset);
111     uint32_t size = read32be(&arch[i].size);
112     if (offset + size > mbref.getBufferSize())
113       error(path + ": slice extends beyond end of file");
114     return MemoryBufferRef(StringRef(buf + offset, size), path.copy(bAlloc));
115   }
116 
117   error("unable to find matching architecture in " + path);
118   return None;
119 }
120 
121 const load_command *macho::findCommand(const mach_header_64 *hdr,
122                                        uint32_t type) {
123   const uint8_t *p =
124       reinterpret_cast<const uint8_t *>(hdr) + sizeof(mach_header_64);
125 
126   for (uint32_t i = 0, n = hdr->ncmds; i < n; ++i) {
127     auto *cmd = reinterpret_cast<const load_command *>(p);
128     if (cmd->cmd == type)
129       return cmd;
130     p += cmd->cmdsize;
131   }
132   return nullptr;
133 }
134 
135 void InputFile::parseSections(ArrayRef<section_64> sections) {
136   subsections.reserve(sections.size());
137   auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart());
138 
139   for (const section_64 &sec : sections) {
140     InputSection *isec = make<InputSection>();
141     isec->file = this;
142     isec->name =
143         StringRef(sec.sectname, strnlen(sec.sectname, sizeof(sec.sectname)));
144     isec->segname =
145         StringRef(sec.segname, strnlen(sec.segname, sizeof(sec.segname)));
146     isec->data = {isZeroFill(sec.flags) ? nullptr : buf + sec.offset,
147                   static_cast<size_t>(sec.size)};
148     if (sec.align >= 32)
149       error("alignment " + std::to_string(sec.align) + " of section " +
150             isec->name + " is too large");
151     else
152       isec->align = 1 << sec.align;
153     isec->flags = sec.flags;
154     subsections.push_back({{0, isec}});
155   }
156 }
157 
158 // Find the subsection corresponding to the greatest section offset that is <=
159 // that of the given offset.
160 //
161 // offset: an offset relative to the start of the original InputSection (before
162 // any subsection splitting has occurred). It will be updated to represent the
163 // same location as an offset relative to the start of the containing
164 // subsection.
165 static InputSection *findContainingSubsection(SubsectionMap &map,
166                                               uint32_t *offset) {
167   auto it = std::prev(map.upper_bound(*offset));
168   *offset -= it->first;
169   return it->second;
170 }
171 
172 void InputFile::parseRelocations(const section_64 &sec,
173                                  SubsectionMap &subsecMap) {
174   auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart());
175   ArrayRef<any_relocation_info> anyRelInfos(
176       reinterpret_cast<const any_relocation_info *>(buf + sec.reloff),
177       sec.nreloc);
178 
179   for (const any_relocation_info &anyRelInfo : anyRelInfos) {
180     if (anyRelInfo.r_word0 & R_SCATTERED)
181       fatal("TODO: Scattered relocations not supported");
182 
183     auto relInfo = reinterpret_cast<const relocation_info &>(anyRelInfo);
184 
185     Reloc r;
186     r.type = relInfo.r_type;
187     r.pcrel = relInfo.r_pcrel;
188     r.length = relInfo.r_length;
189     uint64_t rawAddend = target->getImplicitAddend(mb, sec, relInfo);
190 
191     if (relInfo.r_extern) {
192       r.referent = symbols[relInfo.r_symbolnum];
193       r.addend = rawAddend;
194     } else {
195       if (relInfo.r_symbolnum == 0 || relInfo.r_symbolnum > subsections.size())
196         fatal("invalid section index in relocation for offset " +
197               std::to_string(r.offset) + " in section " + sec.sectname +
198               " of " + getName());
199 
200       SubsectionMap &referentSubsecMap = subsections[relInfo.r_symbolnum - 1];
201       const section_64 &referentSec = sectionHeaders[relInfo.r_symbolnum - 1];
202       uint32_t referentOffset;
203       if (relInfo.r_pcrel) {
204         // The implicit addend for pcrel section relocations is the pcrel offset
205         // in terms of the addresses in the input file. Here we adjust it so
206         // that it describes the offset from the start of the referent section.
207         // TODO: The offset of 4 is probably not right for ARM64, nor for
208         //       relocations with r_length != 2.
209         referentOffset =
210             sec.addr + relInfo.r_address + 4 + rawAddend - referentSec.addr;
211       } else {
212         // The addend for a non-pcrel relocation is its absolute address.
213         referentOffset = rawAddend - referentSec.addr;
214       }
215       r.referent = findContainingSubsection(referentSubsecMap, &referentOffset);
216       r.addend = referentOffset;
217     }
218 
219     r.offset = relInfo.r_address;
220     InputSection *subsec = findContainingSubsection(subsecMap, &r.offset);
221     subsec->relocs.push_back(r);
222   }
223 }
224 
225 void InputFile::parseSymbols(ArrayRef<structs::nlist_64> nList,
226                              const char *strtab, bool subsectionsViaSymbols) {
227   // resize(), not reserve(), because we are going to create N_ALT_ENTRY symbols
228   // out-of-sequence.
229   symbols.resize(nList.size());
230   std::vector<size_t> altEntrySymIdxs;
231 
232   auto createDefined = [&](const structs::nlist_64 &sym, InputSection *isec,
233                            uint32_t value) -> Symbol * {
234     StringRef name = strtab + sym.n_strx;
235     if (sym.n_type & N_EXT)
236       // Global defined symbol
237       return symtab->addDefined(name, isec, value, sym.n_desc & N_WEAK_DEF);
238     // Local defined symbol
239     return make<Defined>(name, isec, value, sym.n_desc & N_WEAK_DEF,
240                          /*isExternal=*/false);
241   };
242 
243   for (size_t i = 0, n = nList.size(); i < n; ++i) {
244     const structs::nlist_64 &sym = nList[i];
245 
246     if ((sym.n_type & N_TYPE) == N_UNDF) {
247       StringRef name = strtab + sym.n_strx;
248       symbols[i] = sym.n_value == 0
249                        ? symtab->addUndefined(name)
250                        : symtab->addCommon(name, this, sym.n_value,
251                                            1 << GET_COMM_ALIGN(sym.n_desc));
252       continue;
253     }
254 
255     const section_64 &sec = sectionHeaders[sym.n_sect - 1];
256     SubsectionMap &subsecMap = subsections[sym.n_sect - 1];
257     uint64_t offset = sym.n_value - sec.addr;
258 
259     // If the input file does not use subsections-via-symbols, all symbols can
260     // use the same subsection. Otherwise, we must split the sections along
261     // symbol boundaries.
262     if (!subsectionsViaSymbols) {
263       symbols[i] = createDefined(sym, subsecMap[0], offset);
264       continue;
265     }
266 
267     // nList entries aren't necessarily arranged in address order. Therefore,
268     // we can't create alt-entry symbols at this point because a later symbol
269     // may split its section, which may affect which subsection the alt-entry
270     // symbol is assigned to. So we need to handle them in a second pass below.
271     if (sym.n_desc & N_ALT_ENTRY) {
272       altEntrySymIdxs.push_back(i);
273       continue;
274     }
275 
276     // Find the subsection corresponding to the greatest section offset that is
277     // <= that of the current symbol. The subsection that we find either needs
278     // to be used directly or split in two.
279     uint32_t firstSize = offset;
280     InputSection *firstIsec = findContainingSubsection(subsecMap, &firstSize);
281 
282     if (firstSize == 0) {
283       // Alias of an existing symbol, or the first symbol in the section. These
284       // are handled by reusing the existing section.
285       symbols[i] = createDefined(sym, firstIsec, 0);
286       continue;
287     }
288 
289     // We saw a symbol definition at a new offset. Split the section into two
290     // subsections. The new symbol uses the second subsection.
291     auto *secondIsec = make<InputSection>(*firstIsec);
292     secondIsec->data = firstIsec->data.slice(firstSize);
293     firstIsec->data = firstIsec->data.slice(0, firstSize);
294     // TODO: ld64 appears to preserve the original alignment as well as each
295     // subsection's offset from the last aligned address. We should consider
296     // emulating that behavior.
297     secondIsec->align = MinAlign(firstIsec->align, offset);
298 
299     subsecMap[offset] = secondIsec;
300     // By construction, the symbol will be at offset zero in the new section.
301     symbols[i] = createDefined(sym, secondIsec, 0);
302   }
303 
304   for (size_t idx : altEntrySymIdxs) {
305     const structs::nlist_64 &sym = nList[idx];
306     SubsectionMap &subsecMap = subsections[sym.n_sect - 1];
307     uint32_t off = sym.n_value - sectionHeaders[sym.n_sect - 1].addr;
308     InputSection *subsec = findContainingSubsection(subsecMap, &off);
309     symbols[idx] = createDefined(sym, subsec, off);
310   }
311 }
312 
313 OpaqueFile::OpaqueFile(MemoryBufferRef mb, StringRef segName,
314                        StringRef sectName)
315     : InputFile(OpaqueKind, mb) {
316   InputSection *isec = make<InputSection>();
317   isec->file = this;
318   isec->name = sectName.take_front(16);
319   isec->segname = segName.take_front(16);
320   const auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart());
321   isec->data = {buf, mb.getBufferSize()};
322   subsections.push_back({{0, isec}});
323 }
324 
325 ObjFile::ObjFile(MemoryBufferRef mb) : InputFile(ObjKind, mb) {
326   auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart());
327   auto *hdr = reinterpret_cast<const mach_header_64 *>(mb.getBufferStart());
328 
329   if (const load_command *cmd = findCommand(hdr, LC_SEGMENT_64)) {
330     auto *c = reinterpret_cast<const segment_command_64 *>(cmd);
331     sectionHeaders = ArrayRef<section_64>{
332         reinterpret_cast<const section_64 *>(c + 1), c->nsects};
333     parseSections(sectionHeaders);
334   }
335 
336   // TODO: Error on missing LC_SYMTAB?
337   if (const load_command *cmd = findCommand(hdr, LC_SYMTAB)) {
338     auto *c = reinterpret_cast<const symtab_command *>(cmd);
339     ArrayRef<structs::nlist_64> nList(
340         reinterpret_cast<const structs::nlist_64 *>(buf + c->symoff), c->nsyms);
341     const char *strtab = reinterpret_cast<const char *>(buf) + c->stroff;
342     bool subsectionsViaSymbols = hdr->flags & MH_SUBSECTIONS_VIA_SYMBOLS;
343     parseSymbols(nList, strtab, subsectionsViaSymbols);
344   }
345 
346   // The relocations may refer to the symbols, so we parse them after we have
347   // parsed all the symbols.
348   for (size_t i = 0, n = subsections.size(); i < n; ++i)
349     parseRelocations(sectionHeaders[i], subsections[i]);
350 }
351 
352 // The path can point to either a dylib or a .tbd file.
353 static Optional<DylibFile *> loadDylib(StringRef path, DylibFile *umbrella) {
354   Optional<MemoryBufferRef> mbref = readFile(path);
355   if (!mbref) {
356     error("could not read dylib file at " + path);
357     return {};
358   }
359 
360   file_magic magic = identify_magic(mbref->getBuffer());
361   if (magic == file_magic::tapi_file)
362     return makeDylibFromTAPI(*mbref, umbrella);
363   assert(magic == file_magic::macho_dynamically_linked_shared_lib);
364   return make<DylibFile>(*mbref, umbrella);
365 }
366 
367 // TBD files are parsed into a series of TAPI documents (InterfaceFiles), with
368 // the first document storing child pointers to the rest of them. When we are
369 // processing a given TBD file, we store that top-level document here. When
370 // processing re-exports, we search its children for potentially matching
371 // documents in the same TBD file. Note that the children themselves don't
372 // point to further documents, i.e. this is a two-level tree.
373 //
374 // ld64 allows a TAPI re-export to reference documents nested within other TBD
375 // files, but that seems like a strange design, so this is an intentional
376 // deviation.
377 const InterfaceFile *currentTopLevelTapi = nullptr;
378 
379 // Re-exports can either refer to on-disk files, or to documents within .tbd
380 // files.
381 static Optional<DylibFile *> loadReexport(StringRef path, DylibFile *umbrella) {
382   if (path::is_absolute(path, path::Style::posix))
383     for (StringRef root : config->systemLibraryRoots)
384       if (Optional<std::string> dylibPath =
385               resolveDylibPath((root + path).str()))
386         return loadDylib(*dylibPath, umbrella);
387 
388   // TODO: Expand @loader_path, @executable_path etc
389 
390   if (currentTopLevelTapi) {
391     for (InterfaceFile &child :
392          make_pointee_range(currentTopLevelTapi->documents())) {
393       if (path == child.getInstallName())
394         return make<DylibFile>(child, umbrella);
395       assert(child.documents().empty());
396     }
397   }
398 
399   if (Optional<std::string> dylibPath = resolveDylibPath(path))
400     return loadDylib(*dylibPath, umbrella);
401 
402   error("unable to locate re-export with install name " + path);
403   return {};
404 }
405 
406 DylibFile::DylibFile(MemoryBufferRef mb, DylibFile *umbrella)
407     : InputFile(DylibKind, mb) {
408   if (umbrella == nullptr)
409     umbrella = this;
410 
411   auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart());
412   auto *hdr = reinterpret_cast<const mach_header_64 *>(mb.getBufferStart());
413 
414   // Initialize dylibName.
415   if (const load_command *cmd = findCommand(hdr, LC_ID_DYLIB)) {
416     auto *c = reinterpret_cast<const dylib_command *>(cmd);
417     dylibName = reinterpret_cast<const char *>(cmd) + read32le(&c->dylib.name);
418   } else {
419     error("dylib " + getName() + " missing LC_ID_DYLIB load command");
420     return;
421   }
422 
423   // Initialize symbols.
424   // TODO: if a re-exported dylib is public (lives in /usr/lib or
425   // /System/Library/Frameworks), we should bind to its symbols directly
426   // instead of the re-exporting umbrella library.
427   if (const load_command *cmd = findCommand(hdr, LC_DYLD_INFO_ONLY)) {
428     auto *c = reinterpret_cast<const dyld_info_command *>(cmd);
429     parseTrie(buf + c->export_off, c->export_size,
430               [&](const Twine &name, uint64_t flags) {
431                 bool isWeakDef = flags & EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION;
432                 bool isTlv = flags & EXPORT_SYMBOL_FLAGS_KIND_THREAD_LOCAL;
433                 symbols.push_back(symtab->addDylib(saver.save(name), umbrella,
434                                                    isWeakDef, isTlv));
435               });
436   } else {
437     error("LC_DYLD_INFO_ONLY not found in " + getName());
438     return;
439   }
440 
441   if (hdr->flags & MH_NO_REEXPORTED_DYLIBS)
442     return;
443 
444   const uint8_t *p =
445       reinterpret_cast<const uint8_t *>(hdr) + sizeof(mach_header_64);
446   for (uint32_t i = 0, n = hdr->ncmds; i < n; ++i) {
447     auto *cmd = reinterpret_cast<const load_command *>(p);
448     p += cmd->cmdsize;
449     if (cmd->cmd != LC_REEXPORT_DYLIB)
450       continue;
451 
452     auto *c = reinterpret_cast<const dylib_command *>(cmd);
453     StringRef reexportPath =
454         reinterpret_cast<const char *>(c) + read32le(&c->dylib.name);
455     if (Optional<DylibFile *> reexport = loadReexport(reexportPath, umbrella))
456       reexported.push_back(*reexport);
457   }
458 }
459 
460 DylibFile::DylibFile(const InterfaceFile &interface, DylibFile *umbrella)
461     : InputFile(DylibKind, interface) {
462   if (umbrella == nullptr)
463     umbrella = this;
464 
465   dylibName = saver.save(interface.getInstallName());
466   auto addSymbol = [&](const Twine &name) -> void {
467     symbols.push_back(symtab->addDylib(saver.save(name), umbrella,
468                                        /*isWeakDef=*/false,
469                                        /*isTlv=*/false));
470   };
471   // TODO(compnerd) filter out symbols based on the target platform
472   // TODO: handle weak defs, thread locals
473   for (const auto symbol : interface.symbols()) {
474     if (!symbol->getArchitectures().has(config->arch))
475       continue;
476 
477     switch (symbol->getKind()) {
478     case SymbolKind::GlobalSymbol:
479       addSymbol(symbol->getName());
480       break;
481     case SymbolKind::ObjectiveCClass:
482       // XXX ld64 only creates these symbols when -ObjC is passed in. We may
483       // want to emulate that.
484       addSymbol(objc::klass + symbol->getName());
485       addSymbol(objc::metaclass + symbol->getName());
486       break;
487     case SymbolKind::ObjectiveCClassEHType:
488       addSymbol(objc::ehtype + symbol->getName());
489       break;
490     case SymbolKind::ObjectiveCInstanceVariable:
491       addSymbol(objc::ivar + symbol->getName());
492       break;
493     }
494   }
495 
496   bool isTopLevelTapi = false;
497   if (currentTopLevelTapi == nullptr) {
498     currentTopLevelTapi = &interface;
499     isTopLevelTapi = true;
500   }
501 
502   for (InterfaceFileRef intfRef : interface.reexportedLibraries())
503     if (Optional<DylibFile *> reexport =
504             loadReexport(intfRef.getInstallName(), umbrella))
505       reexported.push_back(*reexport);
506 
507   if (isTopLevelTapi)
508     currentTopLevelTapi = nullptr;
509 }
510 
511 ArchiveFile::ArchiveFile(std::unique_ptr<llvm::object::Archive> &&f)
512     : InputFile(ArchiveKind, f->getMemoryBufferRef()), file(std::move(f)) {
513   for (const object::Archive::Symbol &sym : file->symbols())
514     symtab->addLazy(sym.getName(), this, sym);
515 }
516 
517 void ArchiveFile::fetch(const object::Archive::Symbol &sym) {
518   object::Archive::Child c =
519       CHECK(sym.getMember(), toString(this) +
520                                  ": could not get the member for symbol " +
521                                  sym.getName());
522 
523   if (!seen.insert(c.getChildOffset()).second)
524     return;
525 
526   MemoryBufferRef mb =
527       CHECK(c.getMemoryBufferRef(),
528             toString(this) +
529                 ": could not get the buffer for the member defining symbol " +
530                 sym.getName());
531   auto file = make<ObjFile>(mb);
532   symbols.insert(symbols.end(), file->symbols.begin(), file->symbols.end());
533   subsections.insert(subsections.end(), file->subsections.begin(),
534                      file->subsections.end());
535 }
536 
537 // Returns "<internal>" or "baz.o".
538 std::string lld::toString(const InputFile *file) {
539   return file ? std::string(file->getName()) : "<internal>";
540 }
541