1 //===- InputFiles.cpp -----------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains functions to parse Mach-O object files. In this comment,
10 // we describe the Mach-O file structure and how we parse it.
11 //
12 // Mach-O is not very different from ELF or COFF. The notion of symbols,
13 // sections and relocations exists in Mach-O as it does in ELF and COFF.
14 //
15 // Perhaps the notion that is new to those who know ELF/COFF is "subsections".
16 // In ELF/COFF, sections are an atomic unit of data copied from input files to
17 // output files. When we merge or garbage-collect sections, we treat each
18 // section as an atomic unit. In Mach-O, that's not the case. Sections can
19 // consist of multiple subsections, and subsections are a unit of merging and
20 // garbage-collecting. Therefore, Mach-O's subsections are more similar to
21 // ELF/COFF's sections than Mach-O's sections are.
22 //
23 // A section can have multiple symbols. A symbol that does not have the
24 // N_ALT_ENTRY attribute indicates a beginning of a subsection. Therefore, by
25 // definition, a symbol is always present at the beginning of each subsection. A
26 // symbol with N_ALT_ENTRY attribute does not start a new subsection and can
27 // point to a middle of a subsection.
28 //
29 // The notion of subsections also affects how relocations are represented in
30 // Mach-O. All references within a section need to be explicitly represented as
31 // relocations if they refer to different subsections, because we obviously need
32 // to fix up addresses if subsections are laid out in an output file differently
33 // than they were in object files. To represent that, Mach-O relocations can
34 // refer to an unnamed location via its address. Scattered relocations (those
35 // with the R_SCATTERED bit set) always refer to unnamed locations.
36 // Non-scattered relocations refer to an unnamed location if r_extern is not set
37 // and r_symbolnum is zero.
38 //
39 // Without the above differences, I think you can use your knowledge about ELF
40 // and COFF for Mach-O.
41 //
42 //===----------------------------------------------------------------------===//
43 
44 #include "InputFiles.h"
45 #include "Config.h"
46 #include "DriverUtils.h"
47 #include "ExportTrie.h"
48 #include "InputSection.h"
49 #include "MachOStructs.h"
50 #include "ObjC.h"
51 #include "OutputSection.h"
52 #include "OutputSegment.h"
53 #include "SymbolTable.h"
54 #include "Symbols.h"
55 #include "Target.h"
56 
57 #include "lld/Common/ErrorHandler.h"
58 #include "lld/Common/Memory.h"
59 #include "llvm/ADT/iterator.h"
60 #include "llvm/BinaryFormat/MachO.h"
61 #include "llvm/Support/Endian.h"
62 #include "llvm/Support/MemoryBuffer.h"
63 #include "llvm/Support/Path.h"
64 
65 using namespace llvm;
66 using namespace llvm::MachO;
67 using namespace llvm::support::endian;
68 using namespace llvm::sys;
69 using namespace lld;
70 using namespace lld::macho;
71 
72 std::vector<InputFile *> macho::inputFiles;
73 
74 // Open a given file path and return it as a memory-mapped file.
75 Optional<MemoryBufferRef> macho::readFile(StringRef path) {
76   // Open a file.
77   auto mbOrErr = MemoryBuffer::getFile(path);
78   if (auto ec = mbOrErr.getError()) {
79     error("cannot open " + path + ": " + ec.message());
80     return None;
81   }
82 
83   std::unique_ptr<MemoryBuffer> &mb = *mbOrErr;
84   MemoryBufferRef mbref = mb->getMemBufferRef();
85   make<std::unique_ptr<MemoryBuffer>>(std::move(mb)); // take mb ownership
86 
87   // If this is a regular non-fat file, return it.
88   const char *buf = mbref.getBufferStart();
89   auto *hdr = reinterpret_cast<const MachO::fat_header *>(buf);
90   if (read32be(&hdr->magic) != MachO::FAT_MAGIC)
91     return mbref;
92 
93   // Object files and archive files may be fat files, which contains
94   // multiple real files for different CPU ISAs. Here, we search for a
95   // file that matches with the current link target and returns it as
96   // a MemoryBufferRef.
97   auto *arch = reinterpret_cast<const MachO::fat_arch *>(buf + sizeof(*hdr));
98 
99   for (uint32_t i = 0, n = read32be(&hdr->nfat_arch); i < n; ++i) {
100     if (reinterpret_cast<const char *>(arch + i + 1) >
101         buf + mbref.getBufferSize()) {
102       error(path + ": fat_arch struct extends beyond end of file");
103       return None;
104     }
105 
106     if (read32be(&arch[i].cputype) != target->cpuType ||
107         read32be(&arch[i].cpusubtype) != target->cpuSubtype)
108       continue;
109 
110     uint32_t offset = read32be(&arch[i].offset);
111     uint32_t size = read32be(&arch[i].size);
112     if (offset + size > mbref.getBufferSize())
113       error(path + ": slice extends beyond end of file");
114     return MemoryBufferRef(StringRef(buf + offset, size), path.copy(bAlloc));
115   }
116 
117   error("unable to find matching architecture in " + path);
118   return None;
119 }
120 
121 const load_command *macho::findCommand(const mach_header_64 *hdr,
122                                        uint32_t type) {
123   const uint8_t *p =
124       reinterpret_cast<const uint8_t *>(hdr) + sizeof(mach_header_64);
125 
126   for (uint32_t i = 0, n = hdr->ncmds; i < n; ++i) {
127     auto *cmd = reinterpret_cast<const load_command *>(p);
128     if (cmd->cmd == type)
129       return cmd;
130     p += cmd->cmdsize;
131   }
132   return nullptr;
133 }
134 
135 void InputFile::parseSections(ArrayRef<section_64> sections) {
136   subsections.reserve(sections.size());
137   auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart());
138 
139   for (const section_64 &sec : sections) {
140     InputSection *isec = make<InputSection>();
141     isec->file = this;
142     isec->name =
143         StringRef(sec.sectname, strnlen(sec.sectname, sizeof(sec.sectname)));
144     isec->segname =
145         StringRef(sec.segname, strnlen(sec.segname, sizeof(sec.segname)));
146     isec->data = {isZeroFill(sec.flags) ? nullptr : buf + sec.offset,
147                   static_cast<size_t>(sec.size)};
148     if (sec.align >= 32)
149       error("alignment " + std::to_string(sec.align) + " of section " +
150             isec->name + " is too large");
151     else
152       isec->align = 1 << sec.align;
153     isec->flags = sec.flags;
154     subsections.push_back({{0, isec}});
155   }
156 }
157 
158 // Find the subsection corresponding to the greatest section offset that is <=
159 // that of the given offset.
160 //
161 // offset: an offset relative to the start of the original InputSection (before
162 // any subsection splitting has occurred). It will be updated to represent the
163 // same location as an offset relative to the start of the containing
164 // subsection.
165 static InputSection *findContainingSubsection(SubsectionMap &map,
166                                               uint32_t *offset) {
167   auto it = std::prev(map.upper_bound(*offset));
168   *offset -= it->first;
169   return it->second;
170 }
171 
172 void InputFile::parseRelocations(const section_64 &sec,
173                                  SubsectionMap &subsecMap) {
174   auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart());
175   ArrayRef<any_relocation_info> relInfos(
176       reinterpret_cast<const any_relocation_info *>(buf + sec.reloff),
177       sec.nreloc);
178 
179   for (const any_relocation_info &anyRel : relInfos) {
180     if (anyRel.r_word0 & R_SCATTERED)
181       fatal("TODO: Scattered relocations not supported");
182 
183     auto rel = reinterpret_cast<const relocation_info &>(anyRel);
184 
185     Reloc r;
186     r.type = rel.r_type;
187     r.pcrel = rel.r_pcrel;
188     r.length = rel.r_length;
189     uint64_t rawAddend = target->getImplicitAddend(mb, sec, rel);
190 
191     if (rel.r_extern) {
192       r.target = symbols[rel.r_symbolnum];
193       r.addend = rawAddend;
194     } else {
195       if (rel.r_symbolnum == 0 || rel.r_symbolnum > subsections.size())
196         fatal("invalid section index in relocation for offset " +
197               std::to_string(r.offset) + " in section " + sec.sectname +
198               " of " + getName());
199 
200       SubsectionMap &targetSubsecMap = subsections[rel.r_symbolnum - 1];
201       const section_64 &targetSec = sectionHeaders[rel.r_symbolnum - 1];
202       uint32_t targetOffset;
203       if (rel.r_pcrel) {
204         // The implicit addend for pcrel section relocations is the pcrel offset
205         // in terms of the addresses in the input file. Here we adjust it so
206         // that it describes the offset from the start of the target section.
207         // TODO: The offset of 4 is probably not right for ARM64, nor for
208         //       relocations with r_length != 2.
209         targetOffset =
210             sec.addr + rel.r_address + 4 + rawAddend - targetSec.addr;
211       } else {
212         // The addend for a non-pcrel relocation is its absolute address.
213         targetOffset = rawAddend - targetSec.addr;
214       }
215       r.target = findContainingSubsection(targetSubsecMap, &targetOffset);
216       r.addend = targetOffset;
217     }
218 
219     r.offset = rel.r_address;
220     InputSection *subsec = findContainingSubsection(subsecMap, &r.offset);
221     subsec->relocs.push_back(r);
222   }
223 }
224 
225 void InputFile::parseSymbols(ArrayRef<structs::nlist_64> nList,
226                              const char *strtab, bool subsectionsViaSymbols) {
227   // resize(), not reserve(), because we are going to create N_ALT_ENTRY symbols
228   // out-of-sequence.
229   symbols.resize(nList.size());
230   std::vector<size_t> altEntrySymIdxs;
231 
232   auto createDefined = [&](const structs::nlist_64 &sym, InputSection *isec,
233                            uint32_t value) -> Symbol * {
234     StringRef name = strtab + sym.n_strx;
235     if (sym.n_type & N_EXT)
236       // Global defined symbol
237       return symtab->addDefined(name, isec, value, sym.n_desc & N_WEAK_DEF);
238     // Local defined symbol
239     return make<Defined>(name, isec, value, sym.n_desc & N_WEAK_DEF,
240                          /*isExternal=*/false);
241   };
242 
243   for (size_t i = 0, n = nList.size(); i < n; ++i) {
244     const structs::nlist_64 &sym = nList[i];
245 
246     // Undefined symbol
247     if (!sym.n_sect) {
248       StringRef name = strtab + sym.n_strx;
249       symbols[i] = symtab->addUndefined(name);
250       continue;
251     }
252 
253     const section_64 &sec = sectionHeaders[sym.n_sect - 1];
254     SubsectionMap &subsecMap = subsections[sym.n_sect - 1];
255     uint64_t offset = sym.n_value - sec.addr;
256 
257     // If the input file does not use subsections-via-symbols, all symbols can
258     // use the same subsection. Otherwise, we must split the sections along
259     // symbol boundaries.
260     if (!subsectionsViaSymbols) {
261       symbols[i] = createDefined(sym, subsecMap[0], offset);
262       continue;
263     }
264 
265     // nList entries aren't necessarily arranged in address order. Therefore,
266     // we can't create alt-entry symbols at this point because a later symbol
267     // may split its section, which may affect which subsection the alt-entry
268     // symbol is assigned to. So we need to handle them in a second pass below.
269     if (sym.n_desc & N_ALT_ENTRY) {
270       altEntrySymIdxs.push_back(i);
271       continue;
272     }
273 
274     // Find the subsection corresponding to the greatest section offset that is
275     // <= that of the current symbol. The subsection that we find either needs
276     // to be used directly or split in two.
277     uint32_t firstSize = offset;
278     InputSection *firstIsec = findContainingSubsection(subsecMap, &firstSize);
279 
280     if (firstSize == 0) {
281       // Alias of an existing symbol, or the first symbol in the section. These
282       // are handled by reusing the existing section.
283       symbols[i] = createDefined(sym, firstIsec, 0);
284       continue;
285     }
286 
287     // We saw a symbol definition at a new offset. Split the section into two
288     // subsections. The new symbol uses the second subsection.
289     auto *secondIsec = make<InputSection>(*firstIsec);
290     secondIsec->data = firstIsec->data.slice(firstSize);
291     firstIsec->data = firstIsec->data.slice(0, firstSize);
292     // TODO: ld64 appears to preserve the original alignment as well as each
293     // subsection's offset from the last aligned address. We should consider
294     // emulating that behavior.
295     secondIsec->align = MinAlign(firstIsec->align, offset);
296 
297     subsecMap[offset] = secondIsec;
298     // By construction, the symbol will be at offset zero in the new section.
299     symbols[i] = createDefined(sym, secondIsec, 0);
300   }
301 
302   for (size_t idx : altEntrySymIdxs) {
303     const structs::nlist_64 &sym = nList[idx];
304     SubsectionMap &subsecMap = subsections[sym.n_sect - 1];
305     uint32_t off = sym.n_value - sectionHeaders[sym.n_sect - 1].addr;
306     InputSection *subsec = findContainingSubsection(subsecMap, &off);
307     symbols[idx] = createDefined(sym, subsec, off);
308   }
309 }
310 
311 OpaqueFile::OpaqueFile(MemoryBufferRef mb, StringRef segName,
312                        StringRef sectName)
313     : InputFile(OpaqueKind, mb) {
314   InputSection *isec = make<InputSection>();
315   isec->file = this;
316   isec->name = sectName.take_front(16);
317   isec->segname = segName.take_front(16);
318   const auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart());
319   isec->data = {buf, mb.getBufferSize()};
320   subsections.push_back({{0, isec}});
321 }
322 
323 ObjFile::ObjFile(MemoryBufferRef mb) : InputFile(ObjKind, mb) {
324   auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart());
325   auto *hdr = reinterpret_cast<const mach_header_64 *>(mb.getBufferStart());
326 
327   if (const load_command *cmd = findCommand(hdr, LC_SEGMENT_64)) {
328     auto *c = reinterpret_cast<const segment_command_64 *>(cmd);
329     sectionHeaders = ArrayRef<section_64>{
330         reinterpret_cast<const section_64 *>(c + 1), c->nsects};
331     parseSections(sectionHeaders);
332   }
333 
334   // TODO: Error on missing LC_SYMTAB?
335   if (const load_command *cmd = findCommand(hdr, LC_SYMTAB)) {
336     auto *c = reinterpret_cast<const symtab_command *>(cmd);
337     ArrayRef<structs::nlist_64> nList(
338         reinterpret_cast<const structs::nlist_64 *>(buf + c->symoff), c->nsyms);
339     const char *strtab = reinterpret_cast<const char *>(buf) + c->stroff;
340     bool subsectionsViaSymbols = hdr->flags & MH_SUBSECTIONS_VIA_SYMBOLS;
341     parseSymbols(nList, strtab, subsectionsViaSymbols);
342   }
343 
344   // The relocations may refer to the symbols, so we parse them after we have
345   // parsed all the symbols.
346   for (size_t i = 0, n = subsections.size(); i < n; ++i)
347     parseRelocations(sectionHeaders[i], subsections[i]);
348 }
349 
350 // The path can point to either a dylib or a .tbd file.
351 static Optional<DylibFile *> loadDylib(StringRef path, DylibFile *umbrella) {
352   Optional<MemoryBufferRef> mbref = readFile(path);
353   if (!mbref) {
354     error("could not read dylib file at " + path);
355     return {};
356   }
357 
358   file_magic magic = identify_magic(mbref->getBuffer());
359   if (magic == file_magic::tapi_file)
360     return makeDylibFromTAPI(*mbref, umbrella);
361   assert(magic == file_magic::macho_dynamically_linked_shared_lib);
362   return make<DylibFile>(*mbref, umbrella);
363 }
364 
365 // TBD files are parsed into a series of TAPI documents (InterfaceFiles), with
366 // the first document storing child pointers to the rest of them. When we are
367 // processing a given TBD file, we store that top-level document here. When
368 // processing re-exports, we search its children for potentially matching
369 // documents in the same TBD file. Note that the children themselves don't
370 // point to further documents, i.e. this is a two-level tree.
371 //
372 // ld64 allows a TAPI re-export to reference documents nested within other TBD
373 // files, but that seems like a strange design, so this is an intentional
374 // deviation.
375 const InterfaceFile *currentTopLevelTapi = nullptr;
376 
377 // Re-exports can either refer to on-disk files, or to documents within .tbd
378 // files.
379 static Optional<DylibFile *> loadReexport(StringRef path, DylibFile *umbrella) {
380   if (path::is_absolute(path, path::Style::posix))
381     for (StringRef root : config->systemLibraryRoots)
382       if (Optional<std::string> dylibPath =
383               resolveDylibPath((root + path).str()))
384         return loadDylib(*dylibPath, umbrella);
385 
386   // TODO: Expand @loader_path, @executable_path etc
387 
388   if (currentTopLevelTapi != nullptr) {
389     for (InterfaceFile &child :
390          make_pointee_range(currentTopLevelTapi->documents())) {
391       if (path == child.getInstallName())
392         return make<DylibFile>(child, umbrella);
393       assert(child.documents().empty());
394     }
395   }
396 
397   if (Optional<std::string> dylibPath = resolveDylibPath(path))
398     return loadDylib(*dylibPath, umbrella);
399 
400   error("unable to locate re-export with install name " + path);
401   return {};
402 }
403 
404 DylibFile::DylibFile(MemoryBufferRef mb, DylibFile *umbrella)
405     : InputFile(DylibKind, mb) {
406   if (umbrella == nullptr)
407     umbrella = this;
408 
409   auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart());
410   auto *hdr = reinterpret_cast<const mach_header_64 *>(mb.getBufferStart());
411 
412   // Initialize dylibName.
413   if (const load_command *cmd = findCommand(hdr, LC_ID_DYLIB)) {
414     auto *c = reinterpret_cast<const dylib_command *>(cmd);
415     dylibName = reinterpret_cast<const char *>(cmd) + read32le(&c->dylib.name);
416   } else {
417     error("dylib " + getName() + " missing LC_ID_DYLIB load command");
418     return;
419   }
420 
421   // Initialize symbols.
422   // TODO: if a re-exported dylib is public (lives in /usr/lib or
423   // /System/Library/Frameworks), we should bind to its symbols directly
424   // instead of the re-exporting umbrella library.
425   if (const load_command *cmd = findCommand(hdr, LC_DYLD_INFO_ONLY)) {
426     auto *c = reinterpret_cast<const dyld_info_command *>(cmd);
427     parseTrie(buf + c->export_off, c->export_size,
428               [&](const Twine &name, uint64_t flags) {
429                 bool isWeakDef = flags & EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION;
430                 bool isTlv = flags & EXPORT_SYMBOL_FLAGS_KIND_THREAD_LOCAL;
431                 symbols.push_back(symtab->addDylib(saver.save(name), umbrella,
432                                                    isWeakDef, isTlv));
433               });
434   } else {
435     error("LC_DYLD_INFO_ONLY not found in " + getName());
436     return;
437   }
438 
439   if (hdr->flags & MH_NO_REEXPORTED_DYLIBS)
440     return;
441 
442   const uint8_t *p =
443       reinterpret_cast<const uint8_t *>(hdr) + sizeof(mach_header_64);
444   for (uint32_t i = 0, n = hdr->ncmds; i < n; ++i) {
445     auto *cmd = reinterpret_cast<const load_command *>(p);
446     p += cmd->cmdsize;
447     if (cmd->cmd != LC_REEXPORT_DYLIB)
448       continue;
449 
450     auto *c = reinterpret_cast<const dylib_command *>(cmd);
451     StringRef reexportPath =
452         reinterpret_cast<const char *>(c) + read32le(&c->dylib.name);
453     if (Optional<DylibFile *> reexport = loadReexport(reexportPath, umbrella))
454       reexported.push_back(*reexport);
455   }
456 }
457 
458 DylibFile::DylibFile(const InterfaceFile &interface, DylibFile *umbrella)
459     : InputFile(DylibKind, interface) {
460   if (umbrella == nullptr)
461     umbrella = this;
462 
463   dylibName = saver.save(interface.getInstallName());
464   auto addSymbol = [&](const Twine &name) -> void {
465     symbols.push_back(symtab->addDylib(saver.save(name), umbrella,
466                                        /*isWeakDef=*/false,
467                                        /*isTlv=*/false));
468   };
469   // TODO(compnerd) filter out symbols based on the target platform
470   // TODO: handle weak defs, thread locals
471   for (const auto symbol : interface.symbols()) {
472     if (!symbol->getArchitectures().has(config->arch))
473       continue;
474 
475     switch (symbol->getKind()) {
476     case SymbolKind::GlobalSymbol:
477       addSymbol(symbol->getName());
478       break;
479     case SymbolKind::ObjectiveCClass:
480       // XXX ld64 only creates these symbols when -ObjC is passed in. We may
481       // want to emulate that.
482       addSymbol(objc::klass + symbol->getName());
483       addSymbol(objc::metaclass + symbol->getName());
484       break;
485     case SymbolKind::ObjectiveCClassEHType:
486       addSymbol(objc::ehtype + symbol->getName());
487       break;
488     case SymbolKind::ObjectiveCInstanceVariable:
489       addSymbol(objc::ivar + symbol->getName());
490       break;
491     }
492   }
493 
494   bool isTopLevelTapi = false;
495   if (currentTopLevelTapi == nullptr) {
496     currentTopLevelTapi = &interface;
497     isTopLevelTapi = true;
498   }
499 
500   for (InterfaceFileRef intfRef : interface.reexportedLibraries())
501     if (Optional<DylibFile *> reexport =
502             loadReexport(intfRef.getInstallName(), umbrella))
503       reexported.push_back(*reexport);
504 
505   if (isTopLevelTapi)
506     currentTopLevelTapi = nullptr;
507 }
508 
509 ArchiveFile::ArchiveFile(std::unique_ptr<llvm::object::Archive> &&f)
510     : InputFile(ArchiveKind, f->getMemoryBufferRef()), file(std::move(f)) {
511   for (const object::Archive::Symbol &sym : file->symbols())
512     symtab->addLazy(sym.getName(), this, sym);
513 }
514 
515 void ArchiveFile::fetch(const object::Archive::Symbol &sym) {
516   object::Archive::Child c =
517       CHECK(sym.getMember(), toString(this) +
518                                  ": could not get the member for symbol " +
519                                  sym.getName());
520 
521   if (!seen.insert(c.getChildOffset()).second)
522     return;
523 
524   MemoryBufferRef mb =
525       CHECK(c.getMemoryBufferRef(),
526             toString(this) +
527                 ": could not get the buffer for the member defining symbol " +
528                 sym.getName());
529   auto file = make<ObjFile>(mb);
530   symbols.insert(symbols.end(), file->symbols.begin(), file->symbols.end());
531   subsections.insert(subsections.end(), file->subsections.begin(),
532                      file->subsections.end());
533 }
534 
535 // Returns "<internal>" or "baz.o".
536 std::string lld::toString(const InputFile *file) {
537   return file ? std::string(file->getName()) : "<internal>";
538 }
539