1 //===- InputFiles.cpp -----------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains functions to parse Mach-O object files. In this comment,
10 // we describe the Mach-O file structure and how we parse it.
11 //
12 // Mach-O is not very different from ELF or COFF. The notion of symbols,
13 // sections and relocations exists in Mach-O as it does in ELF and COFF.
14 //
15 // Perhaps the notion that is new to those who know ELF/COFF is "subsections".
16 // In ELF/COFF, sections are an atomic unit of data copied from input files to
17 // output files. When we merge or garbage-collect sections, we treat each
18 // section as an atomic unit. In Mach-O, that's not the case. Sections can
19 // consist of multiple subsections, and subsections are a unit of merging and
20 // garbage-collecting. Therefore, Mach-O's subsections are more similar to
21 // ELF/COFF's sections than Mach-O's sections are.
22 //
23 // A section can have multiple symbols. A symbol that does not have the
24 // N_ALT_ENTRY attribute indicates a beginning of a subsection. Therefore, by
25 // definition, a symbol is always present at the beginning of each subsection. A
26 // symbol with N_ALT_ENTRY attribute does not start a new subsection and can
27 // point to a middle of a subsection.
28 //
29 // The notion of subsections also affects how relocations are represented in
30 // Mach-O. All references within a section need to be explicitly represented as
31 // relocations if they refer to different subsections, because we obviously need
32 // to fix up addresses if subsections are laid out in an output file differently
33 // than they were in object files. To represent that, Mach-O relocations can
34 // refer to an unnamed location via its address. Scattered relocations (those
35 // with the R_SCATTERED bit set) always refer to unnamed locations.
36 // Non-scattered relocations refer to an unnamed location if r_extern is not set
37 // and r_symbolnum is zero.
38 //
39 // Without the above differences, I think you can use your knowledge about ELF
40 // and COFF for Mach-O.
41 //
42 //===----------------------------------------------------------------------===//
43 
44 #include "InputFiles.h"
45 #include "Config.h"
46 #include "Driver.h"
47 #include "Dwarf.h"
48 #include "ExportTrie.h"
49 #include "InputSection.h"
50 #include "MachOStructs.h"
51 #include "ObjC.h"
52 #include "OutputSection.h"
53 #include "OutputSegment.h"
54 #include "SymbolTable.h"
55 #include "Symbols.h"
56 #include "Target.h"
57 
58 #include "lld/Common/DWARF.h"
59 #include "lld/Common/ErrorHandler.h"
60 #include "lld/Common/Memory.h"
61 #include "lld/Common/Reproduce.h"
62 #include "llvm/ADT/iterator.h"
63 #include "llvm/BinaryFormat/MachO.h"
64 #include "llvm/LTO/LTO.h"
65 #include "llvm/Support/Endian.h"
66 #include "llvm/Support/MemoryBuffer.h"
67 #include "llvm/Support/Path.h"
68 #include "llvm/Support/TarWriter.h"
69 #include "llvm/TextAPI/MachO/Architecture.h"
70 
71 using namespace llvm;
72 using namespace llvm::MachO;
73 using namespace llvm::support::endian;
74 using namespace llvm::sys;
75 using namespace lld;
76 using namespace lld::macho;
77 
78 // Returns "<internal>", "foo.a(bar.o)", or "baz.o".
79 std::string lld::toString(const InputFile *f) {
80   if (!f)
81     return "<internal>";
82 
83   // Multiple dylibs can be defined in one .tbd file.
84   if (auto dylibFile = dyn_cast<DylibFile>(f))
85     if (f->getName().endswith(".tbd"))
86       return (f->getName() + "(" + dylibFile->dylibName + ")").str();
87 
88   if (f->archiveName.empty())
89     return std::string(f->getName());
90   return (path::filename(f->archiveName) + "(" + path::filename(f->getName()) +
91           ")")
92       .str();
93 }
94 
95 SetVector<InputFile *> macho::inputFiles;
96 std::unique_ptr<TarWriter> macho::tar;
97 int InputFile::idCount = 0;
98 
99 // Open a given file path and return it as a memory-mapped file.
100 Optional<MemoryBufferRef> macho::readFile(StringRef path) {
101   // Open a file.
102   auto mbOrErr = MemoryBuffer::getFile(path);
103   if (auto ec = mbOrErr.getError()) {
104     error("cannot open " + path + ": " + ec.message());
105     return None;
106   }
107 
108   std::unique_ptr<MemoryBuffer> &mb = *mbOrErr;
109   MemoryBufferRef mbref = mb->getMemBufferRef();
110   make<std::unique_ptr<MemoryBuffer>>(std::move(mb)); // take mb ownership
111 
112   // If this is a regular non-fat file, return it.
113   const char *buf = mbref.getBufferStart();
114   auto *hdr = reinterpret_cast<const MachO::fat_header *>(buf);
115   if (mbref.getBufferSize() < sizeof(uint32_t) ||
116       read32be(&hdr->magic) != MachO::FAT_MAGIC) {
117     if (tar)
118       tar->append(relativeToRoot(path), mbref.getBuffer());
119     return mbref;
120   }
121 
122   // Object files and archive files may be fat files, which contains
123   // multiple real files for different CPU ISAs. Here, we search for a
124   // file that matches with the current link target and returns it as
125   // a MemoryBufferRef.
126   auto *arch = reinterpret_cast<const MachO::fat_arch *>(buf + sizeof(*hdr));
127 
128   for (uint32_t i = 0, n = read32be(&hdr->nfat_arch); i < n; ++i) {
129     if (reinterpret_cast<const char *>(arch + i + 1) >
130         buf + mbref.getBufferSize()) {
131       error(path + ": fat_arch struct extends beyond end of file");
132       return None;
133     }
134 
135     if (read32be(&arch[i].cputype) != target->cpuType ||
136         read32be(&arch[i].cpusubtype) != target->cpuSubtype)
137       continue;
138 
139     uint32_t offset = read32be(&arch[i].offset);
140     uint32_t size = read32be(&arch[i].size);
141     if (offset + size > mbref.getBufferSize())
142       error(path + ": slice extends beyond end of file");
143     if (tar)
144       tar->append(relativeToRoot(path), mbref.getBuffer());
145     return MemoryBufferRef(StringRef(buf + offset, size), path.copy(bAlloc));
146   }
147 
148   error("unable to find matching architecture in " + path);
149   return None;
150 }
151 
152 const load_command *macho::findCommand(const mach_header_64 *hdr,
153                                        uint32_t type) {
154   const uint8_t *p =
155       reinterpret_cast<const uint8_t *>(hdr) + sizeof(mach_header_64);
156 
157   for (uint32_t i = 0, n = hdr->ncmds; i < n; ++i) {
158     auto *cmd = reinterpret_cast<const load_command *>(p);
159     if (cmd->cmd == type)
160       return cmd;
161     p += cmd->cmdsize;
162   }
163   return nullptr;
164 }
165 
166 void ObjFile::parseSections(ArrayRef<section_64> sections) {
167   subsections.reserve(sections.size());
168   auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart());
169 
170   for (const section_64 &sec : sections) {
171     InputSection *isec = make<InputSection>();
172     isec->file = this;
173     isec->name =
174         StringRef(sec.sectname, strnlen(sec.sectname, sizeof(sec.sectname)));
175     isec->segname =
176         StringRef(sec.segname, strnlen(sec.segname, sizeof(sec.segname)));
177     isec->data = {isZeroFill(sec.flags) ? nullptr : buf + sec.offset,
178                   static_cast<size_t>(sec.size)};
179     if (sec.align >= 32)
180       error("alignment " + std::to_string(sec.align) + " of section " +
181             isec->name + " is too large");
182     else
183       isec->align = 1 << sec.align;
184     isec->flags = sec.flags;
185 
186     if (!(isDebugSection(isec->flags) &&
187           isec->segname == segment_names::dwarf)) {
188       subsections.push_back({{0, isec}});
189     } else {
190       // Instead of emitting DWARF sections, we emit STABS symbols to the
191       // object files that contain them. We filter them out early to avoid
192       // parsing their relocations unnecessarily. But we must still push an
193       // empty map to ensure the indices line up for the remaining sections.
194       subsections.push_back({});
195       debugSections.push_back(isec);
196     }
197   }
198 }
199 
200 // Find the subsection corresponding to the greatest section offset that is <=
201 // that of the given offset.
202 //
203 // offset: an offset relative to the start of the original InputSection (before
204 // any subsection splitting has occurred). It will be updated to represent the
205 // same location as an offset relative to the start of the containing
206 // subsection.
207 static InputSection *findContainingSubsection(SubsectionMap &map,
208                                               uint32_t *offset) {
209   auto it = std::prev(map.upper_bound(*offset));
210   *offset -= it->first;
211   return it->second;
212 }
213 
214 static bool validateRelocationInfo(InputFile *file, const section_64 &sec,
215                                    relocation_info rel) {
216   const TargetInfo::RelocAttrs &relocAttrs = target->getRelocAttrs(rel.r_type);
217   bool valid = true;
218   auto message = [relocAttrs, file, sec, rel, &valid](const Twine &diagnostic) {
219     valid = false;
220     return (relocAttrs.name + " relocation " + diagnostic + " at offset " +
221             std::to_string(rel.r_address) + " of " + sec.segname + "," +
222             sec.sectname + " in " + toString(file))
223         .str();
224   };
225 
226   if (!relocAttrs.hasAttr(RelocAttrBits::LOCAL) && !rel.r_extern)
227     error(message("must be extern"));
228   if (relocAttrs.hasAttr(RelocAttrBits::PCREL) != rel.r_pcrel)
229     error(message(Twine("must ") + (rel.r_pcrel ? "not " : "") +
230                   "be PC-relative"));
231   if (isThreadLocalVariables(sec.flags) &&
232       !relocAttrs.hasAttr(RelocAttrBits::UNSIGNED))
233     error(message("not allowed in thread-local section, must be UNSIGNED"));
234   if (rel.r_length < 2 || rel.r_length > 3 ||
235       !relocAttrs.hasAttr(static_cast<RelocAttrBits>(1 << rel.r_length))) {
236     static SmallVector<StringRef, 4> widths{"0", "4", "8", "4 or 8"};
237     error(message("has width " + std::to_string(1 << rel.r_length) +
238                   " bytes, but must be " +
239                   widths[(static_cast<int>(relocAttrs.bits) >> 2) & 3] +
240                   " bytes"));
241   }
242   return valid;
243 }
244 
245 void ObjFile::parseRelocations(const section_64 &sec,
246                                SubsectionMap &subsecMap) {
247   auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart());
248   ArrayRef<relocation_info> relInfos(
249       reinterpret_cast<const relocation_info *>(buf + sec.reloff), sec.nreloc);
250 
251   for (size_t i = 0; i < relInfos.size(); i++) {
252     // Paired relocations serve as Mach-O's method for attaching a
253     // supplemental datum to a primary relocation record. ELF does not
254     // need them because the *_RELOC_RELA records contain the extra
255     // addend field, vs. *_RELOC_REL which omit the addend.
256     //
257     // The {X86_64,ARM64}_RELOC_SUBTRACTOR record holds the subtrahend,
258     // and the paired *_RELOC_UNSIGNED record holds the minuend. The
259     // datum for each is a symbolic address. The result is the offset
260     // between two addresses.
261     //
262     // The ARM64_RELOC_ADDEND record holds the addend, and the paired
263     // ARM64_RELOC_BRANCH26 or ARM64_RELOC_PAGE21/PAGEOFF12 holds the
264     // base symbolic address.
265     //
266     // Note: X86 does not use *_RELOC_ADDEND because it can embed an
267     // addend into the instruction stream. On X86, a relocatable address
268     // field always occupies an entire contiguous sequence of byte(s),
269     // so there is no need to merge opcode bits with address
270     // bits. Therefore, it's easy and convenient to store addends in the
271     // instruction-stream bytes that would otherwise contain zeroes. By
272     // contrast, RISC ISAs such as ARM64 mix opcode bits with with
273     // address bits so that bitwise arithmetic is necessary to extract
274     // and insert them. Storing addends in the instruction stream is
275     // possible, but inconvenient and more costly at link time.
276 
277     uint64_t pairedAddend = 0;
278     relocation_info relInfo = relInfos[i];
279     if (target->hasAttr(relInfo.r_type, RelocAttrBits::ADDEND)) {
280       pairedAddend = SignExtend64<24>(relInfo.r_symbolnum);
281       relInfo = relInfos[++i];
282     }
283     assert(i < relInfos.size());
284     if (!validateRelocationInfo(this, sec, relInfo))
285       continue;
286     if (relInfo.r_address & R_SCATTERED)
287       fatal("TODO: Scattered relocations not supported");
288 
289     Reloc p;
290     if (target->hasAttr(relInfo.r_type, RelocAttrBits::SUBTRAHEND)) {
291       p.type = relInfo.r_type;
292       p.referent = symbols[relInfo.r_symbolnum];
293       relInfo = relInfos[++i];
294       // SUBTRACTOR relocations should always be followed by an UNSIGNED one
295       // indicating the minuend symbol.
296       assert(target->hasAttr(relInfo.r_type, RelocAttrBits::UNSIGNED) &&
297              relInfo.r_extern);
298     }
299     uint64_t embeddedAddend = target->getEmbeddedAddend(mb, sec, relInfo);
300     assert(!(embeddedAddend && pairedAddend));
301     uint64_t totalAddend = pairedAddend + embeddedAddend;
302     Reloc r;
303     r.type = relInfo.r_type;
304     r.pcrel = relInfo.r_pcrel;
305     r.length = relInfo.r_length;
306     r.offset = relInfo.r_address;
307     if (relInfo.r_extern) {
308       r.referent = symbols[relInfo.r_symbolnum];
309       r.addend = totalAddend;
310     } else {
311       SubsectionMap &referentSubsecMap = subsections[relInfo.r_symbolnum - 1];
312       const section_64 &referentSec = sectionHeaders[relInfo.r_symbolnum - 1];
313       uint32_t referentOffset;
314       if (relInfo.r_pcrel) {
315         // The implicit addend for pcrel section relocations is the pcrel offset
316         // in terms of the addresses in the input file. Here we adjust it so
317         // that it describes the offset from the start of the referent section.
318         assert(target->hasAttr(r.type, RelocAttrBits::BYTE4));
319         referentOffset =
320             sec.addr + relInfo.r_address + 4 + totalAddend - referentSec.addr;
321       } else {
322         // The addend for a non-pcrel relocation is its absolute address.
323         referentOffset = totalAddend - referentSec.addr;
324       }
325       r.referent = findContainingSubsection(referentSubsecMap, &referentOffset);
326       r.addend = referentOffset;
327     }
328 
329     InputSection *subsec = findContainingSubsection(subsecMap, &r.offset);
330     if (p.type != GENERIC_RELOC_INVALID)
331       subsec->relocs.push_back(p);
332     subsec->relocs.push_back(r);
333   }
334 }
335 
336 static macho::Symbol *createDefined(const structs::nlist_64 &sym,
337                                     StringRef name, InputSection *isec,
338                                     uint32_t value) {
339   // Symbol scope is determined by sym.n_type & (N_EXT | N_PEXT):
340   // N_EXT: Global symbols
341   // N_EXT | N_PEXT: Linkage unit (think: dylib) scoped
342   // N_PEXT: Does not occur in input files in practice,
343   //         a private extern must be external.
344   // 0: Translation-unit scoped. These are not in the symbol table.
345 
346   if (sym.n_type & (N_EXT | N_PEXT)) {
347     assert((sym.n_type & N_EXT) && "invalid input");
348     return symtab->addDefined(name, isec->file, isec, value,
349                               sym.n_desc & N_WEAK_DEF, sym.n_type & N_PEXT);
350   }
351   return make<Defined>(name, isec->file, isec, value, sym.n_desc & N_WEAK_DEF,
352                        /*isExternal=*/false, /*isPrivateExtern=*/false);
353 }
354 
355 // Absolute symbols are defined symbols that do not have an associated
356 // InputSection. They cannot be weak.
357 static macho::Symbol *createAbsolute(const structs::nlist_64 &sym,
358                                      InputFile *file, StringRef name) {
359   if (sym.n_type & (N_EXT | N_PEXT)) {
360     assert((sym.n_type & N_EXT) && "invalid input");
361     return symtab->addDefined(name, file, nullptr, sym.n_value,
362                               /*isWeakDef=*/false, sym.n_type & N_PEXT);
363   }
364   return make<Defined>(name, file, nullptr, sym.n_value, /*isWeakDef=*/false,
365                        /*isExternal=*/false, /*isPrivateExtern=*/false);
366 }
367 
368 macho::Symbol *ObjFile::parseNonSectionSymbol(const structs::nlist_64 &sym,
369                                               StringRef name) {
370   uint8_t type = sym.n_type & N_TYPE;
371   switch (type) {
372   case N_UNDF:
373     return sym.n_value == 0
374                ? symtab->addUndefined(name, this, sym.n_desc & N_WEAK_REF)
375                : symtab->addCommon(name, this, sym.n_value,
376                                    1 << GET_COMM_ALIGN(sym.n_desc),
377                                    sym.n_type & N_PEXT);
378   case N_ABS:
379     return createAbsolute(sym, this, name);
380   case N_PBUD:
381   case N_INDR:
382     error("TODO: support symbols of type " + std::to_string(type));
383     return nullptr;
384   case N_SECT:
385     llvm_unreachable(
386         "N_SECT symbols should not be passed to parseNonSectionSymbol");
387   default:
388     llvm_unreachable("invalid symbol type");
389   }
390 }
391 
392 void ObjFile::parseSymbols(ArrayRef<structs::nlist_64> nList,
393                            const char *strtab, bool subsectionsViaSymbols) {
394   // resize(), not reserve(), because we are going to create N_ALT_ENTRY symbols
395   // out-of-sequence.
396   symbols.resize(nList.size());
397   std::vector<size_t> altEntrySymIdxs;
398 
399   for (size_t i = 0, n = nList.size(); i < n; ++i) {
400     const structs::nlist_64 &sym = nList[i];
401     StringRef name = strtab + sym.n_strx;
402 
403     if ((sym.n_type & N_TYPE) != N_SECT) {
404       symbols[i] = parseNonSectionSymbol(sym, name);
405       continue;
406     }
407 
408     const section_64 &sec = sectionHeaders[sym.n_sect - 1];
409     SubsectionMap &subsecMap = subsections[sym.n_sect - 1];
410     assert(!subsecMap.empty());
411     uint64_t offset = sym.n_value - sec.addr;
412 
413     // If the input file does not use subsections-via-symbols, all symbols can
414     // use the same subsection. Otherwise, we must split the sections along
415     // symbol boundaries.
416     if (!subsectionsViaSymbols) {
417       symbols[i] = createDefined(sym, name, subsecMap[0], offset);
418       continue;
419     }
420 
421     // nList entries aren't necessarily arranged in address order. Therefore,
422     // we can't create alt-entry symbols at this point because a later symbol
423     // may split its section, which may affect which subsection the alt-entry
424     // symbol is assigned to. So we need to handle them in a second pass below.
425     if (sym.n_desc & N_ALT_ENTRY) {
426       altEntrySymIdxs.push_back(i);
427       continue;
428     }
429 
430     // Find the subsection corresponding to the greatest section offset that is
431     // <= that of the current symbol. The subsection that we find either needs
432     // to be used directly or split in two.
433     uint32_t firstSize = offset;
434     InputSection *firstIsec = findContainingSubsection(subsecMap, &firstSize);
435 
436     if (firstSize == 0) {
437       // Alias of an existing symbol, or the first symbol in the section. These
438       // are handled by reusing the existing section.
439       symbols[i] = createDefined(sym, name, firstIsec, 0);
440       continue;
441     }
442 
443     // We saw a symbol definition at a new offset. Split the section into two
444     // subsections. The new symbol uses the second subsection.
445     auto *secondIsec = make<InputSection>(*firstIsec);
446     secondIsec->data = firstIsec->data.slice(firstSize);
447     firstIsec->data = firstIsec->data.slice(0, firstSize);
448     // TODO: ld64 appears to preserve the original alignment as well as each
449     // subsection's offset from the last aligned address. We should consider
450     // emulating that behavior.
451     secondIsec->align = MinAlign(firstIsec->align, offset);
452 
453     subsecMap[offset] = secondIsec;
454     // By construction, the symbol will be at offset zero in the new section.
455     symbols[i] = createDefined(sym, name, secondIsec, 0);
456   }
457 
458   for (size_t idx : altEntrySymIdxs) {
459     const structs::nlist_64 &sym = nList[idx];
460     StringRef name = strtab + sym.n_strx;
461     SubsectionMap &subsecMap = subsections[sym.n_sect - 1];
462     uint32_t off = sym.n_value - sectionHeaders[sym.n_sect - 1].addr;
463     InputSection *subsec = findContainingSubsection(subsecMap, &off);
464     symbols[idx] = createDefined(sym, name, subsec, off);
465   }
466 }
467 
468 OpaqueFile::OpaqueFile(MemoryBufferRef mb, StringRef segName,
469                        StringRef sectName)
470     : InputFile(OpaqueKind, mb) {
471   InputSection *isec = make<InputSection>();
472   isec->file = this;
473   isec->name = sectName.take_front(16);
474   isec->segname = segName.take_front(16);
475   const auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart());
476   isec->data = {buf, mb.getBufferSize()};
477   subsections.push_back({{0, isec}});
478 }
479 
480 ObjFile::ObjFile(MemoryBufferRef mb, uint32_t modTime, StringRef archiveName)
481     : InputFile(ObjKind, mb), modTime(modTime) {
482   this->archiveName = std::string(archiveName);
483 
484   auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart());
485   auto *hdr = reinterpret_cast<const mach_header_64 *>(mb.getBufferStart());
486 
487   MachO::Architecture arch =
488       MachO::getArchitectureFromCpuType(hdr->cputype, hdr->cpusubtype);
489   if (arch != config->target.Arch) {
490     error(toString(this) + " has architecture " + getArchitectureName(arch) +
491           " which is incompatible with target architecture " +
492           getArchitectureName(config->target.Arch));
493     return;
494   }
495   // TODO: check platform too
496 
497   if (const load_command *cmd = findCommand(hdr, LC_LINKER_OPTION)) {
498     auto *c = reinterpret_cast<const linker_option_command *>(cmd);
499     StringRef data{reinterpret_cast<const char *>(c + 1),
500                    c->cmdsize - sizeof(linker_option_command)};
501     parseLCLinkerOption(this, c->count, data);
502   }
503 
504   if (const load_command *cmd = findCommand(hdr, LC_SEGMENT_64)) {
505     auto *c = reinterpret_cast<const segment_command_64 *>(cmd);
506     sectionHeaders = ArrayRef<section_64>{
507         reinterpret_cast<const section_64 *>(c + 1), c->nsects};
508     parseSections(sectionHeaders);
509   }
510 
511   // TODO: Error on missing LC_SYMTAB?
512   if (const load_command *cmd = findCommand(hdr, LC_SYMTAB)) {
513     auto *c = reinterpret_cast<const symtab_command *>(cmd);
514     ArrayRef<structs::nlist_64> nList(
515         reinterpret_cast<const structs::nlist_64 *>(buf + c->symoff), c->nsyms);
516     const char *strtab = reinterpret_cast<const char *>(buf) + c->stroff;
517     bool subsectionsViaSymbols = hdr->flags & MH_SUBSECTIONS_VIA_SYMBOLS;
518     parseSymbols(nList, strtab, subsectionsViaSymbols);
519   }
520 
521   // The relocations may refer to the symbols, so we parse them after we have
522   // parsed all the symbols.
523   for (size_t i = 0, n = subsections.size(); i < n; ++i)
524     if (!subsections[i].empty())
525       parseRelocations(sectionHeaders[i], subsections[i]);
526 
527   parseDebugInfo();
528 }
529 
530 void ObjFile::parseDebugInfo() {
531   std::unique_ptr<DwarfObject> dObj = DwarfObject::create(this);
532   if (!dObj)
533     return;
534 
535   auto *ctx = make<DWARFContext>(
536       std::move(dObj), "",
537       [&](Error err) {
538         warn(toString(this) + ": " + toString(std::move(err)));
539       },
540       [&](Error warning) {
541         warn(toString(this) + ": " + toString(std::move(warning)));
542       });
543 
544   // TODO: Since object files can contain a lot of DWARF info, we should verify
545   // that we are parsing just the info we need
546   const DWARFContext::compile_unit_range &units = ctx->compile_units();
547   auto it = units.begin();
548   compileUnit = it->get();
549   assert(std::next(it) == units.end());
550 }
551 
552 // The path can point to either a dylib or a .tbd file.
553 static Optional<DylibFile *> loadDylib(StringRef path, DylibFile *umbrella) {
554   Optional<MemoryBufferRef> mbref = readFile(path);
555   if (!mbref) {
556     error("could not read dylib file at " + path);
557     return {};
558   }
559   return loadDylib(*mbref, umbrella);
560 }
561 
562 // TBD files are parsed into a series of TAPI documents (InterfaceFiles), with
563 // the first document storing child pointers to the rest of them. When we are
564 // processing a given TBD file, we store that top-level document in
565 // currentTopLevelTapi. When processing re-exports, we search its children for
566 // potentially matching documents in the same TBD file. Note that the children
567 // themselves don't point to further documents, i.e. this is a two-level tree.
568 //
569 // Re-exports can either refer to on-disk files, or to documents within .tbd
570 // files.
571 static Optional<DylibFile *>
572 findDylib(StringRef path, DylibFile *umbrella,
573           const InterfaceFile *currentTopLevelTapi) {
574   if (path::is_absolute(path, path::Style::posix))
575     for (StringRef root : config->systemLibraryRoots)
576       if (Optional<std::string> dylibPath =
577               resolveDylibPath((root + path).str()))
578         return loadDylib(*dylibPath, umbrella);
579 
580   // TODO: Expand @loader_path, @executable_path, @rpath etc, handle -dylib_path
581 
582   if (currentTopLevelTapi) {
583     for (InterfaceFile &child :
584          make_pointee_range(currentTopLevelTapi->documents())) {
585       assert(child.documents().empty());
586       if (path == child.getInstallName())
587         return make<DylibFile>(child, umbrella);
588     }
589   }
590 
591   if (Optional<std::string> dylibPath = resolveDylibPath(path))
592     return loadDylib(*dylibPath, umbrella);
593 
594   return {};
595 }
596 
597 // If a re-exported dylib is public (lives in /usr/lib or
598 // /System/Library/Frameworks), then it is considered implicitly linked: we
599 // should bind to its symbols directly instead of via the re-exporting umbrella
600 // library.
601 static bool isImplicitlyLinked(StringRef path) {
602   if (!config->implicitDylibs)
603     return false;
604 
605   if (path::parent_path(path) == "/usr/lib")
606     return true;
607 
608   // Match /System/Library/Frameworks/$FOO.framework/**/$FOO
609   if (path.consume_front("/System/Library/Frameworks/")) {
610     StringRef frameworkName = path.take_until([](char c) { return c == '.'; });
611     return path::filename(path) == frameworkName;
612   }
613 
614   return false;
615 }
616 
617 void loadReexport(StringRef path, DylibFile *umbrella,
618                   const InterfaceFile *currentTopLevelTapi) {
619   Optional<DylibFile *> reexport =
620       findDylib(path, umbrella, currentTopLevelTapi);
621   if (!reexport)
622     error("unable to locate re-export with install name " + path);
623   else if (isImplicitlyLinked(path))
624     inputFiles.insert(*reexport);
625 }
626 
627 DylibFile::DylibFile(MemoryBufferRef mb, DylibFile *umbrella,
628                      bool isBundleLoader)
629     : InputFile(DylibKind, mb), refState(RefState::Unreferenced),
630       isBundleLoader(isBundleLoader) {
631   assert(!isBundleLoader || !umbrella);
632   if (umbrella == nullptr)
633     umbrella = this;
634 
635   auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart());
636   auto *hdr = reinterpret_cast<const mach_header_64 *>(mb.getBufferStart());
637 
638   // Initialize dylibName.
639   if (const load_command *cmd = findCommand(hdr, LC_ID_DYLIB)) {
640     auto *c = reinterpret_cast<const dylib_command *>(cmd);
641     currentVersion = read32le(&c->dylib.current_version);
642     compatibilityVersion = read32le(&c->dylib.compatibility_version);
643     dylibName = reinterpret_cast<const char *>(cmd) + read32le(&c->dylib.name);
644   } else if (!isBundleLoader) {
645     // macho_executable and macho_bundle don't have LC_ID_DYLIB,
646     // so it's OK.
647     error("dylib " + toString(this) + " missing LC_ID_DYLIB load command");
648     return;
649   }
650 
651   // Initialize symbols.
652   DylibFile *exportingFile = isImplicitlyLinked(dylibName) ? this : umbrella;
653   if (const load_command *cmd = findCommand(hdr, LC_DYLD_INFO_ONLY)) {
654     auto *c = reinterpret_cast<const dyld_info_command *>(cmd);
655     parseTrie(buf + c->export_off, c->export_size,
656               [&](const Twine &name, uint64_t flags) {
657                 bool isWeakDef = flags & EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION;
658                 bool isTlv = flags & EXPORT_SYMBOL_FLAGS_KIND_THREAD_LOCAL;
659                 symbols.push_back(symtab->addDylib(
660                     saver.save(name), exportingFile, isWeakDef, isTlv));
661               });
662   } else {
663     error("LC_DYLD_INFO_ONLY not found in " + toString(this));
664     return;
665   }
666 
667   const uint8_t *p =
668       reinterpret_cast<const uint8_t *>(hdr) + sizeof(mach_header_64);
669   for (uint32_t i = 0, n = hdr->ncmds; i < n; ++i) {
670     auto *cmd = reinterpret_cast<const load_command *>(p);
671     p += cmd->cmdsize;
672 
673     if (!(hdr->flags & MH_NO_REEXPORTED_DYLIBS) &&
674         cmd->cmd == LC_REEXPORT_DYLIB) {
675       const auto *c = reinterpret_cast<const dylib_command *>(cmd);
676       StringRef reexportPath =
677           reinterpret_cast<const char *>(c) + read32le(&c->dylib.name);
678       loadReexport(reexportPath, exportingFile, nullptr);
679     }
680 
681     // FIXME: What about LC_LOAD_UPWARD_DYLIB, LC_LAZY_LOAD_DYLIB,
682     // LC_LOAD_WEAK_DYLIB, LC_REEXPORT_DYLIB (..are reexports from dylibs with
683     // MH_NO_REEXPORTED_DYLIBS loaded for -flat_namespace)?
684     if (config->namespaceKind == NamespaceKind::flat &&
685         cmd->cmd == LC_LOAD_DYLIB) {
686       const auto *c = reinterpret_cast<const dylib_command *>(cmd);
687       StringRef dylibPath =
688           reinterpret_cast<const char *>(c) + read32le(&c->dylib.name);
689       Optional<DylibFile *> dylib = findDylib(dylibPath, umbrella, nullptr);
690       if (!dylib)
691         error(Twine("unable to locate library '") + dylibPath +
692               "' loaded from '" + toString(this) + "' for -flat_namespace");
693     }
694   }
695 }
696 
697 DylibFile::DylibFile(const InterfaceFile &interface, DylibFile *umbrella,
698                      bool isBundleLoader)
699     : InputFile(DylibKind, interface), refState(RefState::Unreferenced),
700       isBundleLoader(isBundleLoader) {
701   // FIXME: Add test for the missing TBD code path.
702 
703   if (umbrella == nullptr)
704     umbrella = this;
705 
706   dylibName = saver.save(interface.getInstallName());
707   compatibilityVersion = interface.getCompatibilityVersion().rawValue();
708   currentVersion = interface.getCurrentVersion().rawValue();
709 
710   if (!is_contained(interface.targets(), config->target)) {
711     error(toString(this) + " is incompatible with " +
712           std::string(config->target));
713     return;
714   }
715 
716   DylibFile *exportingFile = isImplicitlyLinked(dylibName) ? this : umbrella;
717   auto addSymbol = [&](const Twine &name) -> void {
718     symbols.push_back(symtab->addDylib(saver.save(name), exportingFile,
719                                        /*isWeakDef=*/false,
720                                        /*isTlv=*/false));
721   };
722   // TODO(compnerd) filter out symbols based on the target platform
723   // TODO: handle weak defs, thread locals
724   for (const auto symbol : interface.symbols()) {
725     if (!symbol->getArchitectures().has(config->target.Arch))
726       continue;
727 
728     switch (symbol->getKind()) {
729     case SymbolKind::GlobalSymbol:
730       addSymbol(symbol->getName());
731       break;
732     case SymbolKind::ObjectiveCClass:
733       // XXX ld64 only creates these symbols when -ObjC is passed in. We may
734       // want to emulate that.
735       addSymbol(objc::klass + symbol->getName());
736       addSymbol(objc::metaclass + symbol->getName());
737       break;
738     case SymbolKind::ObjectiveCClassEHType:
739       addSymbol(objc::ehtype + symbol->getName());
740       break;
741     case SymbolKind::ObjectiveCInstanceVariable:
742       addSymbol(objc::ivar + symbol->getName());
743       break;
744     }
745   }
746 
747   const InterfaceFile *topLevel =
748       interface.getParent() == nullptr ? &interface : interface.getParent();
749 
750   for (InterfaceFileRef intfRef : interface.reexportedLibraries()) {
751     auto targets = intfRef.targets();
752     if (is_contained(targets, config->target))
753       loadReexport(intfRef.getInstallName(), exportingFile, topLevel);
754   }
755 }
756 
757 ArchiveFile::ArchiveFile(std::unique_ptr<object::Archive> &&f)
758     : InputFile(ArchiveKind, f->getMemoryBufferRef()), file(std::move(f)) {
759   for (const object::Archive::Symbol &sym : file->symbols())
760     symtab->addLazy(sym.getName(), this, sym);
761 }
762 
763 void ArchiveFile::fetch(const object::Archive::Symbol &sym) {
764   object::Archive::Child c =
765       CHECK(sym.getMember(), toString(this) +
766                                  ": could not get the member for symbol " +
767                                  toMachOString(sym));
768 
769   if (!seen.insert(c.getChildOffset()).second)
770     return;
771 
772   MemoryBufferRef mb =
773       CHECK(c.getMemoryBufferRef(),
774             toString(this) +
775                 ": could not get the buffer for the member defining symbol " +
776                 toMachOString(sym));
777 
778   if (tar && c.getParent()->isThin())
779     tar->append(relativeToRoot(CHECK(c.getFullName(), this)), mb.getBuffer());
780 
781   uint32_t modTime = toTimeT(
782       CHECK(c.getLastModified(), toString(this) +
783                                      ": could not get the modification time "
784                                      "for the member defining symbol " +
785                                      toMachOString(sym)));
786 
787   // `sym` is owned by a LazySym, which will be replace<>() by make<ObjFile>
788   // and become invalid after that call. Copy it to the stack so we can refer
789   // to it later.
790   const object::Archive::Symbol sym_copy = sym;
791 
792   if (Optional<InputFile *> file =
793           loadArchiveMember(mb, modTime, getName(), /*objCOnly=*/false)) {
794     inputFiles.insert(*file);
795     // ld64 doesn't demangle sym here even with -demangle. Match that, so
796     // intentionally no call to toMachOString() here.
797     printArchiveMemberLoad(sym_copy.getName(), *file);
798   }
799 }
800 
801 static macho::Symbol *createBitcodeSymbol(const lto::InputFile::Symbol &objSym,
802                                           BitcodeFile &file) {
803   StringRef name = saver.save(objSym.getName());
804 
805   // TODO: support weak references
806   if (objSym.isUndefined())
807     return symtab->addUndefined(name, &file, /*isWeakRef=*/false);
808 
809   assert(!objSym.isCommon() && "TODO: support common symbols in LTO");
810 
811   // TODO: Write a test demonstrating why computing isPrivateExtern before
812   // LTO compilation is important.
813   bool isPrivateExtern = false;
814   switch (objSym.getVisibility()) {
815   case GlobalValue::HiddenVisibility:
816     isPrivateExtern = true;
817     break;
818   case GlobalValue::ProtectedVisibility:
819     error(name + " has protected visibility, which is not supported by Mach-O");
820     break;
821   case GlobalValue::DefaultVisibility:
822     break;
823   }
824 
825   return symtab->addDefined(name, &file, /*isec=*/nullptr, /*value=*/0,
826                             objSym.isWeak(), isPrivateExtern);
827 }
828 
829 BitcodeFile::BitcodeFile(MemoryBufferRef mbref)
830     : InputFile(BitcodeKind, mbref) {
831   obj = check(lto::InputFile::create(mbref));
832 
833   // Convert LTO Symbols to LLD Symbols in order to perform resolution. The
834   // "winning" symbol will then be marked as Prevailing at LTO compilation
835   // time.
836   for (const lto::InputFile::Symbol &objSym : obj->symbols())
837     symbols.push_back(createBitcodeSymbol(objSym, *this));
838 }
839