1 //===- InputFiles.cpp -----------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains functions to parse Mach-O object files. In this comment, 10 // we describe the Mach-O file structure and how we parse it. 11 // 12 // Mach-O is not very different from ELF or COFF. The notion of symbols, 13 // sections and relocations exists in Mach-O as it does in ELF and COFF. 14 // 15 // Perhaps the notion that is new to those who know ELF/COFF is "subsections". 16 // In ELF/COFF, sections are an atomic unit of data copied from input files to 17 // output files. When we merge or garbage-collect sections, we treat each 18 // section as an atomic unit. In Mach-O, that's not the case. Sections can 19 // consist of multiple subsections, and subsections are a unit of merging and 20 // garbage-collecting. Therefore, Mach-O's subsections are more similar to 21 // ELF/COFF's sections than Mach-O's sections are. 22 // 23 // A section can have multiple symbols. A symbol that does not have the 24 // N_ALT_ENTRY attribute indicates a beginning of a subsection. Therefore, by 25 // definition, a symbol is always present at the beginning of each subsection. A 26 // symbol with N_ALT_ENTRY attribute does not start a new subsection and can 27 // point to a middle of a subsection. 28 // 29 // The notion of subsections also affects how relocations are represented in 30 // Mach-O. All references within a section need to be explicitly represented as 31 // relocations if they refer to different subsections, because we obviously need 32 // to fix up addresses if subsections are laid out in an output file differently 33 // than they were in object files. To represent that, Mach-O relocations can 34 // refer to an unnamed location via its address. Scattered relocations (those 35 // with the R_SCATTERED bit set) always refer to unnamed locations. 36 // Non-scattered relocations refer to an unnamed location if r_extern is not set 37 // and r_symbolnum is zero. 38 // 39 // Without the above differences, I think you can use your knowledge about ELF 40 // and COFF for Mach-O. 41 // 42 //===----------------------------------------------------------------------===// 43 44 #include "InputFiles.h" 45 #include "Config.h" 46 #include "ExportTrie.h" 47 #include "InputSection.h" 48 #include "OutputSection.h" 49 #include "SymbolTable.h" 50 #include "Symbols.h" 51 #include "Target.h" 52 53 #include "lld/Common/ErrorHandler.h" 54 #include "lld/Common/Memory.h" 55 #include "llvm/BinaryFormat/MachO.h" 56 #include "llvm/Support/Endian.h" 57 #include "llvm/Support/MemoryBuffer.h" 58 #include "llvm/Support/Path.h" 59 60 using namespace llvm; 61 using namespace llvm::MachO; 62 using namespace llvm::support::endian; 63 using namespace llvm::sys; 64 using namespace lld; 65 using namespace lld::macho; 66 67 std::vector<InputFile *> macho::inputFiles; 68 69 // Open a given file path and return it as a memory-mapped file. 70 Optional<MemoryBufferRef> macho::readFile(StringRef path) { 71 // Open a file. 72 auto mbOrErr = MemoryBuffer::getFile(path); 73 if (auto ec = mbOrErr.getError()) { 74 error("cannot open " + path + ": " + ec.message()); 75 return None; 76 } 77 78 std::unique_ptr<MemoryBuffer> &mb = *mbOrErr; 79 MemoryBufferRef mbref = mb->getMemBufferRef(); 80 make<std::unique_ptr<MemoryBuffer>>(std::move(mb)); // take mb ownership 81 82 // If this is a regular non-fat file, return it. 83 const char *buf = mbref.getBufferStart(); 84 auto *hdr = reinterpret_cast<const MachO::fat_header *>(buf); 85 if (read32be(&hdr->magic) != MachO::FAT_MAGIC) 86 return mbref; 87 88 // Object files and archive files may be fat files, which contains 89 // multiple real files for different CPU ISAs. Here, we search for a 90 // file that matches with the current link target and returns it as 91 // a MemoryBufferRef. 92 auto *arch = reinterpret_cast<const MachO::fat_arch *>(buf + sizeof(*hdr)); 93 94 for (uint32_t i = 0, n = read32be(&hdr->nfat_arch); i < n; ++i) { 95 if (reinterpret_cast<const char *>(arch + i + 1) > 96 buf + mbref.getBufferSize()) { 97 error(path + ": fat_arch struct extends beyond end of file"); 98 return None; 99 } 100 101 if (read32be(&arch[i].cputype) != target->cpuType || 102 read32be(&arch[i].cpusubtype) != target->cpuSubtype) 103 continue; 104 105 uint32_t offset = read32be(&arch[i].offset); 106 uint32_t size = read32be(&arch[i].size); 107 if (offset + size > mbref.getBufferSize()) 108 error(path + ": slice extends beyond end of file"); 109 return MemoryBufferRef(StringRef(buf + offset, size), path.copy(bAlloc)); 110 } 111 112 error("unable to find matching architecture in " + path); 113 return None; 114 } 115 116 static const load_command *findCommand(const mach_header_64 *hdr, 117 uint32_t type) { 118 const uint8_t *p = 119 reinterpret_cast<const uint8_t *>(hdr) + sizeof(mach_header_64); 120 121 for (uint32_t i = 0, n = hdr->ncmds; i < n; ++i) { 122 auto *cmd = reinterpret_cast<const load_command *>(p); 123 if (cmd->cmd == type) 124 return cmd; 125 p += cmd->cmdsize; 126 } 127 return nullptr; 128 } 129 130 void InputFile::parseSections(ArrayRef<section_64> sections) { 131 subsections.reserve(sections.size()); 132 auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart()); 133 134 for (const section_64 &sec : sections) { 135 InputSection *isec = make<InputSection>(); 136 isec->file = this; 137 isec->name = StringRef(sec.sectname, strnlen(sec.sectname, 16)); 138 isec->segname = StringRef(sec.segname, strnlen(sec.segname, 16)); 139 isec->data = {buf + sec.offset, static_cast<size_t>(sec.size)}; 140 if (sec.align >= 32) 141 error("alignment " + std::to_string(sec.align) + " of section " + 142 isec->name + " is too large"); 143 else 144 isec->align = 1 << sec.align; 145 isec->flags = sec.flags; 146 subsections.push_back({{0, isec}}); 147 } 148 } 149 150 // Find the subsection corresponding to the greatest section offset that is <= 151 // that of the given offset. 152 // 153 // offset: an offset relative to the start of the original InputSection (before 154 // any subsection splitting has occurred). It will be updated to represent the 155 // same location as an offset relative to the start of the containing 156 // subsection. 157 static InputSection *findContainingSubsection(SubsectionMap &map, 158 uint32_t *offset) { 159 auto it = std::prev(map.upper_bound(*offset)); 160 *offset -= it->first; 161 return it->second; 162 } 163 164 void InputFile::parseRelocations(const section_64 &sec, 165 SubsectionMap &subsecMap) { 166 auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart()); 167 ArrayRef<any_relocation_info> relInfos( 168 reinterpret_cast<const any_relocation_info *>(buf + sec.reloff), 169 sec.nreloc); 170 171 for (const any_relocation_info &anyRel : relInfos) { 172 if (anyRel.r_word0 & R_SCATTERED) 173 fatal("TODO: Scattered relocations not supported"); 174 175 auto rel = reinterpret_cast<const relocation_info &>(anyRel); 176 177 Reloc r; 178 r.type = rel.r_type; 179 r.pcrel = rel.r_pcrel; 180 uint32_t secRelOffset = rel.r_address; 181 uint64_t rawAddend = 182 target->getImplicitAddend(buf + sec.offset + secRelOffset, r.type); 183 184 if (rel.r_extern) { 185 r.target = symbols[rel.r_symbolnum]; 186 r.addend = rawAddend; 187 } else { 188 if (!rel.r_pcrel) 189 fatal("TODO: Only pcrel section relocations are supported"); 190 191 if (rel.r_symbolnum == 0 || rel.r_symbolnum > subsections.size()) 192 fatal("invalid section index in relocation for offset " + 193 std::to_string(r.offset) + " in section " + sec.sectname + 194 " of " + getName()); 195 196 SubsectionMap &targetSubsecMap = subsections[rel.r_symbolnum - 1]; 197 const section_64 &targetSec = sectionHeaders[rel.r_symbolnum - 1]; 198 // The implicit addend for pcrel section relocations is the pcrel offset 199 // in terms of the addresses in the input file. Here we adjust it so that 200 // it describes the offset from the start of the target section. 201 // TODO: Figure out what to do for non-pcrel section relocations. 202 // TODO: The offset of 4 is probably not right for ARM64, nor for 203 // relocations with r_length != 2. 204 uint32_t targetOffset = 205 sec.addr + secRelOffset + 4 + rawAddend - targetSec.addr; 206 r.target = findContainingSubsection(targetSubsecMap, &targetOffset); 207 r.addend = targetOffset; 208 } 209 210 InputSection *subsec = findContainingSubsection(subsecMap, &secRelOffset); 211 r.offset = secRelOffset; 212 subsec->relocs.push_back(r); 213 } 214 } 215 216 void InputFile::parseSymbols(ArrayRef<nlist_64> nList, const char *strtab, 217 bool subsectionsViaSymbols) { 218 // resize(), not reserve(), because we are going to create N_ALT_ENTRY symbols 219 // out-of-sequence. 220 symbols.resize(nList.size()); 221 std::vector<size_t> altEntrySymIdxs; 222 223 auto createDefined = [&](const nlist_64 &sym, InputSection *isec, 224 uint32_t value) -> Symbol * { 225 StringRef name = strtab + sym.n_strx; 226 if (sym.n_type & N_EXT) 227 // Global defined symbol 228 return symtab->addDefined(name, isec, value); 229 else 230 // Local defined symbol 231 return make<Defined>(name, isec, value); 232 }; 233 234 for (size_t i = 0, n = nList.size(); i < n; ++i) { 235 const nlist_64 &sym = nList[i]; 236 237 // Undefined symbol 238 if (!sym.n_sect) { 239 StringRef name = strtab + sym.n_strx; 240 symbols[i] = symtab->addUndefined(name); 241 continue; 242 } 243 244 const section_64 &sec = sectionHeaders[sym.n_sect - 1]; 245 SubsectionMap &subsecMap = subsections[sym.n_sect - 1]; 246 uint64_t offset = sym.n_value - sec.addr; 247 248 // If the input file does not use subsections-via-symbols, all symbols can 249 // use the same subsection. Otherwise, we must split the sections along 250 // symbol boundaries. 251 if (!subsectionsViaSymbols) { 252 symbols[i] = createDefined(sym, subsecMap[0], offset); 253 continue; 254 } 255 256 // nList entries aren't necessarily arranged in address order. Therefore, 257 // we can't create alt-entry symbols at this point because a later symbol 258 // may split its section, which may affect which subsection the alt-entry 259 // symbol is assigned to. So we need to handle them in a second pass below. 260 if (sym.n_desc & N_ALT_ENTRY) { 261 altEntrySymIdxs.push_back(i); 262 continue; 263 } 264 265 // Find the subsection corresponding to the greatest section offset that is 266 // <= that of the current symbol. The subsection that we find either needs 267 // to be used directly or split in two. 268 uint32_t firstSize = offset; 269 InputSection *firstIsec = findContainingSubsection(subsecMap, &firstSize); 270 271 if (firstSize == 0) { 272 // Alias of an existing symbol, or the first symbol in the section. These 273 // are handled by reusing the existing section. 274 symbols[i] = createDefined(sym, firstIsec, 0); 275 continue; 276 } 277 278 // We saw a symbol definition at a new offset. Split the section into two 279 // subsections. The new symbol uses the second subsection. 280 auto *secondIsec = make<InputSection>(*firstIsec); 281 secondIsec->data = firstIsec->data.slice(firstSize); 282 firstIsec->data = firstIsec->data.slice(0, firstSize); 283 // TODO: ld64 appears to preserve the original alignment as well as each 284 // subsection's offset from the last aligned address. We should consider 285 // emulating that behavior. 286 secondIsec->align = MinAlign(firstIsec->align, offset); 287 288 subsecMap[offset] = secondIsec; 289 // By construction, the symbol will be at offset zero in the new section. 290 symbols[i] = createDefined(sym, secondIsec, 0); 291 } 292 293 for (size_t idx : altEntrySymIdxs) { 294 const nlist_64 &sym = nList[idx]; 295 SubsectionMap &subsecMap = subsections[sym.n_sect - 1]; 296 uint32_t off = sym.n_value - sectionHeaders[sym.n_sect - 1].addr; 297 InputSection *subsec = findContainingSubsection(subsecMap, &off); 298 symbols[idx] = createDefined(sym, subsec, off); 299 } 300 } 301 302 ObjFile::ObjFile(MemoryBufferRef mb) : InputFile(ObjKind, mb) { 303 auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart()); 304 auto *hdr = reinterpret_cast<const mach_header_64 *>(mb.getBufferStart()); 305 306 if (const load_command *cmd = findCommand(hdr, LC_SEGMENT_64)) { 307 auto *c = reinterpret_cast<const segment_command_64 *>(cmd); 308 sectionHeaders = ArrayRef<section_64>{ 309 reinterpret_cast<const section_64 *>(c + 1), c->nsects}; 310 parseSections(sectionHeaders); 311 } 312 313 // TODO: Error on missing LC_SYMTAB? 314 if (const load_command *cmd = findCommand(hdr, LC_SYMTAB)) { 315 auto *c = reinterpret_cast<const symtab_command *>(cmd); 316 ArrayRef<nlist_64> nList( 317 reinterpret_cast<const nlist_64 *>(buf + c->symoff), c->nsyms); 318 const char *strtab = reinterpret_cast<const char *>(buf) + c->stroff; 319 bool subsectionsViaSymbols = hdr->flags & MH_SUBSECTIONS_VIA_SYMBOLS; 320 parseSymbols(nList, strtab, subsectionsViaSymbols); 321 } 322 323 // The relocations may refer to the symbols, so we parse them after we have 324 // parsed all the symbols. 325 for (size_t i = 0, n = subsections.size(); i < n; ++i) 326 parseRelocations(sectionHeaders[i], subsections[i]); 327 } 328 329 DylibFile::DylibFile(MemoryBufferRef mb, DylibFile *umbrella) 330 : InputFile(DylibKind, mb) { 331 if (umbrella == nullptr) 332 umbrella = this; 333 334 auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart()); 335 auto *hdr = reinterpret_cast<const mach_header_64 *>(mb.getBufferStart()); 336 337 // Initialize dylibName. 338 if (const load_command *cmd = findCommand(hdr, LC_ID_DYLIB)) { 339 auto *c = reinterpret_cast<const dylib_command *>(cmd); 340 dylibName = reinterpret_cast<const char *>(cmd) + read32le(&c->dylib.name); 341 } else { 342 error("dylib " + getName() + " missing LC_ID_DYLIB load command"); 343 return; 344 } 345 346 // Initialize symbols. 347 if (const load_command *cmd = findCommand(hdr, LC_DYLD_INFO_ONLY)) { 348 auto *c = reinterpret_cast<const dyld_info_command *>(cmd); 349 parseTrie(buf + c->export_off, c->export_size, 350 [&](const Twine &name, uint64_t flags) { 351 symbols.push_back(symtab->addDylib(saver.save(name), umbrella)); 352 }); 353 } else { 354 error("LC_DYLD_INFO_ONLY not found in " + getName()); 355 return; 356 } 357 358 if (hdr->flags & MH_NO_REEXPORTED_DYLIBS) 359 return; 360 361 const uint8_t *p = 362 reinterpret_cast<const uint8_t *>(hdr) + sizeof(mach_header_64); 363 for (uint32_t i = 0, n = hdr->ncmds; i < n; ++i) { 364 auto *cmd = reinterpret_cast<const load_command *>(p); 365 p += cmd->cmdsize; 366 if (cmd->cmd != LC_REEXPORT_DYLIB) 367 continue; 368 369 auto *c = reinterpret_cast<const dylib_command *>(cmd); 370 StringRef reexportPath = 371 reinterpret_cast<const char *>(c) + read32le(&c->dylib.name); 372 // TODO: Expand @loader_path, @executable_path etc in reexportPath 373 Optional<MemoryBufferRef> buffer = readFile(reexportPath); 374 if (!buffer) { 375 error("unable to read re-exported dylib at " + reexportPath); 376 return; 377 } 378 reexported.push_back(make<DylibFile>(*buffer, umbrella)); 379 } 380 } 381 382 DylibFile::DylibFile() : InputFile(DylibKind, MemoryBufferRef()) {} 383 384 DylibFile *DylibFile::createLibSystemMock() { 385 auto *file = make<DylibFile>(); 386 file->mb = MemoryBufferRef("", "/usr/lib/libSystem.B.dylib"); 387 file->dylibName = "/usr/lib/libSystem.B.dylib"; 388 file->symbols.push_back(symtab->addDylib("dyld_stub_binder", file)); 389 return file; 390 } 391 392 ArchiveFile::ArchiveFile(std::unique_ptr<llvm::object::Archive> &&f) 393 : InputFile(ArchiveKind, f->getMemoryBufferRef()), file(std::move(f)) { 394 for (const object::Archive::Symbol &sym : file->symbols()) 395 symtab->addLazy(sym.getName(), this, sym); 396 } 397 398 void ArchiveFile::fetch(const object::Archive::Symbol &sym) { 399 object::Archive::Child c = 400 CHECK(sym.getMember(), toString(this) + 401 ": could not get the member for symbol " + 402 sym.getName()); 403 404 if (!seen.insert(c.getChildOffset()).second) 405 return; 406 407 MemoryBufferRef mb = 408 CHECK(c.getMemoryBufferRef(), 409 toString(this) + 410 ": could not get the buffer for the member defining symbol " + 411 sym.getName()); 412 auto file = make<ObjFile>(mb); 413 symbols.insert(symbols.end(), file->symbols.begin(), file->symbols.end()); 414 subsections.insert(subsections.end(), file->subsections.begin(), 415 file->subsections.end()); 416 } 417 418 // Returns "<internal>" or "baz.o". 419 std::string lld::toString(const InputFile *file) { 420 return file ? std::string(file->getName()) : "<internal>"; 421 } 422