1 //===- InputFiles.cpp -----------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains functions to parse Mach-O object files. In this comment, 10 // we describe the Mach-O file structure and how we parse it. 11 // 12 // Mach-O is not very different from ELF or COFF. The notion of symbols, 13 // sections and relocations exists in Mach-O as it does in ELF and COFF. 14 // 15 // Perhaps the notion that is new to those who know ELF/COFF is "subsections". 16 // In ELF/COFF, sections are an atomic unit of data copied from input files to 17 // output files. When we merge or garbage-collect sections, we treat each 18 // section as an atomic unit. In Mach-O, that's not the case. Sections can 19 // consist of multiple subsections, and subsections are a unit of merging and 20 // garbage-collecting. Therefore, Mach-O's subsections are more similar to 21 // ELF/COFF's sections than Mach-O's sections are. 22 // 23 // A section can have multiple symbols. A symbol that does not have the 24 // N_ALT_ENTRY attribute indicates a beginning of a subsection. Therefore, by 25 // definition, a symbol is always present at the beginning of each subsection. A 26 // symbol with N_ALT_ENTRY attribute does not start a new subsection and can 27 // point to a middle of a subsection. 28 // 29 // The notion of subsections also affects how relocations are represented in 30 // Mach-O. All references within a section need to be explicitly represented as 31 // relocations if they refer to different subsections, because we obviously need 32 // to fix up addresses if subsections are laid out in an output file differently 33 // than they were in object files. To represent that, Mach-O relocations can 34 // refer to an unnamed location via its address. Scattered relocations (those 35 // with the R_SCATTERED bit set) always refer to unnamed locations. 36 // Non-scattered relocations refer to an unnamed location if r_extern is not set 37 // and r_symbolnum is zero. 38 // 39 // Without the above differences, I think you can use your knowledge about ELF 40 // and COFF for Mach-O. 41 // 42 //===----------------------------------------------------------------------===// 43 44 #include "InputFiles.h" 45 #include "Config.h" 46 #include "Driver.h" 47 #include "Dwarf.h" 48 #include "ExportTrie.h" 49 #include "InputSection.h" 50 #include "MachOStructs.h" 51 #include "ObjC.h" 52 #include "OutputSection.h" 53 #include "OutputSegment.h" 54 #include "SymbolTable.h" 55 #include "Symbols.h" 56 #include "SyntheticSections.h" 57 #include "Target.h" 58 59 #include "lld/Common/CommonLinkerContext.h" 60 #include "lld/Common/DWARF.h" 61 #include "lld/Common/Reproduce.h" 62 #include "llvm/ADT/iterator.h" 63 #include "llvm/BinaryFormat/MachO.h" 64 #include "llvm/LTO/LTO.h" 65 #include "llvm/Support/BinaryStreamReader.h" 66 #include "llvm/Support/Endian.h" 67 #include "llvm/Support/MemoryBuffer.h" 68 #include "llvm/Support/Path.h" 69 #include "llvm/Support/TarWriter.h" 70 #include "llvm/Support/TimeProfiler.h" 71 #include "llvm/TextAPI/Architecture.h" 72 #include "llvm/TextAPI/InterfaceFile.h" 73 74 #include <type_traits> 75 76 using namespace llvm; 77 using namespace llvm::MachO; 78 using namespace llvm::support::endian; 79 using namespace llvm::sys; 80 using namespace lld; 81 using namespace lld::macho; 82 83 // Returns "<internal>", "foo.a(bar.o)", or "baz.o". 84 std::string lld::toString(const InputFile *f) { 85 if (!f) 86 return "<internal>"; 87 88 // Multiple dylibs can be defined in one .tbd file. 89 if (auto dylibFile = dyn_cast<DylibFile>(f)) 90 if (f->getName().endswith(".tbd")) 91 return (f->getName() + "(" + dylibFile->installName + ")").str(); 92 93 if (f->archiveName.empty()) 94 return std::string(f->getName()); 95 return (f->archiveName + "(" + path::filename(f->getName()) + ")").str(); 96 } 97 98 SetVector<InputFile *> macho::inputFiles; 99 std::unique_ptr<TarWriter> macho::tar; 100 int InputFile::idCount = 0; 101 102 static VersionTuple decodeVersion(uint32_t version) { 103 unsigned major = version >> 16; 104 unsigned minor = (version >> 8) & 0xffu; 105 unsigned subMinor = version & 0xffu; 106 return VersionTuple(major, minor, subMinor); 107 } 108 109 static std::vector<PlatformInfo> getPlatformInfos(const InputFile *input) { 110 if (!isa<ObjFile>(input) && !isa<DylibFile>(input)) 111 return {}; 112 113 const char *hdr = input->mb.getBufferStart(); 114 115 std::vector<PlatformInfo> platformInfos; 116 for (auto *cmd : findCommands<build_version_command>(hdr, LC_BUILD_VERSION)) { 117 PlatformInfo info; 118 info.target.Platform = static_cast<PlatformType>(cmd->platform); 119 info.minimum = decodeVersion(cmd->minos); 120 platformInfos.emplace_back(std::move(info)); 121 } 122 for (auto *cmd : findCommands<version_min_command>( 123 hdr, LC_VERSION_MIN_MACOSX, LC_VERSION_MIN_IPHONEOS, 124 LC_VERSION_MIN_TVOS, LC_VERSION_MIN_WATCHOS)) { 125 PlatformInfo info; 126 switch (cmd->cmd) { 127 case LC_VERSION_MIN_MACOSX: 128 info.target.Platform = PLATFORM_MACOS; 129 break; 130 case LC_VERSION_MIN_IPHONEOS: 131 info.target.Platform = PLATFORM_IOS; 132 break; 133 case LC_VERSION_MIN_TVOS: 134 info.target.Platform = PLATFORM_TVOS; 135 break; 136 case LC_VERSION_MIN_WATCHOS: 137 info.target.Platform = PLATFORM_WATCHOS; 138 break; 139 } 140 info.minimum = decodeVersion(cmd->version); 141 platformInfos.emplace_back(std::move(info)); 142 } 143 144 return platformInfos; 145 } 146 147 static bool checkCompatibility(const InputFile *input) { 148 std::vector<PlatformInfo> platformInfos = getPlatformInfos(input); 149 if (platformInfos.empty()) 150 return true; 151 152 auto it = find_if(platformInfos, [&](const PlatformInfo &info) { 153 return removeSimulator(info.target.Platform) == 154 removeSimulator(config->platform()); 155 }); 156 if (it == platformInfos.end()) { 157 std::string platformNames; 158 raw_string_ostream os(platformNames); 159 interleave( 160 platformInfos, os, 161 [&](const PlatformInfo &info) { 162 os << getPlatformName(info.target.Platform); 163 }, 164 "/"); 165 error(toString(input) + " has platform " + platformNames + 166 Twine(", which is different from target platform ") + 167 getPlatformName(config->platform())); 168 return false; 169 } 170 171 if (it->minimum > config->platformInfo.minimum) 172 warn(toString(input) + " has version " + it->minimum.getAsString() + 173 ", which is newer than target minimum of " + 174 config->platformInfo.minimum.getAsString()); 175 176 return true; 177 } 178 179 // This cache mostly exists to store system libraries (and .tbds) as they're 180 // loaded, rather than the input archives, which are already cached at a higher 181 // level, and other files like the filelist that are only read once. 182 // Theoretically this caching could be more efficient by hoisting it, but that 183 // would require altering many callers to track the state. 184 DenseMap<CachedHashStringRef, MemoryBufferRef> macho::cachedReads; 185 // Open a given file path and return it as a memory-mapped file. 186 Optional<MemoryBufferRef> macho::readFile(StringRef path) { 187 CachedHashStringRef key(path); 188 auto entry = cachedReads.find(key); 189 if (entry != cachedReads.end()) 190 return entry->second; 191 192 ErrorOr<std::unique_ptr<MemoryBuffer>> mbOrErr = MemoryBuffer::getFile(path); 193 if (std::error_code ec = mbOrErr.getError()) { 194 error("cannot open " + path + ": " + ec.message()); 195 return None; 196 } 197 198 std::unique_ptr<MemoryBuffer> &mb = *mbOrErr; 199 MemoryBufferRef mbref = mb->getMemBufferRef(); 200 make<std::unique_ptr<MemoryBuffer>>(std::move(mb)); // take mb ownership 201 202 // If this is a regular non-fat file, return it. 203 const char *buf = mbref.getBufferStart(); 204 const auto *hdr = reinterpret_cast<const fat_header *>(buf); 205 if (mbref.getBufferSize() < sizeof(uint32_t) || 206 read32be(&hdr->magic) != FAT_MAGIC) { 207 if (tar) 208 tar->append(relativeToRoot(path), mbref.getBuffer()); 209 return cachedReads[key] = mbref; 210 } 211 212 llvm::BumpPtrAllocator &bAlloc = lld::bAlloc(); 213 214 // Object files and archive files may be fat files, which contain multiple 215 // real files for different CPU ISAs. Here, we search for a file that matches 216 // with the current link target and returns it as a MemoryBufferRef. 217 const auto *arch = reinterpret_cast<const fat_arch *>(buf + sizeof(*hdr)); 218 219 for (uint32_t i = 0, n = read32be(&hdr->nfat_arch); i < n; ++i) { 220 if (reinterpret_cast<const char *>(arch + i + 1) > 221 buf + mbref.getBufferSize()) { 222 error(path + ": fat_arch struct extends beyond end of file"); 223 return None; 224 } 225 226 if (read32be(&arch[i].cputype) != static_cast<uint32_t>(target->cpuType) || 227 read32be(&arch[i].cpusubtype) != target->cpuSubtype) 228 continue; 229 230 uint32_t offset = read32be(&arch[i].offset); 231 uint32_t size = read32be(&arch[i].size); 232 if (offset + size > mbref.getBufferSize()) 233 error(path + ": slice extends beyond end of file"); 234 if (tar) 235 tar->append(relativeToRoot(path), mbref.getBuffer()); 236 return cachedReads[key] = MemoryBufferRef(StringRef(buf + offset, size), 237 path.copy(bAlloc)); 238 } 239 240 error("unable to find matching architecture in " + path); 241 return None; 242 } 243 244 InputFile::InputFile(Kind kind, const InterfaceFile &interface) 245 : id(idCount++), fileKind(kind), name(saver().save(interface.getPath())) {} 246 247 // Some sections comprise of fixed-size records, so instead of splitting them at 248 // symbol boundaries, we split them based on size. Records are distinct from 249 // literals in that they may contain references to other sections, instead of 250 // being leaf nodes in the InputSection graph. 251 // 252 // Note that "record" is a term I came up with. In contrast, "literal" is a term 253 // used by the Mach-O format. 254 static Optional<size_t> getRecordSize(StringRef segname, StringRef name) { 255 if (name == section_names::compactUnwind) { 256 if (segname == segment_names::ld) 257 return target->wordSize == 8 ? 32 : 20; 258 } 259 if (config->icfLevel == ICFLevel::none) 260 return {}; 261 262 if (name == section_names::cfString && segname == segment_names::data) 263 return target->wordSize == 8 ? 32 : 16; 264 if (name == section_names::objcClassRefs && segname == segment_names::data) 265 return target->wordSize; 266 return {}; 267 } 268 269 static Error parseCallGraph(ArrayRef<uint8_t> data, 270 std::vector<CallGraphEntry> &callGraph) { 271 TimeTraceScope timeScope("Parsing call graph section"); 272 BinaryStreamReader reader(data, support::little); 273 while (!reader.empty()) { 274 uint32_t fromIndex, toIndex; 275 uint64_t count; 276 if (Error err = reader.readInteger(fromIndex)) 277 return err; 278 if (Error err = reader.readInteger(toIndex)) 279 return err; 280 if (Error err = reader.readInteger(count)) 281 return err; 282 callGraph.emplace_back(fromIndex, toIndex, count); 283 } 284 return Error::success(); 285 } 286 287 // Parse the sequence of sections within a single LC_SEGMENT(_64). 288 // Split each section into subsections. 289 template <class SectionHeader> 290 void ObjFile::parseSections(ArrayRef<SectionHeader> sectionHeaders) { 291 sections.reserve(sectionHeaders.size()); 292 auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart()); 293 294 for (const SectionHeader &sec : sectionHeaders) { 295 StringRef name = 296 StringRef(sec.sectname, strnlen(sec.sectname, sizeof(sec.sectname))); 297 StringRef segname = 298 StringRef(sec.segname, strnlen(sec.segname, sizeof(sec.segname))); 299 sections.push_back(make<Section>(this, segname, name, sec.flags, sec.addr)); 300 if (sec.align >= 32) { 301 error("alignment " + std::to_string(sec.align) + " of section " + name + 302 " is too large"); 303 continue; 304 } 305 const Section §ion = *sections.back(); 306 uint32_t align = 1 << sec.align; 307 ArrayRef<uint8_t> data = {isZeroFill(sec.flags) ? nullptr 308 : buf + sec.offset, 309 static_cast<size_t>(sec.size)}; 310 311 auto splitRecords = [&](int recordSize) -> void { 312 if (data.empty()) 313 return; 314 Subsections &subsections = sections.back()->subsections; 315 subsections.reserve(data.size() / recordSize); 316 for (uint64_t off = 0; off < data.size(); off += recordSize) { 317 auto *isec = make<ConcatInputSection>( 318 section, data.slice(off, recordSize), align); 319 subsections.push_back({off, isec}); 320 } 321 }; 322 323 if (sectionType(sec.flags) == S_CSTRING_LITERALS || 324 (config->dedupLiterals && isWordLiteralSection(sec.flags))) { 325 if (sec.nreloc && config->dedupLiterals) 326 fatal(toString(this) + " contains relocations in " + sec.segname + "," + 327 sec.sectname + 328 ", so LLD cannot deduplicate literals. Try re-running without " 329 "--deduplicate-literals."); 330 331 InputSection *isec; 332 if (sectionType(sec.flags) == S_CSTRING_LITERALS) { 333 isec = make<CStringInputSection>(section, data, align); 334 // FIXME: parallelize this? 335 cast<CStringInputSection>(isec)->splitIntoPieces(); 336 } else { 337 isec = make<WordLiteralInputSection>(section, data, align); 338 } 339 sections.back()->subsections.push_back({0, isec}); 340 } else if (auto recordSize = getRecordSize(segname, name)) { 341 splitRecords(*recordSize); 342 if (name == section_names::compactUnwind) 343 compactUnwindSection = sections.back(); 344 } else if (segname == segment_names::llvm) { 345 if (config->callGraphProfileSort && name == section_names::cgProfile) 346 checkError(parseCallGraph(data, callGraph)); 347 // ld64 does not appear to emit contents from sections within the __LLVM 348 // segment. Symbols within those sections point to bitcode metadata 349 // instead of actual symbols. Global symbols within those sections could 350 // have the same name without causing duplicate symbol errors. To avoid 351 // spurious duplicate symbol errors, we do not parse these sections. 352 // TODO: Evaluate whether the bitcode metadata is needed. 353 } else { 354 auto *isec = make<ConcatInputSection>(section, data, align); 355 if (isDebugSection(isec->getFlags()) && 356 isec->getSegName() == segment_names::dwarf) { 357 // Instead of emitting DWARF sections, we emit STABS symbols to the 358 // object files that contain them. We filter them out early to avoid 359 // parsing their relocations unnecessarily. 360 debugSections.push_back(isec); 361 } else { 362 sections.back()->subsections.push_back({0, isec}); 363 } 364 } 365 } 366 } 367 368 // Find the subsection corresponding to the greatest section offset that is <= 369 // that of the given offset. 370 // 371 // offset: an offset relative to the start of the original InputSection (before 372 // any subsection splitting has occurred). It will be updated to represent the 373 // same location as an offset relative to the start of the containing 374 // subsection. 375 template <class T> 376 static InputSection *findContainingSubsection(const Section §ion, 377 T *offset) { 378 static_assert(std::is_same<uint64_t, T>::value || 379 std::is_same<uint32_t, T>::value, 380 "unexpected type for offset"); 381 auto it = std::prev(llvm::upper_bound( 382 section.subsections, *offset, 383 [](uint64_t value, Subsection subsec) { return value < subsec.offset; })); 384 *offset -= it->offset; 385 return it->isec; 386 } 387 388 // Find a symbol at offset `off` within `isec`. 389 static Defined *findSymbolAtOffset(const ConcatInputSection *isec, 390 uint64_t off) { 391 auto it = llvm::lower_bound(isec->symbols, off, [](Defined *d, uint64_t off) { 392 return d->value < off; 393 }); 394 // The offset should point at the exact address of a symbol (with no addend.) 395 if (it == isec->symbols.end() || (*it)->value != off) { 396 assert(isec->wasCoalesced); 397 return nullptr; 398 } 399 return *it; 400 } 401 402 template <class SectionHeader> 403 static bool validateRelocationInfo(InputFile *file, const SectionHeader &sec, 404 relocation_info rel) { 405 const RelocAttrs &relocAttrs = target->getRelocAttrs(rel.r_type); 406 bool valid = true; 407 auto message = [relocAttrs, file, sec, rel, &valid](const Twine &diagnostic) { 408 valid = false; 409 return (relocAttrs.name + " relocation " + diagnostic + " at offset " + 410 std::to_string(rel.r_address) + " of " + sec.segname + "," + 411 sec.sectname + " in " + toString(file)) 412 .str(); 413 }; 414 415 if (!relocAttrs.hasAttr(RelocAttrBits::LOCAL) && !rel.r_extern) 416 error(message("must be extern")); 417 if (relocAttrs.hasAttr(RelocAttrBits::PCREL) != rel.r_pcrel) 418 error(message(Twine("must ") + (rel.r_pcrel ? "not " : "") + 419 "be PC-relative")); 420 if (isThreadLocalVariables(sec.flags) && 421 !relocAttrs.hasAttr(RelocAttrBits::UNSIGNED)) 422 error(message("not allowed in thread-local section, must be UNSIGNED")); 423 if (rel.r_length < 2 || rel.r_length > 3 || 424 !relocAttrs.hasAttr(static_cast<RelocAttrBits>(1 << rel.r_length))) { 425 static SmallVector<StringRef, 4> widths{"0", "4", "8", "4 or 8"}; 426 error(message("has width " + std::to_string(1 << rel.r_length) + 427 " bytes, but must be " + 428 widths[(static_cast<int>(relocAttrs.bits) >> 2) & 3] + 429 " bytes")); 430 } 431 return valid; 432 } 433 434 template <class SectionHeader> 435 void ObjFile::parseRelocations(ArrayRef<SectionHeader> sectionHeaders, 436 const SectionHeader &sec, 437 Section §ion) { 438 auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart()); 439 ArrayRef<relocation_info> relInfos( 440 reinterpret_cast<const relocation_info *>(buf + sec.reloff), sec.nreloc); 441 442 Subsections &subsections = section.subsections; 443 auto subsecIt = subsections.rbegin(); 444 for (size_t i = 0; i < relInfos.size(); i++) { 445 // Paired relocations serve as Mach-O's method for attaching a 446 // supplemental datum to a primary relocation record. ELF does not 447 // need them because the *_RELOC_RELA records contain the extra 448 // addend field, vs. *_RELOC_REL which omit the addend. 449 // 450 // The {X86_64,ARM64}_RELOC_SUBTRACTOR record holds the subtrahend, 451 // and the paired *_RELOC_UNSIGNED record holds the minuend. The 452 // datum for each is a symbolic address. The result is the offset 453 // between two addresses. 454 // 455 // The ARM64_RELOC_ADDEND record holds the addend, and the paired 456 // ARM64_RELOC_BRANCH26 or ARM64_RELOC_PAGE21/PAGEOFF12 holds the 457 // base symbolic address. 458 // 459 // Note: X86 does not use *_RELOC_ADDEND because it can embed an 460 // addend into the instruction stream. On X86, a relocatable address 461 // field always occupies an entire contiguous sequence of byte(s), 462 // so there is no need to merge opcode bits with address 463 // bits. Therefore, it's easy and convenient to store addends in the 464 // instruction-stream bytes that would otherwise contain zeroes. By 465 // contrast, RISC ISAs such as ARM64 mix opcode bits with with 466 // address bits so that bitwise arithmetic is necessary to extract 467 // and insert them. Storing addends in the instruction stream is 468 // possible, but inconvenient and more costly at link time. 469 470 relocation_info relInfo = relInfos[i]; 471 bool isSubtrahend = 472 target->hasAttr(relInfo.r_type, RelocAttrBits::SUBTRAHEND); 473 if (isSubtrahend && StringRef(sec.sectname) == section_names::ehFrame) { 474 // __TEXT,__eh_frame only has symbols and SUBTRACTOR relocs when ld64 -r 475 // adds local "EH_Frame1" and "func.eh". Ignore them because they have 476 // gone unused by Mac OS since Snow Leopard (10.6), vintage 2009. 477 ++i; 478 continue; 479 } 480 int64_t pairedAddend = 0; 481 if (target->hasAttr(relInfo.r_type, RelocAttrBits::ADDEND)) { 482 pairedAddend = SignExtend64<24>(relInfo.r_symbolnum); 483 relInfo = relInfos[++i]; 484 } 485 assert(i < relInfos.size()); 486 if (!validateRelocationInfo(this, sec, relInfo)) 487 continue; 488 if (relInfo.r_address & R_SCATTERED) 489 fatal("TODO: Scattered relocations not supported"); 490 491 int64_t embeddedAddend = target->getEmbeddedAddend(mb, sec.offset, relInfo); 492 assert(!(embeddedAddend && pairedAddend)); 493 int64_t totalAddend = pairedAddend + embeddedAddend; 494 Reloc r; 495 r.type = relInfo.r_type; 496 r.pcrel = relInfo.r_pcrel; 497 r.length = relInfo.r_length; 498 r.offset = relInfo.r_address; 499 if (relInfo.r_extern) { 500 r.referent = symbols[relInfo.r_symbolnum]; 501 r.addend = isSubtrahend ? 0 : totalAddend; 502 } else { 503 assert(!isSubtrahend); 504 const SectionHeader &referentSecHead = 505 sectionHeaders[relInfo.r_symbolnum - 1]; 506 uint64_t referentOffset; 507 if (relInfo.r_pcrel) { 508 // The implicit addend for pcrel section relocations is the pcrel offset 509 // in terms of the addresses in the input file. Here we adjust it so 510 // that it describes the offset from the start of the referent section. 511 // FIXME This logic was written around x86_64 behavior -- ARM64 doesn't 512 // have pcrel section relocations. We may want to factor this out into 513 // the arch-specific .cpp file. 514 assert(target->hasAttr(r.type, RelocAttrBits::BYTE4)); 515 referentOffset = sec.addr + relInfo.r_address + 4 + totalAddend - 516 referentSecHead.addr; 517 } else { 518 // The addend for a non-pcrel relocation is its absolute address. 519 referentOffset = totalAddend - referentSecHead.addr; 520 } 521 r.referent = findContainingSubsection(*sections[relInfo.r_symbolnum - 1], 522 &referentOffset); 523 r.addend = referentOffset; 524 } 525 526 // Find the subsection that this relocation belongs to. 527 // Though not required by the Mach-O format, clang and gcc seem to emit 528 // relocations in order, so let's take advantage of it. However, ld64 emits 529 // unsorted relocations (in `-r` mode), so we have a fallback for that 530 // uncommon case. 531 InputSection *subsec; 532 while (subsecIt != subsections.rend() && subsecIt->offset > r.offset) 533 ++subsecIt; 534 if (subsecIt == subsections.rend() || 535 subsecIt->offset + subsecIt->isec->getSize() <= r.offset) { 536 subsec = findContainingSubsection(section, &r.offset); 537 // Now that we know the relocs are unsorted, avoid trying the 'fast path' 538 // for the other relocations. 539 subsecIt = subsections.rend(); 540 } else { 541 subsec = subsecIt->isec; 542 r.offset -= subsecIt->offset; 543 } 544 subsec->relocs.push_back(r); 545 546 if (isSubtrahend) { 547 relocation_info minuendInfo = relInfos[++i]; 548 // SUBTRACTOR relocations should always be followed by an UNSIGNED one 549 // attached to the same address. 550 assert(target->hasAttr(minuendInfo.r_type, RelocAttrBits::UNSIGNED) && 551 relInfo.r_address == minuendInfo.r_address); 552 Reloc p; 553 p.type = minuendInfo.r_type; 554 if (minuendInfo.r_extern) { 555 p.referent = symbols[minuendInfo.r_symbolnum]; 556 p.addend = totalAddend; 557 } else { 558 uint64_t referentOffset = 559 totalAddend - sectionHeaders[minuendInfo.r_symbolnum - 1].addr; 560 p.referent = findContainingSubsection( 561 *sections[minuendInfo.r_symbolnum - 1], &referentOffset); 562 p.addend = referentOffset; 563 } 564 subsec->relocs.push_back(p); 565 } 566 } 567 } 568 569 template <class NList> 570 static macho::Symbol *createDefined(const NList &sym, StringRef name, 571 InputSection *isec, uint64_t value, 572 uint64_t size) { 573 // Symbol scope is determined by sym.n_type & (N_EXT | N_PEXT): 574 // N_EXT: Global symbols. These go in the symbol table during the link, 575 // and also in the export table of the output so that the dynamic 576 // linker sees them. 577 // N_EXT | N_PEXT: Linkage unit (think: dylib) scoped. These go in the 578 // symbol table during the link so that duplicates are 579 // either reported (for non-weak symbols) or merged 580 // (for weak symbols), but they do not go in the export 581 // table of the output. 582 // N_PEXT: llvm-mc does not emit these, but `ld -r` (wherein ld64 emits 583 // object files) may produce them. LLD does not yet support -r. 584 // These are translation-unit scoped, identical to the `0` case. 585 // 0: Translation-unit scoped. These are not in the symbol table during 586 // link, and not in the export table of the output either. 587 bool isWeakDefCanBeHidden = 588 (sym.n_desc & (N_WEAK_DEF | N_WEAK_REF)) == (N_WEAK_DEF | N_WEAK_REF); 589 590 if (sym.n_type & N_EXT) { 591 bool isPrivateExtern = sym.n_type & N_PEXT; 592 // lld's behavior for merging symbols is slightly different from ld64: 593 // ld64 picks the winning symbol based on several criteria (see 594 // pickBetweenRegularAtoms() in ld64's SymbolTable.cpp), while lld 595 // just merges metadata and keeps the contents of the first symbol 596 // with that name (see SymbolTable::addDefined). For: 597 // * inline function F in a TU built with -fvisibility-inlines-hidden 598 // * and inline function F in another TU built without that flag 599 // ld64 will pick the one from the file built without 600 // -fvisibility-inlines-hidden. 601 // lld will instead pick the one listed first on the link command line and 602 // give it visibility as if the function was built without 603 // -fvisibility-inlines-hidden. 604 // If both functions have the same contents, this will have the same 605 // behavior. If not, it won't, but the input had an ODR violation in 606 // that case. 607 // 608 // Similarly, merging a symbol 609 // that's isPrivateExtern and not isWeakDefCanBeHidden with one 610 // that's not isPrivateExtern but isWeakDefCanBeHidden technically 611 // should produce one 612 // that's not isPrivateExtern but isWeakDefCanBeHidden. That matters 613 // with ld64's semantics, because it means the non-private-extern 614 // definition will continue to take priority if more private extern 615 // definitions are encountered. With lld's semantics there's no observable 616 // difference between a symbol that's isWeakDefCanBeHidden(autohide) or one 617 // that's privateExtern -- neither makes it into the dynamic symbol table, 618 // unless the autohide symbol is explicitly exported. 619 // But if a symbol is both privateExtern and autohide then it can't 620 // be exported. 621 // So we nullify the autohide flag when privateExtern is present 622 // and promote the symbol to privateExtern when it is not already. 623 if (isWeakDefCanBeHidden && isPrivateExtern) 624 isWeakDefCanBeHidden = false; 625 else if (isWeakDefCanBeHidden) 626 isPrivateExtern = true; 627 return symtab->addDefined( 628 name, isec->getFile(), isec, value, size, sym.n_desc & N_WEAK_DEF, 629 isPrivateExtern, sym.n_desc & N_ARM_THUMB_DEF, 630 sym.n_desc & REFERENCED_DYNAMICALLY, sym.n_desc & N_NO_DEAD_STRIP, 631 isWeakDefCanBeHidden); 632 } 633 assert(!isWeakDefCanBeHidden && 634 "weak_def_can_be_hidden on already-hidden symbol?"); 635 return make<Defined>( 636 name, isec->getFile(), isec, value, size, sym.n_desc & N_WEAK_DEF, 637 /*isExternal=*/false, /*isPrivateExtern=*/false, 638 sym.n_desc & N_ARM_THUMB_DEF, sym.n_desc & REFERENCED_DYNAMICALLY, 639 sym.n_desc & N_NO_DEAD_STRIP); 640 } 641 642 // Absolute symbols are defined symbols that do not have an associated 643 // InputSection. They cannot be weak. 644 template <class NList> 645 static macho::Symbol *createAbsolute(const NList &sym, InputFile *file, 646 StringRef name) { 647 if (sym.n_type & N_EXT) { 648 return symtab->addDefined( 649 name, file, nullptr, sym.n_value, /*size=*/0, 650 /*isWeakDef=*/false, sym.n_type & N_PEXT, sym.n_desc & N_ARM_THUMB_DEF, 651 /*isReferencedDynamically=*/false, sym.n_desc & N_NO_DEAD_STRIP, 652 /*isWeakDefCanBeHidden=*/false); 653 } 654 return make<Defined>(name, file, nullptr, sym.n_value, /*size=*/0, 655 /*isWeakDef=*/false, 656 /*isExternal=*/false, /*isPrivateExtern=*/false, 657 sym.n_desc & N_ARM_THUMB_DEF, 658 /*isReferencedDynamically=*/false, 659 sym.n_desc & N_NO_DEAD_STRIP); 660 } 661 662 template <class NList> 663 macho::Symbol *ObjFile::parseNonSectionSymbol(const NList &sym, 664 StringRef name) { 665 uint8_t type = sym.n_type & N_TYPE; 666 switch (type) { 667 case N_UNDF: 668 return sym.n_value == 0 669 ? symtab->addUndefined(name, this, sym.n_desc & N_WEAK_REF) 670 : symtab->addCommon(name, this, sym.n_value, 671 1 << GET_COMM_ALIGN(sym.n_desc), 672 sym.n_type & N_PEXT); 673 case N_ABS: 674 return createAbsolute(sym, this, name); 675 case N_PBUD: 676 case N_INDR: 677 error("TODO: support symbols of type " + std::to_string(type)); 678 return nullptr; 679 case N_SECT: 680 llvm_unreachable( 681 "N_SECT symbols should not be passed to parseNonSectionSymbol"); 682 default: 683 llvm_unreachable("invalid symbol type"); 684 } 685 } 686 687 template <class NList> static bool isUndef(const NList &sym) { 688 return (sym.n_type & N_TYPE) == N_UNDF && sym.n_value == 0; 689 } 690 691 template <class LP> 692 void ObjFile::parseSymbols(ArrayRef<typename LP::section> sectionHeaders, 693 ArrayRef<typename LP::nlist> nList, 694 const char *strtab, bool subsectionsViaSymbols) { 695 using NList = typename LP::nlist; 696 697 // Groups indices of the symbols by the sections that contain them. 698 std::vector<std::vector<uint32_t>> symbolsBySection(sections.size()); 699 symbols.resize(nList.size()); 700 SmallVector<unsigned, 32> undefineds; 701 for (uint32_t i = 0; i < nList.size(); ++i) { 702 const NList &sym = nList[i]; 703 704 // Ignore debug symbols for now. 705 // FIXME: may need special handling. 706 if (sym.n_type & N_STAB) 707 continue; 708 709 StringRef name = strtab + sym.n_strx; 710 if ((sym.n_type & N_TYPE) == N_SECT) { 711 Subsections &subsections = sections[sym.n_sect - 1]->subsections; 712 // parseSections() may have chosen not to parse this section. 713 if (subsections.empty()) 714 continue; 715 symbolsBySection[sym.n_sect - 1].push_back(i); 716 } else if (isUndef(sym)) { 717 undefineds.push_back(i); 718 } else { 719 symbols[i] = parseNonSectionSymbol(sym, name); 720 } 721 } 722 723 for (size_t i = 0; i < sections.size(); ++i) { 724 Subsections &subsections = sections[i]->subsections; 725 if (subsections.empty()) 726 continue; 727 InputSection *lastIsec = subsections.back().isec; 728 if (lastIsec->getName() == section_names::ehFrame) { 729 // __TEXT,__eh_frame only has symbols and SUBTRACTOR relocs when ld64 -r 730 // adds local "EH_Frame1" and "func.eh". Ignore them because they have 731 // gone unused by Mac OS since Snow Leopard (10.6), vintage 2009. 732 continue; 733 } 734 std::vector<uint32_t> &symbolIndices = symbolsBySection[i]; 735 uint64_t sectionAddr = sectionHeaders[i].addr; 736 uint32_t sectionAlign = 1u << sectionHeaders[i].align; 737 738 // Record-based sections have already been split into subsections during 739 // parseSections(), so we simply need to match Symbols to the corresponding 740 // subsection here. 741 if (getRecordSize(lastIsec->getSegName(), lastIsec->getName())) { 742 for (size_t j = 0; j < symbolIndices.size(); ++j) { 743 uint32_t symIndex = symbolIndices[j]; 744 const NList &sym = nList[symIndex]; 745 StringRef name = strtab + sym.n_strx; 746 uint64_t symbolOffset = sym.n_value - sectionAddr; 747 InputSection *isec = 748 findContainingSubsection(*sections[i], &symbolOffset); 749 if (symbolOffset != 0) { 750 error(toString(lastIsec) + ": symbol " + name + 751 " at misaligned offset"); 752 continue; 753 } 754 symbols[symIndex] = createDefined(sym, name, isec, 0, isec->getSize()); 755 } 756 continue; 757 } 758 759 // Calculate symbol sizes and create subsections by splitting the sections 760 // along symbol boundaries. 761 // We populate subsections by repeatedly splitting the last (highest 762 // address) subsection. 763 llvm::stable_sort(symbolIndices, [&](uint32_t lhs, uint32_t rhs) { 764 return nList[lhs].n_value < nList[rhs].n_value; 765 }); 766 for (size_t j = 0; j < symbolIndices.size(); ++j) { 767 uint32_t symIndex = symbolIndices[j]; 768 const NList &sym = nList[symIndex]; 769 StringRef name = strtab + sym.n_strx; 770 Subsection &subsec = subsections.back(); 771 InputSection *isec = subsec.isec; 772 773 uint64_t subsecAddr = sectionAddr + subsec.offset; 774 size_t symbolOffset = sym.n_value - subsecAddr; 775 uint64_t symbolSize = 776 j + 1 < symbolIndices.size() 777 ? nList[symbolIndices[j + 1]].n_value - sym.n_value 778 : isec->data.size() - symbolOffset; 779 // There are 4 cases where we do not need to create a new subsection: 780 // 1. If the input file does not use subsections-via-symbols. 781 // 2. Multiple symbols at the same address only induce one subsection. 782 // (The symbolOffset == 0 check covers both this case as well as 783 // the first loop iteration.) 784 // 3. Alternative entry points do not induce new subsections. 785 // 4. If we have a literal section (e.g. __cstring and __literal4). 786 if (!subsectionsViaSymbols || symbolOffset == 0 || 787 sym.n_desc & N_ALT_ENTRY || !isa<ConcatInputSection>(isec)) { 788 symbols[symIndex] = 789 createDefined(sym, name, isec, symbolOffset, symbolSize); 790 continue; 791 } 792 auto *concatIsec = cast<ConcatInputSection>(isec); 793 794 auto *nextIsec = make<ConcatInputSection>(*concatIsec); 795 nextIsec->wasCoalesced = false; 796 if (isZeroFill(isec->getFlags())) { 797 // Zero-fill sections have NULL data.data() non-zero data.size() 798 nextIsec->data = {nullptr, isec->data.size() - symbolOffset}; 799 isec->data = {nullptr, symbolOffset}; 800 } else { 801 nextIsec->data = isec->data.slice(symbolOffset); 802 isec->data = isec->data.slice(0, symbolOffset); 803 } 804 805 // By construction, the symbol will be at offset zero in the new 806 // subsection. 807 symbols[symIndex] = 808 createDefined(sym, name, nextIsec, /*value=*/0, symbolSize); 809 // TODO: ld64 appears to preserve the original alignment as well as each 810 // subsection's offset from the last aligned address. We should consider 811 // emulating that behavior. 812 nextIsec->align = MinAlign(sectionAlign, sym.n_value); 813 subsections.push_back({sym.n_value - sectionAddr, nextIsec}); 814 } 815 } 816 817 // Undefined symbols can trigger recursive fetch from Archives due to 818 // LazySymbols. Process defined symbols first so that the relative order 819 // between a defined symbol and an undefined symbol does not change the 820 // symbol resolution behavior. In addition, a set of interconnected symbols 821 // will all be resolved to the same file, instead of being resolved to 822 // different files. 823 for (unsigned i : undefineds) { 824 const NList &sym = nList[i]; 825 StringRef name = strtab + sym.n_strx; 826 symbols[i] = parseNonSectionSymbol(sym, name); 827 } 828 } 829 830 OpaqueFile::OpaqueFile(MemoryBufferRef mb, StringRef segName, 831 StringRef sectName) 832 : InputFile(OpaqueKind, mb) { 833 const auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart()); 834 ArrayRef<uint8_t> data = {buf, mb.getBufferSize()}; 835 sections.push_back(make<Section>(/*file=*/this, segName.take_front(16), 836 sectName.take_front(16), 837 /*flags=*/0, /*addr=*/0)); 838 Section §ion = *sections.back(); 839 ConcatInputSection *isec = make<ConcatInputSection>(section, data); 840 isec->live = true; 841 section.subsections.push_back({0, isec}); 842 } 843 844 ObjFile::ObjFile(MemoryBufferRef mb, uint32_t modTime, StringRef archiveName, 845 bool lazy) 846 : InputFile(ObjKind, mb, lazy), modTime(modTime) { 847 this->archiveName = std::string(archiveName); 848 if (lazy) { 849 if (target->wordSize == 8) 850 parseLazy<LP64>(); 851 else 852 parseLazy<ILP32>(); 853 } else { 854 if (target->wordSize == 8) 855 parse<LP64>(); 856 else 857 parse<ILP32>(); 858 } 859 } 860 861 template <class LP> void ObjFile::parse() { 862 using Header = typename LP::mach_header; 863 using SegmentCommand = typename LP::segment_command; 864 using SectionHeader = typename LP::section; 865 using NList = typename LP::nlist; 866 867 auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart()); 868 auto *hdr = reinterpret_cast<const Header *>(mb.getBufferStart()); 869 870 Architecture arch = getArchitectureFromCpuType(hdr->cputype, hdr->cpusubtype); 871 if (arch != config->arch()) { 872 auto msg = config->errorForArchMismatch 873 ? static_cast<void (*)(const Twine &)>(error) 874 : warn; 875 msg(toString(this) + " has architecture " + getArchitectureName(arch) + 876 " which is incompatible with target architecture " + 877 getArchitectureName(config->arch())); 878 return; 879 } 880 881 if (!checkCompatibility(this)) 882 return; 883 884 for (auto *cmd : findCommands<linker_option_command>(hdr, LC_LINKER_OPTION)) { 885 StringRef data{reinterpret_cast<const char *>(cmd + 1), 886 cmd->cmdsize - sizeof(linker_option_command)}; 887 parseLCLinkerOption(this, cmd->count, data); 888 } 889 890 ArrayRef<SectionHeader> sectionHeaders; 891 if (const load_command *cmd = findCommand(hdr, LP::segmentLCType)) { 892 auto *c = reinterpret_cast<const SegmentCommand *>(cmd); 893 sectionHeaders = ArrayRef<SectionHeader>{ 894 reinterpret_cast<const SectionHeader *>(c + 1), c->nsects}; 895 parseSections(sectionHeaders); 896 } 897 898 // TODO: Error on missing LC_SYMTAB? 899 if (const load_command *cmd = findCommand(hdr, LC_SYMTAB)) { 900 auto *c = reinterpret_cast<const symtab_command *>(cmd); 901 ArrayRef<NList> nList(reinterpret_cast<const NList *>(buf + c->symoff), 902 c->nsyms); 903 const char *strtab = reinterpret_cast<const char *>(buf) + c->stroff; 904 bool subsectionsViaSymbols = hdr->flags & MH_SUBSECTIONS_VIA_SYMBOLS; 905 parseSymbols<LP>(sectionHeaders, nList, strtab, subsectionsViaSymbols); 906 } 907 908 // The relocations may refer to the symbols, so we parse them after we have 909 // parsed all the symbols. 910 for (size_t i = 0, n = sections.size(); i < n; ++i) 911 if (!sections[i]->subsections.empty()) 912 parseRelocations(sectionHeaders, sectionHeaders[i], *sections[i]); 913 914 parseDebugInfo(); 915 if (compactUnwindSection) 916 registerCompactUnwind(); 917 } 918 919 template <class LP> void ObjFile::parseLazy() { 920 using Header = typename LP::mach_header; 921 using NList = typename LP::nlist; 922 923 auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart()); 924 auto *hdr = reinterpret_cast<const Header *>(mb.getBufferStart()); 925 const load_command *cmd = findCommand(hdr, LC_SYMTAB); 926 if (!cmd) 927 return; 928 auto *c = reinterpret_cast<const symtab_command *>(cmd); 929 ArrayRef<NList> nList(reinterpret_cast<const NList *>(buf + c->symoff), 930 c->nsyms); 931 const char *strtab = reinterpret_cast<const char *>(buf) + c->stroff; 932 symbols.resize(nList.size()); 933 for (auto it : llvm::enumerate(nList)) { 934 const NList &sym = it.value(); 935 if ((sym.n_type & N_EXT) && !isUndef(sym)) { 936 // TODO: Bound checking 937 StringRef name = strtab + sym.n_strx; 938 symbols[it.index()] = symtab->addLazyObject(name, *this); 939 if (!lazy) 940 break; 941 } 942 } 943 } 944 945 void ObjFile::parseDebugInfo() { 946 std::unique_ptr<DwarfObject> dObj = DwarfObject::create(this); 947 if (!dObj) 948 return; 949 950 auto *ctx = make<DWARFContext>( 951 std::move(dObj), "", 952 [&](Error err) { 953 warn(toString(this) + ": " + toString(std::move(err))); 954 }, 955 [&](Error warning) { 956 warn(toString(this) + ": " + toString(std::move(warning))); 957 }); 958 959 // TODO: Since object files can contain a lot of DWARF info, we should verify 960 // that we are parsing just the info we need 961 const DWARFContext::compile_unit_range &units = ctx->compile_units(); 962 // FIXME: There can be more than one compile unit per object file. See 963 // PR48637. 964 auto it = units.begin(); 965 compileUnit = it->get(); 966 } 967 968 ArrayRef<data_in_code_entry> ObjFile::getDataInCode() const { 969 const auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart()); 970 const load_command *cmd = findCommand(buf, LC_DATA_IN_CODE); 971 if (!cmd) 972 return {}; 973 const auto *c = reinterpret_cast<const linkedit_data_command *>(cmd); 974 return {reinterpret_cast<const data_in_code_entry *>(buf + c->dataoff), 975 c->datasize / sizeof(data_in_code_entry)}; 976 } 977 978 // Create pointers from symbols to their associated compact unwind entries. 979 void ObjFile::registerCompactUnwind() { 980 for (const Subsection &subsection : compactUnwindSection->subsections) { 981 ConcatInputSection *isec = cast<ConcatInputSection>(subsection.isec); 982 // Hack!! Since each CUE contains a different function address, if ICF 983 // operated naively and compared the entire contents of each CUE, entries 984 // with identical unwind info but belonging to different functions would 985 // never be considered equivalent. To work around this problem, we slice 986 // away the function address here. (Note that we do not adjust the offsets 987 // of the corresponding relocations.) We rely on `relocateCompactUnwind()` 988 // to correctly handle these truncated input sections. 989 isec->data = isec->data.slice(target->wordSize); 990 991 ConcatInputSection *referentIsec; 992 for (auto it = isec->relocs.begin(); it != isec->relocs.end();) { 993 Reloc &r = *it; 994 // CUE::functionAddress is at offset 0. Skip personality & LSDA relocs. 995 if (r.offset != 0) { 996 ++it; 997 continue; 998 } 999 uint64_t add = r.addend; 1000 if (auto *sym = cast_or_null<Defined>(r.referent.dyn_cast<Symbol *>())) { 1001 // Check whether the symbol defined in this file is the prevailing one. 1002 // Skip if it is e.g. a weak def that didn't prevail. 1003 if (sym->getFile() != this) { 1004 ++it; 1005 continue; 1006 } 1007 add += sym->value; 1008 referentIsec = cast<ConcatInputSection>(sym->isec); 1009 } else { 1010 referentIsec = 1011 cast<ConcatInputSection>(r.referent.dyn_cast<InputSection *>()); 1012 } 1013 if (referentIsec->getSegName() != segment_names::text) 1014 error(isec->getLocation(r.offset) + " references section " + 1015 referentIsec->getName() + " which is not in segment __TEXT"); 1016 // The functionAddress relocations are typically section relocations. 1017 // However, unwind info operates on a per-symbol basis, so we search for 1018 // the function symbol here. 1019 Defined *d = findSymbolAtOffset(referentIsec, add); 1020 if (!d) { 1021 ++it; 1022 continue; 1023 } 1024 d->unwindEntry = isec; 1025 // Since we've sliced away the functionAddress, we should remove the 1026 // corresponding relocation too. Given that clang emits relocations in 1027 // reverse order of address, this relocation should be at the end of the 1028 // vector for most of our input object files, so this is typically an O(1) 1029 // operation. 1030 it = isec->relocs.erase(it); 1031 } 1032 } 1033 } 1034 1035 // The path can point to either a dylib or a .tbd file. 1036 static DylibFile *loadDylib(StringRef path, DylibFile *umbrella) { 1037 Optional<MemoryBufferRef> mbref = readFile(path); 1038 if (!mbref) { 1039 error("could not read dylib file at " + path); 1040 return nullptr; 1041 } 1042 return loadDylib(*mbref, umbrella); 1043 } 1044 1045 // TBD files are parsed into a series of TAPI documents (InterfaceFiles), with 1046 // the first document storing child pointers to the rest of them. When we are 1047 // processing a given TBD file, we store that top-level document in 1048 // currentTopLevelTapi. When processing re-exports, we search its children for 1049 // potentially matching documents in the same TBD file. Note that the children 1050 // themselves don't point to further documents, i.e. this is a two-level tree. 1051 // 1052 // Re-exports can either refer to on-disk files, or to documents within .tbd 1053 // files. 1054 static DylibFile *findDylib(StringRef path, DylibFile *umbrella, 1055 const InterfaceFile *currentTopLevelTapi) { 1056 // Search order: 1057 // 1. Install name basename in -F / -L directories. 1058 { 1059 StringRef stem = path::stem(path); 1060 SmallString<128> frameworkName; 1061 path::append(frameworkName, path::Style::posix, stem + ".framework", stem); 1062 bool isFramework = path.endswith(frameworkName); 1063 if (isFramework) { 1064 for (StringRef dir : config->frameworkSearchPaths) { 1065 SmallString<128> candidate = dir; 1066 path::append(candidate, frameworkName); 1067 if (Optional<StringRef> dylibPath = resolveDylibPath(candidate.str())) 1068 return loadDylib(*dylibPath, umbrella); 1069 } 1070 } else if (Optional<StringRef> dylibPath = findPathCombination( 1071 stem, config->librarySearchPaths, {".tbd", ".dylib"})) 1072 return loadDylib(*dylibPath, umbrella); 1073 } 1074 1075 // 2. As absolute path. 1076 if (path::is_absolute(path, path::Style::posix)) 1077 for (StringRef root : config->systemLibraryRoots) 1078 if (Optional<StringRef> dylibPath = resolveDylibPath((root + path).str())) 1079 return loadDylib(*dylibPath, umbrella); 1080 1081 // 3. As relative path. 1082 1083 // TODO: Handle -dylib_file 1084 1085 // Replace @executable_path, @loader_path, @rpath prefixes in install name. 1086 SmallString<128> newPath; 1087 if (config->outputType == MH_EXECUTE && 1088 path.consume_front("@executable_path/")) { 1089 // ld64 allows overriding this with the undocumented flag -executable_path. 1090 // lld doesn't currently implement that flag. 1091 // FIXME: Consider using finalOutput instead of outputFile. 1092 path::append(newPath, path::parent_path(config->outputFile), path); 1093 path = newPath; 1094 } else if (path.consume_front("@loader_path/")) { 1095 fs::real_path(umbrella->getName(), newPath); 1096 path::remove_filename(newPath); 1097 path::append(newPath, path); 1098 path = newPath; 1099 } else if (path.startswith("@rpath/")) { 1100 for (StringRef rpath : umbrella->rpaths) { 1101 newPath.clear(); 1102 if (rpath.consume_front("@loader_path/")) { 1103 fs::real_path(umbrella->getName(), newPath); 1104 path::remove_filename(newPath); 1105 } 1106 path::append(newPath, rpath, path.drop_front(strlen("@rpath/"))); 1107 if (Optional<StringRef> dylibPath = resolveDylibPath(newPath.str())) 1108 return loadDylib(*dylibPath, umbrella); 1109 } 1110 } 1111 1112 // FIXME: Should this be further up? 1113 if (currentTopLevelTapi) { 1114 for (InterfaceFile &child : 1115 make_pointee_range(currentTopLevelTapi->documents())) { 1116 assert(child.documents().empty()); 1117 if (path == child.getInstallName()) { 1118 auto file = make<DylibFile>(child, umbrella); 1119 file->parseReexports(child); 1120 return file; 1121 } 1122 } 1123 } 1124 1125 if (Optional<StringRef> dylibPath = resolveDylibPath(path)) 1126 return loadDylib(*dylibPath, umbrella); 1127 1128 return nullptr; 1129 } 1130 1131 // If a re-exported dylib is public (lives in /usr/lib or 1132 // /System/Library/Frameworks), then it is considered implicitly linked: we 1133 // should bind to its symbols directly instead of via the re-exporting umbrella 1134 // library. 1135 static bool isImplicitlyLinked(StringRef path) { 1136 if (!config->implicitDylibs) 1137 return false; 1138 1139 if (path::parent_path(path) == "/usr/lib") 1140 return true; 1141 1142 // Match /System/Library/Frameworks/$FOO.framework/**/$FOO 1143 if (path.consume_front("/System/Library/Frameworks/")) { 1144 StringRef frameworkName = path.take_until([](char c) { return c == '.'; }); 1145 return path::filename(path) == frameworkName; 1146 } 1147 1148 return false; 1149 } 1150 1151 static void loadReexport(StringRef path, DylibFile *umbrella, 1152 const InterfaceFile *currentTopLevelTapi) { 1153 DylibFile *reexport = findDylib(path, umbrella, currentTopLevelTapi); 1154 if (!reexport) 1155 error("unable to locate re-export with install name " + path); 1156 } 1157 1158 DylibFile::DylibFile(MemoryBufferRef mb, DylibFile *umbrella, 1159 bool isBundleLoader) 1160 : InputFile(DylibKind, mb), refState(RefState::Unreferenced), 1161 isBundleLoader(isBundleLoader) { 1162 assert(!isBundleLoader || !umbrella); 1163 if (umbrella == nullptr) 1164 umbrella = this; 1165 this->umbrella = umbrella; 1166 1167 auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart()); 1168 auto *hdr = reinterpret_cast<const mach_header *>(mb.getBufferStart()); 1169 1170 // Initialize installName. 1171 if (const load_command *cmd = findCommand(hdr, LC_ID_DYLIB)) { 1172 auto *c = reinterpret_cast<const dylib_command *>(cmd); 1173 currentVersion = read32le(&c->dylib.current_version); 1174 compatibilityVersion = read32le(&c->dylib.compatibility_version); 1175 installName = 1176 reinterpret_cast<const char *>(cmd) + read32le(&c->dylib.name); 1177 } else if (!isBundleLoader) { 1178 // macho_executable and macho_bundle don't have LC_ID_DYLIB, 1179 // so it's OK. 1180 error("dylib " + toString(this) + " missing LC_ID_DYLIB load command"); 1181 return; 1182 } 1183 1184 if (config->printEachFile) 1185 message(toString(this)); 1186 inputFiles.insert(this); 1187 1188 deadStrippable = hdr->flags & MH_DEAD_STRIPPABLE_DYLIB; 1189 1190 if (!checkCompatibility(this)) 1191 return; 1192 1193 checkAppExtensionSafety(hdr->flags & MH_APP_EXTENSION_SAFE); 1194 1195 for (auto *cmd : findCommands<rpath_command>(hdr, LC_RPATH)) { 1196 StringRef rpath{reinterpret_cast<const char *>(cmd) + cmd->path}; 1197 rpaths.push_back(rpath); 1198 } 1199 1200 // Initialize symbols. 1201 exportingFile = isImplicitlyLinked(installName) ? this : this->umbrella; 1202 if (const load_command *cmd = findCommand(hdr, LC_DYLD_INFO_ONLY)) { 1203 auto *c = reinterpret_cast<const dyld_info_command *>(cmd); 1204 struct TrieEntry { 1205 StringRef name; 1206 uint64_t flags; 1207 }; 1208 1209 std::vector<TrieEntry> entries; 1210 // Find all the $ld$* symbols to process first. 1211 parseTrie(buf + c->export_off, c->export_size, 1212 [&](const Twine &name, uint64_t flags) { 1213 StringRef savedName = saver().save(name); 1214 if (handleLDSymbol(savedName)) 1215 return; 1216 entries.push_back({savedName, flags}); 1217 }); 1218 1219 // Process the "normal" symbols. 1220 for (TrieEntry &entry : entries) { 1221 if (exportingFile->hiddenSymbols.contains( 1222 CachedHashStringRef(entry.name))) 1223 continue; 1224 1225 bool isWeakDef = entry.flags & EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION; 1226 bool isTlv = entry.flags & EXPORT_SYMBOL_FLAGS_KIND_THREAD_LOCAL; 1227 1228 symbols.push_back( 1229 symtab->addDylib(entry.name, exportingFile, isWeakDef, isTlv)); 1230 } 1231 1232 } else { 1233 error("LC_DYLD_INFO_ONLY not found in " + toString(this)); 1234 return; 1235 } 1236 } 1237 1238 void DylibFile::parseLoadCommands(MemoryBufferRef mb) { 1239 auto *hdr = reinterpret_cast<const mach_header *>(mb.getBufferStart()); 1240 const uint8_t *p = reinterpret_cast<const uint8_t *>(mb.getBufferStart()) + 1241 target->headerSize; 1242 for (uint32_t i = 0, n = hdr->ncmds; i < n; ++i) { 1243 auto *cmd = reinterpret_cast<const load_command *>(p); 1244 p += cmd->cmdsize; 1245 1246 if (!(hdr->flags & MH_NO_REEXPORTED_DYLIBS) && 1247 cmd->cmd == LC_REEXPORT_DYLIB) { 1248 const auto *c = reinterpret_cast<const dylib_command *>(cmd); 1249 StringRef reexportPath = 1250 reinterpret_cast<const char *>(c) + read32le(&c->dylib.name); 1251 loadReexport(reexportPath, exportingFile, nullptr); 1252 } 1253 1254 // FIXME: What about LC_LOAD_UPWARD_DYLIB, LC_LAZY_LOAD_DYLIB, 1255 // LC_LOAD_WEAK_DYLIB, LC_REEXPORT_DYLIB (..are reexports from dylibs with 1256 // MH_NO_REEXPORTED_DYLIBS loaded for -flat_namespace)? 1257 if (config->namespaceKind == NamespaceKind::flat && 1258 cmd->cmd == LC_LOAD_DYLIB) { 1259 const auto *c = reinterpret_cast<const dylib_command *>(cmd); 1260 StringRef dylibPath = 1261 reinterpret_cast<const char *>(c) + read32le(&c->dylib.name); 1262 DylibFile *dylib = findDylib(dylibPath, umbrella, nullptr); 1263 if (!dylib) 1264 error(Twine("unable to locate library '") + dylibPath + 1265 "' loaded from '" + toString(this) + "' for -flat_namespace"); 1266 } 1267 } 1268 } 1269 1270 // Some versions of XCode ship with .tbd files that don't have the right 1271 // platform settings. 1272 constexpr std::array<StringRef, 4> skipPlatformChecks{ 1273 "/usr/lib/system/libsystem_kernel.dylib", 1274 "/usr/lib/system/libsystem_platform.dylib", 1275 "/usr/lib/system/libsystem_pthread.dylib", 1276 "/usr/lib/system/libcompiler_rt.dylib"}; 1277 1278 DylibFile::DylibFile(const InterfaceFile &interface, DylibFile *umbrella, 1279 bool isBundleLoader) 1280 : InputFile(DylibKind, interface), refState(RefState::Unreferenced), 1281 isBundleLoader(isBundleLoader) { 1282 // FIXME: Add test for the missing TBD code path. 1283 1284 if (umbrella == nullptr) 1285 umbrella = this; 1286 this->umbrella = umbrella; 1287 1288 installName = saver().save(interface.getInstallName()); 1289 compatibilityVersion = interface.getCompatibilityVersion().rawValue(); 1290 currentVersion = interface.getCurrentVersion().rawValue(); 1291 1292 if (config->printEachFile) 1293 message(toString(this)); 1294 inputFiles.insert(this); 1295 1296 if (!is_contained(skipPlatformChecks, installName) && 1297 !is_contained(interface.targets(), config->platformInfo.target)) { 1298 error(toString(this) + " is incompatible with " + 1299 std::string(config->platformInfo.target)); 1300 return; 1301 } 1302 1303 checkAppExtensionSafety(interface.isApplicationExtensionSafe()); 1304 1305 exportingFile = isImplicitlyLinked(installName) ? this : umbrella; 1306 auto addSymbol = [&](const Twine &name) -> void { 1307 StringRef savedName = saver().save(name); 1308 if (exportingFile->hiddenSymbols.contains(CachedHashStringRef(savedName))) 1309 return; 1310 1311 symbols.push_back(symtab->addDylib(savedName, exportingFile, 1312 /*isWeakDef=*/false, 1313 /*isTlv=*/false)); 1314 }; 1315 1316 std::vector<const llvm::MachO::Symbol *> normalSymbols; 1317 normalSymbols.reserve(interface.symbolsCount()); 1318 for (const auto *symbol : interface.symbols()) { 1319 if (!symbol->getArchitectures().has(config->arch())) 1320 continue; 1321 if (handleLDSymbol(symbol->getName())) 1322 continue; 1323 1324 switch (symbol->getKind()) { 1325 case SymbolKind::GlobalSymbol: // Fallthrough 1326 case SymbolKind::ObjectiveCClass: // Fallthrough 1327 case SymbolKind::ObjectiveCClassEHType: // Fallthrough 1328 case SymbolKind::ObjectiveCInstanceVariable: // Fallthrough 1329 normalSymbols.push_back(symbol); 1330 } 1331 } 1332 1333 // TODO(compnerd) filter out symbols based on the target platform 1334 // TODO: handle weak defs, thread locals 1335 for (const auto *symbol : normalSymbols) { 1336 switch (symbol->getKind()) { 1337 case SymbolKind::GlobalSymbol: 1338 addSymbol(symbol->getName()); 1339 break; 1340 case SymbolKind::ObjectiveCClass: 1341 // XXX ld64 only creates these symbols when -ObjC is passed in. We may 1342 // want to emulate that. 1343 addSymbol(objc::klass + symbol->getName()); 1344 addSymbol(objc::metaclass + symbol->getName()); 1345 break; 1346 case SymbolKind::ObjectiveCClassEHType: 1347 addSymbol(objc::ehtype + symbol->getName()); 1348 break; 1349 case SymbolKind::ObjectiveCInstanceVariable: 1350 addSymbol(objc::ivar + symbol->getName()); 1351 break; 1352 } 1353 } 1354 } 1355 1356 void DylibFile::parseReexports(const InterfaceFile &interface) { 1357 const InterfaceFile *topLevel = 1358 interface.getParent() == nullptr ? &interface : interface.getParent(); 1359 for (const InterfaceFileRef &intfRef : interface.reexportedLibraries()) { 1360 InterfaceFile::const_target_range targets = intfRef.targets(); 1361 if (is_contained(skipPlatformChecks, intfRef.getInstallName()) || 1362 is_contained(targets, config->platformInfo.target)) 1363 loadReexport(intfRef.getInstallName(), exportingFile, topLevel); 1364 } 1365 } 1366 1367 // $ld$ symbols modify the properties/behavior of the library (e.g. its install 1368 // name, compatibility version or hide/add symbols) for specific target 1369 // versions. 1370 bool DylibFile::handleLDSymbol(StringRef originalName) { 1371 if (!originalName.startswith("$ld$")) 1372 return false; 1373 1374 StringRef action; 1375 StringRef name; 1376 std::tie(action, name) = originalName.drop_front(strlen("$ld$")).split('$'); 1377 if (action == "previous") 1378 handleLDPreviousSymbol(name, originalName); 1379 else if (action == "install_name") 1380 handleLDInstallNameSymbol(name, originalName); 1381 else if (action == "hide") 1382 handleLDHideSymbol(name, originalName); 1383 return true; 1384 } 1385 1386 void DylibFile::handleLDPreviousSymbol(StringRef name, StringRef originalName) { 1387 // originalName: $ld$ previous $ <installname> $ <compatversion> $ 1388 // <platformstr> $ <startversion> $ <endversion> $ <symbol-name> $ 1389 StringRef installName; 1390 StringRef compatVersion; 1391 StringRef platformStr; 1392 StringRef startVersion; 1393 StringRef endVersion; 1394 StringRef symbolName; 1395 StringRef rest; 1396 1397 std::tie(installName, name) = name.split('$'); 1398 std::tie(compatVersion, name) = name.split('$'); 1399 std::tie(platformStr, name) = name.split('$'); 1400 std::tie(startVersion, name) = name.split('$'); 1401 std::tie(endVersion, name) = name.split('$'); 1402 std::tie(symbolName, rest) = name.split('$'); 1403 // TODO: ld64 contains some logic for non-empty symbolName as well. 1404 if (!symbolName.empty()) 1405 return; 1406 unsigned platform; 1407 if (platformStr.getAsInteger(10, platform) || 1408 platform != static_cast<unsigned>(config->platform())) 1409 return; 1410 1411 VersionTuple start; 1412 if (start.tryParse(startVersion)) { 1413 warn("failed to parse start version, symbol '" + originalName + 1414 "' ignored"); 1415 return; 1416 } 1417 VersionTuple end; 1418 if (end.tryParse(endVersion)) { 1419 warn("failed to parse end version, symbol '" + originalName + "' ignored"); 1420 return; 1421 } 1422 if (config->platformInfo.minimum < start || 1423 config->platformInfo.minimum >= end) 1424 return; 1425 1426 this->installName = saver().save(installName); 1427 1428 if (!compatVersion.empty()) { 1429 VersionTuple cVersion; 1430 if (cVersion.tryParse(compatVersion)) { 1431 warn("failed to parse compatibility version, symbol '" + originalName + 1432 "' ignored"); 1433 return; 1434 } 1435 compatibilityVersion = encodeVersion(cVersion); 1436 } 1437 } 1438 1439 void DylibFile::handleLDInstallNameSymbol(StringRef name, 1440 StringRef originalName) { 1441 // originalName: $ld$ install_name $ os<version> $ install_name 1442 StringRef condition, installName; 1443 std::tie(condition, installName) = name.split('$'); 1444 VersionTuple version; 1445 if (!condition.consume_front("os") || version.tryParse(condition)) 1446 warn("failed to parse os version, symbol '" + originalName + "' ignored"); 1447 else if (version == config->platformInfo.minimum) 1448 this->installName = saver().save(installName); 1449 } 1450 1451 void DylibFile::handleLDHideSymbol(StringRef name, StringRef originalName) { 1452 StringRef symbolName; 1453 bool shouldHide = true; 1454 if (name.startswith("os")) { 1455 // If it's hidden based on versions. 1456 name = name.drop_front(2); 1457 StringRef minVersion; 1458 std::tie(minVersion, symbolName) = name.split('$'); 1459 VersionTuple versionTup; 1460 if (versionTup.tryParse(minVersion)) { 1461 warn("Failed to parse hidden version, symbol `" + originalName + 1462 "` ignored."); 1463 return; 1464 } 1465 shouldHide = versionTup == config->platformInfo.minimum; 1466 } else { 1467 symbolName = name; 1468 } 1469 1470 if (shouldHide) 1471 exportingFile->hiddenSymbols.insert(CachedHashStringRef(symbolName)); 1472 } 1473 1474 void DylibFile::checkAppExtensionSafety(bool dylibIsAppExtensionSafe) const { 1475 if (config->applicationExtension && !dylibIsAppExtensionSafe) 1476 warn("using '-application_extension' with unsafe dylib: " + toString(this)); 1477 } 1478 1479 ArchiveFile::ArchiveFile(std::unique_ptr<object::Archive> &&f) 1480 : InputFile(ArchiveKind, f->getMemoryBufferRef()), file(std::move(f)) {} 1481 1482 void ArchiveFile::addLazySymbols() { 1483 for (const object::Archive::Symbol &sym : file->symbols()) 1484 symtab->addLazyArchive(sym.getName(), this, sym); 1485 } 1486 1487 static Expected<InputFile *> loadArchiveMember(MemoryBufferRef mb, 1488 uint32_t modTime, 1489 StringRef archiveName, 1490 uint64_t offsetInArchive) { 1491 if (config->zeroModTime) 1492 modTime = 0; 1493 1494 switch (identify_magic(mb.getBuffer())) { 1495 case file_magic::macho_object: 1496 return make<ObjFile>(mb, modTime, archiveName); 1497 case file_magic::bitcode: 1498 return make<BitcodeFile>(mb, archiveName, offsetInArchive); 1499 default: 1500 return createStringError(inconvertibleErrorCode(), 1501 mb.getBufferIdentifier() + 1502 " has unhandled file type"); 1503 } 1504 } 1505 1506 Error ArchiveFile::fetch(const object::Archive::Child &c, StringRef reason) { 1507 if (!seen.insert(c.getChildOffset()).second) 1508 return Error::success(); 1509 1510 Expected<MemoryBufferRef> mb = c.getMemoryBufferRef(); 1511 if (!mb) 1512 return mb.takeError(); 1513 1514 // Thin archives refer to .o files, so --reproduce needs the .o files too. 1515 if (tar && c.getParent()->isThin()) 1516 tar->append(relativeToRoot(CHECK(c.getFullName(), this)), mb->getBuffer()); 1517 1518 Expected<TimePoint<std::chrono::seconds>> modTime = c.getLastModified(); 1519 if (!modTime) 1520 return modTime.takeError(); 1521 1522 Expected<InputFile *> file = 1523 loadArchiveMember(*mb, toTimeT(*modTime), getName(), c.getChildOffset()); 1524 1525 if (!file) 1526 return file.takeError(); 1527 1528 inputFiles.insert(*file); 1529 printArchiveMemberLoad(reason, *file); 1530 return Error::success(); 1531 } 1532 1533 void ArchiveFile::fetch(const object::Archive::Symbol &sym) { 1534 object::Archive::Child c = 1535 CHECK(sym.getMember(), toString(this) + 1536 ": could not get the member defining symbol " + 1537 toMachOString(sym)); 1538 1539 // `sym` is owned by a LazySym, which will be replace<>()d by make<ObjFile> 1540 // and become invalid after that call. Copy it to the stack so we can refer 1541 // to it later. 1542 const object::Archive::Symbol symCopy = sym; 1543 1544 // ld64 doesn't demangle sym here even with -demangle. 1545 // Match that: intentionally don't call toMachOString(). 1546 if (Error e = fetch(c, symCopy.getName())) 1547 error(toString(this) + ": could not get the member defining symbol " + 1548 toMachOString(symCopy) + ": " + toString(std::move(e))); 1549 } 1550 1551 static macho::Symbol *createBitcodeSymbol(const lto::InputFile::Symbol &objSym, 1552 BitcodeFile &file) { 1553 StringRef name = saver().save(objSym.getName()); 1554 1555 if (objSym.isUndefined()) 1556 return symtab->addUndefined(name, &file, /*isWeakRef=*/objSym.isWeak()); 1557 1558 // TODO: Write a test demonstrating why computing isPrivateExtern before 1559 // LTO compilation is important. 1560 bool isPrivateExtern = false; 1561 switch (objSym.getVisibility()) { 1562 case GlobalValue::HiddenVisibility: 1563 isPrivateExtern = true; 1564 break; 1565 case GlobalValue::ProtectedVisibility: 1566 error(name + " has protected visibility, which is not supported by Mach-O"); 1567 break; 1568 case GlobalValue::DefaultVisibility: 1569 break; 1570 } 1571 isPrivateExtern = isPrivateExtern || objSym.canBeOmittedFromSymbolTable(); 1572 1573 if (objSym.isCommon()) 1574 return symtab->addCommon(name, &file, objSym.getCommonSize(), 1575 objSym.getCommonAlignment(), isPrivateExtern); 1576 1577 return symtab->addDefined(name, &file, /*isec=*/nullptr, /*value=*/0, 1578 /*size=*/0, objSym.isWeak(), isPrivateExtern, 1579 /*isThumb=*/false, 1580 /*isReferencedDynamically=*/false, 1581 /*noDeadStrip=*/false, 1582 /*isWeakDefCanBeHidden=*/false); 1583 } 1584 1585 BitcodeFile::BitcodeFile(MemoryBufferRef mb, StringRef archiveName, 1586 uint64_t offsetInArchive, bool lazy) 1587 : InputFile(BitcodeKind, mb, lazy) { 1588 this->archiveName = std::string(archiveName); 1589 std::string path = mb.getBufferIdentifier().str(); 1590 // ThinLTO assumes that all MemoryBufferRefs given to it have a unique 1591 // name. If two members with the same name are provided, this causes a 1592 // collision and ThinLTO can't proceed. 1593 // So, we append the archive name to disambiguate two members with the same 1594 // name from multiple different archives, and offset within the archive to 1595 // disambiguate two members of the same name from a single archive. 1596 MemoryBufferRef mbref(mb.getBuffer(), 1597 saver().save(archiveName.empty() 1598 ? path 1599 : archiveName + 1600 sys::path::filename(path) + 1601 utostr(offsetInArchive))); 1602 1603 obj = check(lto::InputFile::create(mbref)); 1604 if (lazy) 1605 parseLazy(); 1606 else 1607 parse(); 1608 } 1609 1610 void BitcodeFile::parse() { 1611 // Convert LTO Symbols to LLD Symbols in order to perform resolution. The 1612 // "winning" symbol will then be marked as Prevailing at LTO compilation 1613 // time. 1614 symbols.clear(); 1615 for (const lto::InputFile::Symbol &objSym : obj->symbols()) 1616 symbols.push_back(createBitcodeSymbol(objSym, *this)); 1617 } 1618 1619 void BitcodeFile::parseLazy() { 1620 symbols.resize(obj->symbols().size()); 1621 for (auto it : llvm::enumerate(obj->symbols())) { 1622 const lto::InputFile::Symbol &objSym = it.value(); 1623 if (!objSym.isUndefined()) { 1624 symbols[it.index()] = 1625 symtab->addLazyObject(saver().save(objSym.getName()), *this); 1626 if (!lazy) 1627 break; 1628 } 1629 } 1630 } 1631 1632 void macho::extract(InputFile &file, StringRef reason) { 1633 assert(file.lazy); 1634 file.lazy = false; 1635 printArchiveMemberLoad(reason, &file); 1636 if (auto *bitcode = dyn_cast<BitcodeFile>(&file)) { 1637 bitcode->parse(); 1638 } else { 1639 auto &f = cast<ObjFile>(file); 1640 if (target->wordSize == 8) 1641 f.parse<LP64>(); 1642 else 1643 f.parse<ILP32>(); 1644 } 1645 } 1646 1647 template void ObjFile::parse<LP64>(); 1648