//===- DbiStreamBuilder.cpp - PDB Dbi Stream Creation -----------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // The data structures defined in this file are based on the reference // implementation which is available at // https://github.com/Microsoft/microsoft-pdb/blob/master/PDB/dbi/gsi.cpp // //===----------------------------------------------------------------------===// #include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h" #include "llvm/ADT/DenseSet.h" #include "llvm/DebugInfo/CodeView/RecordName.h" #include "llvm/DebugInfo/CodeView/SymbolDeserializer.h" #include "llvm/DebugInfo/CodeView/SymbolRecord.h" #include "llvm/DebugInfo/CodeView/SymbolSerializer.h" #include "llvm/DebugInfo/MSF/MSFBuilder.h" #include "llvm/DebugInfo/MSF/MSFCommon.h" #include "llvm/DebugInfo/MSF/MappedBlockStream.h" #include "llvm/DebugInfo/PDB/Native/GlobalsStream.h" #include "llvm/DebugInfo/PDB/Native/Hash.h" #include "llvm/Support/BinaryItemStream.h" #include "llvm/Support/BinaryStreamWriter.h" #include "llvm/Support/Parallel.h" #include "llvm/Support/xxhash.h" #include #include using namespace llvm; using namespace llvm::msf; using namespace llvm::pdb; using namespace llvm::codeview; struct llvm::pdb::GSIHashStreamBuilder { struct SymbolDenseMapInfo { static inline CVSymbol getEmptyKey() { static CVSymbol Empty; return Empty; } static inline CVSymbol getTombstoneKey() { static CVSymbol Tombstone( DenseMapInfo>::getTombstoneKey()); return Tombstone; } static unsigned getHashValue(const CVSymbol &Val) { return xxHash64(Val.RecordData); } static bool isEqual(const CVSymbol &LHS, const CVSymbol &RHS) { return LHS.RecordData == RHS.RecordData; } }; std::vector Records; llvm::DenseSet SymbolHashes; std::vector HashRecords; std::array HashBitmap; std::vector HashBuckets; uint32_t calculateSerializedLength() const; uint32_t calculateRecordByteSize() const; Error commit(BinaryStreamWriter &Writer); void finalizeBuckets(uint32_t RecordZeroOffset); // Finalize public symbol buckets. void finalizeBuckets(uint32_t RecordZeroOffset, std::vector &&Publics); template void addSymbol(const T &Symbol, MSFBuilder &Msf) { T Copy(Symbol); addSymbol(SymbolSerializer::writeOneSymbol(Copy, Msf.getAllocator(), CodeViewContainer::Pdb)); } void addSymbol(const CVSymbol &Symbol); }; void GSIHashStreamBuilder::addSymbol(const codeview::CVSymbol &Symbol) { // Ignore duplicate typedefs and constants. if (Symbol.kind() == S_UDT || Symbol.kind() == S_CONSTANT) { auto Iter = SymbolHashes.insert(Symbol); if (!Iter.second) return; } Records.push_back(Symbol); } namespace { LLVM_PACKED_START struct PublicSym32Layout { RecordPrefix Prefix; PublicSym32Header Pub; // char Name[]; }; LLVM_PACKED_END } // namespace // Calculate how much memory this public needs when serialized. static uint32_t sizeOfPublic(const BulkPublic &Pub) { uint32_t NameLen = Pub.NameLen; NameLen = std::min(NameLen, uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1)); return alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4); } static CVSymbol serializePublic(uint8_t *Mem, const BulkPublic &Pub) { // Assume the caller has allocated sizeOfPublic bytes. uint32_t NameLen = std::min( Pub.NameLen, uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1)); size_t Size = alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4); assert(Size == sizeOfPublic(Pub)); auto *FixedMem = reinterpret_cast(Mem); FixedMem->Prefix.RecordKind = static_cast(codeview::S_PUB32); FixedMem->Prefix.RecordLen = static_cast(Size - 2); FixedMem->Pub.Flags = Pub.Flags; FixedMem->Pub.Offset = Pub.Offset; FixedMem->Pub.Segment = Pub.U.Segment; char *NameMem = reinterpret_cast(FixedMem + 1); memcpy(NameMem, Pub.Name, NameLen); // Zero the null terminator and remaining bytes. memset(&NameMem[NameLen], 0, Size - sizeof(PublicSym32Layout) - NameLen); return CVSymbol(makeArrayRef(reinterpret_cast(Mem), Size)); } uint32_t GSIHashStreamBuilder::calculateSerializedLength() const { uint32_t Size = sizeof(GSIHashHeader); Size += HashRecords.size() * sizeof(PSHashRecord); Size += HashBitmap.size() * sizeof(uint32_t); Size += HashBuckets.size() * sizeof(uint32_t); return Size; } uint32_t GSIHashStreamBuilder::calculateRecordByteSize() const { uint32_t Size = 0; for (const auto &Sym : Records) Size += Sym.length(); return Size; } Error GSIHashStreamBuilder::commit(BinaryStreamWriter &Writer) { GSIHashHeader Header; Header.VerSignature = GSIHashHeader::HdrSignature; Header.VerHdr = GSIHashHeader::HdrVersion; Header.HrSize = HashRecords.size() * sizeof(PSHashRecord); Header.NumBuckets = HashBitmap.size() * 4 + HashBuckets.size() * 4; if (auto EC = Writer.writeObject(Header)) return EC; if (auto EC = Writer.writeArray(makeArrayRef(HashRecords))) return EC; if (auto EC = Writer.writeArray(makeArrayRef(HashBitmap))) return EC; if (auto EC = Writer.writeArray(makeArrayRef(HashBuckets))) return EC; return Error::success(); } static bool isAsciiString(StringRef S) { return llvm::all_of(S, [](char C) { return unsigned(C) < 0x80; }); } // See `caseInsensitiveComparePchPchCchCch` in gsi.cpp static int gsiRecordCmp(StringRef S1, StringRef S2) { size_t LS = S1.size(); size_t RS = S2.size(); // Shorter strings always compare less than longer strings. if (LS != RS) return LS - RS; // If either string contains non ascii characters, memcmp them. if (LLVM_UNLIKELY(!isAsciiString(S1) || !isAsciiString(S2))) return memcmp(S1.data(), S2.data(), LS); // Both strings are ascii, perform a case-insenstive comparison. return S1.compare_lower(S2.data()); } void GSIHashStreamBuilder::finalizeBuckets(uint32_t RecordZeroOffset) { // Build up a list of globals to be bucketed. This repurposes the BulkPublic // struct with different meanings for the fields to avoid reallocating a new // vector during public symbol table hash construction. std::vector Globals; Globals.resize(Records.size()); uint32_t SymOffset = RecordZeroOffset; for (size_t I = 0, E = Records.size(); I < E; ++I) { StringRef Name = getSymbolName(Records[I]); Globals[I].Name = Name.data(); Globals[I].NameLen = Name.size(); Globals[I].SymOffset = SymOffset; SymOffset += Records[I].length(); } finalizeBuckets(RecordZeroOffset, std::move(Globals)); } void GSIHashStreamBuilder::finalizeBuckets(uint32_t RecordZeroOffset, std::vector &&Globals) { // Hash every name in parallel. The Segment field is no longer needed, so // store the BucketIdx in a union. parallelForEachN(0, Globals.size(), [&](size_t I) { Globals[I].U.BucketIdx = hashStringV1(Globals[I].Name) % IPHR_HASH; }); // Parallel sort by bucket index, then name within the buckets. Within the // buckets, sort each bucket by memcmp of the symbol's name. It's important // that we use the same sorting algorithm as is used by the reference // implementation to ensure that the search for a record within a bucket can // properly early-out when it detects the record won't be found. The // algorithm used here corredsponds to the function // caseInsensitiveComparePchPchCchCch in the reference implementation. auto BucketCmp = [](const BulkPublic &L, const BulkPublic &R) { if (L.U.BucketIdx != R.U.BucketIdx) return L.U.BucketIdx < R.U.BucketIdx; int Cmp = gsiRecordCmp(L.getName(), R.getName()); if (Cmp != 0) return Cmp < 0; // This comparison is necessary to make the sorting stable in the presence // of two static globals with the same name. The easiest way to observe // this is with S_LDATA32 records. return L.SymOffset < R.SymOffset; }; parallelSort(Globals, BucketCmp); // Zero out the bucket index bitmap. for (ulittle32_t &Word : HashBitmap) Word = 0; // Compute the three tables: the hash records in bucket and chain order, the // bucket presence bitmap, and the bucket chain start offsets. HashRecords.reserve(Globals.size()); uint32_t LastBucketIdx = ~0U; for (const BulkPublic &Global : Globals) { // If this is a new bucket, add it to the bitmap and the start offset map. uint32_t BucketIdx = Global.U.BucketIdx; if (LastBucketIdx != BucketIdx) { HashBitmap[BucketIdx / 32] |= 1U << (BucketIdx % 32); // Calculate what the offset of the first hash record in the chain would // be if it were inflated to contain 32-bit pointers. On a 32-bit system, // each record would be 12 bytes. See HROffsetCalc in gsi.h. const int SizeOfHROffsetCalc = 12; ulittle32_t ChainStartOff = ulittle32_t(HashRecords.size() * SizeOfHROffsetCalc); HashBuckets.push_back(ChainStartOff); LastBucketIdx = BucketIdx; } // Create the hash record. Add one when writing symbol offsets to disk. // See GSI1::fixSymRecs. Always use a refcount of 1 for now. PSHashRecord HRec; HRec.Off = Global.SymOffset + 1; HRec.CRef = 1; HashRecords.push_back(HRec); } } GSIStreamBuilder::GSIStreamBuilder(msf::MSFBuilder &Msf) : Msf(Msf), PSH(std::make_unique()), GSH(std::make_unique()) {} GSIStreamBuilder::~GSIStreamBuilder() {} uint32_t GSIStreamBuilder::calculatePublicsHashStreamSize() const { uint32_t Size = 0; Size += sizeof(PublicsStreamHeader); Size += PSH->calculateSerializedLength(); Size += PubAddrMap.size() * sizeof(uint32_t); // AddrMap // FIXME: Add thunk map and section offsets for incremental linking. return Size; } uint32_t GSIStreamBuilder::calculateGlobalsHashStreamSize() const { return GSH->calculateSerializedLength(); } Error GSIStreamBuilder::finalizeMsfLayout() { // First we write public symbol records, then we write global symbol records. uint32_t PublicsSize = PSH->calculateRecordByteSize(); uint32_t GlobalsSize = GSH->calculateRecordByteSize(); GSH->finalizeBuckets(PublicsSize); Expected Idx = Msf.addStream(calculateGlobalsHashStreamSize()); if (!Idx) return Idx.takeError(); GlobalsStreamIndex = *Idx; Idx = Msf.addStream(calculatePublicsHashStreamSize()); if (!Idx) return Idx.takeError(); PublicsStreamIndex = *Idx; uint32_t RecordBytes = PublicsSize + GlobalsSize; Idx = Msf.addStream(RecordBytes); if (!Idx) return Idx.takeError(); RecordStreamIndex = *Idx; return Error::success(); } void GSIStreamBuilder::addPublicSymbols(std::vector &&Publics) { // Sort the symbols by name. PDBs contain lots of symbols, so use parallelism. parallelSort(Publics, [](const BulkPublic &L, const BulkPublic &R) { return L.getName() < R.getName(); }); // Assign offsets and allocate one contiguous block of memory for all public // symbols. uint32_t SymOffset = 0; for (BulkPublic &Pub : Publics) { Pub.SymOffset = SymOffset; SymOffset += sizeOfPublic(Pub); } uint8_t *Mem = reinterpret_cast(Msf.getAllocator().Allocate(SymOffset, 4)); // Instead of storing individual CVSymbol records, store them as one giant // buffer. // FIXME: This is kind of a hack. This makes Records.size() wrong, and we have // to account for that elsewhere. PSH->Records.push_back(CVSymbol(makeArrayRef(Mem, SymOffset))); // Serialize them in parallel. parallelForEachN(0, Publics.size(), [&](size_t I) { const BulkPublic &Pub = Publics[I]; serializePublic(Mem + Pub.SymOffset, Pub); }); // Re-sort the publics by address so we can build the address map. We no // longer need the original ordering. auto AddrCmp = [](const BulkPublic &L, const BulkPublic &R) { if (L.U.Segment != R.U.Segment) return L.U.Segment < R.U.Segment; if (L.Offset != R.Offset) return L.Offset < R.Offset; // parallelSort is unstable, so we have to do name comparison to ensure // that two names for the same location come out in a determinstic order. return L.getName() < R.getName(); }; parallelSort(Publics, AddrCmp); // Fill in the symbol offsets in the appropriate order. PubAddrMap.reserve(Publics.size()); for (const BulkPublic &Pub : Publics) PubAddrMap.push_back(ulittle32_t(Pub.SymOffset)); // Finalize public symbol buckets immediately after they have been added. // They should all be warm in the cache at this point, so go ahead and do it // now. PSH->finalizeBuckets(0, std::move(Publics)); } void GSIStreamBuilder::addGlobalSymbol(const ProcRefSym &Sym) { GSH->addSymbol(Sym, Msf); } void GSIStreamBuilder::addGlobalSymbol(const DataSym &Sym) { GSH->addSymbol(Sym, Msf); } void GSIStreamBuilder::addGlobalSymbol(const ConstantSym &Sym) { GSH->addSymbol(Sym, Msf); } void GSIStreamBuilder::addGlobalSymbol(const codeview::CVSymbol &Sym) { GSH->addSymbol(Sym); } static Error writeRecords(BinaryStreamWriter &Writer, ArrayRef Records) { BinaryItemStream ItemStream(support::endianness::little); ItemStream.setItems(Records); BinaryStreamRef RecordsRef(ItemStream); return Writer.writeStreamRef(RecordsRef); } Error GSIStreamBuilder::commitSymbolRecordStream( WritableBinaryStreamRef Stream) { BinaryStreamWriter Writer(Stream); // Write public symbol records first, followed by global symbol records. This // must match the order that we assume in finalizeMsfLayout when computing // PSHZero and GSHZero. if (auto EC = writeRecords(Writer, PSH->Records)) return EC; if (auto EC = writeRecords(Writer, GSH->Records)) return EC; return Error::success(); } Error GSIStreamBuilder::commitPublicsHashStream( WritableBinaryStreamRef Stream) { BinaryStreamWriter Writer(Stream); PublicsStreamHeader Header; // FIXME: Fill these in. They are for incremental linking. Header.SymHash = PSH->calculateSerializedLength(); Header.AddrMap = PubAddrMap.size() * 4; Header.NumThunks = 0; Header.SizeOfThunk = 0; Header.ISectThunkTable = 0; memset(Header.Padding, 0, sizeof(Header.Padding)); Header.OffThunkTable = 0; Header.NumSections = 0; if (auto EC = Writer.writeObject(Header)) return EC; if (auto EC = PSH->commit(Writer)) return EC; if (auto EC = Writer.writeArray(makeArrayRef(PubAddrMap))) return EC; return Error::success(); } Error GSIStreamBuilder::commitGlobalsHashStream( WritableBinaryStreamRef Stream) { BinaryStreamWriter Writer(Stream); return GSH->commit(Writer); } Error GSIStreamBuilder::commit(const msf::MSFLayout &Layout, WritableBinaryStreamRef Buffer) { auto GS = WritableMappedBlockStream::createIndexedStream( Layout, Buffer, getGlobalsStreamIndex(), Msf.getAllocator()); auto PS = WritableMappedBlockStream::createIndexedStream( Layout, Buffer, getPublicsStreamIndex(), Msf.getAllocator()); auto PRS = WritableMappedBlockStream::createIndexedStream( Layout, Buffer, getRecordStreamIndex(), Msf.getAllocator()); if (auto EC = commitSymbolRecordStream(*PRS)) return EC; if (auto EC = commitGlobalsHashStream(*GS)) return EC; if (auto EC = commitPublicsHashStream(*PS)) return EC; return Error::success(); }