1 //===- DbiStreamBuilder.cpp - PDB Dbi Stream Creation -----------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h" 10 11 #include "llvm/ADT/DenseSet.h" 12 #include "llvm/DebugInfo/CodeView/RecordName.h" 13 #include "llvm/DebugInfo/CodeView/SymbolDeserializer.h" 14 #include "llvm/DebugInfo/CodeView/SymbolRecord.h" 15 #include "llvm/DebugInfo/CodeView/SymbolSerializer.h" 16 #include "llvm/DebugInfo/MSF/MSFBuilder.h" 17 #include "llvm/DebugInfo/MSF/MSFCommon.h" 18 #include "llvm/DebugInfo/MSF/MappedBlockStream.h" 19 #include "llvm/DebugInfo/PDB/Native/GlobalsStream.h" 20 #include "llvm/DebugInfo/PDB/Native/Hash.h" 21 #include "llvm/Support/BinaryItemStream.h" 22 #include "llvm/Support/BinaryStreamWriter.h" 23 #include "llvm/Support/xxhash.h" 24 #include <algorithm> 25 #include <vector> 26 27 using namespace llvm; 28 using namespace llvm::msf; 29 using namespace llvm::pdb; 30 using namespace llvm::codeview; 31 32 struct llvm::pdb::GSIHashStreamBuilder { 33 struct SymbolDenseMapInfo { 34 static inline CVSymbol getEmptyKey() { 35 static CVSymbol Empty; 36 return Empty; 37 } 38 static inline CVSymbol getTombstoneKey() { 39 static CVSymbol Tombstone( 40 DenseMapInfo<ArrayRef<uint8_t>>::getTombstoneKey()); 41 return Tombstone; 42 } 43 static unsigned getHashValue(const CVSymbol &Val) { 44 return xxHash64(Val.RecordData); 45 } 46 static bool isEqual(const CVSymbol &LHS, const CVSymbol &RHS) { 47 return LHS.RecordData == RHS.RecordData; 48 } 49 }; 50 51 std::vector<CVSymbol> Records; 52 llvm::DenseSet<CVSymbol, SymbolDenseMapInfo> SymbolHashes; 53 std::vector<PSHashRecord> HashRecords; 54 std::array<support::ulittle32_t, (IPHR_HASH + 32) / 32> HashBitmap; 55 std::vector<support::ulittle32_t> HashBuckets; 56 57 uint32_t calculateSerializedLength() const; 58 uint32_t calculateRecordByteSize() const; 59 Error commit(BinaryStreamWriter &Writer); 60 void finalizeBuckets(uint32_t RecordZeroOffset); 61 62 template <typename T> void addSymbol(const T &Symbol, MSFBuilder &Msf) { 63 T Copy(Symbol); 64 addSymbol(SymbolSerializer::writeOneSymbol(Copy, Msf.getAllocator(), 65 CodeViewContainer::Pdb)); 66 } 67 void addSymbol(const CVSymbol &Symbol) { 68 if (Symbol.kind() == S_UDT || Symbol.kind() == S_CONSTANT) { 69 auto Iter = SymbolHashes.insert(Symbol); 70 if (!Iter.second) 71 return; 72 } 73 74 Records.push_back(Symbol); 75 } 76 }; 77 78 uint32_t GSIHashStreamBuilder::calculateSerializedLength() const { 79 uint32_t Size = sizeof(GSIHashHeader); 80 Size += HashRecords.size() * sizeof(PSHashRecord); 81 Size += HashBitmap.size() * sizeof(uint32_t); 82 Size += HashBuckets.size() * sizeof(uint32_t); 83 return Size; 84 } 85 86 uint32_t GSIHashStreamBuilder::calculateRecordByteSize() const { 87 uint32_t Size = 0; 88 for (const auto &Sym : Records) 89 Size += Sym.length(); 90 return Size; 91 } 92 93 Error GSIHashStreamBuilder::commit(BinaryStreamWriter &Writer) { 94 GSIHashHeader Header; 95 Header.VerSignature = GSIHashHeader::HdrSignature; 96 Header.VerHdr = GSIHashHeader::HdrVersion; 97 Header.HrSize = HashRecords.size() * sizeof(PSHashRecord); 98 Header.NumBuckets = HashBitmap.size() * 4 + HashBuckets.size() * 4; 99 100 if (auto EC = Writer.writeObject(Header)) 101 return EC; 102 103 if (auto EC = Writer.writeArray(makeArrayRef(HashRecords))) 104 return EC; 105 if (auto EC = Writer.writeArray(makeArrayRef(HashBitmap))) 106 return EC; 107 if (auto EC = Writer.writeArray(makeArrayRef(HashBuckets))) 108 return EC; 109 return Error::success(); 110 } 111 112 static bool isAsciiString(StringRef S) { 113 return llvm::all_of(S, [](char C) { return unsigned(C) < 0x80; }); 114 } 115 116 // See `caseInsensitiveComparePchPchCchCch` in gsi.cpp 117 static bool gsiRecordLess(StringRef S1, StringRef S2) { 118 size_t LS = S1.size(); 119 size_t RS = S2.size(); 120 // Shorter strings always compare less than longer strings. 121 if (LS != RS) 122 return LS < RS; 123 124 // If either string contains non ascii characters, memcmp them. 125 if (LLVM_UNLIKELY(!isAsciiString(S1) || !isAsciiString(S2))) 126 return memcmp(S1.data(), S2.data(), LS) < 0; 127 128 // Both strings are ascii, perform a case-insenstive comparison. 129 return S1.compare_lower(S2.data()) < 0; 130 } 131 132 void GSIHashStreamBuilder::finalizeBuckets(uint32_t RecordZeroOffset) { 133 std::array<std::vector<std::pair<StringRef, PSHashRecord>>, IPHR_HASH + 1> 134 TmpBuckets; 135 uint32_t SymOffset = RecordZeroOffset; 136 for (const CVSymbol &Sym : Records) { 137 PSHashRecord HR; 138 // Add one when writing symbol offsets to disk. See GSI1::fixSymRecs. 139 HR.Off = SymOffset + 1; 140 HR.CRef = 1; // Always use a refcount of 1. 141 142 // Hash the name to figure out which bucket this goes into. 143 StringRef Name = getSymbolName(Sym); 144 size_t BucketIdx = hashStringV1(Name) % IPHR_HASH; 145 TmpBuckets[BucketIdx].push_back(std::make_pair(Name, HR)); 146 SymOffset += Sym.length(); 147 } 148 149 // Compute the three tables: the hash records in bucket and chain order, the 150 // bucket presence bitmap, and the bucket chain start offsets. 151 HashRecords.reserve(Records.size()); 152 for (ulittle32_t &Word : HashBitmap) 153 Word = 0; 154 for (size_t BucketIdx = 0; BucketIdx < IPHR_HASH + 1; ++BucketIdx) { 155 auto &Bucket = TmpBuckets[BucketIdx]; 156 if (Bucket.empty()) 157 continue; 158 HashBitmap[BucketIdx / 32] |= 1U << (BucketIdx % 32); 159 160 // Calculate what the offset of the first hash record in the chain would 161 // be if it were inflated to contain 32-bit pointers. On a 32-bit system, 162 // each record would be 12 bytes. See HROffsetCalc in gsi.h. 163 const int SizeOfHROffsetCalc = 12; 164 ulittle32_t ChainStartOff = 165 ulittle32_t(HashRecords.size() * SizeOfHROffsetCalc); 166 HashBuckets.push_back(ChainStartOff); 167 168 // Sort each bucket by memcmp of the symbol's name. It's important that 169 // we use the same sorting algorithm as is used by the reference 170 // implementation to ensure that the search for a record within a bucket 171 // can properly early-out when it detects the record won't be found. The 172 // algorithm used here corredsponds to the function 173 // caseInsensitiveComparePchPchCchCch in the reference implementation. 174 llvm::sort(Bucket, [](const std::pair<StringRef, PSHashRecord> &Left, 175 const std::pair<StringRef, PSHashRecord> &Right) { 176 return gsiRecordLess(Left.first, Right.first); 177 }); 178 179 for (const auto &Entry : Bucket) 180 HashRecords.push_back(Entry.second); 181 } 182 } 183 184 GSIStreamBuilder::GSIStreamBuilder(msf::MSFBuilder &Msf) 185 : Msf(Msf), PSH(std::make_unique<GSIHashStreamBuilder>()), 186 GSH(std::make_unique<GSIHashStreamBuilder>()) {} 187 188 GSIStreamBuilder::~GSIStreamBuilder() {} 189 190 uint32_t GSIStreamBuilder::calculatePublicsHashStreamSize() const { 191 uint32_t Size = 0; 192 Size += sizeof(PublicsStreamHeader); 193 Size += PSH->calculateSerializedLength(); 194 Size += PSH->Records.size() * sizeof(uint32_t); // AddrMap 195 // FIXME: Add thunk map and section offsets for incremental linking. 196 197 return Size; 198 } 199 200 uint32_t GSIStreamBuilder::calculateGlobalsHashStreamSize() const { 201 return GSH->calculateSerializedLength(); 202 } 203 204 Error GSIStreamBuilder::finalizeMsfLayout() { 205 // First we write public symbol records, then we write global symbol records. 206 uint32_t PSHZero = 0; 207 uint32_t GSHZero = PSH->calculateRecordByteSize(); 208 209 PSH->finalizeBuckets(PSHZero); 210 GSH->finalizeBuckets(GSHZero); 211 212 Expected<uint32_t> Idx = Msf.addStream(calculateGlobalsHashStreamSize()); 213 if (!Idx) 214 return Idx.takeError(); 215 GlobalsStreamIndex = *Idx; 216 217 Idx = Msf.addStream(calculatePublicsHashStreamSize()); 218 if (!Idx) 219 return Idx.takeError(); 220 PublicsStreamIndex = *Idx; 221 222 uint32_t RecordBytes = 223 GSH->calculateRecordByteSize() + PSH->calculateRecordByteSize(); 224 225 Idx = Msf.addStream(RecordBytes); 226 if (!Idx) 227 return Idx.takeError(); 228 RecordStreamIndex = *Idx; 229 return Error::success(); 230 } 231 232 static StringRef extractPubSym(const CVSymbol *Sym, uint16_t &Seg, 233 uint32_t &Offset) { 234 ArrayRef<uint8_t> Buf = Sym->content(); 235 assert(Buf.size() > sizeof(PublicSym32Header)); 236 const auto *Hdr = reinterpret_cast<const PublicSym32Header *>(Buf.data()); 237 Buf = Buf.drop_front(sizeof(PublicSym32Header)); 238 Seg = Hdr->Segment; 239 Offset = Hdr->Offset; 240 // Don't worry about finding the null terminator, since the strings will be 241 // compared later. 242 return StringRef(reinterpret_cast<const char *>(Buf.data()), Buf.size()); 243 } 244 245 static bool comparePubSymByAddrAndName(const CVSymbol *LS, const CVSymbol *RS) { 246 uint16_t LSeg, RSeg; 247 uint32_t LOff, ROff; 248 StringRef LName, RName; 249 LName = extractPubSym(LS, LSeg, LOff); 250 RName = extractPubSym(RS, RSeg, ROff); 251 if (LSeg != RSeg) 252 return LSeg < RSeg; 253 if (LOff != ROff) 254 return LOff < ROff; 255 return LName < RName; 256 } 257 258 /// Compute the address map. The address map is an array of symbol offsets 259 /// sorted so that it can be binary searched by address. 260 static std::vector<ulittle32_t> computeAddrMap(ArrayRef<CVSymbol> Records) { 261 // Make a vector of pointers to the symbols so we can sort it by address. 262 // Also gather the symbol offsets while we're at it. 263 264 std::vector<const CVSymbol *> PublicsByAddr; 265 std::vector<uint32_t> SymOffsets; 266 PublicsByAddr.reserve(Records.size()); 267 SymOffsets.reserve(Records.size()); 268 269 uint32_t SymOffset = 0; 270 for (const CVSymbol &Sym : Records) { 271 assert(Sym.kind() == SymbolKind::S_PUB32); 272 PublicsByAddr.push_back(&Sym); 273 SymOffsets.push_back(SymOffset); 274 SymOffset += Sym.length(); 275 } 276 llvm::stable_sort(PublicsByAddr, comparePubSymByAddrAndName); 277 278 // Fill in the symbol offsets in the appropriate order. 279 std::vector<ulittle32_t> AddrMap; 280 AddrMap.reserve(Records.size()); 281 for (const CVSymbol *Sym : PublicsByAddr) { 282 ptrdiff_t Idx = std::distance(Records.data(), Sym); 283 assert(Idx >= 0 && size_t(Idx) < Records.size()); 284 AddrMap.push_back(ulittle32_t(SymOffsets[Idx])); 285 } 286 return AddrMap; 287 } 288 289 void GSIStreamBuilder::addPublicSymbol(const PublicSym32 &Pub) { 290 PSH->addSymbol(Pub, Msf); 291 } 292 293 void GSIStreamBuilder::addGlobalSymbol(const ProcRefSym &Sym) { 294 GSH->addSymbol(Sym, Msf); 295 } 296 297 void GSIStreamBuilder::addGlobalSymbol(const DataSym &Sym) { 298 GSH->addSymbol(Sym, Msf); 299 } 300 301 void GSIStreamBuilder::addGlobalSymbol(const ConstantSym &Sym) { 302 GSH->addSymbol(Sym, Msf); 303 } 304 305 void GSIStreamBuilder::addGlobalSymbol(const codeview::CVSymbol &Sym) { 306 GSH->addSymbol(Sym); 307 } 308 309 static Error writeRecords(BinaryStreamWriter &Writer, 310 ArrayRef<CVSymbol> Records) { 311 BinaryItemStream<CVSymbol> ItemStream(support::endianness::little); 312 ItemStream.setItems(Records); 313 BinaryStreamRef RecordsRef(ItemStream); 314 return Writer.writeStreamRef(RecordsRef); 315 } 316 317 Error GSIStreamBuilder::commitSymbolRecordStream( 318 WritableBinaryStreamRef Stream) { 319 BinaryStreamWriter Writer(Stream); 320 321 // Write public symbol records first, followed by global symbol records. This 322 // must match the order that we assume in finalizeMsfLayout when computing 323 // PSHZero and GSHZero. 324 if (auto EC = writeRecords(Writer, PSH->Records)) 325 return EC; 326 if (auto EC = writeRecords(Writer, GSH->Records)) 327 return EC; 328 329 return Error::success(); 330 } 331 332 Error GSIStreamBuilder::commitPublicsHashStream( 333 WritableBinaryStreamRef Stream) { 334 BinaryStreamWriter Writer(Stream); 335 PublicsStreamHeader Header; 336 337 // FIXME: Fill these in. They are for incremental linking. 338 Header.SymHash = PSH->calculateSerializedLength(); 339 Header.AddrMap = PSH->Records.size() * 4; 340 Header.NumThunks = 0; 341 Header.SizeOfThunk = 0; 342 Header.ISectThunkTable = 0; 343 memset(Header.Padding, 0, sizeof(Header.Padding)); 344 Header.OffThunkTable = 0; 345 Header.NumSections = 0; 346 if (auto EC = Writer.writeObject(Header)) 347 return EC; 348 349 if (auto EC = PSH->commit(Writer)) 350 return EC; 351 352 std::vector<ulittle32_t> AddrMap = computeAddrMap(PSH->Records); 353 if (auto EC = Writer.writeArray(makeArrayRef(AddrMap))) 354 return EC; 355 356 return Error::success(); 357 } 358 359 Error GSIStreamBuilder::commitGlobalsHashStream( 360 WritableBinaryStreamRef Stream) { 361 BinaryStreamWriter Writer(Stream); 362 return GSH->commit(Writer); 363 } 364 365 Error GSIStreamBuilder::commit(const msf::MSFLayout &Layout, 366 WritableBinaryStreamRef Buffer) { 367 auto GS = WritableMappedBlockStream::createIndexedStream( 368 Layout, Buffer, getGlobalsStreamIndex(), Msf.getAllocator()); 369 auto PS = WritableMappedBlockStream::createIndexedStream( 370 Layout, Buffer, getPublicsStreamIndex(), Msf.getAllocator()); 371 auto PRS = WritableMappedBlockStream::createIndexedStream( 372 Layout, Buffer, getRecordStreamIndex(), Msf.getAllocator()); 373 374 if (auto EC = commitSymbolRecordStream(*PRS)) 375 return EC; 376 if (auto EC = commitGlobalsHashStream(*GS)) 377 return EC; 378 if (auto EC = commitPublicsHashStream(*PS)) 379 return EC; 380 return Error::success(); 381 } 382