1 //===- DbiStreamBuilder.cpp - PDB Dbi Stream Creation -----------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // The data structures defined in this file are based on the reference 10 // implementation which is available at 11 // https://github.com/Microsoft/microsoft-pdb/blob/master/PDB/dbi/gsi.cpp 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h" 16 17 #include "llvm/ADT/DenseSet.h" 18 #include "llvm/DebugInfo/CodeView/RecordName.h" 19 #include "llvm/DebugInfo/CodeView/SymbolDeserializer.h" 20 #include "llvm/DebugInfo/CodeView/SymbolRecord.h" 21 #include "llvm/DebugInfo/CodeView/SymbolSerializer.h" 22 #include "llvm/DebugInfo/MSF/MSFBuilder.h" 23 #include "llvm/DebugInfo/MSF/MSFCommon.h" 24 #include "llvm/DebugInfo/MSF/MappedBlockStream.h" 25 #include "llvm/DebugInfo/PDB/Native/GlobalsStream.h" 26 #include "llvm/DebugInfo/PDB/Native/Hash.h" 27 #include "llvm/Support/BinaryItemStream.h" 28 #include "llvm/Support/BinaryStreamWriter.h" 29 #include "llvm/Support/Parallel.h" 30 #include "llvm/Support/xxhash.h" 31 #include <algorithm> 32 #include <vector> 33 34 using namespace llvm; 35 using namespace llvm::msf; 36 using namespace llvm::pdb; 37 using namespace llvm::codeview; 38 39 struct llvm::pdb::GSIHashStreamBuilder { 40 struct SymbolDenseMapInfo { 41 static inline CVSymbol getEmptyKey() { 42 static CVSymbol Empty; 43 return Empty; 44 } 45 static inline CVSymbol getTombstoneKey() { 46 static CVSymbol Tombstone( 47 DenseMapInfo<ArrayRef<uint8_t>>::getTombstoneKey()); 48 return Tombstone; 49 } 50 static unsigned getHashValue(const CVSymbol &Val) { 51 return xxHash64(Val.RecordData); 52 } 53 static bool isEqual(const CVSymbol &LHS, const CVSymbol &RHS) { 54 return LHS.RecordData == RHS.RecordData; 55 } 56 }; 57 58 std::vector<CVSymbol> Records; 59 llvm::DenseSet<CVSymbol, SymbolDenseMapInfo> SymbolHashes; 60 std::vector<PSHashRecord> HashRecords; 61 std::array<support::ulittle32_t, (IPHR_HASH + 32) / 32> HashBitmap; 62 std::vector<support::ulittle32_t> HashBuckets; 63 64 uint32_t calculateSerializedLength() const; 65 uint32_t calculateRecordByteSize() const; 66 Error commit(BinaryStreamWriter &Writer); 67 68 void finalizeBuckets(uint32_t RecordZeroOffset); 69 70 // Finalize public symbol buckets. 71 void finalizeBuckets(uint32_t RecordZeroOffset, 72 std::vector<BulkPublic> &&Publics); 73 74 template <typename T> void addSymbol(const T &Symbol, MSFBuilder &Msf) { 75 T Copy(Symbol); 76 addSymbol(SymbolSerializer::writeOneSymbol(Copy, Msf.getAllocator(), 77 CodeViewContainer::Pdb)); 78 } 79 void addSymbol(const CVSymbol &Symbol); 80 }; 81 82 void GSIHashStreamBuilder::addSymbol(const codeview::CVSymbol &Symbol) { 83 // Ignore duplicate typedefs and constants. 84 if (Symbol.kind() == S_UDT || Symbol.kind() == S_CONSTANT) { 85 auto Iter = SymbolHashes.insert(Symbol); 86 if (!Iter.second) 87 return; 88 } 89 Records.push_back(Symbol); 90 } 91 92 namespace { 93 LLVM_PACKED_START 94 struct PublicSym32Layout { 95 RecordPrefix Prefix; 96 PublicSym32Header Pub; 97 // char Name[]; 98 }; 99 LLVM_PACKED_END 100 } // namespace 101 102 // Calculate how much memory this public needs when serialized. 103 static uint32_t sizeOfPublic(const BulkPublic &Pub) { 104 uint32_t NameLen = Pub.NameLen; 105 NameLen = std::min(NameLen, 106 uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1)); 107 return alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4); 108 } 109 110 static CVSymbol serializePublic(uint8_t *Mem, const BulkPublic &Pub) { 111 // Assume the caller has allocated sizeOfPublic bytes. 112 uint32_t NameLen = std::min( 113 Pub.NameLen, uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1)); 114 size_t Size = alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4); 115 assert(Size == sizeOfPublic(Pub)); 116 auto *FixedMem = reinterpret_cast<PublicSym32Layout *>(Mem); 117 FixedMem->Prefix.RecordKind = static_cast<uint16_t>(codeview::S_PUB32); 118 FixedMem->Prefix.RecordLen = static_cast<uint16_t>(Size - 2); 119 FixedMem->Pub.Flags = Pub.Flags; 120 FixedMem->Pub.Offset = Pub.Offset; 121 FixedMem->Pub.Segment = Pub.U.Segment; 122 char *NameMem = reinterpret_cast<char *>(FixedMem + 1); 123 memcpy(NameMem, Pub.Name, NameLen); 124 // Zero the null terminator and remaining bytes. 125 memset(&NameMem[NameLen], 0, Size - sizeof(PublicSym32Layout) - NameLen); 126 return CVSymbol(makeArrayRef(reinterpret_cast<uint8_t *>(Mem), Size)); 127 } 128 129 uint32_t GSIHashStreamBuilder::calculateSerializedLength() const { 130 uint32_t Size = sizeof(GSIHashHeader); 131 Size += HashRecords.size() * sizeof(PSHashRecord); 132 Size += HashBitmap.size() * sizeof(uint32_t); 133 Size += HashBuckets.size() * sizeof(uint32_t); 134 return Size; 135 } 136 137 uint32_t GSIHashStreamBuilder::calculateRecordByteSize() const { 138 uint32_t Size = 0; 139 for (const auto &Sym : Records) 140 Size += Sym.length(); 141 return Size; 142 } 143 144 Error GSIHashStreamBuilder::commit(BinaryStreamWriter &Writer) { 145 GSIHashHeader Header; 146 Header.VerSignature = GSIHashHeader::HdrSignature; 147 Header.VerHdr = GSIHashHeader::HdrVersion; 148 Header.HrSize = HashRecords.size() * sizeof(PSHashRecord); 149 Header.NumBuckets = HashBitmap.size() * 4 + HashBuckets.size() * 4; 150 151 if (auto EC = Writer.writeObject(Header)) 152 return EC; 153 154 if (auto EC = Writer.writeArray(makeArrayRef(HashRecords))) 155 return EC; 156 if (auto EC = Writer.writeArray(makeArrayRef(HashBitmap))) 157 return EC; 158 if (auto EC = Writer.writeArray(makeArrayRef(HashBuckets))) 159 return EC; 160 return Error::success(); 161 } 162 163 static bool isAsciiString(StringRef S) { 164 return llvm::all_of(S, [](char C) { return unsigned(C) < 0x80; }); 165 } 166 167 // See `caseInsensitiveComparePchPchCchCch` in gsi.cpp 168 static int gsiRecordCmp(StringRef S1, StringRef S2) { 169 size_t LS = S1.size(); 170 size_t RS = S2.size(); 171 // Shorter strings always compare less than longer strings. 172 if (LS != RS) 173 return LS - RS; 174 175 // If either string contains non ascii characters, memcmp them. 176 if (LLVM_UNLIKELY(!isAsciiString(S1) || !isAsciiString(S2))) 177 return memcmp(S1.data(), S2.data(), LS); 178 179 // Both strings are ascii, perform a case-insenstive comparison. 180 return S1.compare_lower(S2.data()); 181 } 182 183 void GSIHashStreamBuilder::finalizeBuckets(uint32_t RecordZeroOffset) { 184 // Build up a list of globals to be bucketed. This repurposes the BulkPublic 185 // struct with different meanings for the fields to avoid reallocating a new 186 // vector during public symbol table hash construction. 187 std::vector<BulkPublic> Globals; 188 Globals.resize(Records.size()); 189 uint32_t SymOffset = RecordZeroOffset; 190 for (size_t I = 0, E = Records.size(); I < E; ++I) { 191 StringRef Name = getSymbolName(Records[I]); 192 Globals[I].Name = Name.data(); 193 Globals[I].NameLen = Name.size(); 194 Globals[I].SymOffset = SymOffset; 195 SymOffset += Records[I].length(); 196 } 197 198 finalizeBuckets(RecordZeroOffset, std::move(Globals)); 199 } 200 201 void GSIHashStreamBuilder::finalizeBuckets(uint32_t RecordZeroOffset, 202 std::vector<BulkPublic> &&Globals) { 203 // Hash every name in parallel. The Segment field is no longer needed, so 204 // store the BucketIdx in a union. 205 parallelForEachN(0, Globals.size(), [&](size_t I) { 206 Globals[I].U.BucketIdx = hashStringV1(Globals[I].Name) % IPHR_HASH; 207 }); 208 209 // Parallel sort by bucket index, then name within the buckets. Within the 210 // buckets, sort each bucket by memcmp of the symbol's name. It's important 211 // that we use the same sorting algorithm as is used by the reference 212 // implementation to ensure that the search for a record within a bucket can 213 // properly early-out when it detects the record won't be found. The 214 // algorithm used here corredsponds to the function 215 // caseInsensitiveComparePchPchCchCch in the reference implementation. 216 auto BucketCmp = [](const BulkPublic &L, const BulkPublic &R) { 217 if (L.U.BucketIdx != R.U.BucketIdx) 218 return L.U.BucketIdx < R.U.BucketIdx; 219 int Cmp = gsiRecordCmp(L.getName(), R.getName()); 220 if (Cmp != 0) 221 return Cmp < 0; 222 // This comparison is necessary to make the sorting stable in the presence 223 // of two static globals with the same name. The easiest way to observe 224 // this is with S_LDATA32 records. 225 return L.SymOffset < R.SymOffset; 226 }; 227 parallelSort(Globals, BucketCmp); 228 229 // Zero out the bucket index bitmap. 230 for (ulittle32_t &Word : HashBitmap) 231 Word = 0; 232 233 // Compute the three tables: the hash records in bucket and chain order, the 234 // bucket presence bitmap, and the bucket chain start offsets. 235 HashRecords.reserve(Globals.size()); 236 uint32_t LastBucketIdx = ~0U; 237 for (const BulkPublic &Global : Globals) { 238 // If this is a new bucket, add it to the bitmap and the start offset map. 239 uint32_t BucketIdx = Global.U.BucketIdx; 240 if (LastBucketIdx != BucketIdx) { 241 HashBitmap[BucketIdx / 32] |= 1U << (BucketIdx % 32); 242 243 // Calculate what the offset of the first hash record in the chain would 244 // be if it were inflated to contain 32-bit pointers. On a 32-bit system, 245 // each record would be 12 bytes. See HROffsetCalc in gsi.h. 246 const int SizeOfHROffsetCalc = 12; 247 ulittle32_t ChainStartOff = 248 ulittle32_t(HashRecords.size() * SizeOfHROffsetCalc); 249 HashBuckets.push_back(ChainStartOff); 250 LastBucketIdx = BucketIdx; 251 } 252 253 // Create the hash record. Add one when writing symbol offsets to disk. 254 // See GSI1::fixSymRecs. Always use a refcount of 1 for now. 255 PSHashRecord HRec; 256 HRec.Off = Global.SymOffset + 1; 257 HRec.CRef = 1; 258 HashRecords.push_back(HRec); 259 } 260 } 261 262 GSIStreamBuilder::GSIStreamBuilder(msf::MSFBuilder &Msf) 263 : Msf(Msf), PSH(std::make_unique<GSIHashStreamBuilder>()), 264 GSH(std::make_unique<GSIHashStreamBuilder>()) {} 265 266 GSIStreamBuilder::~GSIStreamBuilder() {} 267 268 uint32_t GSIStreamBuilder::calculatePublicsHashStreamSize() const { 269 uint32_t Size = 0; 270 Size += sizeof(PublicsStreamHeader); 271 Size += PSH->calculateSerializedLength(); 272 Size += PubAddrMap.size() * sizeof(uint32_t); // AddrMap 273 // FIXME: Add thunk map and section offsets for incremental linking. 274 275 return Size; 276 } 277 278 uint32_t GSIStreamBuilder::calculateGlobalsHashStreamSize() const { 279 return GSH->calculateSerializedLength(); 280 } 281 282 Error GSIStreamBuilder::finalizeMsfLayout() { 283 // First we write public symbol records, then we write global symbol records. 284 uint32_t PublicsSize = PSH->calculateRecordByteSize(); 285 uint32_t GlobalsSize = GSH->calculateRecordByteSize(); 286 GSH->finalizeBuckets(PublicsSize); 287 288 Expected<uint32_t> Idx = Msf.addStream(calculateGlobalsHashStreamSize()); 289 if (!Idx) 290 return Idx.takeError(); 291 GlobalsStreamIndex = *Idx; 292 293 Idx = Msf.addStream(calculatePublicsHashStreamSize()); 294 if (!Idx) 295 return Idx.takeError(); 296 PublicsStreamIndex = *Idx; 297 298 uint32_t RecordBytes = PublicsSize + GlobalsSize; 299 300 Idx = Msf.addStream(RecordBytes); 301 if (!Idx) 302 return Idx.takeError(); 303 RecordStreamIndex = *Idx; 304 return Error::success(); 305 } 306 307 void GSIStreamBuilder::addPublicSymbols(std::vector<BulkPublic> &&Publics) { 308 // Sort the symbols by name. PDBs contain lots of symbols, so use parallelism. 309 parallelSort(Publics, [](const BulkPublic &L, const BulkPublic &R) { 310 return L.getName() < R.getName(); 311 }); 312 313 // Assign offsets and allocate one contiguous block of memory for all public 314 // symbols. 315 uint32_t SymOffset = 0; 316 for (BulkPublic &Pub : Publics) { 317 Pub.SymOffset = SymOffset; 318 SymOffset += sizeOfPublic(Pub); 319 } 320 uint8_t *Mem = 321 reinterpret_cast<uint8_t *>(Msf.getAllocator().Allocate(SymOffset, 4)); 322 323 // Instead of storing individual CVSymbol records, store them as one giant 324 // buffer. 325 // FIXME: This is kind of a hack. This makes Records.size() wrong, and we have 326 // to account for that elsewhere. 327 PSH->Records.push_back(CVSymbol(makeArrayRef(Mem, SymOffset))); 328 329 // Serialize them in parallel. 330 parallelForEachN(0, Publics.size(), [&](size_t I) { 331 const BulkPublic &Pub = Publics[I]; 332 serializePublic(Mem + Pub.SymOffset, Pub); 333 }); 334 335 // Re-sort the publics by address so we can build the address map. We no 336 // longer need the original ordering. 337 auto AddrCmp = [](const BulkPublic &L, const BulkPublic &R) { 338 if (L.U.Segment != R.U.Segment) 339 return L.U.Segment < R.U.Segment; 340 if (L.Offset != R.Offset) 341 return L.Offset < R.Offset; 342 // parallelSort is unstable, so we have to do name comparison to ensure 343 // that two names for the same location come out in a determinstic order. 344 return L.getName() < R.getName(); 345 }; 346 parallelSort(Publics, AddrCmp); 347 348 // Fill in the symbol offsets in the appropriate order. 349 PubAddrMap.reserve(Publics.size()); 350 for (const BulkPublic &Pub : Publics) 351 PubAddrMap.push_back(ulittle32_t(Pub.SymOffset)); 352 353 // Finalize public symbol buckets immediately after they have been added. 354 // They should all be warm in the cache at this point, so go ahead and do it 355 // now. 356 PSH->finalizeBuckets(0, std::move(Publics)); 357 } 358 359 void GSIStreamBuilder::addGlobalSymbol(const ProcRefSym &Sym) { 360 GSH->addSymbol(Sym, Msf); 361 } 362 363 void GSIStreamBuilder::addGlobalSymbol(const DataSym &Sym) { 364 GSH->addSymbol(Sym, Msf); 365 } 366 367 void GSIStreamBuilder::addGlobalSymbol(const ConstantSym &Sym) { 368 GSH->addSymbol(Sym, Msf); 369 } 370 371 void GSIStreamBuilder::addGlobalSymbol(const codeview::CVSymbol &Sym) { 372 GSH->addSymbol(Sym); 373 } 374 375 static Error writeRecords(BinaryStreamWriter &Writer, 376 ArrayRef<CVSymbol> Records) { 377 BinaryItemStream<CVSymbol> ItemStream(support::endianness::little); 378 ItemStream.setItems(Records); 379 BinaryStreamRef RecordsRef(ItemStream); 380 return Writer.writeStreamRef(RecordsRef); 381 } 382 383 Error GSIStreamBuilder::commitSymbolRecordStream( 384 WritableBinaryStreamRef Stream) { 385 BinaryStreamWriter Writer(Stream); 386 387 // Write public symbol records first, followed by global symbol records. This 388 // must match the order that we assume in finalizeMsfLayout when computing 389 // PSHZero and GSHZero. 390 if (auto EC = writeRecords(Writer, PSH->Records)) 391 return EC; 392 if (auto EC = writeRecords(Writer, GSH->Records)) 393 return EC; 394 395 return Error::success(); 396 } 397 398 Error GSIStreamBuilder::commitPublicsHashStream( 399 WritableBinaryStreamRef Stream) { 400 BinaryStreamWriter Writer(Stream); 401 PublicsStreamHeader Header; 402 403 // FIXME: Fill these in. They are for incremental linking. 404 Header.SymHash = PSH->calculateSerializedLength(); 405 Header.AddrMap = PubAddrMap.size() * 4; 406 Header.NumThunks = 0; 407 Header.SizeOfThunk = 0; 408 Header.ISectThunkTable = 0; 409 memset(Header.Padding, 0, sizeof(Header.Padding)); 410 Header.OffThunkTable = 0; 411 Header.NumSections = 0; 412 if (auto EC = Writer.writeObject(Header)) 413 return EC; 414 415 if (auto EC = PSH->commit(Writer)) 416 return EC; 417 418 if (auto EC = Writer.writeArray(makeArrayRef(PubAddrMap))) 419 return EC; 420 421 return Error::success(); 422 } 423 424 Error GSIStreamBuilder::commitGlobalsHashStream( 425 WritableBinaryStreamRef Stream) { 426 BinaryStreamWriter Writer(Stream); 427 return GSH->commit(Writer); 428 } 429 430 Error GSIStreamBuilder::commit(const msf::MSFLayout &Layout, 431 WritableBinaryStreamRef Buffer) { 432 auto GS = WritableMappedBlockStream::createIndexedStream( 433 Layout, Buffer, getGlobalsStreamIndex(), Msf.getAllocator()); 434 auto PS = WritableMappedBlockStream::createIndexedStream( 435 Layout, Buffer, getPublicsStreamIndex(), Msf.getAllocator()); 436 auto PRS = WritableMappedBlockStream::createIndexedStream( 437 Layout, Buffer, getRecordStreamIndex(), Msf.getAllocator()); 438 439 if (auto EC = commitSymbolRecordStream(*PRS)) 440 return EC; 441 if (auto EC = commitGlobalsHashStream(*GS)) 442 return EC; 443 if (auto EC = commitPublicsHashStream(*PS)) 444 return EC; 445 return Error::success(); 446 } 447