1 //===- DbiStreamBuilder.cpp - PDB Dbi Stream Creation -----------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // The data structures defined in this file are based on the reference
10 // implementation which is available at
11 // https://github.com/Microsoft/microsoft-pdb/blob/master/PDB/dbi/gsi.cpp
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h"
16 
17 #include "llvm/ADT/DenseSet.h"
18 #include "llvm/DebugInfo/CodeView/RecordName.h"
19 #include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
20 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
21 #include "llvm/DebugInfo/CodeView/SymbolSerializer.h"
22 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
23 #include "llvm/DebugInfo/MSF/MSFCommon.h"
24 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
25 #include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
26 #include "llvm/DebugInfo/PDB/Native/Hash.h"
27 #include "llvm/Support/BinaryItemStream.h"
28 #include "llvm/Support/BinaryStreamWriter.h"
29 #include "llvm/Support/Parallel.h"
30 #include "llvm/Support/xxhash.h"
31 #include <algorithm>
32 #include <vector>
33 
34 using namespace llvm;
35 using namespace llvm::msf;
36 using namespace llvm::pdb;
37 using namespace llvm::codeview;
38 
39 struct llvm::pdb::GSIHashStreamBuilder {
40   struct SymbolDenseMapInfo {
41     static inline CVSymbol getEmptyKey() {
42       static CVSymbol Empty;
43       return Empty;
44     }
45     static inline CVSymbol getTombstoneKey() {
46       static CVSymbol Tombstone(
47           DenseMapInfo<ArrayRef<uint8_t>>::getTombstoneKey());
48       return Tombstone;
49     }
50     static unsigned getHashValue(const CVSymbol &Val) {
51       return xxHash64(Val.RecordData);
52     }
53     static bool isEqual(const CVSymbol &LHS, const CVSymbol &RHS) {
54       return LHS.RecordData == RHS.RecordData;
55     }
56   };
57 
58   std::vector<CVSymbol> Records;
59   llvm::DenseSet<CVSymbol, SymbolDenseMapInfo> SymbolHashes;
60   std::vector<PSHashRecord> HashRecords;
61   std::array<support::ulittle32_t, (IPHR_HASH + 32) / 32> HashBitmap;
62   std::vector<support::ulittle32_t> HashBuckets;
63 
64   uint32_t calculateSerializedLength() const;
65   uint32_t calculateRecordByteSize() const;
66   Error commit(BinaryStreamWriter &Writer);
67 
68   void finalizeBuckets(uint32_t RecordZeroOffset);
69 
70   // Finalize public symbol buckets.
71   void finalizeBuckets(uint32_t RecordZeroOffset,
72                        std::vector<BulkPublic> &&Publics);
73 
74   template <typename T> void addSymbol(const T &Symbol, MSFBuilder &Msf) {
75     T Copy(Symbol);
76     addSymbol(SymbolSerializer::writeOneSymbol(Copy, Msf.getAllocator(),
77                                                CodeViewContainer::Pdb));
78   }
79   void addSymbol(const CVSymbol &Symbol);
80 };
81 
82 void GSIHashStreamBuilder::addSymbol(const codeview::CVSymbol &Symbol) {
83   // Ignore duplicate typedefs and constants.
84   if (Symbol.kind() == S_UDT || Symbol.kind() == S_CONSTANT) {
85     auto Iter = SymbolHashes.insert(Symbol);
86     if (!Iter.second)
87       return;
88   }
89   Records.push_back(Symbol);
90 }
91 
92 namespace {
93 LLVM_PACKED_START
94 struct PublicSym32Layout {
95   RecordPrefix Prefix;
96   PublicSym32Header Pub;
97   // char Name[];
98 };
99 LLVM_PACKED_END
100 } // namespace
101 
102 // Calculate how much memory this public needs when serialized.
103 static uint32_t sizeOfPublic(const BulkPublic &Pub) {
104   uint32_t NameLen = Pub.NameLen;
105   NameLen = std::min(NameLen,
106                      uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1));
107   return alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4);
108 }
109 
110 static CVSymbol serializePublic(uint8_t *Mem, const BulkPublic &Pub) {
111   // Assume the caller has allocated sizeOfPublic bytes.
112   uint32_t NameLen = std::min(
113       Pub.NameLen, uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1));
114   size_t Size = alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4);
115   assert(Size == sizeOfPublic(Pub));
116   auto *FixedMem = reinterpret_cast<PublicSym32Layout *>(Mem);
117   FixedMem->Prefix.RecordKind = static_cast<uint16_t>(codeview::S_PUB32);
118   FixedMem->Prefix.RecordLen = static_cast<uint16_t>(Size - 2);
119   FixedMem->Pub.Flags = Pub.Flags;
120   FixedMem->Pub.Offset = Pub.Offset;
121   FixedMem->Pub.Segment = Pub.U.Segment;
122   char *NameMem = reinterpret_cast<char *>(FixedMem + 1);
123   memcpy(NameMem, Pub.Name, NameLen);
124   // Zero the null terminator and remaining bytes.
125   memset(&NameMem[NameLen], 0, Size - sizeof(PublicSym32Layout) - NameLen);
126   return CVSymbol(makeArrayRef(reinterpret_cast<uint8_t *>(Mem), Size));
127 }
128 
129 uint32_t GSIHashStreamBuilder::calculateSerializedLength() const {
130   uint32_t Size = sizeof(GSIHashHeader);
131   Size += HashRecords.size() * sizeof(PSHashRecord);
132   Size += HashBitmap.size() * sizeof(uint32_t);
133   Size += HashBuckets.size() * sizeof(uint32_t);
134   return Size;
135 }
136 
137 uint32_t GSIHashStreamBuilder::calculateRecordByteSize() const {
138   uint32_t Size = 0;
139   for (const auto &Sym : Records)
140     Size += Sym.length();
141   return Size;
142 }
143 
144 Error GSIHashStreamBuilder::commit(BinaryStreamWriter &Writer) {
145   GSIHashHeader Header;
146   Header.VerSignature = GSIHashHeader::HdrSignature;
147   Header.VerHdr = GSIHashHeader::HdrVersion;
148   Header.HrSize = HashRecords.size() * sizeof(PSHashRecord);
149   Header.NumBuckets = HashBitmap.size() * 4 + HashBuckets.size() * 4;
150 
151   if (auto EC = Writer.writeObject(Header))
152     return EC;
153 
154   if (auto EC = Writer.writeArray(makeArrayRef(HashRecords)))
155     return EC;
156   if (auto EC = Writer.writeArray(makeArrayRef(HashBitmap)))
157     return EC;
158   if (auto EC = Writer.writeArray(makeArrayRef(HashBuckets)))
159     return EC;
160   return Error::success();
161 }
162 
163 static bool isAsciiString(StringRef S) {
164   return llvm::all_of(S, [](char C) { return unsigned(C) < 0x80; });
165 }
166 
167 // See `caseInsensitiveComparePchPchCchCch` in gsi.cpp
168 static int gsiRecordCmp(StringRef S1, StringRef S2) {
169   size_t LS = S1.size();
170   size_t RS = S2.size();
171   // Shorter strings always compare less than longer strings.
172   if (LS != RS)
173     return LS - RS;
174 
175   // If either string contains non ascii characters, memcmp them.
176   if (LLVM_UNLIKELY(!isAsciiString(S1) || !isAsciiString(S2)))
177     return memcmp(S1.data(), S2.data(), LS);
178 
179   // Both strings are ascii, perform a case-insenstive comparison.
180   return S1.compare_lower(S2.data());
181 }
182 
183 void GSIHashStreamBuilder::finalizeBuckets(uint32_t RecordZeroOffset) {
184   // Build up a list of globals to be bucketed. This repurposes the BulkPublic
185   // struct with different meanings for the fields to avoid reallocating a new
186   // vector during public symbol table hash construction.
187   std::vector<BulkPublic> Globals;
188   Globals.resize(Records.size());
189   uint32_t SymOffset = RecordZeroOffset;
190   for (size_t I = 0, E = Records.size(); I < E; ++I) {
191     StringRef Name = getSymbolName(Records[I]);
192     Globals[I].Name = Name.data();
193     Globals[I].NameLen = Name.size();
194     Globals[I].SymOffset = SymOffset;
195     SymOffset += Records[I].length();
196   }
197 
198   finalizeBuckets(RecordZeroOffset, std::move(Globals));
199 }
200 
201 void GSIHashStreamBuilder::finalizeBuckets(uint32_t RecordZeroOffset,
202                                            std::vector<BulkPublic> &&Globals) {
203   // Hash every name in parallel. The Segment field is no longer needed, so
204   // store the BucketIdx in a union.
205   parallelForEachN(0, Globals.size(), [&](size_t I) {
206     Globals[I].U.BucketIdx = hashStringV1(Globals[I].Name) % IPHR_HASH;
207   });
208 
209   // Parallel sort by bucket index, then name within the buckets. Within the
210   // buckets, sort each bucket by memcmp of the symbol's name.  It's important
211   // that we use the same sorting algorithm as is used by the reference
212   // implementation to ensure that the search for a record within a bucket can
213   // properly early-out when it detects the record won't be found.  The
214   // algorithm used here corredsponds to the function
215   // caseInsensitiveComparePchPchCchCch in the reference implementation.
216   auto BucketCmp = [](const BulkPublic &L, const BulkPublic &R) {
217     if (L.U.BucketIdx != R.U.BucketIdx)
218       return L.U.BucketIdx < R.U.BucketIdx;
219     int Cmp = gsiRecordCmp(L.getName(), R.getName());
220     if (Cmp != 0)
221       return Cmp < 0;
222     // This comparison is necessary to make the sorting stable in the presence
223     // of two static globals with the same name. The easiest way to observe
224     // this is with S_LDATA32 records.
225     return L.SymOffset < R.SymOffset;
226   };
227   parallelSort(Globals, BucketCmp);
228 
229   // Zero out the bucket index bitmap.
230   for (ulittle32_t &Word : HashBitmap)
231     Word = 0;
232 
233   // Compute the three tables: the hash records in bucket and chain order, the
234   // bucket presence bitmap, and the bucket chain start offsets.
235   HashRecords.reserve(Globals.size());
236   uint32_t LastBucketIdx = ~0U;
237   for (const BulkPublic &Global : Globals) {
238     // If this is a new bucket, add it to the bitmap and the start offset map.
239     uint32_t BucketIdx = Global.U.BucketIdx;
240     if (LastBucketIdx != BucketIdx) {
241       HashBitmap[BucketIdx / 32] |= 1U << (BucketIdx % 32);
242 
243       // Calculate what the offset of the first hash record in the chain would
244       // be if it were inflated to contain 32-bit pointers. On a 32-bit system,
245       // each record would be 12 bytes. See HROffsetCalc in gsi.h.
246       const int SizeOfHROffsetCalc = 12;
247       ulittle32_t ChainStartOff =
248           ulittle32_t(HashRecords.size() * SizeOfHROffsetCalc);
249       HashBuckets.push_back(ChainStartOff);
250       LastBucketIdx = BucketIdx;
251     }
252 
253     // Create the hash record. Add one when writing symbol offsets to disk.
254     // See GSI1::fixSymRecs. Always use a refcount of 1 for now.
255     PSHashRecord HRec;
256     HRec.Off = Global.SymOffset + 1;
257     HRec.CRef = 1;
258     HashRecords.push_back(HRec);
259   }
260 }
261 
262 GSIStreamBuilder::GSIStreamBuilder(msf::MSFBuilder &Msf)
263     : Msf(Msf), PSH(std::make_unique<GSIHashStreamBuilder>()),
264       GSH(std::make_unique<GSIHashStreamBuilder>()) {}
265 
266 GSIStreamBuilder::~GSIStreamBuilder() {}
267 
268 uint32_t GSIStreamBuilder::calculatePublicsHashStreamSize() const {
269   uint32_t Size = 0;
270   Size += sizeof(PublicsStreamHeader);
271   Size += PSH->calculateSerializedLength();
272   Size += PubAddrMap.size() * sizeof(uint32_t); // AddrMap
273   // FIXME: Add thunk map and section offsets for incremental linking.
274 
275   return Size;
276 }
277 
278 uint32_t GSIStreamBuilder::calculateGlobalsHashStreamSize() const {
279   return GSH->calculateSerializedLength();
280 }
281 
282 Error GSIStreamBuilder::finalizeMsfLayout() {
283   // First we write public symbol records, then we write global symbol records.
284   uint32_t PublicsSize = PSH->calculateRecordByteSize();
285   uint32_t GlobalsSize = GSH->calculateRecordByteSize();
286   GSH->finalizeBuckets(PublicsSize);
287 
288   Expected<uint32_t> Idx = Msf.addStream(calculateGlobalsHashStreamSize());
289   if (!Idx)
290     return Idx.takeError();
291   GlobalsStreamIndex = *Idx;
292 
293   Idx = Msf.addStream(calculatePublicsHashStreamSize());
294   if (!Idx)
295     return Idx.takeError();
296   PublicsStreamIndex = *Idx;
297 
298   uint32_t RecordBytes = PublicsSize + GlobalsSize;
299 
300   Idx = Msf.addStream(RecordBytes);
301   if (!Idx)
302     return Idx.takeError();
303   RecordStreamIndex = *Idx;
304   return Error::success();
305 }
306 
307 void GSIStreamBuilder::addPublicSymbols(std::vector<BulkPublic> &&Publics) {
308   // Sort the symbols by name. PDBs contain lots of symbols, so use parallelism.
309   parallelSort(Publics, [](const BulkPublic &L, const BulkPublic &R) {
310     return L.getName() < R.getName();
311   });
312 
313   // Assign offsets and allocate one contiguous block of memory for all public
314   // symbols.
315   uint32_t SymOffset = 0;
316   for (BulkPublic &Pub : Publics) {
317     Pub.SymOffset = SymOffset;
318     SymOffset += sizeOfPublic(Pub);
319   }
320   uint8_t *Mem =
321       reinterpret_cast<uint8_t *>(Msf.getAllocator().Allocate(SymOffset, 4));
322 
323   // Instead of storing individual CVSymbol records, store them as one giant
324   // buffer.
325   // FIXME: This is kind of a hack. This makes Records.size() wrong, and we have
326   // to account for that elsewhere.
327   PSH->Records.push_back(CVSymbol(makeArrayRef(Mem, SymOffset)));
328 
329   // Serialize them in parallel.
330   parallelForEachN(0, Publics.size(), [&](size_t I) {
331     const BulkPublic &Pub = Publics[I];
332     serializePublic(Mem + Pub.SymOffset, Pub);
333   });
334 
335   // Re-sort the publics by address so we can build the address map. We no
336   // longer need the original ordering.
337   auto AddrCmp = [](const BulkPublic &L, const BulkPublic &R) {
338     if (L.U.Segment != R.U.Segment)
339       return L.U.Segment < R.U.Segment;
340     if (L.Offset != R.Offset)
341       return L.Offset < R.Offset;
342     // parallelSort is unstable, so we have to do name comparison to ensure
343     // that two names for the same location come out in a determinstic order.
344     return L.getName() < R.getName();
345   };
346   parallelSort(Publics, AddrCmp);
347 
348   // Fill in the symbol offsets in the appropriate order.
349   PubAddrMap.reserve(Publics.size());
350   for (const BulkPublic &Pub : Publics)
351     PubAddrMap.push_back(ulittle32_t(Pub.SymOffset));
352 
353   // Finalize public symbol buckets immediately after they have been added.
354   // They should all be warm in the cache at this point, so go ahead and do it
355   // now.
356   PSH->finalizeBuckets(0, std::move(Publics));
357 }
358 
359 void GSIStreamBuilder::addGlobalSymbol(const ProcRefSym &Sym) {
360   GSH->addSymbol(Sym, Msf);
361 }
362 
363 void GSIStreamBuilder::addGlobalSymbol(const DataSym &Sym) {
364   GSH->addSymbol(Sym, Msf);
365 }
366 
367 void GSIStreamBuilder::addGlobalSymbol(const ConstantSym &Sym) {
368   GSH->addSymbol(Sym, Msf);
369 }
370 
371 void GSIStreamBuilder::addGlobalSymbol(const codeview::CVSymbol &Sym) {
372   GSH->addSymbol(Sym);
373 }
374 
375 static Error writeRecords(BinaryStreamWriter &Writer,
376                           ArrayRef<CVSymbol> Records) {
377   BinaryItemStream<CVSymbol> ItemStream(support::endianness::little);
378   ItemStream.setItems(Records);
379   BinaryStreamRef RecordsRef(ItemStream);
380   return Writer.writeStreamRef(RecordsRef);
381 }
382 
383 Error GSIStreamBuilder::commitSymbolRecordStream(
384     WritableBinaryStreamRef Stream) {
385   BinaryStreamWriter Writer(Stream);
386 
387   // Write public symbol records first, followed by global symbol records.  This
388   // must match the order that we assume in finalizeMsfLayout when computing
389   // PSHZero and GSHZero.
390   if (auto EC = writeRecords(Writer, PSH->Records))
391     return EC;
392   if (auto EC = writeRecords(Writer, GSH->Records))
393     return EC;
394 
395   return Error::success();
396 }
397 
398 Error GSIStreamBuilder::commitPublicsHashStream(
399     WritableBinaryStreamRef Stream) {
400   BinaryStreamWriter Writer(Stream);
401   PublicsStreamHeader Header;
402 
403   // FIXME: Fill these in. They are for incremental linking.
404   Header.SymHash = PSH->calculateSerializedLength();
405   Header.AddrMap = PubAddrMap.size() * 4;
406   Header.NumThunks = 0;
407   Header.SizeOfThunk = 0;
408   Header.ISectThunkTable = 0;
409   memset(Header.Padding, 0, sizeof(Header.Padding));
410   Header.OffThunkTable = 0;
411   Header.NumSections = 0;
412   if (auto EC = Writer.writeObject(Header))
413     return EC;
414 
415   if (auto EC = PSH->commit(Writer))
416     return EC;
417 
418   if (auto EC = Writer.writeArray(makeArrayRef(PubAddrMap)))
419     return EC;
420 
421   return Error::success();
422 }
423 
424 Error GSIStreamBuilder::commitGlobalsHashStream(
425     WritableBinaryStreamRef Stream) {
426   BinaryStreamWriter Writer(Stream);
427   return GSH->commit(Writer);
428 }
429 
430 Error GSIStreamBuilder::commit(const msf::MSFLayout &Layout,
431                                WritableBinaryStreamRef Buffer) {
432   auto GS = WritableMappedBlockStream::createIndexedStream(
433       Layout, Buffer, getGlobalsStreamIndex(), Msf.getAllocator());
434   auto PS = WritableMappedBlockStream::createIndexedStream(
435       Layout, Buffer, getPublicsStreamIndex(), Msf.getAllocator());
436   auto PRS = WritableMappedBlockStream::createIndexedStream(
437       Layout, Buffer, getRecordStreamIndex(), Msf.getAllocator());
438 
439   if (auto EC = commitSymbolRecordStream(*PRS))
440     return EC;
441   if (auto EC = commitGlobalsHashStream(*GS))
442     return EC;
443   if (auto EC = commitPublicsHashStream(*PS))
444     return EC;
445   return Error::success();
446 }
447