1 //===- DbiStreamBuilder.cpp - PDB Dbi Stream Creation -----------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h"
10 
11 #include "llvm/ADT/DenseSet.h"
12 #include "llvm/DebugInfo/CodeView/RecordName.h"
13 #include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
14 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
15 #include "llvm/DebugInfo/CodeView/SymbolSerializer.h"
16 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
17 #include "llvm/DebugInfo/MSF/MSFCommon.h"
18 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
19 #include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
20 #include "llvm/DebugInfo/PDB/Native/Hash.h"
21 #include "llvm/Support/BinaryItemStream.h"
22 #include "llvm/Support/BinaryStreamWriter.h"
23 #include "llvm/Support/xxhash.h"
24 #include <algorithm>
25 #include <vector>
26 
27 using namespace llvm;
28 using namespace llvm::msf;
29 using namespace llvm::pdb;
30 using namespace llvm::codeview;
31 
32 struct llvm::pdb::GSIHashStreamBuilder {
33   struct SymbolDenseMapInfo {
34     static inline CVSymbol getEmptyKey() {
35       static CVSymbol Empty;
36       return Empty;
37     }
38     static inline CVSymbol getTombstoneKey() {
39       static CVSymbol Tombstone(
40           DenseMapInfo<ArrayRef<uint8_t>>::getTombstoneKey());
41       return Tombstone;
42     }
43     static unsigned getHashValue(const CVSymbol &Val) {
44       return xxHash64(Val.RecordData);
45     }
46     static bool isEqual(const CVSymbol &LHS, const CVSymbol &RHS) {
47       return LHS.RecordData == RHS.RecordData;
48     }
49   };
50 
51   std::vector<CVSymbol> Records;
52   llvm::DenseSet<CVSymbol, SymbolDenseMapInfo> SymbolHashes;
53   std::vector<PSHashRecord> HashRecords;
54   std::array<support::ulittle32_t, (IPHR_HASH + 32) / 32> HashBitmap;
55   std::vector<support::ulittle32_t> HashBuckets;
56 
57   uint32_t calculateSerializedLength() const;
58   uint32_t calculateRecordByteSize() const;
59   Error commit(BinaryStreamWriter &Writer);
60   void finalizeBuckets(uint32_t RecordZeroOffset);
61 
62   template <typename T> void addSymbol(const T &Symbol, MSFBuilder &Msf) {
63     T Copy(Symbol);
64     addSymbol(SymbolSerializer::writeOneSymbol(Copy, Msf.getAllocator(),
65                                                CodeViewContainer::Pdb));
66   }
67   void addSymbol(const CVSymbol &Symbol) {
68     if (Symbol.kind() == S_UDT || Symbol.kind() == S_CONSTANT) {
69       auto Iter = SymbolHashes.insert(Symbol);
70       if (!Iter.second)
71         return;
72     }
73 
74     Records.push_back(Symbol);
75   }
76 };
77 
78 uint32_t GSIHashStreamBuilder::calculateSerializedLength() const {
79   uint32_t Size = sizeof(GSIHashHeader);
80   Size += HashRecords.size() * sizeof(PSHashRecord);
81   Size += HashBitmap.size() * sizeof(uint32_t);
82   Size += HashBuckets.size() * sizeof(uint32_t);
83   return Size;
84 }
85 
86 uint32_t GSIHashStreamBuilder::calculateRecordByteSize() const {
87   uint32_t Size = 0;
88   for (const auto &Sym : Records)
89     Size += Sym.length();
90   return Size;
91 }
92 
93 Error GSIHashStreamBuilder::commit(BinaryStreamWriter &Writer) {
94   GSIHashHeader Header;
95   Header.VerSignature = GSIHashHeader::HdrSignature;
96   Header.VerHdr = GSIHashHeader::HdrVersion;
97   Header.HrSize = HashRecords.size() * sizeof(PSHashRecord);
98   Header.NumBuckets = HashBitmap.size() * 4 + HashBuckets.size() * 4;
99 
100   if (auto EC = Writer.writeObject(Header))
101     return EC;
102 
103   if (auto EC = Writer.writeArray(makeArrayRef(HashRecords)))
104     return EC;
105   if (auto EC = Writer.writeArray(makeArrayRef(HashBitmap)))
106     return EC;
107   if (auto EC = Writer.writeArray(makeArrayRef(HashBuckets)))
108     return EC;
109   return Error::success();
110 }
111 
112 static bool isAsciiString(StringRef S) {
113   return llvm::all_of(S, [](char C) { return unsigned(C) < 0x80; });
114 }
115 
116 // See `caseInsensitiveComparePchPchCchCch` in gsi.cpp
117 static bool gsiRecordLess(StringRef S1, StringRef S2) {
118   size_t LS = S1.size();
119   size_t RS = S2.size();
120   // Shorter strings always compare less than longer strings.
121   if (LS != RS)
122     return LS < RS;
123 
124   // If either string contains non ascii characters, memcmp them.
125   if (LLVM_UNLIKELY(!isAsciiString(S1) || !isAsciiString(S2)))
126     return memcmp(S1.data(), S2.data(), LS) < 0;
127 
128   // Both strings are ascii, perform a case-insenstive comparison.
129   return S1.compare_lower(S2.data()) < 0;
130 }
131 
132 void GSIHashStreamBuilder::finalizeBuckets(uint32_t RecordZeroOffset) {
133   std::array<std::vector<std::pair<StringRef, PSHashRecord>>, IPHR_HASH + 1>
134       TmpBuckets;
135   uint32_t SymOffset = RecordZeroOffset;
136   for (const CVSymbol &Sym : Records) {
137     PSHashRecord HR;
138     // Add one when writing symbol offsets to disk. See GSI1::fixSymRecs.
139     HR.Off = SymOffset + 1;
140     HR.CRef = 1; // Always use a refcount of 1.
141 
142     // Hash the name to figure out which bucket this goes into.
143     StringRef Name = getSymbolName(Sym);
144     size_t BucketIdx = hashStringV1(Name) % IPHR_HASH;
145     TmpBuckets[BucketIdx].push_back(std::make_pair(Name, HR));
146     SymOffset += Sym.length();
147   }
148 
149   // Compute the three tables: the hash records in bucket and chain order, the
150   // bucket presence bitmap, and the bucket chain start offsets.
151   HashRecords.reserve(Records.size());
152   for (ulittle32_t &Word : HashBitmap)
153     Word = 0;
154   for (size_t BucketIdx = 0; BucketIdx < IPHR_HASH + 1; ++BucketIdx) {
155     auto &Bucket = TmpBuckets[BucketIdx];
156     if (Bucket.empty())
157       continue;
158     HashBitmap[BucketIdx / 32] |= 1U << (BucketIdx % 32);
159 
160     // Calculate what the offset of the first hash record in the chain would
161     // be if it were inflated to contain 32-bit pointers. On a 32-bit system,
162     // each record would be 12 bytes. See HROffsetCalc in gsi.h.
163     const int SizeOfHROffsetCalc = 12;
164     ulittle32_t ChainStartOff =
165         ulittle32_t(HashRecords.size() * SizeOfHROffsetCalc);
166     HashBuckets.push_back(ChainStartOff);
167 
168     // Sort each bucket by memcmp of the symbol's name.  It's important that
169     // we use the same sorting algorithm as is used by the reference
170     // implementation to ensure that the search for a record within a bucket
171     // can properly early-out when it detects the record won't be found.  The
172     // algorithm used here corredsponds to the function
173     // caseInsensitiveComparePchPchCchCch in the reference implementation.
174     llvm::sort(Bucket, [](const std::pair<StringRef, PSHashRecord> &Left,
175                           const std::pair<StringRef, PSHashRecord> &Right) {
176       return gsiRecordLess(Left.first, Right.first);
177     });
178 
179     for (const auto &Entry : Bucket)
180       HashRecords.push_back(Entry.second);
181   }
182 }
183 
184 GSIStreamBuilder::GSIStreamBuilder(msf::MSFBuilder &Msf)
185     : Msf(Msf), PSH(std::make_unique<GSIHashStreamBuilder>()),
186       GSH(std::make_unique<GSIHashStreamBuilder>()) {}
187 
188 GSIStreamBuilder::~GSIStreamBuilder() {}
189 
190 uint32_t GSIStreamBuilder::calculatePublicsHashStreamSize() const {
191   uint32_t Size = 0;
192   Size += sizeof(PublicsStreamHeader);
193   Size += PSH->calculateSerializedLength();
194   Size += PSH->Records.size() * sizeof(uint32_t); // AddrMap
195   // FIXME: Add thunk map and section offsets for incremental linking.
196 
197   return Size;
198 }
199 
200 uint32_t GSIStreamBuilder::calculateGlobalsHashStreamSize() const {
201   return GSH->calculateSerializedLength();
202 }
203 
204 Error GSIStreamBuilder::finalizeMsfLayout() {
205   // First we write public symbol records, then we write global symbol records.
206   uint32_t PSHZero = 0;
207   uint32_t GSHZero = PSH->calculateRecordByteSize();
208 
209   PSH->finalizeBuckets(PSHZero);
210   GSH->finalizeBuckets(GSHZero);
211 
212   Expected<uint32_t> Idx = Msf.addStream(calculateGlobalsHashStreamSize());
213   if (!Idx)
214     return Idx.takeError();
215   GlobalsStreamIndex = *Idx;
216 
217   Idx = Msf.addStream(calculatePublicsHashStreamSize());
218   if (!Idx)
219     return Idx.takeError();
220   PublicsStreamIndex = *Idx;
221 
222   uint32_t RecordBytes =
223       GSH->calculateRecordByteSize() + PSH->calculateRecordByteSize();
224 
225   Idx = Msf.addStream(RecordBytes);
226   if (!Idx)
227     return Idx.takeError();
228   RecordStreamIndex = *Idx;
229   return Error::success();
230 }
231 
232 static StringRef extractPubSym(const CVSymbol *Sym, uint16_t &Seg,
233                                uint32_t &Offset) {
234   ArrayRef<uint8_t> Buf = Sym->content();
235   assert(Buf.size() > sizeof(PublicSym32Header));
236   const auto *Hdr = reinterpret_cast<const PublicSym32Header *>(Buf.data());
237   Buf = Buf.drop_front(sizeof(PublicSym32Header));
238   Seg = Hdr->Segment;
239   Offset = Hdr->Offset;
240   // Don't worry about finding the null terminator, since the strings will be
241   // compared later.
242   return StringRef(reinterpret_cast<const char *>(Buf.data()), Buf.size());
243 }
244 
245 static bool comparePubSymByAddrAndName(const CVSymbol *LS, const CVSymbol *RS) {
246   uint16_t LSeg, RSeg;
247   uint32_t LOff, ROff;
248   StringRef LName, RName;
249   LName = extractPubSym(LS, LSeg, LOff);
250   RName = extractPubSym(RS, RSeg, ROff);
251   if (LSeg != RSeg)
252     return LSeg < RSeg;
253   if (LOff != ROff)
254     return LOff < ROff;
255   return LName < RName;
256 }
257 
258 /// Compute the address map. The address map is an array of symbol offsets
259 /// sorted so that it can be binary searched by address.
260 static std::vector<ulittle32_t> computeAddrMap(ArrayRef<CVSymbol> Records) {
261   // Make a vector of pointers to the symbols so we can sort it by address.
262   // Also gather the symbol offsets while we're at it.
263 
264   std::vector<const CVSymbol *> PublicsByAddr;
265   std::vector<uint32_t> SymOffsets;
266   PublicsByAddr.reserve(Records.size());
267   SymOffsets.reserve(Records.size());
268 
269   uint32_t SymOffset = 0;
270   for (const CVSymbol &Sym : Records) {
271     assert(Sym.kind() == SymbolKind::S_PUB32);
272     PublicsByAddr.push_back(&Sym);
273     SymOffsets.push_back(SymOffset);
274     SymOffset += Sym.length();
275   }
276   llvm::stable_sort(PublicsByAddr, comparePubSymByAddrAndName);
277 
278   // Fill in the symbol offsets in the appropriate order.
279   std::vector<ulittle32_t> AddrMap;
280   AddrMap.reserve(Records.size());
281   for (const CVSymbol *Sym : PublicsByAddr) {
282     ptrdiff_t Idx = std::distance(Records.data(), Sym);
283     assert(Idx >= 0 && size_t(Idx) < Records.size());
284     AddrMap.push_back(ulittle32_t(SymOffsets[Idx]));
285   }
286   return AddrMap;
287 }
288 
289 void GSIStreamBuilder::addPublicSymbol(const PublicSym32 &Pub) {
290   PSH->addSymbol(Pub, Msf);
291 }
292 
293 void GSIStreamBuilder::addGlobalSymbol(const ProcRefSym &Sym) {
294   GSH->addSymbol(Sym, Msf);
295 }
296 
297 void GSIStreamBuilder::addGlobalSymbol(const DataSym &Sym) {
298   GSH->addSymbol(Sym, Msf);
299 }
300 
301 void GSIStreamBuilder::addGlobalSymbol(const ConstantSym &Sym) {
302   GSH->addSymbol(Sym, Msf);
303 }
304 
305 void GSIStreamBuilder::addGlobalSymbol(const codeview::CVSymbol &Sym) {
306   GSH->addSymbol(Sym);
307 }
308 
309 static Error writeRecords(BinaryStreamWriter &Writer,
310                           ArrayRef<CVSymbol> Records) {
311   BinaryItemStream<CVSymbol> ItemStream(support::endianness::little);
312   ItemStream.setItems(Records);
313   BinaryStreamRef RecordsRef(ItemStream);
314   return Writer.writeStreamRef(RecordsRef);
315 }
316 
317 Error GSIStreamBuilder::commitSymbolRecordStream(
318     WritableBinaryStreamRef Stream) {
319   BinaryStreamWriter Writer(Stream);
320 
321   // Write public symbol records first, followed by global symbol records.  This
322   // must match the order that we assume in finalizeMsfLayout when computing
323   // PSHZero and GSHZero.
324   if (auto EC = writeRecords(Writer, PSH->Records))
325     return EC;
326   if (auto EC = writeRecords(Writer, GSH->Records))
327     return EC;
328 
329   return Error::success();
330 }
331 
332 Error GSIStreamBuilder::commitPublicsHashStream(
333     WritableBinaryStreamRef Stream) {
334   BinaryStreamWriter Writer(Stream);
335   PublicsStreamHeader Header;
336 
337   // FIXME: Fill these in. They are for incremental linking.
338   Header.SymHash = PSH->calculateSerializedLength();
339   Header.AddrMap = PSH->Records.size() * 4;
340   Header.NumThunks = 0;
341   Header.SizeOfThunk = 0;
342   Header.ISectThunkTable = 0;
343   memset(Header.Padding, 0, sizeof(Header.Padding));
344   Header.OffThunkTable = 0;
345   Header.NumSections = 0;
346   if (auto EC = Writer.writeObject(Header))
347     return EC;
348 
349   if (auto EC = PSH->commit(Writer))
350     return EC;
351 
352   std::vector<ulittle32_t> AddrMap = computeAddrMap(PSH->Records);
353   if (auto EC = Writer.writeArray(makeArrayRef(AddrMap)))
354     return EC;
355 
356   return Error::success();
357 }
358 
359 Error GSIStreamBuilder::commitGlobalsHashStream(
360     WritableBinaryStreamRef Stream) {
361   BinaryStreamWriter Writer(Stream);
362   return GSH->commit(Writer);
363 }
364 
365 Error GSIStreamBuilder::commit(const msf::MSFLayout &Layout,
366                                WritableBinaryStreamRef Buffer) {
367   auto GS = WritableMappedBlockStream::createIndexedStream(
368       Layout, Buffer, getGlobalsStreamIndex(), Msf.getAllocator());
369   auto PS = WritableMappedBlockStream::createIndexedStream(
370       Layout, Buffer, getPublicsStreamIndex(), Msf.getAllocator());
371   auto PRS = WritableMappedBlockStream::createIndexedStream(
372       Layout, Buffer, getRecordStreamIndex(), Msf.getAllocator());
373 
374   if (auto EC = commitSymbolRecordStream(*PRS))
375     return EC;
376   if (auto EC = commitGlobalsHashStream(*GS))
377     return EC;
378   if (auto EC = commitPublicsHashStream(*PS))
379     return EC;
380   return Error::success();
381 }
382