1 //===-- Serialization.cpp - Binary serialization of index data ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "Serialization.h"
10 #include "Headers.h"
11 #include "RIFF.h"
12 #include "index/MemIndex.h"
13 #include "index/SymbolLocation.h"
14 #include "index/SymbolOrigin.h"
15 #include "index/dex/Dex.h"
16 #include "support/Logger.h"
17 #include "support/Trace.h"
18 #include "clang/Tooling/CompilationDatabase.h"
19 #include "llvm/ADT/StringRef.h"
20 #include "llvm/Support/Compiler.h"
21 #include "llvm/Support/Compression.h"
22 #include "llvm/Support/Endian.h"
23 #include "llvm/Support/Error.h"
24 #include "llvm/Support/raw_ostream.h"
25 #include <cstdint>
26 #include <vector>
27 
28 namespace clang {
29 namespace clangd {
30 namespace {
31 
32 // IO PRIMITIVES
33 // We use little-endian 32 bit ints, sometimes with variable-length encoding.
34 //
35 // Variable-length int encoding (varint) uses the bottom 7 bits of each byte
36 // to encode the number, and the top bit to indicate whether more bytes follow.
37 // e.g. 9a 2f means [0x1a and keep reading, 0x2f and stop].
38 // This represents 0x1a | 0x2f<<7 = 6042.
39 // A 32-bit integer takes 1-5 bytes to encode; small numbers are more compact.
40 
41 // Reads binary data from a StringRef, and keeps track of position.
42 class Reader {
43   const char *Begin, *End;
44   bool Err = false;
45 
46 public:
Reader(llvm::StringRef Data)47   Reader(llvm::StringRef Data) : Begin(Data.begin()), End(Data.end()) {}
48   // The "error" bit is set by reading past EOF or reading invalid data.
49   // When in an error state, reads may return zero values: callers should check.
err() const50   bool err() const { return Err; }
51   // Did we read all the data, or encounter an error?
eof() const52   bool eof() const { return Begin == End || Err; }
53   // All the data we didn't read yet.
rest() const54   llvm::StringRef rest() const { return llvm::StringRef(Begin, End - Begin); }
55 
consume8()56   uint8_t consume8() {
57     if (LLVM_UNLIKELY(Begin == End)) {
58       Err = true;
59       return 0;
60     }
61     return *Begin++;
62   }
63 
consume32()64   uint32_t consume32() {
65     if (LLVM_UNLIKELY(Begin + 4 > End)) {
66       Err = true;
67       return 0;
68     }
69     auto Ret = llvm::support::endian::read32le(Begin);
70     Begin += 4;
71     return Ret;
72   }
73 
consume(int N)74   llvm::StringRef consume(int N) {
75     if (LLVM_UNLIKELY(Begin + N > End)) {
76       Err = true;
77       return llvm::StringRef();
78     }
79     llvm::StringRef Ret(Begin, N);
80     Begin += N;
81     return Ret;
82   }
83 
consumeVar()84   uint32_t consumeVar() {
85     constexpr static uint8_t More = 1 << 7;
86 
87     // Use a 32 bit unsigned here to prevent promotion to signed int (unless int
88     // is wider than 32 bits).
89     uint32_t B = consume8();
90     if (LLVM_LIKELY(!(B & More)))
91       return B;
92     uint32_t Val = B & ~More;
93     for (int Shift = 7; B & More && Shift < 32; Shift += 7) {
94       B = consume8();
95       // 5th byte of a varint can only have lowest 4 bits set.
96       assert((Shift != 28 || B == (B & 0x0f)) && "Invalid varint encoding");
97       Val |= (B & ~More) << Shift;
98     }
99     return Val;
100   }
101 
consumeString(llvm::ArrayRef<llvm::StringRef> Strings)102   llvm::StringRef consumeString(llvm::ArrayRef<llvm::StringRef> Strings) {
103     auto StringIndex = consumeVar();
104     if (LLVM_UNLIKELY(StringIndex >= Strings.size())) {
105       Err = true;
106       return llvm::StringRef();
107     }
108     return Strings[StringIndex];
109   }
110 
consumeID()111   SymbolID consumeID() {
112     llvm::StringRef Raw = consume(SymbolID::RawSize); // short if truncated.
113     return LLVM_UNLIKELY(err()) ? SymbolID() : SymbolID::fromRaw(Raw);
114   }
115 
116   // Read a varint (as consumeVar) and resize the container accordingly.
117   // If the size is invalid, return false and mark an error.
118   // (The caller should abort in this case).
consumeSize(T & Container)119   template <typename T> LLVM_NODISCARD bool consumeSize(T &Container) {
120     auto Size = consumeVar();
121     // Conservatively assume each element is at least one byte.
122     if (Size > (size_t)(End - Begin)) {
123       Err = true;
124       return false;
125     }
126     Container.resize(Size);
127     return true;
128   }
129 };
130 
write32(uint32_t I,llvm::raw_ostream & OS)131 void write32(uint32_t I, llvm::raw_ostream &OS) {
132   char Buf[4];
133   llvm::support::endian::write32le(Buf, I);
134   OS.write(Buf, sizeof(Buf));
135 }
136 
writeVar(uint32_t I,llvm::raw_ostream & OS)137 void writeVar(uint32_t I, llvm::raw_ostream &OS) {
138   constexpr static uint8_t More = 1 << 7;
139   if (LLVM_LIKELY(I < 1 << 7)) {
140     OS.write(I);
141     return;
142   }
143   for (;;) {
144     OS.write(I | More);
145     I >>= 7;
146     if (I < 1 << 7) {
147       OS.write(I);
148       return;
149     }
150   }
151 }
152 
153 // STRING TABLE ENCODING
154 // Index data has many string fields, and many strings are identical.
155 // We store each string once, and refer to them by index.
156 //
157 // The string table's format is:
158 //   - UncompressedSize : uint32 (or 0 for no compression)
159 //   - CompressedData   : byte[CompressedSize]
160 //
161 // CompressedData is a zlib-compressed byte[UncompressedSize].
162 // It contains a sequence of null-terminated strings, e.g. "foo\0bar\0".
163 // These are sorted to improve compression.
164 
165 // Maps each string to a canonical representation.
166 // Strings remain owned externally (e.g. by SymbolSlab).
167 class StringTableOut {
168   llvm::DenseSet<llvm::StringRef> Unique;
169   std::vector<llvm::StringRef> Sorted;
170   // Since strings are interned, look up can be by pointer.
171   llvm::DenseMap<std::pair<const char *, size_t>, unsigned> Index;
172 
173 public:
StringTableOut()174   StringTableOut() {
175     // Ensure there's at least one string in the table.
176     // Table size zero is reserved to indicate no compression.
177     Unique.insert("");
178   }
179   // Add a string to the table. Overwrites S if an identical string exists.
intern(llvm::StringRef & S)180   void intern(llvm::StringRef &S) { S = *Unique.insert(S).first; };
181   // Finalize the table and write it to OS. No more strings may be added.
finalize(llvm::raw_ostream & OS)182   void finalize(llvm::raw_ostream &OS) {
183     Sorted = {Unique.begin(), Unique.end()};
184     llvm::sort(Sorted);
185     for (unsigned I = 0; I < Sorted.size(); ++I)
186       Index.try_emplace({Sorted[I].data(), Sorted[I].size()}, I);
187 
188     std::string RawTable;
189     for (llvm::StringRef S : Sorted) {
190       RawTable.append(std::string(S));
191       RawTable.push_back(0);
192     }
193     if (llvm::compression::zlib::isAvailable()) {
194       llvm::SmallVector<uint8_t, 0> Compressed;
195       llvm::compression::zlib::compress(llvm::arrayRefFromStringRef(RawTable),
196                                         Compressed);
197       write32(RawTable.size(), OS);
198       OS << llvm::toStringRef(Compressed);
199     } else {
200       write32(0, OS); // No compression.
201       OS << RawTable;
202     }
203   }
204   // Get the ID of an string, which must be interned. Table must be finalized.
index(llvm::StringRef S) const205   unsigned index(llvm::StringRef S) const {
206     assert(!Sorted.empty() && "table not finalized");
207     assert(Index.count({S.data(), S.size()}) && "string not interned");
208     return Index.find({S.data(), S.size()})->second;
209   }
210 };
211 
212 struct StringTableIn {
213   llvm::BumpPtrAllocator Arena;
214   std::vector<llvm::StringRef> Strings;
215 };
216 
readStringTable(llvm::StringRef Data)217 llvm::Expected<StringTableIn> readStringTable(llvm::StringRef Data) {
218   Reader R(Data);
219   size_t UncompressedSize = R.consume32();
220   if (R.err())
221     return error("Truncated string table");
222 
223   llvm::StringRef Uncompressed;
224   llvm::SmallVector<uint8_t, 0> UncompressedStorage;
225   if (UncompressedSize == 0) // No compression
226     Uncompressed = R.rest();
227   else if (llvm::compression::zlib::isAvailable()) {
228     // Don't allocate a massive buffer if UncompressedSize was corrupted
229     // This is effective for sharded index, but not big monolithic ones, as
230     // once compressed size reaches 4MB nothing can be ruled out.
231     // Theoretical max ratio from https://zlib.net/zlib_tech.html
232     constexpr int MaxCompressionRatio = 1032;
233     if (UncompressedSize / MaxCompressionRatio > R.rest().size())
234       return error("Bad stri table: uncompress {0} -> {1} bytes is implausible",
235                    R.rest().size(), UncompressedSize);
236 
237     if (llvm::Error E = llvm::compression::zlib::uncompress(
238             llvm::arrayRefFromStringRef(R.rest()), UncompressedStorage,
239             UncompressedSize))
240       return std::move(E);
241     Uncompressed = toStringRef(UncompressedStorage);
242   } else
243     return error("Compressed string table, but zlib is unavailable");
244 
245   StringTableIn Table;
246   llvm::StringSaver Saver(Table.Arena);
247   R = Reader(Uncompressed);
248   for (Reader R(Uncompressed); !R.eof();) {
249     auto Len = R.rest().find(0);
250     if (Len == llvm::StringRef::npos)
251       return error("Bad string table: not null terminated");
252     Table.Strings.push_back(Saver.save(R.consume(Len)));
253     R.consume8();
254   }
255   if (R.err())
256     return error("Truncated string table");
257   return std::move(Table);
258 }
259 
260 // SYMBOL ENCODING
261 // Each field of clangd::Symbol is encoded in turn (see implementation).
262 //  - StringRef fields encode as varint (index into the string table)
263 //  - enums encode as the underlying type
264 //  - most numbers encode as varint
265 
writeLocation(const SymbolLocation & Loc,const StringTableOut & Strings,llvm::raw_ostream & OS)266 void writeLocation(const SymbolLocation &Loc, const StringTableOut &Strings,
267                    llvm::raw_ostream &OS) {
268   writeVar(Strings.index(Loc.FileURI), OS);
269   for (const auto &Endpoint : {Loc.Start, Loc.End}) {
270     writeVar(Endpoint.line(), OS);
271     writeVar(Endpoint.column(), OS);
272   }
273 }
274 
readLocation(Reader & Data,llvm::ArrayRef<llvm::StringRef> Strings)275 SymbolLocation readLocation(Reader &Data,
276                             llvm::ArrayRef<llvm::StringRef> Strings) {
277   SymbolLocation Loc;
278   Loc.FileURI = Data.consumeString(Strings).data();
279   for (auto *Endpoint : {&Loc.Start, &Loc.End}) {
280     Endpoint->setLine(Data.consumeVar());
281     Endpoint->setColumn(Data.consumeVar());
282   }
283   return Loc;
284 }
285 
readIncludeGraphNode(Reader & Data,llvm::ArrayRef<llvm::StringRef> Strings)286 IncludeGraphNode readIncludeGraphNode(Reader &Data,
287                                       llvm::ArrayRef<llvm::StringRef> Strings) {
288   IncludeGraphNode IGN;
289   IGN.Flags = static_cast<IncludeGraphNode::SourceFlag>(Data.consume8());
290   IGN.URI = Data.consumeString(Strings);
291   llvm::StringRef Digest = Data.consume(IGN.Digest.size());
292   std::copy(Digest.bytes_begin(), Digest.bytes_end(), IGN.Digest.begin());
293   if (!Data.consumeSize(IGN.DirectIncludes))
294     return IGN;
295   for (llvm::StringRef &Include : IGN.DirectIncludes)
296     Include = Data.consumeString(Strings);
297   return IGN;
298 }
299 
writeIncludeGraphNode(const IncludeGraphNode & IGN,const StringTableOut & Strings,llvm::raw_ostream & OS)300 void writeIncludeGraphNode(const IncludeGraphNode &IGN,
301                            const StringTableOut &Strings,
302                            llvm::raw_ostream &OS) {
303   OS.write(static_cast<uint8_t>(IGN.Flags));
304   writeVar(Strings.index(IGN.URI), OS);
305   llvm::StringRef Hash(reinterpret_cast<const char *>(IGN.Digest.data()),
306                        IGN.Digest.size());
307   OS << Hash;
308   writeVar(IGN.DirectIncludes.size(), OS);
309   for (llvm::StringRef Include : IGN.DirectIncludes)
310     writeVar(Strings.index(Include), OS);
311 }
312 
writeSymbol(const Symbol & Sym,const StringTableOut & Strings,llvm::raw_ostream & OS)313 void writeSymbol(const Symbol &Sym, const StringTableOut &Strings,
314                  llvm::raw_ostream &OS) {
315   OS << Sym.ID.raw(); // TODO: once we start writing xrefs and posting lists,
316                       // symbol IDs should probably be in a string table.
317   OS.write(static_cast<uint8_t>(Sym.SymInfo.Kind));
318   OS.write(static_cast<uint8_t>(Sym.SymInfo.Lang));
319   writeVar(Strings.index(Sym.Name), OS);
320   writeVar(Strings.index(Sym.Scope), OS);
321   writeVar(Strings.index(Sym.TemplateSpecializationArgs), OS);
322   writeLocation(Sym.Definition, Strings, OS);
323   writeLocation(Sym.CanonicalDeclaration, Strings, OS);
324   writeVar(Sym.References, OS);
325   OS.write(static_cast<uint8_t>(Sym.Flags));
326   writeVar(Strings.index(Sym.Signature), OS);
327   writeVar(Strings.index(Sym.CompletionSnippetSuffix), OS);
328   writeVar(Strings.index(Sym.Documentation), OS);
329   writeVar(Strings.index(Sym.ReturnType), OS);
330   writeVar(Strings.index(Sym.Type), OS);
331 
332   auto WriteInclude = [&](const Symbol::IncludeHeaderWithReferences &Include) {
333     writeVar(Strings.index(Include.IncludeHeader), OS);
334     writeVar(Include.References, OS);
335   };
336   writeVar(Sym.IncludeHeaders.size(), OS);
337   for (const auto &Include : Sym.IncludeHeaders)
338     WriteInclude(Include);
339 }
340 
readSymbol(Reader & Data,llvm::ArrayRef<llvm::StringRef> Strings,SymbolOrigin Origin)341 Symbol readSymbol(Reader &Data, llvm::ArrayRef<llvm::StringRef> Strings,
342                   SymbolOrigin Origin) {
343   Symbol Sym;
344   Sym.ID = Data.consumeID();
345   Sym.SymInfo.Kind = static_cast<index::SymbolKind>(Data.consume8());
346   Sym.SymInfo.Lang = static_cast<index::SymbolLanguage>(Data.consume8());
347   Sym.Name = Data.consumeString(Strings);
348   Sym.Scope = Data.consumeString(Strings);
349   Sym.TemplateSpecializationArgs = Data.consumeString(Strings);
350   Sym.Definition = readLocation(Data, Strings);
351   Sym.CanonicalDeclaration = readLocation(Data, Strings);
352   Sym.References = Data.consumeVar();
353   Sym.Flags = static_cast<Symbol::SymbolFlag>(Data.consume8());
354   Sym.Origin = Origin;
355   Sym.Signature = Data.consumeString(Strings);
356   Sym.CompletionSnippetSuffix = Data.consumeString(Strings);
357   Sym.Documentation = Data.consumeString(Strings);
358   Sym.ReturnType = Data.consumeString(Strings);
359   Sym.Type = Data.consumeString(Strings);
360   if (!Data.consumeSize(Sym.IncludeHeaders))
361     return Sym;
362   for (auto &I : Sym.IncludeHeaders) {
363     I.IncludeHeader = Data.consumeString(Strings);
364     I.References = Data.consumeVar();
365   }
366   return Sym;
367 }
368 
369 // REFS ENCODING
370 // A refs section has data grouped by Symbol. Each symbol has:
371 //  - SymbolID: 8 bytes
372 //  - NumRefs: varint
373 //  - Ref[NumRefs]
374 // Fields of Ref are encoded in turn, see implementation.
375 
writeRefs(const SymbolID & ID,llvm::ArrayRef<Ref> Refs,const StringTableOut & Strings,llvm::raw_ostream & OS)376 void writeRefs(const SymbolID &ID, llvm::ArrayRef<Ref> Refs,
377                const StringTableOut &Strings, llvm::raw_ostream &OS) {
378   OS << ID.raw();
379   writeVar(Refs.size(), OS);
380   for (const auto &Ref : Refs) {
381     OS.write(static_cast<unsigned char>(Ref.Kind));
382     writeLocation(Ref.Location, Strings, OS);
383     OS << Ref.Container.raw();
384   }
385 }
386 
387 std::pair<SymbolID, std::vector<Ref>>
readRefs(Reader & Data,llvm::ArrayRef<llvm::StringRef> Strings)388 readRefs(Reader &Data, llvm::ArrayRef<llvm::StringRef> Strings) {
389   std::pair<SymbolID, std::vector<Ref>> Result;
390   Result.first = Data.consumeID();
391   if (!Data.consumeSize(Result.second))
392     return Result;
393   for (auto &Ref : Result.second) {
394     Ref.Kind = static_cast<RefKind>(Data.consume8());
395     Ref.Location = readLocation(Data, Strings);
396     Ref.Container = Data.consumeID();
397   }
398   return Result;
399 }
400 
401 // RELATIONS ENCODING
402 // A relations section is a flat list of relations. Each relation has:
403 //  - SymbolID (subject): 8 bytes
404 //  - relation kind (predicate): 1 byte
405 //  - SymbolID (object): 8 bytes
406 // In the future, we might prefer a packed representation if the need arises.
407 
writeRelation(const Relation & R,llvm::raw_ostream & OS)408 void writeRelation(const Relation &R, llvm::raw_ostream &OS) {
409   OS << R.Subject.raw();
410   OS.write(static_cast<uint8_t>(R.Predicate));
411   OS << R.Object.raw();
412 }
413 
readRelation(Reader & Data)414 Relation readRelation(Reader &Data) {
415   SymbolID Subject = Data.consumeID();
416   RelationKind Predicate = static_cast<RelationKind>(Data.consume8());
417   SymbolID Object = Data.consumeID();
418   return {Subject, Predicate, Object};
419 }
420 
421 struct InternedCompileCommand {
422   llvm::StringRef Directory;
423   std::vector<llvm::StringRef> CommandLine;
424 };
425 
writeCompileCommand(const InternedCompileCommand & Cmd,const StringTableOut & Strings,llvm::raw_ostream & CmdOS)426 void writeCompileCommand(const InternedCompileCommand &Cmd,
427                          const StringTableOut &Strings,
428                          llvm::raw_ostream &CmdOS) {
429   writeVar(Strings.index(Cmd.Directory), CmdOS);
430   writeVar(Cmd.CommandLine.size(), CmdOS);
431   for (llvm::StringRef C : Cmd.CommandLine)
432     writeVar(Strings.index(C), CmdOS);
433 }
434 
435 InternedCompileCommand
readCompileCommand(Reader CmdReader,llvm::ArrayRef<llvm::StringRef> Strings)436 readCompileCommand(Reader CmdReader, llvm::ArrayRef<llvm::StringRef> Strings) {
437   InternedCompileCommand Cmd;
438   Cmd.Directory = CmdReader.consumeString(Strings);
439   if (!CmdReader.consumeSize(Cmd.CommandLine))
440     return Cmd;
441   for (llvm::StringRef &C : Cmd.CommandLine)
442     C = CmdReader.consumeString(Strings);
443   return Cmd;
444 }
445 
446 // FILE ENCODING
447 // A file is a RIFF chunk with type 'CdIx'.
448 // It contains the sections:
449 //   - meta: version number
450 //   - srcs: information related to include graph
451 //   - stri: string table
452 //   - symb: symbols
453 //   - refs: references to symbols
454 
455 // The current versioning scheme is simple - non-current versions are rejected.
456 // If you make a breaking change, bump this version number to invalidate stored
457 // data. Later we may want to support some backward compatibility.
458 constexpr static uint32_t Version = 17;
459 
readRIFF(llvm::StringRef Data,SymbolOrigin Origin)460 llvm::Expected<IndexFileIn> readRIFF(llvm::StringRef Data,
461                                      SymbolOrigin Origin) {
462   auto RIFF = riff::readFile(Data);
463   if (!RIFF)
464     return RIFF.takeError();
465   if (RIFF->Type != riff::fourCC("CdIx"))
466     return error("wrong RIFF filetype: {0}", riff::fourCCStr(RIFF->Type));
467   llvm::StringMap<llvm::StringRef> Chunks;
468   for (const auto &Chunk : RIFF->Chunks)
469     Chunks.try_emplace(llvm::StringRef(Chunk.ID.data(), Chunk.ID.size()),
470                        Chunk.Data);
471 
472   if (!Chunks.count("meta"))
473     return error("missing meta chunk");
474   Reader Meta(Chunks.lookup("meta"));
475   auto SeenVersion = Meta.consume32();
476   if (SeenVersion != Version)
477     return error("wrong version: want {0}, got {1}", Version, SeenVersion);
478 
479   // meta chunk is checked above, as we prefer the "version mismatch" error.
480   for (llvm::StringRef RequiredChunk : {"stri"})
481     if (!Chunks.count(RequiredChunk))
482       return error("missing required chunk {0}", RequiredChunk);
483 
484   auto Strings = readStringTable(Chunks.lookup("stri"));
485   if (!Strings)
486     return Strings.takeError();
487 
488   IndexFileIn Result;
489   if (Chunks.count("srcs")) {
490     Reader SrcsReader(Chunks.lookup("srcs"));
491     Result.Sources.emplace();
492     while (!SrcsReader.eof()) {
493       auto IGN = readIncludeGraphNode(SrcsReader, Strings->Strings);
494       auto Entry = Result.Sources->try_emplace(IGN.URI).first;
495       Entry->getValue() = std::move(IGN);
496       // We change all the strings inside the structure to point at the keys in
497       // the map, since it is the only copy of the string that's going to live.
498       Entry->getValue().URI = Entry->getKey();
499       for (auto &Include : Entry->getValue().DirectIncludes)
500         Include = Result.Sources->try_emplace(Include).first->getKey();
501     }
502     if (SrcsReader.err())
503       return error("malformed or truncated include uri");
504   }
505 
506   if (Chunks.count("symb")) {
507     Reader SymbolReader(Chunks.lookup("symb"));
508     SymbolSlab::Builder Symbols;
509     while (!SymbolReader.eof())
510       Symbols.insert(readSymbol(SymbolReader, Strings->Strings, Origin));
511     if (SymbolReader.err())
512       return error("malformed or truncated symbol");
513     Result.Symbols = std::move(Symbols).build();
514   }
515   if (Chunks.count("refs")) {
516     Reader RefsReader(Chunks.lookup("refs"));
517     RefSlab::Builder Refs;
518     while (!RefsReader.eof()) {
519       auto RefsBundle = readRefs(RefsReader, Strings->Strings);
520       for (const auto &Ref : RefsBundle.second) // FIXME: bulk insert?
521         Refs.insert(RefsBundle.first, Ref);
522     }
523     if (RefsReader.err())
524       return error("malformed or truncated refs");
525     Result.Refs = std::move(Refs).build();
526   }
527   if (Chunks.count("rela")) {
528     Reader RelationsReader(Chunks.lookup("rela"));
529     RelationSlab::Builder Relations;
530     while (!RelationsReader.eof())
531       Relations.insert(readRelation(RelationsReader));
532     if (RelationsReader.err())
533       return error("malformed or truncated relations");
534     Result.Relations = std::move(Relations).build();
535   }
536   if (Chunks.count("cmdl")) {
537     Reader CmdReader(Chunks.lookup("cmdl"));
538     InternedCompileCommand Cmd =
539         readCompileCommand(CmdReader, Strings->Strings);
540     if (CmdReader.err())
541       return error("malformed or truncated commandline section");
542     Result.Cmd.emplace();
543     Result.Cmd->Directory = std::string(Cmd.Directory);
544     Result.Cmd->CommandLine.reserve(Cmd.CommandLine.size());
545     for (llvm::StringRef C : Cmd.CommandLine)
546       Result.Cmd->CommandLine.emplace_back(C);
547   }
548   return std::move(Result);
549 }
550 
551 template <class Callback>
visitStrings(IncludeGraphNode & IGN,const Callback & CB)552 void visitStrings(IncludeGraphNode &IGN, const Callback &CB) {
553   CB(IGN.URI);
554   for (llvm::StringRef &Include : IGN.DirectIncludes)
555     CB(Include);
556 }
557 
writeRIFF(const IndexFileOut & Data,llvm::raw_ostream & OS)558 void writeRIFF(const IndexFileOut &Data, llvm::raw_ostream &OS) {
559   assert(Data.Symbols && "An index file without symbols makes no sense!");
560   riff::File RIFF;
561   RIFF.Type = riff::fourCC("CdIx");
562 
563   llvm::SmallString<4> Meta;
564   {
565     llvm::raw_svector_ostream MetaOS(Meta);
566     write32(Version, MetaOS);
567   }
568   RIFF.Chunks.push_back({riff::fourCC("meta"), Meta});
569 
570   StringTableOut Strings;
571   std::vector<Symbol> Symbols;
572   for (const auto &Sym : *Data.Symbols) {
573     Symbols.emplace_back(Sym);
574     visitStrings(Symbols.back(),
575                  [&](llvm::StringRef &S) { Strings.intern(S); });
576   }
577   std::vector<IncludeGraphNode> Sources;
578   if (Data.Sources)
579     for (const auto &Source : *Data.Sources) {
580       Sources.push_back(Source.getValue());
581       visitStrings(Sources.back(),
582                    [&](llvm::StringRef &S) { Strings.intern(S); });
583     }
584 
585   std::vector<std::pair<SymbolID, std::vector<Ref>>> Refs;
586   if (Data.Refs) {
587     for (const auto &Sym : *Data.Refs) {
588       Refs.emplace_back(Sym);
589       for (auto &Ref : Refs.back().second) {
590         llvm::StringRef File = Ref.Location.FileURI;
591         Strings.intern(File);
592         Ref.Location.FileURI = File.data();
593       }
594     }
595   }
596 
597   std::vector<Relation> Relations;
598   if (Data.Relations) {
599     for (const auto &Relation : *Data.Relations) {
600       Relations.emplace_back(Relation);
601       // No strings to be interned in relations.
602     }
603   }
604 
605   InternedCompileCommand InternedCmd;
606   if (Data.Cmd) {
607     InternedCmd.CommandLine.reserve(Data.Cmd->CommandLine.size());
608     InternedCmd.Directory = Data.Cmd->Directory;
609     Strings.intern(InternedCmd.Directory);
610     for (llvm::StringRef C : Data.Cmd->CommandLine) {
611       InternedCmd.CommandLine.emplace_back(C);
612       Strings.intern(InternedCmd.CommandLine.back());
613     }
614   }
615 
616   std::string StringSection;
617   {
618     llvm::raw_string_ostream StringOS(StringSection);
619     Strings.finalize(StringOS);
620   }
621   RIFF.Chunks.push_back({riff::fourCC("stri"), StringSection});
622 
623   std::string SymbolSection;
624   {
625     llvm::raw_string_ostream SymbolOS(SymbolSection);
626     for (const auto &Sym : Symbols)
627       writeSymbol(Sym, Strings, SymbolOS);
628   }
629   RIFF.Chunks.push_back({riff::fourCC("symb"), SymbolSection});
630 
631   std::string RefsSection;
632   if (Data.Refs) {
633     {
634       llvm::raw_string_ostream RefsOS(RefsSection);
635       for (const auto &Sym : Refs)
636         writeRefs(Sym.first, Sym.second, Strings, RefsOS);
637     }
638     RIFF.Chunks.push_back({riff::fourCC("refs"), RefsSection});
639   }
640 
641   std::string RelationSection;
642   if (Data.Relations) {
643     {
644       llvm::raw_string_ostream RelationOS{RelationSection};
645       for (const auto &Relation : Relations)
646         writeRelation(Relation, RelationOS);
647     }
648     RIFF.Chunks.push_back({riff::fourCC("rela"), RelationSection});
649   }
650 
651   std::string SrcsSection;
652   {
653     {
654       llvm::raw_string_ostream SrcsOS(SrcsSection);
655       for (const auto &SF : Sources)
656         writeIncludeGraphNode(SF, Strings, SrcsOS);
657     }
658     RIFF.Chunks.push_back({riff::fourCC("srcs"), SrcsSection});
659   }
660 
661   std::string CmdlSection;
662   if (Data.Cmd) {
663     {
664       llvm::raw_string_ostream CmdOS(CmdlSection);
665       writeCompileCommand(InternedCmd, Strings, CmdOS);
666     }
667     RIFF.Chunks.push_back({riff::fourCC("cmdl"), CmdlSection});
668   }
669 
670   OS << RIFF;
671 }
672 
673 } // namespace
674 
675 // Defined in YAMLSerialization.cpp.
676 void writeYAML(const IndexFileOut &, llvm::raw_ostream &);
677 llvm::Expected<IndexFileIn> readYAML(llvm::StringRef, SymbolOrigin Origin);
678 
operator <<(llvm::raw_ostream & OS,const IndexFileOut & O)679 llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const IndexFileOut &O) {
680   switch (O.Format) {
681   case IndexFileFormat::RIFF:
682     writeRIFF(O, OS);
683     break;
684   case IndexFileFormat::YAML:
685     writeYAML(O, OS);
686     break;
687   }
688   return OS;
689 }
690 
readIndexFile(llvm::StringRef Data,SymbolOrigin Origin)691 llvm::Expected<IndexFileIn> readIndexFile(llvm::StringRef Data,
692                                           SymbolOrigin Origin) {
693   if (Data.startswith("RIFF")) {
694     return readRIFF(Data, Origin);
695   }
696   if (auto YAMLContents = readYAML(Data, Origin)) {
697     return std::move(*YAMLContents);
698   } else {
699     return error("Not a RIFF file and failed to parse as YAML: {0}",
700                  YAMLContents.takeError());
701   }
702 }
703 
loadIndex(llvm::StringRef SymbolFilename,SymbolOrigin Origin,bool UseDex)704 std::unique_ptr<SymbolIndex> loadIndex(llvm::StringRef SymbolFilename,
705                                        SymbolOrigin Origin, bool UseDex) {
706   trace::Span OverallTracer("LoadIndex");
707   auto Buffer = llvm::MemoryBuffer::getFile(SymbolFilename);
708   if (!Buffer) {
709     elog("Can't open {0}: {1}", SymbolFilename, Buffer.getError().message());
710     return nullptr;
711   }
712 
713   SymbolSlab Symbols;
714   RefSlab Refs;
715   RelationSlab Relations;
716   {
717     trace::Span Tracer("ParseIndex");
718     if (auto I = readIndexFile(Buffer->get()->getBuffer(), Origin)) {
719       if (I->Symbols)
720         Symbols = std::move(*I->Symbols);
721       if (I->Refs)
722         Refs = std::move(*I->Refs);
723       if (I->Relations)
724         Relations = std::move(*I->Relations);
725     } else {
726       elog("Bad index file: {0}", I.takeError());
727       return nullptr;
728     }
729   }
730 
731   size_t NumSym = Symbols.size();
732   size_t NumRefs = Refs.numRefs();
733   size_t NumRelations = Relations.size();
734 
735   trace::Span Tracer("BuildIndex");
736   auto Index = UseDex ? dex::Dex::build(std::move(Symbols), std::move(Refs),
737                                         std::move(Relations))
738                       : MemIndex::build(std::move(Symbols), std::move(Refs),
739                                         std::move(Relations));
740   vlog("Loaded {0} from {1} with estimated memory usage {2} bytes\n"
741        "  - number of symbols: {3}\n"
742        "  - number of refs: {4}\n"
743        "  - number of relations: {5}",
744        UseDex ? "Dex" : "MemIndex", SymbolFilename,
745        Index->estimateMemoryUsage(), NumSym, NumRefs, NumRelations);
746   return Index;
747 }
748 
749 } // namespace clangd
750 } // namespace clang
751