1 //===- bolt/Profile/DataAggregator.h - Perf data aggregator -----*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This family of functions reads profile data written by perf record,
10 // aggregates it and then writes it back to an output file.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef BOLT_PROFILE_DATA_AGGREGATOR_H
15 #define BOLT_PROFILE_DATA_AGGREGATOR_H
16 
17 #include "bolt/Profile/DataReader.h"
18 #include "llvm/ADT/StringRef.h"
19 #include "llvm/Support/Error.h"
20 #include "llvm/Support/Program.h"
21 #include <unordered_map>
22 
23 namespace llvm {
24 namespace bolt {
25 
26 class BinaryFunction;
27 class BinaryContext;
28 class BoltAddressTranslation;
29 
30 /// DataAggregator inherits all parsing logic from DataReader as well as
31 /// its data structures used to represent aggregated profile data in memory.
32 ///
33 /// The aggregator works by dispatching two separate perf-script jobs that
34 /// read perf samples and perf task annotations. Later, we read the output
35 /// files to extract information about which PID was used for this binary.
36 /// With the PID, we filter the samples and extract all LBR entries.
37 ///
38 /// To aggregate LBR entries, we rely on a BinaryFunction map to locate the
39 /// original function where the event happened. Then, we convert a raw address
40 /// to an offset relative to the start of this function and aggregate branch
41 /// information for each function.
42 ///
43 /// This must be coordinated with RewriteInstance so we have BinaryFunctions in
44 /// State::Disassembled. After this state, BinaryFunction will drop the
45 /// instruction map with original addresses we rely on to validate the traces
46 /// found in the LBR.
47 ///
48 /// The last step is to write the aggregated data to disk in the output file
49 /// specified by the user.
50 class DataAggregator : public DataReader {
51 public:
DataAggregator(StringRef Filename)52   explicit DataAggregator(StringRef Filename) : DataReader(Filename) {
53     start();
54   }
55 
56   ~DataAggregator();
57 
getReaderName()58   StringRef getReaderName() const override { return "perf data aggregator"; }
59 
isTrustedSource()60   bool isTrustedSource() const override { return true; }
61 
62   Error preprocessProfile(BinaryContext &BC) override;
63 
readProfilePreCFG(BinaryContext & BC)64   Error readProfilePreCFG(BinaryContext &BC) override {
65     return Error::success();
66   }
67 
68   Error readProfile(BinaryContext &BC) override;
69 
70   bool mayHaveProfileData(const BinaryFunction &BF) override;
71 
72   /// Set Bolt Address Translation Table when processing samples collected in
73   /// bolted binaries
setBAT(BoltAddressTranslation * B)74   void setBAT(BoltAddressTranslation *B) override { BAT = B; }
75 
76   /// Check whether \p FileName is a perf.data file
77   static bool checkPerfDataMagic(StringRef FileName);
78 
79 private:
80   struct PerfBranchSample {
81     SmallVector<LBREntry, 32> LBR;
82     uint64_t PC;
83   };
84 
85   struct PerfBasicSample {
86     StringRef EventName;
87     uint64_t PC;
88   };
89 
90   struct PerfMemSample {
91     uint64_t PC;
92     uint64_t Addr;
93   };
94 
95   /// Used for parsing specific pre-aggregated input files.
96   struct AggregatedLBREntry {
97     enum Type : char { BRANCH = 0, FT, FT_EXTERNAL_ORIGIN };
98     Location From;
99     Location To;
100     uint64_t Count;
101     uint64_t Mispreds;
102     Type EntryType;
103   };
104 
105   struct Trace {
106     uint64_t From;
107     uint64_t To;
TraceTrace108     Trace(uint64_t From, uint64_t To) : From(From), To(To) {}
109     bool operator==(const Trace &Other) const {
110       return From == Other.From && To == Other.To;
111     }
112   };
113 
114   struct TraceHash {
operatorTraceHash115     size_t operator()(const Trace &L) const {
116       return std::hash<uint64_t>()(L.From << 32 | L.To);
117     }
118   };
119 
120   struct FTInfo {
121     uint64_t InternCount{0};
122     uint64_t ExternCount{0};
123   };
124 
125   struct BranchInfo {
126     uint64_t TakenCount{0};
127     uint64_t MispredCount{0};
128   };
129 
130   /// Intermediate storage for profile data. We save the results of parsing
131   /// and use them later for processing and assigning profile.
132   std::unordered_map<Trace, BranchInfo, TraceHash> BranchLBRs;
133   std::unordered_map<Trace, FTInfo, TraceHash> FallthroughLBRs;
134   std::vector<AggregatedLBREntry> AggregatedLBRs;
135   std::unordered_map<uint64_t, uint64_t> BasicSamples;
136   std::vector<PerfMemSample> MemSamples;
137 
clear(T & Container)138   template <typename T> void clear(T &Container) {
139     T TempContainer;
140     TempContainer.swap(Container);
141   }
142 
143   /// Perf utility full path name
144   std::string PerfPath;
145 
146   /// Perf process spawning bookkeeping
147   struct PerfProcessInfo {
148     bool IsFinished{false};
149     sys::ProcessInfo PI;
150     SmallVector<char, 256> StdoutPath;
151     SmallVector<char, 256> StderrPath;
152   };
153 
154   /// Process info for spawned processes
155   PerfProcessInfo MainEventsPPI;
156   PerfProcessInfo MemEventsPPI;
157   PerfProcessInfo MMapEventsPPI;
158   PerfProcessInfo TaskEventsPPI;
159 
160   /// Kernel VM starts at fixed based address
161   /// https://www.kernel.org/doc/Documentation/x86/x86_64/mm.txt
162   static constexpr uint64_t KernelBaseAddr = 0xffff800000000000;
163 
164   /// Current list of created temporary files
165   std::vector<std::string> TempFiles;
166 
167   /// Name of the binary with matching build-id from perf.data if different
168   /// from the file name in BC.
169   std::string BuildIDBinaryName;
170 
171   /// Memory map info for a single file as recorded in perf.data
172   struct MMapInfo {
173     uint64_t BaseAddress{0}; /// Base address of the mapped binary.
174     uint64_t MMapAddress{0}; /// Address of the executable segment.
175     uint64_t Size{0};        /// Size of the mapping.
176     uint64_t Offset{0};      /// File offset of the mapped segment.
177     int32_t PID{-1};         /// Process ID.
178     bool Forked{false};      /// Was the process forked?
179     uint64_t Time{0ULL};     /// Time in micro seconds.
180   };
181 
182   /// Per-PID map info for the binary
183   std::unordered_map<uint64_t, MMapInfo> BinaryMMapInfo;
184 
185   /// Fork event info
186   struct ForkInfo {
187     int32_t ParentPID;
188     int32_t ChildPID;
189     uint64_t Time{0ULL};
190   };
191 
192   /// References to core BOLT data structures
193   BinaryContext *BC{nullptr};
194 
195   BoltAddressTranslation *BAT{nullptr};
196 
197   /// Update function execution profile with a recorded trace.
198   /// A trace is region of code executed between two LBR entries supplied in
199   /// execution order.
200   ///
201   /// Return true if the trace is valid, false otherwise.
202   bool recordTrace(
203       BinaryFunction &BF, const LBREntry &First, const LBREntry &Second,
204       uint64_t Count = 1,
205       SmallVector<std::pair<uint64_t, uint64_t>, 16> *Branches = nullptr) const;
206 
207   /// Return a vector of offsets corresponding to a trace in a function
208   /// (see recordTrace() above).
209   Optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>>
210   getFallthroughsInTrace(BinaryFunction &BF, const LBREntry &First,
211                          const LBREntry &Second, uint64_t Count = 1) const;
212 
213   /// Record external entry into the function \p BF.
214   ///
215   /// Return true if the entry is valid, false otherwise.
216   bool recordEntry(BinaryFunction &BF, uint64_t To, bool Mispred,
217                    uint64_t Count = 1) const;
218 
219   /// Record exit from the function \p BF via a call or return.
220   ///
221   /// Return true if the exit point is valid, false otherwise.
222   bool recordExit(BinaryFunction &BF, uint64_t From, bool Mispred,
223                   uint64_t Count = 1) const;
224 
225   /// Aggregation statistics
226   uint64_t NumInvalidTraces{0};
227   uint64_t NumLongRangeTraces{0};
228   uint64_t NumColdSamples{0};
229 
230   /// Looks into system PATH for Linux Perf and set up the aggregator to use it
231   void findPerfExecutable();
232 
233   /// Launch a perf subprocess with given args and save output for later
234   /// parsing.
235   void launchPerfProcess(StringRef Name, PerfProcessInfo &PPI,
236                          const char *ArgsString, bool Wait);
237 
238   /// Delete all temporary files created to hold the output generated by spawned
239   /// subprocesses during the aggregation job
240   void deleteTempFiles();
241 
242   // Semantic pass helpers
243 
244   /// Look up which function contains an address by using out map of
245   /// disassembled BinaryFunctions
246   BinaryFunction *getBinaryFunctionContainingAddress(uint64_t Address) const;
247 
248   /// Retrieve the location name to be used for samples recorded in \p Func.
249   /// If doing BAT translation, link cold parts to the hot part  names (used by
250   /// the original binary).  \p Count specifies how many samples were recorded
251   /// at that location, so we can tally total activity in cold areas if we are
252   /// dealing with profiling data collected in a bolted binary. For LBRs,
253   /// \p Count should only be used for the source of the branch to avoid
254   /// counting cold activity twice (one for source and another for destination).
255   StringRef getLocationName(BinaryFunction &Func, uint64_t Count);
256 
257   /// Semantic actions - parser hooks to interpret parsed perf samples
258   /// Register a sample (non-LBR mode), i.e. a new hit at \p Address
259   bool doSample(BinaryFunction &Func, const uint64_t Address, uint64_t Count);
260 
261   /// Register an intraprocedural branch \p Branch.
262   bool doIntraBranch(BinaryFunction &Func, uint64_t From, uint64_t To,
263                      uint64_t Count, uint64_t Mispreds);
264 
265   /// Register an interprocedural branch from \p FromFunc to \p ToFunc with
266   /// offsets \p From and \p To, respectively.
267   bool doInterBranch(BinaryFunction *FromFunc, BinaryFunction *ToFunc,
268                      uint64_t From, uint64_t To, uint64_t Count,
269                      uint64_t Mispreds);
270 
271   /// Register a \p Branch.
272   bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds);
273 
274   /// Register a trace between two LBR entries supplied in execution order.
275   bool doTrace(const LBREntry &First, const LBREntry &Second,
276                uint64_t Count = 1);
277 
278   /// Parser helpers
279   /// Return false if we exhausted our parser buffer and finished parsing
280   /// everything
hasData()281   bool hasData() const { return !ParsingBuf.empty(); }
282 
283   /// Print heat map based on LBR samples.
284   std::error_code printLBRHeatMap();
285 
286   /// Parse a single perf sample containing a PID associated with a sequence of
287   /// LBR entries. If the PID does not correspond to the binary we are looking
288   /// for, return std::errc::no_such_process. If other parsing errors occur,
289   /// return the error. Otherwise, return the parsed sample.
290   ErrorOr<PerfBranchSample> parseBranchSample();
291 
292   /// Parse a single perf sample containing a PID associated with an event name
293   /// and a PC
294   ErrorOr<PerfBasicSample> parseBasicSample();
295 
296   /// Parse a single perf sample containing a PID associated with an IP and
297   /// address.
298   ErrorOr<PerfMemSample> parseMemSample();
299 
300   /// Parse pre-aggregated LBR samples created by an external tool
301   ErrorOr<AggregatedLBREntry> parseAggregatedLBREntry();
302 
303   /// Parse either buildid:offset or just offset, representing a location in the
304   /// binary. Used exclusevely for pre-aggregated LBR samples.
305   ErrorOr<Location> parseLocationOrOffset();
306 
307   /// Check if a field separator is the next char to parse and, if yes, consume
308   /// it and return true
309   bool checkAndConsumeFS();
310 
311   /// Consume the entire line
312   void consumeRestOfLine();
313 
314   /// Parse a single LBR entry as output by perf script -Fbrstack
315   ErrorOr<LBREntry> parseLBREntry();
316 
317   /// Parse and pre-aggregate branch events.
318   std::error_code parseBranchEvents();
319 
320   /// Process all branch events.
321   void processBranchEvents();
322 
323   /// This member function supports generating data for AutoFDO LLVM tools.
324   std::error_code writeAutoFDOData(StringRef OutputFilename);
325 
326   /// Parse the full output generated by perf script to report non-LBR samples.
327   std::error_code parseBasicEvents();
328 
329   /// Process non-LBR events.
330   void processBasicEvents();
331 
332   /// Parse the full output generated by perf script to report memory events.
333   std::error_code parseMemEvents();
334 
335   /// Process parsed memory events profile.
336   void processMemEvents();
337 
338   /// Parse a single line of a PERF_RECORD_MMAP2 event looking for a mapping
339   /// between the binary name and its memory layout in a process with a given
340   /// PID.
341   /// On success return a <FileName, MMapInfo> pair.
342   ErrorOr<std::pair<StringRef, MMapInfo>> parseMMapEvent();
343 
344   /// Parse PERF_RECORD_FORK event.
345   Optional<ForkInfo> parseForkEvent();
346 
347   /// Parse 'PERF_RECORD_COMM exec'. Don't consume the string.
348   Optional<int32_t> parseCommExecEvent();
349 
350   /// Parse the full output generated by `perf script --show-mmap-events`
351   /// to generate mapping between binary files and their memory mappings for
352   /// all PIDs.
353   std::error_code parseMMapEvents();
354 
355   /// Parse output of `perf script --show-task-events`, and forked processes
356   /// to the set of tracked PIDs.
357   std::error_code parseTaskEvents();
358 
359   /// Parse a single pair of binary full path and associated build-id
360   Optional<std::pair<StringRef, StringRef>> parseNameBuildIDPair();
361 
362   /// Coordinate reading and parsing of pre-aggregated file
363   ///
364   /// The regular perf2bolt aggregation job is to read perf output directly.
365   /// However, if the data is coming from a database instead of perf, one could
366   /// write a query to produce a pre-aggregated file. This function deals with
367   /// this case.
368   ///
369   /// The pre-aggregated file contains aggregated LBR data, but without binary
370   /// knowledge. BOLT will parse it and, using information from the disassembled
371   /// binary, augment it with fall-through edge frequency information. After
372   /// this step is finished, this data can be either written to disk to be
373   /// consumed by BOLT later, or can be used by BOLT immediately if kept in
374   /// memory.
375   ///
376   /// File format syntax:
377   /// {B|F|f} [<start_id>:]<start_offset> [<end_id>:]<end_offset> <count>
378   ///       [<mispred_count>]
379   ///
380   /// B - indicates an aggregated branch
381   /// F - an aggregated fall-through
382   /// f - an aggregated fall-through with external origin - used to disambiguate
383   ///       between a return hitting a basic block head and a regular internal
384   ///       jump to the block
385   ///
386   /// <start_id> - build id of the object containing the start address. We can
387   /// skip it for the main binary and use "X" for an unknown object. This will
388   /// save some space and facilitate human parsing.
389   ///
390   /// <start_offset> - hex offset from the object base load address (0 for the
391   /// main executable unless it's PIE) to the start address.
392   ///
393   /// <end_id>, <end_offset> - same for the end address.
394   ///
395   /// <count> - total aggregated count of the branch or a fall-through.
396   ///
397   /// <mispred_count> - the number of times the branch was mispredicted.
398   /// Omitted for fall-throughs.
399   ///
400   /// Example:
401   /// F 41be50 41be50 3
402   /// F 41be90 41be90 4
403   /// B 4b1942 39b57f0 3 0
404   /// B 4b196f 4b19e0 2 0
405   void parsePreAggregated();
406 
407   /// Parse the full output of pre-aggregated LBR samples generated by
408   /// an external tool.
409   std::error_code parsePreAggregatedLBRSamples();
410 
411   /// Process parsed pre-aggregated data.
412   void processPreAggregated();
413 
414   /// If \p Address falls into the binary address space based on memory
415   /// mapping info \p MMI, then adjust it for further processing by subtracting
416   /// the base load address. External addresses, i.e. addresses that do not
417   /// correspond to the binary allocated address space, are adjusted to avoid
418   /// conflicts.
adjustAddress(uint64_t & Address,const MMapInfo & MMI)419   void adjustAddress(uint64_t &Address, const MMapInfo &MMI) const {
420     if (Address >= MMI.MMapAddress && Address < MMI.MMapAddress + MMI.Size) {
421       Address -= MMI.BaseAddress;
422     } else if (Address < MMI.Size) {
423       // Make sure the address is not treated as belonging to the binary.
424       Address = (-1ULL);
425     }
426   }
427 
428   /// Adjust addresses in \p LBR entry.
adjustLBR(LBREntry & LBR,const MMapInfo & MMI)429   void adjustLBR(LBREntry &LBR, const MMapInfo &MMI) const {
430     adjustAddress(LBR.From, MMI);
431     adjustAddress(LBR.To, MMI);
432   }
433 
434   /// Ignore kernel/user transition LBR if requested
435   bool ignoreKernelInterrupt(LBREntry &LBR) const;
436 
437   /// Populate functions in \p BC with profile.
438   void processProfile(BinaryContext &BC);
439 
440   /// Start an aggregation job asynchronously.
441   void start();
442 
443   /// Returns true if this aggregation job is using a translation table to
444   /// remap samples collected on binaries already processed by BOLT.
usesBAT()445   bool usesBAT() const { return BAT; }
446 
447   /// Force all subprocesses to stop and cancel aggregation
448   void abort();
449 
450   /// Dump data structures into a file readable by llvm-bolt
451   std::error_code writeAggregatedFile(StringRef OutputFilename) const;
452 
453   /// Filter out binaries based on PID
454   void filterBinaryMMapInfo();
455 
456   /// If we have a build-id available for the input file, use it to assist
457   /// matching profile to a binary.
458   ///
459   /// If the binary name changed after profile collection, use build-id
460   /// to get the proper name in perf data when build-ids are available.
461   /// If \p FileBuildID has no match, then issue an error and exit.
462   void processFileBuildID(StringRef FileBuildID);
463 
464   /// Debugging dump methods
465   void dump() const;
466   void dump(const LBREntry &LBR) const;
467   void dump(const PerfBranchSample &Sample) const;
468   void dump(const PerfMemSample &Sample) const;
469 
470 public:
471   /// If perf.data was collected without build ids, the buildid-list may contain
472   /// incomplete entries. Return true if the buffer containing
473   /// "perf buildid-list" output has only valid entries and is non- empty.
474   /// Return false otherwise.
475   bool hasAllBuildIDs();
476 
477   /// Parse the output generated by "perf buildid-list" to extract build-ids
478   /// and return a file name matching a given \p FileBuildID.
479   Optional<StringRef> getFileNameForBuildID(StringRef FileBuildID);
480 };
481 } // namespace bolt
482 } // namespace llvm
483 
484 #endif
485