1 //===-- Target.cpp ----------------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 #include "../Target.h"
9 
10 #include "../Error.h"
11 #include "../ParallelSnippetGenerator.h"
12 #include "../SerialSnippetGenerator.h"
13 #include "../SnippetGenerator.h"
14 #include "MCTargetDesc/X86BaseInfo.h"
15 #include "MCTargetDesc/X86MCTargetDesc.h"
16 #include "X86.h"
17 #include "X86Counter.h"
18 #include "X86RegisterInfo.h"
19 #include "X86Subtarget.h"
20 #include "llvm/ADT/Sequence.h"
21 #include "llvm/MC/MCInstBuilder.h"
22 #include "llvm/Support/Errc.h"
23 #include "llvm/Support/Error.h"
24 #include "llvm/Support/FormatVariadic.h"
25 #include "llvm/Support/Host.h"
26 
27 #include <memory>
28 #include <string>
29 #include <vector>
30 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
31 #include <immintrin.h>
32 #include <intrin.h>
33 #endif
34 #if defined(__x86_64__) && defined(_MSC_VER)
35 #include <float.h> // For _clearfp in ~X86SavedState().
36 #endif
37 
38 namespace llvm {
39 namespace exegesis {
40 
41 static cl::OptionCategory
42     BenchmarkOptions("llvm-exegesis benchmark x86-options");
43 
44 // If a positive value is specified, we are going to use the LBR in
45 // latency-mode.
46 //
47 // Note:
48 //  -  A small value is preferred, but too low a value could result in
49 //     throttling.
50 //  -  A prime number is preferred to avoid always skipping certain blocks.
51 //
52 static cl::opt<unsigned> LbrSamplingPeriod(
53     "x86-lbr-sample-period",
54     cl::desc("The sample period (nbranches/sample), used for LBR sampling"),
55     cl::cat(BenchmarkOptions), cl::init(0));
56 
57 // FIXME: Validates that repetition-mode is loop if LBR is requested.
58 
59 // Returns a non-null reason if we cannot handle the memory references in this
60 // instruction.
isInvalidMemoryInstr(const Instruction & Instr)61 static const char *isInvalidMemoryInstr(const Instruction &Instr) {
62   switch (Instr.Description.TSFlags & X86II::FormMask) {
63   default:
64     return "Unknown FormMask value";
65   // These have no memory access.
66   case X86II::Pseudo:
67   case X86II::RawFrm:
68   case X86II::AddCCFrm:
69   case X86II::PrefixByte:
70   case X86II::MRMDestReg:
71   case X86II::MRMSrcReg:
72   case X86II::MRMSrcReg4VOp3:
73   case X86II::MRMSrcRegOp4:
74   case X86II::MRMSrcRegCC:
75   case X86II::MRMXrCC:
76   case X86II::MRMr0:
77   case X86II::MRMXr:
78   case X86II::MRM0r:
79   case X86II::MRM1r:
80   case X86II::MRM2r:
81   case X86II::MRM3r:
82   case X86II::MRM4r:
83   case X86II::MRM5r:
84   case X86II::MRM6r:
85   case X86II::MRM7r:
86   case X86II::MRM0X:
87   case X86II::MRM1X:
88   case X86II::MRM2X:
89   case X86II::MRM3X:
90   case X86II::MRM4X:
91   case X86II::MRM5X:
92   case X86II::MRM6X:
93   case X86II::MRM7X:
94   case X86II::MRM_C0:
95   case X86II::MRM_C1:
96   case X86II::MRM_C2:
97   case X86II::MRM_C3:
98   case X86II::MRM_C4:
99   case X86II::MRM_C5:
100   case X86II::MRM_C6:
101   case X86II::MRM_C7:
102   case X86II::MRM_C8:
103   case X86II::MRM_C9:
104   case X86II::MRM_CA:
105   case X86II::MRM_CB:
106   case X86II::MRM_CC:
107   case X86II::MRM_CD:
108   case X86II::MRM_CE:
109   case X86II::MRM_CF:
110   case X86II::MRM_D0:
111   case X86II::MRM_D1:
112   case X86II::MRM_D2:
113   case X86II::MRM_D3:
114   case X86II::MRM_D4:
115   case X86II::MRM_D5:
116   case X86II::MRM_D6:
117   case X86II::MRM_D7:
118   case X86II::MRM_D8:
119   case X86II::MRM_D9:
120   case X86II::MRM_DA:
121   case X86II::MRM_DB:
122   case X86II::MRM_DC:
123   case X86II::MRM_DD:
124   case X86II::MRM_DE:
125   case X86II::MRM_DF:
126   case X86II::MRM_E0:
127   case X86II::MRM_E1:
128   case X86II::MRM_E2:
129   case X86II::MRM_E3:
130   case X86II::MRM_E4:
131   case X86II::MRM_E5:
132   case X86II::MRM_E6:
133   case X86II::MRM_E7:
134   case X86II::MRM_E8:
135   case X86II::MRM_E9:
136   case X86II::MRM_EA:
137   case X86II::MRM_EB:
138   case X86II::MRM_EC:
139   case X86II::MRM_ED:
140   case X86II::MRM_EE:
141   case X86II::MRM_EF:
142   case X86II::MRM_F0:
143   case X86II::MRM_F1:
144   case X86II::MRM_F2:
145   case X86II::MRM_F3:
146   case X86II::MRM_F4:
147   case X86II::MRM_F5:
148   case X86II::MRM_F6:
149   case X86II::MRM_F7:
150   case X86II::MRM_F8:
151   case X86II::MRM_F9:
152   case X86II::MRM_FA:
153   case X86II::MRM_FB:
154   case X86II::MRM_FC:
155   case X86II::MRM_FD:
156   case X86II::MRM_FE:
157   case X86II::MRM_FF:
158   case X86II::RawFrmImm8:
159     return nullptr;
160   case X86II::AddRegFrm:
161     return (Instr.Description.Opcode == X86::POP16r ||
162             Instr.Description.Opcode == X86::POP32r ||
163             Instr.Description.Opcode == X86::PUSH16r ||
164             Instr.Description.Opcode == X86::PUSH32r)
165                ? "unsupported opcode: unsupported memory access"
166                : nullptr;
167   // These access memory and are handled.
168   case X86II::MRMDestMem:
169   case X86II::MRMSrcMem:
170   case X86II::MRMSrcMem4VOp3:
171   case X86II::MRMSrcMemOp4:
172   case X86II::MRMSrcMemCC:
173   case X86II::MRMXmCC:
174   case X86II::MRMXm:
175   case X86II::MRM0m:
176   case X86II::MRM1m:
177   case X86II::MRM2m:
178   case X86II::MRM3m:
179   case X86II::MRM4m:
180   case X86II::MRM5m:
181   case X86II::MRM6m:
182   case X86II::MRM7m:
183     return nullptr;
184   // These access memory and are not handled yet.
185   case X86II::RawFrmImm16:
186   case X86II::RawFrmMemOffs:
187   case X86II::RawFrmSrc:
188   case X86II::RawFrmDst:
189   case X86II::RawFrmDstSrc:
190     return "unsupported opcode: non uniform memory access";
191   }
192 }
193 
194 // If the opcode is invalid, returns a pointer to a character literal indicating
195 // the reason. nullptr indicates a valid opcode.
isInvalidOpcode(const Instruction & Instr)196 static const char *isInvalidOpcode(const Instruction &Instr) {
197   const auto OpcodeName = Instr.Name;
198   if ((Instr.Description.TSFlags & X86II::FormMask) == X86II::Pseudo)
199     return "unsupported opcode: pseudo instruction";
200   if ((OpcodeName.startswith("POP") && !OpcodeName.startswith("POPCNT")) ||
201       OpcodeName.startswith("PUSH") || OpcodeName.startswith("ADJCALLSTACK") ||
202       OpcodeName.startswith("LEAVE"))
203     return "unsupported opcode: Push/Pop/AdjCallStack/Leave";
204   switch (Instr.Description.Opcode) {
205   case X86::LFS16rm:
206   case X86::LFS32rm:
207   case X86::LFS64rm:
208   case X86::LGS16rm:
209   case X86::LGS32rm:
210   case X86::LGS64rm:
211   case X86::LSS16rm:
212   case X86::LSS32rm:
213   case X86::LSS64rm:
214   case X86::SYSENTER:
215     return "unsupported opcode";
216   default:
217     break;
218   }
219   if (const auto reason = isInvalidMemoryInstr(Instr))
220     return reason;
221   // We do not handle instructions with OPERAND_PCREL.
222   for (const Operand &Op : Instr.Operands)
223     if (Op.isExplicit() &&
224         Op.getExplicitOperandInfo().OperandType == MCOI::OPERAND_PCREL)
225       return "unsupported opcode: PC relative operand";
226   // We do not handle second-form X87 instructions. We only handle first-form
227   // ones (_Fp), see comment in X86InstrFPStack.td.
228   for (const Operand &Op : Instr.Operands)
229     if (Op.isReg() && Op.isExplicit() &&
230         Op.getExplicitOperandInfo().RegClass == X86::RSTRegClassID)
231       return "unsupported second-form X87 instruction";
232   return nullptr;
233 }
234 
getX86FPFlags(const Instruction & Instr)235 static unsigned getX86FPFlags(const Instruction &Instr) {
236   return Instr.Description.TSFlags & X86II::FPTypeMask;
237 }
238 
239 // Helper to fill a memory operand with a value.
setMemOp(InstructionTemplate & IT,int OpIdx,const MCOperand & OpVal)240 static void setMemOp(InstructionTemplate &IT, int OpIdx,
241                      const MCOperand &OpVal) {
242   const auto Op = IT.getInstr().Operands[OpIdx];
243   assert(Op.isExplicit() && "invalid memory pattern");
244   IT.getValueFor(Op) = OpVal;
245 }
246 
247 // Common (latency, uops) code for LEA templates. `GetDestReg` takes the
248 // addressing base and index registers and returns the LEA destination register.
generateLEATemplatesCommon(const Instruction & Instr,const BitVector & ForbiddenRegisters,const LLVMState & State,const SnippetGenerator::Options & Opts,std::function<void (unsigned,unsigned,BitVector & CandidateDestRegs)> RestrictDestRegs)249 static Expected<std::vector<CodeTemplate>> generateLEATemplatesCommon(
250     const Instruction &Instr, const BitVector &ForbiddenRegisters,
251     const LLVMState &State, const SnippetGenerator::Options &Opts,
252     std::function<void(unsigned, unsigned, BitVector &CandidateDestRegs)>
253         RestrictDestRegs) {
254   assert(Instr.Operands.size() == 6 && "invalid LEA");
255   assert(X86II::getMemoryOperandNo(Instr.Description.TSFlags) == 1 &&
256          "invalid LEA");
257 
258   constexpr const int kDestOp = 0;
259   constexpr const int kBaseOp = 1;
260   constexpr const int kIndexOp = 3;
261   auto PossibleDestRegs =
262       Instr.Operands[kDestOp].getRegisterAliasing().sourceBits();
263   remove(PossibleDestRegs, ForbiddenRegisters);
264   auto PossibleBaseRegs =
265       Instr.Operands[kBaseOp].getRegisterAliasing().sourceBits();
266   remove(PossibleBaseRegs, ForbiddenRegisters);
267   auto PossibleIndexRegs =
268       Instr.Operands[kIndexOp].getRegisterAliasing().sourceBits();
269   remove(PossibleIndexRegs, ForbiddenRegisters);
270 
271   const auto &RegInfo = State.getRegInfo();
272   std::vector<CodeTemplate> Result;
273   for (const unsigned BaseReg : PossibleBaseRegs.set_bits()) {
274     for (const unsigned IndexReg : PossibleIndexRegs.set_bits()) {
275       for (int LogScale = 0; LogScale <= 3; ++LogScale) {
276         // FIXME: Add an option for controlling how we explore immediates.
277         for (const int Disp : {0, 42}) {
278           InstructionTemplate IT(&Instr);
279           const int64_t Scale = 1ull << LogScale;
280           setMemOp(IT, 1, MCOperand::createReg(BaseReg));
281           setMemOp(IT, 2, MCOperand::createImm(Scale));
282           setMemOp(IT, 3, MCOperand::createReg(IndexReg));
283           setMemOp(IT, 4, MCOperand::createImm(Disp));
284           // SegmentReg must be 0 for LEA.
285           setMemOp(IT, 5, MCOperand::createReg(0));
286 
287           // Output reg candidates are selected by the caller.
288           auto PossibleDestRegsNow = PossibleDestRegs;
289           RestrictDestRegs(BaseReg, IndexReg, PossibleDestRegsNow);
290           assert(PossibleDestRegsNow.set_bits().begin() !=
291                      PossibleDestRegsNow.set_bits().end() &&
292                  "no remaining registers");
293           setMemOp(
294               IT, 0,
295               MCOperand::createReg(*PossibleDestRegsNow.set_bits().begin()));
296 
297           CodeTemplate CT;
298           CT.Instructions.push_back(std::move(IT));
299           CT.Config = formatv("{3}(%{0}, %{1}, {2})", RegInfo.getName(BaseReg),
300                               RegInfo.getName(IndexReg), Scale, Disp)
301                           .str();
302           Result.push_back(std::move(CT));
303           if (Result.size() >= Opts.MaxConfigsPerOpcode)
304             return std::move(Result);
305         }
306       }
307     }
308   }
309 
310   return std::move(Result);
311 }
312 
313 namespace {
314 class X86SerialSnippetGenerator : public SerialSnippetGenerator {
315 public:
316   using SerialSnippetGenerator::SerialSnippetGenerator;
317 
318   Expected<std::vector<CodeTemplate>>
319   generateCodeTemplates(InstructionTemplate Variant,
320                         const BitVector &ForbiddenRegisters) const override;
321 };
322 } // namespace
323 
324 Expected<std::vector<CodeTemplate>>
generateCodeTemplates(InstructionTemplate Variant,const BitVector & ForbiddenRegisters) const325 X86SerialSnippetGenerator::generateCodeTemplates(
326     InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
327   const Instruction &Instr = Variant.getInstr();
328 
329   if (const auto reason = isInvalidOpcode(Instr))
330     return make_error<Failure>(reason);
331 
332   // LEA gets special attention.
333   const auto Opcode = Instr.Description.getOpcode();
334   if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) {
335     return generateLEATemplatesCommon(
336         Instr, ForbiddenRegisters, State, Opts,
337         [this](unsigned BaseReg, unsigned IndexReg,
338                BitVector &CandidateDestRegs) {
339           // We just select a destination register that aliases the base
340           // register.
341           CandidateDestRegs &=
342               State.getRATC().getRegister(BaseReg).aliasedBits();
343         });
344   }
345 
346   if (Instr.hasMemoryOperands())
347     return make_error<Failure>(
348         "unsupported memory operand in latency measurements");
349 
350   switch (getX86FPFlags(Instr)) {
351   case X86II::NotFP:
352     return SerialSnippetGenerator::generateCodeTemplates(Variant,
353                                                          ForbiddenRegisters);
354   case X86II::ZeroArgFP:
355   case X86II::OneArgFP:
356   case X86II::SpecialFP:
357   case X86II::CompareFP:
358   case X86II::CondMovFP:
359     return make_error<Failure>("Unsupported x87 Instruction");
360   case X86II::OneArgFPRW:
361   case X86II::TwoArgFP:
362     // These are instructions like
363     //   - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
364     //   - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
365     // They are intrinsically serial and do not modify the state of the stack.
366     return generateSelfAliasingCodeTemplates(Variant);
367   default:
368     llvm_unreachable("Unknown FP Type!");
369   }
370 }
371 
372 namespace {
373 class X86ParallelSnippetGenerator : public ParallelSnippetGenerator {
374 public:
375   using ParallelSnippetGenerator::ParallelSnippetGenerator;
376 
377   Expected<std::vector<CodeTemplate>>
378   generateCodeTemplates(InstructionTemplate Variant,
379                         const BitVector &ForbiddenRegisters) const override;
380 };
381 
382 } // namespace
383 
384 Expected<std::vector<CodeTemplate>>
generateCodeTemplates(InstructionTemplate Variant,const BitVector & ForbiddenRegisters) const385 X86ParallelSnippetGenerator::generateCodeTemplates(
386     InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
387   const Instruction &Instr = Variant.getInstr();
388 
389   if (const auto reason = isInvalidOpcode(Instr))
390     return make_error<Failure>(reason);
391 
392   // LEA gets special attention.
393   const auto Opcode = Instr.Description.getOpcode();
394   if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) {
395     return generateLEATemplatesCommon(
396         Instr, ForbiddenRegisters, State, Opts,
397         [this](unsigned BaseReg, unsigned IndexReg,
398                BitVector &CandidateDestRegs) {
399           // Any destination register that is not used for addressing is fine.
400           remove(CandidateDestRegs,
401                  State.getRATC().getRegister(BaseReg).aliasedBits());
402           remove(CandidateDestRegs,
403                  State.getRATC().getRegister(IndexReg).aliasedBits());
404         });
405   }
406 
407   switch (getX86FPFlags(Instr)) {
408   case X86II::NotFP:
409     return ParallelSnippetGenerator::generateCodeTemplates(Variant,
410                                                            ForbiddenRegisters);
411   case X86II::ZeroArgFP:
412   case X86II::OneArgFP:
413   case X86II::SpecialFP:
414     return make_error<Failure>("Unsupported x87 Instruction");
415   case X86II::OneArgFPRW:
416   case X86II::TwoArgFP:
417     // These are instructions like
418     //   - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
419     //   - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
420     // They are intrinsically serial and do not modify the state of the stack.
421     // We generate the same code for latency and uops.
422     return generateSelfAliasingCodeTemplates(Variant);
423   case X86II::CompareFP:
424   case X86II::CondMovFP:
425     // We can compute uops for any FP instruction that does not grow or shrink
426     // the stack (either do not touch the stack or push as much as they pop).
427     return generateUnconstrainedCodeTemplates(
428         Variant, "instruction does not grow/shrink the FP stack");
429   default:
430     llvm_unreachable("Unknown FP Type!");
431   }
432 }
433 
getLoadImmediateOpcode(unsigned RegBitWidth)434 static unsigned getLoadImmediateOpcode(unsigned RegBitWidth) {
435   switch (RegBitWidth) {
436   case 8:
437     return X86::MOV8ri;
438   case 16:
439     return X86::MOV16ri;
440   case 32:
441     return X86::MOV32ri;
442   case 64:
443     return X86::MOV64ri;
444   }
445   llvm_unreachable("Invalid Value Width");
446 }
447 
448 // Generates instruction to load an immediate value into a register.
loadImmediate(unsigned Reg,unsigned RegBitWidth,const APInt & Value)449 static MCInst loadImmediate(unsigned Reg, unsigned RegBitWidth,
450                             const APInt &Value) {
451   if (Value.getBitWidth() > RegBitWidth)
452     llvm_unreachable("Value must fit in the Register");
453   return MCInstBuilder(getLoadImmediateOpcode(RegBitWidth))
454       .addReg(Reg)
455       .addImm(Value.getZExtValue());
456 }
457 
458 // Allocates scratch memory on the stack.
allocateStackSpace(unsigned Bytes)459 static MCInst allocateStackSpace(unsigned Bytes) {
460   return MCInstBuilder(X86::SUB64ri8)
461       .addReg(X86::RSP)
462       .addReg(X86::RSP)
463       .addImm(Bytes);
464 }
465 
466 // Fills scratch memory at offset `OffsetBytes` with value `Imm`.
fillStackSpace(unsigned MovOpcode,unsigned OffsetBytes,uint64_t Imm)467 static MCInst fillStackSpace(unsigned MovOpcode, unsigned OffsetBytes,
468                              uint64_t Imm) {
469   return MCInstBuilder(MovOpcode)
470       // Address = ESP
471       .addReg(X86::RSP)    // BaseReg
472       .addImm(1)           // ScaleAmt
473       .addReg(0)           // IndexReg
474       .addImm(OffsetBytes) // Disp
475       .addReg(0)           // Segment
476       // Immediate.
477       .addImm(Imm);
478 }
479 
480 // Loads scratch memory into register `Reg` using opcode `RMOpcode`.
loadToReg(unsigned Reg,unsigned RMOpcode)481 static MCInst loadToReg(unsigned Reg, unsigned RMOpcode) {
482   return MCInstBuilder(RMOpcode)
483       .addReg(Reg)
484       // Address = ESP
485       .addReg(X86::RSP) // BaseReg
486       .addImm(1)        // ScaleAmt
487       .addReg(0)        // IndexReg
488       .addImm(0)        // Disp
489       .addReg(0);       // Segment
490 }
491 
492 // Releases scratch memory.
releaseStackSpace(unsigned Bytes)493 static MCInst releaseStackSpace(unsigned Bytes) {
494   return MCInstBuilder(X86::ADD64ri8)
495       .addReg(X86::RSP)
496       .addReg(X86::RSP)
497       .addImm(Bytes);
498 }
499 
500 // Reserves some space on the stack, fills it with the content of the provided
501 // constant and provide methods to load the stack value into a register.
502 namespace {
503 struct ConstantInliner {
ConstantInlinerllvm::exegesis::__anonb99050300511::ConstantInliner504   explicit ConstantInliner(const APInt &Constant) : Constant_(Constant) {}
505 
506   std::vector<MCInst> loadAndFinalize(unsigned Reg, unsigned RegBitWidth,
507                                       unsigned Opcode);
508 
509   std::vector<MCInst> loadX87STAndFinalize(unsigned Reg);
510 
511   std::vector<MCInst> loadX87FPAndFinalize(unsigned Reg);
512 
513   std::vector<MCInst> popFlagAndFinalize();
514 
515   std::vector<MCInst> loadImplicitRegAndFinalize(unsigned Opcode,
516                                                  unsigned Value);
517 
518 private:
addllvm::exegesis::__anonb99050300511::ConstantInliner519   ConstantInliner &add(const MCInst &Inst) {
520     Instructions.push_back(Inst);
521     return *this;
522   }
523 
524   void initStack(unsigned Bytes);
525 
526   static constexpr const unsigned kF80Bytes = 10; // 80 bits.
527 
528   APInt Constant_;
529   std::vector<MCInst> Instructions;
530 };
531 } // namespace
532 
loadAndFinalize(unsigned Reg,unsigned RegBitWidth,unsigned Opcode)533 std::vector<MCInst> ConstantInliner::loadAndFinalize(unsigned Reg,
534                                                      unsigned RegBitWidth,
535                                                      unsigned Opcode) {
536   assert((RegBitWidth & 7) == 0 && "RegBitWidth must be a multiple of 8 bits");
537   initStack(RegBitWidth / 8);
538   add(loadToReg(Reg, Opcode));
539   add(releaseStackSpace(RegBitWidth / 8));
540   return std::move(Instructions);
541 }
542 
loadX87STAndFinalize(unsigned Reg)543 std::vector<MCInst> ConstantInliner::loadX87STAndFinalize(unsigned Reg) {
544   initStack(kF80Bytes);
545   add(MCInstBuilder(X86::LD_F80m)
546           // Address = ESP
547           .addReg(X86::RSP) // BaseReg
548           .addImm(1)        // ScaleAmt
549           .addReg(0)        // IndexReg
550           .addImm(0)        // Disp
551           .addReg(0));      // Segment
552   if (Reg != X86::ST0)
553     add(MCInstBuilder(X86::ST_Frr).addReg(Reg));
554   add(releaseStackSpace(kF80Bytes));
555   return std::move(Instructions);
556 }
557 
loadX87FPAndFinalize(unsigned Reg)558 std::vector<MCInst> ConstantInliner::loadX87FPAndFinalize(unsigned Reg) {
559   initStack(kF80Bytes);
560   add(MCInstBuilder(X86::LD_Fp80m)
561           .addReg(Reg)
562           // Address = ESP
563           .addReg(X86::RSP) // BaseReg
564           .addImm(1)        // ScaleAmt
565           .addReg(0)        // IndexReg
566           .addImm(0)        // Disp
567           .addReg(0));      // Segment
568   add(releaseStackSpace(kF80Bytes));
569   return std::move(Instructions);
570 }
571 
popFlagAndFinalize()572 std::vector<MCInst> ConstantInliner::popFlagAndFinalize() {
573   initStack(8);
574   add(MCInstBuilder(X86::POPF64));
575   return std::move(Instructions);
576 }
577 
578 std::vector<MCInst>
loadImplicitRegAndFinalize(unsigned Opcode,unsigned Value)579 ConstantInliner::loadImplicitRegAndFinalize(unsigned Opcode, unsigned Value) {
580   add(allocateStackSpace(4));
581   add(fillStackSpace(X86::MOV32mi, 0, Value)); // Mask all FP exceptions
582   add(MCInstBuilder(Opcode)
583           // Address = ESP
584           .addReg(X86::RSP) // BaseReg
585           .addImm(1)        // ScaleAmt
586           .addReg(0)        // IndexReg
587           .addImm(0)        // Disp
588           .addReg(0));      // Segment
589   add(releaseStackSpace(4));
590   return std::move(Instructions);
591 }
592 
initStack(unsigned Bytes)593 void ConstantInliner::initStack(unsigned Bytes) {
594   assert(Constant_.getBitWidth() <= Bytes * 8 &&
595          "Value does not have the correct size");
596   const APInt WideConstant = Constant_.getBitWidth() < Bytes * 8
597                                  ? Constant_.sext(Bytes * 8)
598                                  : Constant_;
599   add(allocateStackSpace(Bytes));
600   size_t ByteOffset = 0;
601   for (; Bytes - ByteOffset >= 4; ByteOffset += 4)
602     add(fillStackSpace(
603         X86::MOV32mi, ByteOffset,
604         WideConstant.extractBits(32, ByteOffset * 8).getZExtValue()));
605   if (Bytes - ByteOffset >= 2) {
606     add(fillStackSpace(
607         X86::MOV16mi, ByteOffset,
608         WideConstant.extractBits(16, ByteOffset * 8).getZExtValue()));
609     ByteOffset += 2;
610   }
611   if (Bytes - ByteOffset >= 1)
612     add(fillStackSpace(
613         X86::MOV8mi, ByteOffset,
614         WideConstant.extractBits(8, ByteOffset * 8).getZExtValue()));
615 }
616 
617 #include "X86GenExegesis.inc"
618 
619 namespace {
620 
621 class X86SavedState : public ExegesisTarget::SavedState {
622 public:
X86SavedState()623   X86SavedState() {
624 #ifdef __x86_64__
625 # if defined(_MSC_VER)
626     _fxsave64(FPState);
627     Eflags = __readeflags();
628 # elif defined(__GNUC__)
629     __builtin_ia32_fxsave64(FPState);
630     Eflags = __builtin_ia32_readeflags_u64();
631 # endif
632 #else
633     llvm_unreachable("X86 exegesis running on non-X86 target");
634 #endif
635   }
636 
~X86SavedState()637   ~X86SavedState() {
638     // Restoring the X87 state does not flush pending exceptions, make sure
639     // these exceptions are flushed now.
640 #ifdef __x86_64__
641 # if defined(_MSC_VER)
642     _clearfp();
643     _fxrstor64(FPState);
644     __writeeflags(Eflags);
645 # elif defined(__GNUC__)
646     asm volatile("fwait");
647     __builtin_ia32_fxrstor64(FPState);
648     __builtin_ia32_writeeflags_u64(Eflags);
649 # endif
650 #else
651     llvm_unreachable("X86 exegesis running on non-X86 target");
652 #endif
653   }
654 
655 private:
656 #ifdef __x86_64__
657   alignas(16) char FPState[512];
658   uint64_t Eflags;
659 #endif
660 };
661 
662 class ExegesisX86Target : public ExegesisTarget {
663 public:
ExegesisX86Target()664   ExegesisX86Target() : ExegesisTarget(X86CpuPfmCounters) {}
665 
666   Expected<std::unique_ptr<pfm::Counter>>
createCounter(StringRef CounterName,const LLVMState & State) const667   createCounter(StringRef CounterName, const LLVMState &State) const override {
668     // If LbrSamplingPeriod was provided, then ignore the
669     // CounterName because we only have one for LBR.
670     if (LbrSamplingPeriod > 0) {
671       // Can't use LBR without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, or without
672       // __linux__ (for now)
673 #if defined(HAVE_LIBPFM) && defined(LIBPFM_HAS_FIELD_CYCLES) &&                \
674     defined(__linux__)
675       return std::make_unique<X86LbrCounter>(
676           X86LbrPerfEvent(LbrSamplingPeriod));
677 #else
678       return llvm::make_error<llvm::StringError>(
679           "LBR counter requested without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, "
680           "or running on Linux.",
681           llvm::errc::invalid_argument);
682 #endif
683     }
684     return ExegesisTarget::createCounter(CounterName, State);
685   }
686 
687 private:
688   void addTargetSpecificPasses(PassManagerBase &PM) const override;
689 
690   unsigned getScratchMemoryRegister(const Triple &TT) const override;
691 
692   unsigned getLoopCounterRegister(const Triple &) const override;
693 
getMaxMemoryAccessSize() const694   unsigned getMaxMemoryAccessSize() const override { return 64; }
695 
696   Error randomizeTargetMCOperand(const Instruction &Instr, const Variable &Var,
697                                  MCOperand &AssignedValue,
698                                  const BitVector &ForbiddenRegs) const override;
699 
700   void fillMemoryOperands(InstructionTemplate &IT, unsigned Reg,
701                           unsigned Offset) const override;
702 
703   void decrementLoopCounterAndJump(MachineBasicBlock &MBB,
704                                    MachineBasicBlock &TargetMBB,
705                                    const MCInstrInfo &MII) const override;
706 
707   std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, unsigned Reg,
708                                const APInt &Value) const override;
709 
getUnavailableRegisters() const710   ArrayRef<unsigned> getUnavailableRegisters() const override {
711     return makeArrayRef(kUnavailableRegisters,
712                         sizeof(kUnavailableRegisters) /
713                             sizeof(kUnavailableRegisters[0]));
714   }
715 
allowAsBackToBack(const Instruction & Instr) const716   bool allowAsBackToBack(const Instruction &Instr) const override {
717     const unsigned Opcode = Instr.Description.Opcode;
718     return !isInvalidOpcode(Instr) && Opcode != X86::LEA64r &&
719            Opcode != X86::LEA64_32r && Opcode != X86::LEA16r;
720   }
721 
722   std::vector<InstructionTemplate>
723   generateInstructionVariants(const Instruction &Instr,
724                               unsigned MaxConfigsPerOpcode) const override;
725 
createSerialSnippetGenerator(const LLVMState & State,const SnippetGenerator::Options & Opts) const726   std::unique_ptr<SnippetGenerator> createSerialSnippetGenerator(
727       const LLVMState &State,
728       const SnippetGenerator::Options &Opts) const override {
729     return std::make_unique<X86SerialSnippetGenerator>(State, Opts);
730   }
731 
createParallelSnippetGenerator(const LLVMState & State,const SnippetGenerator::Options & Opts) const732   std::unique_ptr<SnippetGenerator> createParallelSnippetGenerator(
733       const LLVMState &State,
734       const SnippetGenerator::Options &Opts) const override {
735     return std::make_unique<X86ParallelSnippetGenerator>(State, Opts);
736   }
737 
matchesArch(Triple::ArchType Arch) const738   bool matchesArch(Triple::ArchType Arch) const override {
739     return Arch == Triple::x86_64 || Arch == Triple::x86;
740   }
741 
checkFeatureSupport() const742   Error checkFeatureSupport() const override {
743     // LBR is the only feature we conditionally support now.
744     // So if LBR is not requested, then we should be able to run the benchmarks.
745     if (LbrSamplingPeriod == 0)
746       return Error::success();
747 
748 #if defined(__linux__) && defined(HAVE_LIBPFM) &&                              \
749     defined(LIBPFM_HAS_FIELD_CYCLES)
750       // FIXME: Fix this.
751       // https://bugs.llvm.org/show_bug.cgi?id=48918
752       // For now, only do the check if we see an Intel machine because
753       // the counter uses some intel-specific magic and it could
754       // be confuse and think an AMD machine actually has LBR support.
755 #if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) ||            \
756     defined(_M_X64)
757     using namespace sys::detail::x86;
758 
759     if (getVendorSignature() == VendorSignatures::GENUINE_INTEL)
760       // If the kernel supports it, the hardware still may not have it.
761       return X86LbrCounter::checkLbrSupport();
762 #else
763     llvm_unreachable("Running X86 exegesis on non-X86 target");
764 #endif
765 #endif
766     return llvm::make_error<llvm::StringError>(
767         "LBR not supported on this kernel and/or platform",
768         llvm::errc::not_supported);
769   }
770 
withSavedState() const771   std::unique_ptr<SavedState> withSavedState() const override {
772     return std::make_unique<X86SavedState>();
773   }
774 
775   static const unsigned kUnavailableRegisters[4];
776 };
777 
778 // We disable a few registers that cannot be encoded on instructions with a REX
779 // prefix.
780 const unsigned ExegesisX86Target::kUnavailableRegisters[4] = {X86::AH, X86::BH,
781                                                               X86::CH, X86::DH};
782 
783 // We're using one of R8-R15 because these registers are never hardcoded in
784 // instructions (e.g. MOVS writes to EDI, ESI, EDX), so they have less
785 // conflicts.
786 constexpr const unsigned kLoopCounterReg = X86::R8;
787 
788 } // namespace
789 
addTargetSpecificPasses(PassManagerBase & PM) const790 void ExegesisX86Target::addTargetSpecificPasses(PassManagerBase &PM) const {
791   // Lowers FP pseudo-instructions, e.g. ABS_Fp32 -> ABS_F.
792   PM.add(createX86FloatingPointStackifierPass());
793 }
794 
getScratchMemoryRegister(const Triple & TT) const795 unsigned ExegesisX86Target::getScratchMemoryRegister(const Triple &TT) const {
796   if (!TT.isArch64Bit()) {
797     // FIXME: This would require popping from the stack, so we would have to
798     // add some additional setup code.
799     return 0;
800   }
801   return TT.isOSWindows() ? X86::RCX : X86::RDI;
802 }
803 
getLoopCounterRegister(const Triple & TT) const804 unsigned ExegesisX86Target::getLoopCounterRegister(const Triple &TT) const {
805   if (!TT.isArch64Bit()) {
806     return 0;
807   }
808   return kLoopCounterReg;
809 }
810 
randomizeTargetMCOperand(const Instruction & Instr,const Variable & Var,MCOperand & AssignedValue,const BitVector & ForbiddenRegs) const811 Error ExegesisX86Target::randomizeTargetMCOperand(
812     const Instruction &Instr, const Variable &Var, MCOperand &AssignedValue,
813     const BitVector &ForbiddenRegs) const {
814   const Operand &Op = Instr.getPrimaryOperand(Var);
815   switch (Op.getExplicitOperandInfo().OperandType) {
816   case X86::OperandType::OPERAND_ROUNDING_CONTROL:
817     AssignedValue =
818         MCOperand::createImm(randomIndex(X86::STATIC_ROUNDING::TO_ZERO));
819     return Error::success();
820   default:
821     break;
822   }
823   return make_error<Failure>(
824       Twine("unimplemented operand type ")
825           .concat(Twine(Op.getExplicitOperandInfo().OperandType)));
826 }
827 
fillMemoryOperands(InstructionTemplate & IT,unsigned Reg,unsigned Offset) const828 void ExegesisX86Target::fillMemoryOperands(InstructionTemplate &IT,
829                                            unsigned Reg,
830                                            unsigned Offset) const {
831   assert(!isInvalidMemoryInstr(IT.getInstr()) &&
832          "fillMemoryOperands requires a valid memory instruction");
833   int MemOpIdx = X86II::getMemoryOperandNo(IT.getInstr().Description.TSFlags);
834   assert(MemOpIdx >= 0 && "invalid memory operand index");
835   // getMemoryOperandNo() ignores tied operands, so we have to add them back.
836   MemOpIdx += X86II::getOperandBias(IT.getInstr().Description);
837   setMemOp(IT, MemOpIdx + 0, MCOperand::createReg(Reg));    // BaseReg
838   setMemOp(IT, MemOpIdx + 1, MCOperand::createImm(1));      // ScaleAmt
839   setMemOp(IT, MemOpIdx + 2, MCOperand::createReg(0));      // IndexReg
840   setMemOp(IT, MemOpIdx + 3, MCOperand::createImm(Offset)); // Disp
841   setMemOp(IT, MemOpIdx + 4, MCOperand::createReg(0));      // Segment
842 }
843 
decrementLoopCounterAndJump(MachineBasicBlock & MBB,MachineBasicBlock & TargetMBB,const MCInstrInfo & MII) const844 void ExegesisX86Target::decrementLoopCounterAndJump(
845     MachineBasicBlock &MBB, MachineBasicBlock &TargetMBB,
846     const MCInstrInfo &MII) const {
847   BuildMI(&MBB, DebugLoc(), MII.get(X86::ADD64ri8))
848       .addDef(kLoopCounterReg)
849       .addUse(kLoopCounterReg)
850       .addImm(-1);
851   BuildMI(&MBB, DebugLoc(), MII.get(X86::JCC_1))
852       .addMBB(&TargetMBB)
853       .addImm(X86::COND_NE);
854 }
855 
setRegTo(const MCSubtargetInfo & STI,unsigned Reg,const APInt & Value) const856 std::vector<MCInst> ExegesisX86Target::setRegTo(const MCSubtargetInfo &STI,
857                                                 unsigned Reg,
858                                                 const APInt &Value) const {
859   if (X86::GR8RegClass.contains(Reg))
860     return {loadImmediate(Reg, 8, Value)};
861   if (X86::GR16RegClass.contains(Reg))
862     return {loadImmediate(Reg, 16, Value)};
863   if (X86::GR32RegClass.contains(Reg))
864     return {loadImmediate(Reg, 32, Value)};
865   if (X86::GR64RegClass.contains(Reg))
866     return {loadImmediate(Reg, 64, Value)};
867   ConstantInliner CI(Value);
868   if (X86::VR64RegClass.contains(Reg))
869     return CI.loadAndFinalize(Reg, 64, X86::MMX_MOVQ64rm);
870   if (X86::VR128XRegClass.contains(Reg)) {
871     if (STI.getFeatureBits()[X86::FeatureAVX512])
872       return CI.loadAndFinalize(Reg, 128, X86::VMOVDQU32Z128rm);
873     if (STI.getFeatureBits()[X86::FeatureAVX])
874       return CI.loadAndFinalize(Reg, 128, X86::VMOVDQUrm);
875     return CI.loadAndFinalize(Reg, 128, X86::MOVDQUrm);
876   }
877   if (X86::VR256XRegClass.contains(Reg)) {
878     if (STI.getFeatureBits()[X86::FeatureAVX512])
879       return CI.loadAndFinalize(Reg, 256, X86::VMOVDQU32Z256rm);
880     if (STI.getFeatureBits()[X86::FeatureAVX])
881       return CI.loadAndFinalize(Reg, 256, X86::VMOVDQUYrm);
882   }
883   if (X86::VR512RegClass.contains(Reg))
884     if (STI.getFeatureBits()[X86::FeatureAVX512])
885       return CI.loadAndFinalize(Reg, 512, X86::VMOVDQU32Zrm);
886   if (X86::RSTRegClass.contains(Reg)) {
887     return CI.loadX87STAndFinalize(Reg);
888   }
889   if (X86::RFP32RegClass.contains(Reg) || X86::RFP64RegClass.contains(Reg) ||
890       X86::RFP80RegClass.contains(Reg)) {
891     return CI.loadX87FPAndFinalize(Reg);
892   }
893   if (Reg == X86::EFLAGS)
894     return CI.popFlagAndFinalize();
895   if (Reg == X86::MXCSR)
896     return CI.loadImplicitRegAndFinalize(
897         STI.getFeatureBits()[X86::FeatureAVX] ? X86::VLDMXCSR : X86::LDMXCSR,
898         0x1f80);
899   if (Reg == X86::FPCW)
900     return CI.loadImplicitRegAndFinalize(X86::FLDCW16m, 0x37f);
901   return {}; // Not yet implemented.
902 }
903 
904 // Instruction can have some variable operands, and we may want to see how
905 // different operands affect performance. So for each operand position,
906 // precompute all the possible choices we might care about,
907 // and greedily generate all the possible combinations of choices.
generateInstructionVariants(const Instruction & Instr,unsigned MaxConfigsPerOpcode) const908 std::vector<InstructionTemplate> ExegesisX86Target::generateInstructionVariants(
909     const Instruction &Instr, unsigned MaxConfigsPerOpcode) const {
910   bool Exploration = false;
911   SmallVector<SmallVector<MCOperand, 1>, 4> VariableChoices;
912   VariableChoices.resize(Instr.Variables.size());
913   for (auto I : llvm::zip(Instr.Variables, VariableChoices)) {
914     const Variable &Var = std::get<0>(I);
915     SmallVectorImpl<MCOperand> &Choices = std::get<1>(I);
916 
917     switch (Instr.getPrimaryOperand(Var).getExplicitOperandInfo().OperandType) {
918     default:
919       // We don't wish to explicitly explore this variable.
920       Choices.emplace_back(); // But add invalid MCOperand to simplify logic.
921       continue;
922     case X86::OperandType::OPERAND_COND_CODE: {
923       Exploration = true;
924       auto CondCodes = enum_seq_inclusive(X86::CondCode::COND_O,
925                                           X86::CondCode::LAST_VALID_COND,
926                                           force_iteration_on_noniterable_enum);
927       Choices.reserve(CondCodes.size());
928       for (int CondCode : CondCodes)
929         Choices.emplace_back(MCOperand::createImm(CondCode));
930       break;
931     }
932     }
933   }
934 
935   // If we don't wish to explore any variables, defer to the baseline method.
936   if (!Exploration)
937     return ExegesisTarget::generateInstructionVariants(Instr,
938                                                        MaxConfigsPerOpcode);
939 
940   std::vector<InstructionTemplate> Variants;
941   size_t NumVariants;
942   CombinationGenerator<MCOperand, decltype(VariableChoices)::value_type, 4> G(
943       VariableChoices);
944 
945   // How many operand combinations can we produce, within the limit?
946   NumVariants = std::min(G.numCombinations(), (size_t)MaxConfigsPerOpcode);
947   // And actually produce all the wanted operand combinations.
948   Variants.reserve(NumVariants);
949   G.generate([&](ArrayRef<MCOperand> State) -> bool {
950     Variants.emplace_back(&Instr);
951     Variants.back().setVariableValues(State);
952     // Did we run out of space for variants?
953     return Variants.size() >= NumVariants;
954   });
955 
956   assert(Variants.size() == NumVariants &&
957          Variants.size() <= MaxConfigsPerOpcode &&
958          "Should not produce too many variants");
959   return Variants;
960 }
961 
getTheExegesisX86Target()962 static ExegesisTarget *getTheExegesisX86Target() {
963   static ExegesisX86Target Target;
964   return &Target;
965 }
966 
InitializeX86ExegesisTarget()967 void InitializeX86ExegesisTarget() {
968   ExegesisTarget::registerTarget(getTheExegesisX86Target());
969 }
970 
971 } // namespace exegesis
972 } // namespace llvm
973