1 //===-- Target.cpp ----------------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 #include "../Target.h"
9
10 #include "../Error.h"
11 #include "../ParallelSnippetGenerator.h"
12 #include "../SerialSnippetGenerator.h"
13 #include "../SnippetGenerator.h"
14 #include "MCTargetDesc/X86BaseInfo.h"
15 #include "MCTargetDesc/X86MCTargetDesc.h"
16 #include "X86.h"
17 #include "X86Counter.h"
18 #include "X86RegisterInfo.h"
19 #include "X86Subtarget.h"
20 #include "llvm/ADT/Sequence.h"
21 #include "llvm/MC/MCInstBuilder.h"
22 #include "llvm/Support/Errc.h"
23 #include "llvm/Support/Error.h"
24 #include "llvm/Support/FormatVariadic.h"
25 #include "llvm/Support/Host.h"
26
27 #include <memory>
28 #include <string>
29 #include <vector>
30 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
31 #include <immintrin.h>
32 #include <intrin.h>
33 #endif
34 #if defined(__x86_64__) && defined(_MSC_VER)
35 #include <float.h> // For _clearfp in ~X86SavedState().
36 #endif
37
38 namespace llvm {
39 namespace exegesis {
40
41 static cl::OptionCategory
42 BenchmarkOptions("llvm-exegesis benchmark x86-options");
43
44 // If a positive value is specified, we are going to use the LBR in
45 // latency-mode.
46 //
47 // Note:
48 // - A small value is preferred, but too low a value could result in
49 // throttling.
50 // - A prime number is preferred to avoid always skipping certain blocks.
51 //
52 static cl::opt<unsigned> LbrSamplingPeriod(
53 "x86-lbr-sample-period",
54 cl::desc("The sample period (nbranches/sample), used for LBR sampling"),
55 cl::cat(BenchmarkOptions), cl::init(0));
56
57 // FIXME: Validates that repetition-mode is loop if LBR is requested.
58
59 // Returns a non-null reason if we cannot handle the memory references in this
60 // instruction.
isInvalidMemoryInstr(const Instruction & Instr)61 static const char *isInvalidMemoryInstr(const Instruction &Instr) {
62 switch (Instr.Description.TSFlags & X86II::FormMask) {
63 default:
64 return "Unknown FormMask value";
65 // These have no memory access.
66 case X86II::Pseudo:
67 case X86II::RawFrm:
68 case X86II::AddCCFrm:
69 case X86II::PrefixByte:
70 case X86II::MRMDestReg:
71 case X86II::MRMSrcReg:
72 case X86II::MRMSrcReg4VOp3:
73 case X86II::MRMSrcRegOp4:
74 case X86II::MRMSrcRegCC:
75 case X86II::MRMXrCC:
76 case X86II::MRMr0:
77 case X86II::MRMXr:
78 case X86II::MRM0r:
79 case X86II::MRM1r:
80 case X86II::MRM2r:
81 case X86II::MRM3r:
82 case X86II::MRM4r:
83 case X86II::MRM5r:
84 case X86II::MRM6r:
85 case X86II::MRM7r:
86 case X86II::MRM0X:
87 case X86II::MRM1X:
88 case X86II::MRM2X:
89 case X86II::MRM3X:
90 case X86II::MRM4X:
91 case X86II::MRM5X:
92 case X86II::MRM6X:
93 case X86II::MRM7X:
94 case X86II::MRM_C0:
95 case X86II::MRM_C1:
96 case X86II::MRM_C2:
97 case X86II::MRM_C3:
98 case X86II::MRM_C4:
99 case X86II::MRM_C5:
100 case X86II::MRM_C6:
101 case X86II::MRM_C7:
102 case X86II::MRM_C8:
103 case X86II::MRM_C9:
104 case X86II::MRM_CA:
105 case X86II::MRM_CB:
106 case X86II::MRM_CC:
107 case X86II::MRM_CD:
108 case X86II::MRM_CE:
109 case X86II::MRM_CF:
110 case X86II::MRM_D0:
111 case X86II::MRM_D1:
112 case X86II::MRM_D2:
113 case X86II::MRM_D3:
114 case X86II::MRM_D4:
115 case X86II::MRM_D5:
116 case X86II::MRM_D6:
117 case X86II::MRM_D7:
118 case X86II::MRM_D8:
119 case X86II::MRM_D9:
120 case X86II::MRM_DA:
121 case X86II::MRM_DB:
122 case X86II::MRM_DC:
123 case X86II::MRM_DD:
124 case X86II::MRM_DE:
125 case X86II::MRM_DF:
126 case X86II::MRM_E0:
127 case X86II::MRM_E1:
128 case X86II::MRM_E2:
129 case X86II::MRM_E3:
130 case X86II::MRM_E4:
131 case X86II::MRM_E5:
132 case X86II::MRM_E6:
133 case X86II::MRM_E7:
134 case X86II::MRM_E8:
135 case X86II::MRM_E9:
136 case X86II::MRM_EA:
137 case X86II::MRM_EB:
138 case X86II::MRM_EC:
139 case X86II::MRM_ED:
140 case X86II::MRM_EE:
141 case X86II::MRM_EF:
142 case X86II::MRM_F0:
143 case X86II::MRM_F1:
144 case X86II::MRM_F2:
145 case X86II::MRM_F3:
146 case X86II::MRM_F4:
147 case X86II::MRM_F5:
148 case X86II::MRM_F6:
149 case X86II::MRM_F7:
150 case X86II::MRM_F8:
151 case X86II::MRM_F9:
152 case X86II::MRM_FA:
153 case X86II::MRM_FB:
154 case X86II::MRM_FC:
155 case X86II::MRM_FD:
156 case X86II::MRM_FE:
157 case X86II::MRM_FF:
158 case X86II::RawFrmImm8:
159 return nullptr;
160 case X86II::AddRegFrm:
161 return (Instr.Description.Opcode == X86::POP16r ||
162 Instr.Description.Opcode == X86::POP32r ||
163 Instr.Description.Opcode == X86::PUSH16r ||
164 Instr.Description.Opcode == X86::PUSH32r)
165 ? "unsupported opcode: unsupported memory access"
166 : nullptr;
167 // These access memory and are handled.
168 case X86II::MRMDestMem:
169 case X86II::MRMSrcMem:
170 case X86II::MRMSrcMem4VOp3:
171 case X86II::MRMSrcMemOp4:
172 case X86II::MRMSrcMemCC:
173 case X86II::MRMXmCC:
174 case X86II::MRMXm:
175 case X86II::MRM0m:
176 case X86II::MRM1m:
177 case X86II::MRM2m:
178 case X86II::MRM3m:
179 case X86II::MRM4m:
180 case X86II::MRM5m:
181 case X86II::MRM6m:
182 case X86II::MRM7m:
183 return nullptr;
184 // These access memory and are not handled yet.
185 case X86II::RawFrmImm16:
186 case X86II::RawFrmMemOffs:
187 case X86II::RawFrmSrc:
188 case X86II::RawFrmDst:
189 case X86II::RawFrmDstSrc:
190 return "unsupported opcode: non uniform memory access";
191 }
192 }
193
194 // If the opcode is invalid, returns a pointer to a character literal indicating
195 // the reason. nullptr indicates a valid opcode.
isInvalidOpcode(const Instruction & Instr)196 static const char *isInvalidOpcode(const Instruction &Instr) {
197 const auto OpcodeName = Instr.Name;
198 if ((Instr.Description.TSFlags & X86II::FormMask) == X86II::Pseudo)
199 return "unsupported opcode: pseudo instruction";
200 if ((OpcodeName.startswith("POP") && !OpcodeName.startswith("POPCNT")) ||
201 OpcodeName.startswith("PUSH") || OpcodeName.startswith("ADJCALLSTACK") ||
202 OpcodeName.startswith("LEAVE"))
203 return "unsupported opcode: Push/Pop/AdjCallStack/Leave";
204 switch (Instr.Description.Opcode) {
205 case X86::LFS16rm:
206 case X86::LFS32rm:
207 case X86::LFS64rm:
208 case X86::LGS16rm:
209 case X86::LGS32rm:
210 case X86::LGS64rm:
211 case X86::LSS16rm:
212 case X86::LSS32rm:
213 case X86::LSS64rm:
214 case X86::SYSENTER:
215 return "unsupported opcode";
216 default:
217 break;
218 }
219 if (const auto reason = isInvalidMemoryInstr(Instr))
220 return reason;
221 // We do not handle instructions with OPERAND_PCREL.
222 for (const Operand &Op : Instr.Operands)
223 if (Op.isExplicit() &&
224 Op.getExplicitOperandInfo().OperandType == MCOI::OPERAND_PCREL)
225 return "unsupported opcode: PC relative operand";
226 // We do not handle second-form X87 instructions. We only handle first-form
227 // ones (_Fp), see comment in X86InstrFPStack.td.
228 for (const Operand &Op : Instr.Operands)
229 if (Op.isReg() && Op.isExplicit() &&
230 Op.getExplicitOperandInfo().RegClass == X86::RSTRegClassID)
231 return "unsupported second-form X87 instruction";
232 return nullptr;
233 }
234
getX86FPFlags(const Instruction & Instr)235 static unsigned getX86FPFlags(const Instruction &Instr) {
236 return Instr.Description.TSFlags & X86II::FPTypeMask;
237 }
238
239 // Helper to fill a memory operand with a value.
setMemOp(InstructionTemplate & IT,int OpIdx,const MCOperand & OpVal)240 static void setMemOp(InstructionTemplate &IT, int OpIdx,
241 const MCOperand &OpVal) {
242 const auto Op = IT.getInstr().Operands[OpIdx];
243 assert(Op.isExplicit() && "invalid memory pattern");
244 IT.getValueFor(Op) = OpVal;
245 }
246
247 // Common (latency, uops) code for LEA templates. `GetDestReg` takes the
248 // addressing base and index registers and returns the LEA destination register.
generateLEATemplatesCommon(const Instruction & Instr,const BitVector & ForbiddenRegisters,const LLVMState & State,const SnippetGenerator::Options & Opts,std::function<void (unsigned,unsigned,BitVector & CandidateDestRegs)> RestrictDestRegs)249 static Expected<std::vector<CodeTemplate>> generateLEATemplatesCommon(
250 const Instruction &Instr, const BitVector &ForbiddenRegisters,
251 const LLVMState &State, const SnippetGenerator::Options &Opts,
252 std::function<void(unsigned, unsigned, BitVector &CandidateDestRegs)>
253 RestrictDestRegs) {
254 assert(Instr.Operands.size() == 6 && "invalid LEA");
255 assert(X86II::getMemoryOperandNo(Instr.Description.TSFlags) == 1 &&
256 "invalid LEA");
257
258 constexpr const int kDestOp = 0;
259 constexpr const int kBaseOp = 1;
260 constexpr const int kIndexOp = 3;
261 auto PossibleDestRegs =
262 Instr.Operands[kDestOp].getRegisterAliasing().sourceBits();
263 remove(PossibleDestRegs, ForbiddenRegisters);
264 auto PossibleBaseRegs =
265 Instr.Operands[kBaseOp].getRegisterAliasing().sourceBits();
266 remove(PossibleBaseRegs, ForbiddenRegisters);
267 auto PossibleIndexRegs =
268 Instr.Operands[kIndexOp].getRegisterAliasing().sourceBits();
269 remove(PossibleIndexRegs, ForbiddenRegisters);
270
271 const auto &RegInfo = State.getRegInfo();
272 std::vector<CodeTemplate> Result;
273 for (const unsigned BaseReg : PossibleBaseRegs.set_bits()) {
274 for (const unsigned IndexReg : PossibleIndexRegs.set_bits()) {
275 for (int LogScale = 0; LogScale <= 3; ++LogScale) {
276 // FIXME: Add an option for controlling how we explore immediates.
277 for (const int Disp : {0, 42}) {
278 InstructionTemplate IT(&Instr);
279 const int64_t Scale = 1ull << LogScale;
280 setMemOp(IT, 1, MCOperand::createReg(BaseReg));
281 setMemOp(IT, 2, MCOperand::createImm(Scale));
282 setMemOp(IT, 3, MCOperand::createReg(IndexReg));
283 setMemOp(IT, 4, MCOperand::createImm(Disp));
284 // SegmentReg must be 0 for LEA.
285 setMemOp(IT, 5, MCOperand::createReg(0));
286
287 // Output reg candidates are selected by the caller.
288 auto PossibleDestRegsNow = PossibleDestRegs;
289 RestrictDestRegs(BaseReg, IndexReg, PossibleDestRegsNow);
290 assert(PossibleDestRegsNow.set_bits().begin() !=
291 PossibleDestRegsNow.set_bits().end() &&
292 "no remaining registers");
293 setMemOp(
294 IT, 0,
295 MCOperand::createReg(*PossibleDestRegsNow.set_bits().begin()));
296
297 CodeTemplate CT;
298 CT.Instructions.push_back(std::move(IT));
299 CT.Config = formatv("{3}(%{0}, %{1}, {2})", RegInfo.getName(BaseReg),
300 RegInfo.getName(IndexReg), Scale, Disp)
301 .str();
302 Result.push_back(std::move(CT));
303 if (Result.size() >= Opts.MaxConfigsPerOpcode)
304 return std::move(Result);
305 }
306 }
307 }
308 }
309
310 return std::move(Result);
311 }
312
313 namespace {
314 class X86SerialSnippetGenerator : public SerialSnippetGenerator {
315 public:
316 using SerialSnippetGenerator::SerialSnippetGenerator;
317
318 Expected<std::vector<CodeTemplate>>
319 generateCodeTemplates(InstructionTemplate Variant,
320 const BitVector &ForbiddenRegisters) const override;
321 };
322 } // namespace
323
324 Expected<std::vector<CodeTemplate>>
generateCodeTemplates(InstructionTemplate Variant,const BitVector & ForbiddenRegisters) const325 X86SerialSnippetGenerator::generateCodeTemplates(
326 InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
327 const Instruction &Instr = Variant.getInstr();
328
329 if (const auto reason = isInvalidOpcode(Instr))
330 return make_error<Failure>(reason);
331
332 // LEA gets special attention.
333 const auto Opcode = Instr.Description.getOpcode();
334 if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) {
335 return generateLEATemplatesCommon(
336 Instr, ForbiddenRegisters, State, Opts,
337 [this](unsigned BaseReg, unsigned IndexReg,
338 BitVector &CandidateDestRegs) {
339 // We just select a destination register that aliases the base
340 // register.
341 CandidateDestRegs &=
342 State.getRATC().getRegister(BaseReg).aliasedBits();
343 });
344 }
345
346 if (Instr.hasMemoryOperands())
347 return make_error<Failure>(
348 "unsupported memory operand in latency measurements");
349
350 switch (getX86FPFlags(Instr)) {
351 case X86II::NotFP:
352 return SerialSnippetGenerator::generateCodeTemplates(Variant,
353 ForbiddenRegisters);
354 case X86II::ZeroArgFP:
355 case X86II::OneArgFP:
356 case X86II::SpecialFP:
357 case X86II::CompareFP:
358 case X86II::CondMovFP:
359 return make_error<Failure>("Unsupported x87 Instruction");
360 case X86II::OneArgFPRW:
361 case X86II::TwoArgFP:
362 // These are instructions like
363 // - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
364 // - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
365 // They are intrinsically serial and do not modify the state of the stack.
366 return generateSelfAliasingCodeTemplates(Variant);
367 default:
368 llvm_unreachable("Unknown FP Type!");
369 }
370 }
371
372 namespace {
373 class X86ParallelSnippetGenerator : public ParallelSnippetGenerator {
374 public:
375 using ParallelSnippetGenerator::ParallelSnippetGenerator;
376
377 Expected<std::vector<CodeTemplate>>
378 generateCodeTemplates(InstructionTemplate Variant,
379 const BitVector &ForbiddenRegisters) const override;
380 };
381
382 } // namespace
383
384 Expected<std::vector<CodeTemplate>>
generateCodeTemplates(InstructionTemplate Variant,const BitVector & ForbiddenRegisters) const385 X86ParallelSnippetGenerator::generateCodeTemplates(
386 InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
387 const Instruction &Instr = Variant.getInstr();
388
389 if (const auto reason = isInvalidOpcode(Instr))
390 return make_error<Failure>(reason);
391
392 // LEA gets special attention.
393 const auto Opcode = Instr.Description.getOpcode();
394 if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) {
395 return generateLEATemplatesCommon(
396 Instr, ForbiddenRegisters, State, Opts,
397 [this](unsigned BaseReg, unsigned IndexReg,
398 BitVector &CandidateDestRegs) {
399 // Any destination register that is not used for addressing is fine.
400 remove(CandidateDestRegs,
401 State.getRATC().getRegister(BaseReg).aliasedBits());
402 remove(CandidateDestRegs,
403 State.getRATC().getRegister(IndexReg).aliasedBits());
404 });
405 }
406
407 switch (getX86FPFlags(Instr)) {
408 case X86II::NotFP:
409 return ParallelSnippetGenerator::generateCodeTemplates(Variant,
410 ForbiddenRegisters);
411 case X86II::ZeroArgFP:
412 case X86II::OneArgFP:
413 case X86II::SpecialFP:
414 return make_error<Failure>("Unsupported x87 Instruction");
415 case X86II::OneArgFPRW:
416 case X86II::TwoArgFP:
417 // These are instructions like
418 // - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
419 // - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
420 // They are intrinsically serial and do not modify the state of the stack.
421 // We generate the same code for latency and uops.
422 return generateSelfAliasingCodeTemplates(Variant);
423 case X86II::CompareFP:
424 case X86II::CondMovFP:
425 // We can compute uops for any FP instruction that does not grow or shrink
426 // the stack (either do not touch the stack or push as much as they pop).
427 return generateUnconstrainedCodeTemplates(
428 Variant, "instruction does not grow/shrink the FP stack");
429 default:
430 llvm_unreachable("Unknown FP Type!");
431 }
432 }
433
getLoadImmediateOpcode(unsigned RegBitWidth)434 static unsigned getLoadImmediateOpcode(unsigned RegBitWidth) {
435 switch (RegBitWidth) {
436 case 8:
437 return X86::MOV8ri;
438 case 16:
439 return X86::MOV16ri;
440 case 32:
441 return X86::MOV32ri;
442 case 64:
443 return X86::MOV64ri;
444 }
445 llvm_unreachable("Invalid Value Width");
446 }
447
448 // Generates instruction to load an immediate value into a register.
loadImmediate(unsigned Reg,unsigned RegBitWidth,const APInt & Value)449 static MCInst loadImmediate(unsigned Reg, unsigned RegBitWidth,
450 const APInt &Value) {
451 if (Value.getBitWidth() > RegBitWidth)
452 llvm_unreachable("Value must fit in the Register");
453 return MCInstBuilder(getLoadImmediateOpcode(RegBitWidth))
454 .addReg(Reg)
455 .addImm(Value.getZExtValue());
456 }
457
458 // Allocates scratch memory on the stack.
allocateStackSpace(unsigned Bytes)459 static MCInst allocateStackSpace(unsigned Bytes) {
460 return MCInstBuilder(X86::SUB64ri8)
461 .addReg(X86::RSP)
462 .addReg(X86::RSP)
463 .addImm(Bytes);
464 }
465
466 // Fills scratch memory at offset `OffsetBytes` with value `Imm`.
fillStackSpace(unsigned MovOpcode,unsigned OffsetBytes,uint64_t Imm)467 static MCInst fillStackSpace(unsigned MovOpcode, unsigned OffsetBytes,
468 uint64_t Imm) {
469 return MCInstBuilder(MovOpcode)
470 // Address = ESP
471 .addReg(X86::RSP) // BaseReg
472 .addImm(1) // ScaleAmt
473 .addReg(0) // IndexReg
474 .addImm(OffsetBytes) // Disp
475 .addReg(0) // Segment
476 // Immediate.
477 .addImm(Imm);
478 }
479
480 // Loads scratch memory into register `Reg` using opcode `RMOpcode`.
loadToReg(unsigned Reg,unsigned RMOpcode)481 static MCInst loadToReg(unsigned Reg, unsigned RMOpcode) {
482 return MCInstBuilder(RMOpcode)
483 .addReg(Reg)
484 // Address = ESP
485 .addReg(X86::RSP) // BaseReg
486 .addImm(1) // ScaleAmt
487 .addReg(0) // IndexReg
488 .addImm(0) // Disp
489 .addReg(0); // Segment
490 }
491
492 // Releases scratch memory.
releaseStackSpace(unsigned Bytes)493 static MCInst releaseStackSpace(unsigned Bytes) {
494 return MCInstBuilder(X86::ADD64ri8)
495 .addReg(X86::RSP)
496 .addReg(X86::RSP)
497 .addImm(Bytes);
498 }
499
500 // Reserves some space on the stack, fills it with the content of the provided
501 // constant and provide methods to load the stack value into a register.
502 namespace {
503 struct ConstantInliner {
ConstantInlinerllvm::exegesis::__anonb99050300511::ConstantInliner504 explicit ConstantInliner(const APInt &Constant) : Constant_(Constant) {}
505
506 std::vector<MCInst> loadAndFinalize(unsigned Reg, unsigned RegBitWidth,
507 unsigned Opcode);
508
509 std::vector<MCInst> loadX87STAndFinalize(unsigned Reg);
510
511 std::vector<MCInst> loadX87FPAndFinalize(unsigned Reg);
512
513 std::vector<MCInst> popFlagAndFinalize();
514
515 std::vector<MCInst> loadImplicitRegAndFinalize(unsigned Opcode,
516 unsigned Value);
517
518 private:
addllvm::exegesis::__anonb99050300511::ConstantInliner519 ConstantInliner &add(const MCInst &Inst) {
520 Instructions.push_back(Inst);
521 return *this;
522 }
523
524 void initStack(unsigned Bytes);
525
526 static constexpr const unsigned kF80Bytes = 10; // 80 bits.
527
528 APInt Constant_;
529 std::vector<MCInst> Instructions;
530 };
531 } // namespace
532
loadAndFinalize(unsigned Reg,unsigned RegBitWidth,unsigned Opcode)533 std::vector<MCInst> ConstantInliner::loadAndFinalize(unsigned Reg,
534 unsigned RegBitWidth,
535 unsigned Opcode) {
536 assert((RegBitWidth & 7) == 0 && "RegBitWidth must be a multiple of 8 bits");
537 initStack(RegBitWidth / 8);
538 add(loadToReg(Reg, Opcode));
539 add(releaseStackSpace(RegBitWidth / 8));
540 return std::move(Instructions);
541 }
542
loadX87STAndFinalize(unsigned Reg)543 std::vector<MCInst> ConstantInliner::loadX87STAndFinalize(unsigned Reg) {
544 initStack(kF80Bytes);
545 add(MCInstBuilder(X86::LD_F80m)
546 // Address = ESP
547 .addReg(X86::RSP) // BaseReg
548 .addImm(1) // ScaleAmt
549 .addReg(0) // IndexReg
550 .addImm(0) // Disp
551 .addReg(0)); // Segment
552 if (Reg != X86::ST0)
553 add(MCInstBuilder(X86::ST_Frr).addReg(Reg));
554 add(releaseStackSpace(kF80Bytes));
555 return std::move(Instructions);
556 }
557
loadX87FPAndFinalize(unsigned Reg)558 std::vector<MCInst> ConstantInliner::loadX87FPAndFinalize(unsigned Reg) {
559 initStack(kF80Bytes);
560 add(MCInstBuilder(X86::LD_Fp80m)
561 .addReg(Reg)
562 // Address = ESP
563 .addReg(X86::RSP) // BaseReg
564 .addImm(1) // ScaleAmt
565 .addReg(0) // IndexReg
566 .addImm(0) // Disp
567 .addReg(0)); // Segment
568 add(releaseStackSpace(kF80Bytes));
569 return std::move(Instructions);
570 }
571
popFlagAndFinalize()572 std::vector<MCInst> ConstantInliner::popFlagAndFinalize() {
573 initStack(8);
574 add(MCInstBuilder(X86::POPF64));
575 return std::move(Instructions);
576 }
577
578 std::vector<MCInst>
loadImplicitRegAndFinalize(unsigned Opcode,unsigned Value)579 ConstantInliner::loadImplicitRegAndFinalize(unsigned Opcode, unsigned Value) {
580 add(allocateStackSpace(4));
581 add(fillStackSpace(X86::MOV32mi, 0, Value)); // Mask all FP exceptions
582 add(MCInstBuilder(Opcode)
583 // Address = ESP
584 .addReg(X86::RSP) // BaseReg
585 .addImm(1) // ScaleAmt
586 .addReg(0) // IndexReg
587 .addImm(0) // Disp
588 .addReg(0)); // Segment
589 add(releaseStackSpace(4));
590 return std::move(Instructions);
591 }
592
initStack(unsigned Bytes)593 void ConstantInliner::initStack(unsigned Bytes) {
594 assert(Constant_.getBitWidth() <= Bytes * 8 &&
595 "Value does not have the correct size");
596 const APInt WideConstant = Constant_.getBitWidth() < Bytes * 8
597 ? Constant_.sext(Bytes * 8)
598 : Constant_;
599 add(allocateStackSpace(Bytes));
600 size_t ByteOffset = 0;
601 for (; Bytes - ByteOffset >= 4; ByteOffset += 4)
602 add(fillStackSpace(
603 X86::MOV32mi, ByteOffset,
604 WideConstant.extractBits(32, ByteOffset * 8).getZExtValue()));
605 if (Bytes - ByteOffset >= 2) {
606 add(fillStackSpace(
607 X86::MOV16mi, ByteOffset,
608 WideConstant.extractBits(16, ByteOffset * 8).getZExtValue()));
609 ByteOffset += 2;
610 }
611 if (Bytes - ByteOffset >= 1)
612 add(fillStackSpace(
613 X86::MOV8mi, ByteOffset,
614 WideConstant.extractBits(8, ByteOffset * 8).getZExtValue()));
615 }
616
617 #include "X86GenExegesis.inc"
618
619 namespace {
620
621 class X86SavedState : public ExegesisTarget::SavedState {
622 public:
X86SavedState()623 X86SavedState() {
624 #ifdef __x86_64__
625 # if defined(_MSC_VER)
626 _fxsave64(FPState);
627 Eflags = __readeflags();
628 # elif defined(__GNUC__)
629 __builtin_ia32_fxsave64(FPState);
630 Eflags = __builtin_ia32_readeflags_u64();
631 # endif
632 #else
633 llvm_unreachable("X86 exegesis running on non-X86 target");
634 #endif
635 }
636
~X86SavedState()637 ~X86SavedState() {
638 // Restoring the X87 state does not flush pending exceptions, make sure
639 // these exceptions are flushed now.
640 #ifdef __x86_64__
641 # if defined(_MSC_VER)
642 _clearfp();
643 _fxrstor64(FPState);
644 __writeeflags(Eflags);
645 # elif defined(__GNUC__)
646 asm volatile("fwait");
647 __builtin_ia32_fxrstor64(FPState);
648 __builtin_ia32_writeeflags_u64(Eflags);
649 # endif
650 #else
651 llvm_unreachable("X86 exegesis running on non-X86 target");
652 #endif
653 }
654
655 private:
656 #ifdef __x86_64__
657 alignas(16) char FPState[512];
658 uint64_t Eflags;
659 #endif
660 };
661
662 class ExegesisX86Target : public ExegesisTarget {
663 public:
ExegesisX86Target()664 ExegesisX86Target() : ExegesisTarget(X86CpuPfmCounters) {}
665
666 Expected<std::unique_ptr<pfm::Counter>>
createCounter(StringRef CounterName,const LLVMState & State) const667 createCounter(StringRef CounterName, const LLVMState &State) const override {
668 // If LbrSamplingPeriod was provided, then ignore the
669 // CounterName because we only have one for LBR.
670 if (LbrSamplingPeriod > 0) {
671 // Can't use LBR without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, or without
672 // __linux__ (for now)
673 #if defined(HAVE_LIBPFM) && defined(LIBPFM_HAS_FIELD_CYCLES) && \
674 defined(__linux__)
675 return std::make_unique<X86LbrCounter>(
676 X86LbrPerfEvent(LbrSamplingPeriod));
677 #else
678 return llvm::make_error<llvm::StringError>(
679 "LBR counter requested without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, "
680 "or running on Linux.",
681 llvm::errc::invalid_argument);
682 #endif
683 }
684 return ExegesisTarget::createCounter(CounterName, State);
685 }
686
687 private:
688 void addTargetSpecificPasses(PassManagerBase &PM) const override;
689
690 unsigned getScratchMemoryRegister(const Triple &TT) const override;
691
692 unsigned getLoopCounterRegister(const Triple &) const override;
693
getMaxMemoryAccessSize() const694 unsigned getMaxMemoryAccessSize() const override { return 64; }
695
696 Error randomizeTargetMCOperand(const Instruction &Instr, const Variable &Var,
697 MCOperand &AssignedValue,
698 const BitVector &ForbiddenRegs) const override;
699
700 void fillMemoryOperands(InstructionTemplate &IT, unsigned Reg,
701 unsigned Offset) const override;
702
703 void decrementLoopCounterAndJump(MachineBasicBlock &MBB,
704 MachineBasicBlock &TargetMBB,
705 const MCInstrInfo &MII) const override;
706
707 std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, unsigned Reg,
708 const APInt &Value) const override;
709
getUnavailableRegisters() const710 ArrayRef<unsigned> getUnavailableRegisters() const override {
711 return makeArrayRef(kUnavailableRegisters,
712 sizeof(kUnavailableRegisters) /
713 sizeof(kUnavailableRegisters[0]));
714 }
715
allowAsBackToBack(const Instruction & Instr) const716 bool allowAsBackToBack(const Instruction &Instr) const override {
717 const unsigned Opcode = Instr.Description.Opcode;
718 return !isInvalidOpcode(Instr) && Opcode != X86::LEA64r &&
719 Opcode != X86::LEA64_32r && Opcode != X86::LEA16r;
720 }
721
722 std::vector<InstructionTemplate>
723 generateInstructionVariants(const Instruction &Instr,
724 unsigned MaxConfigsPerOpcode) const override;
725
createSerialSnippetGenerator(const LLVMState & State,const SnippetGenerator::Options & Opts) const726 std::unique_ptr<SnippetGenerator> createSerialSnippetGenerator(
727 const LLVMState &State,
728 const SnippetGenerator::Options &Opts) const override {
729 return std::make_unique<X86SerialSnippetGenerator>(State, Opts);
730 }
731
createParallelSnippetGenerator(const LLVMState & State,const SnippetGenerator::Options & Opts) const732 std::unique_ptr<SnippetGenerator> createParallelSnippetGenerator(
733 const LLVMState &State,
734 const SnippetGenerator::Options &Opts) const override {
735 return std::make_unique<X86ParallelSnippetGenerator>(State, Opts);
736 }
737
matchesArch(Triple::ArchType Arch) const738 bool matchesArch(Triple::ArchType Arch) const override {
739 return Arch == Triple::x86_64 || Arch == Triple::x86;
740 }
741
checkFeatureSupport() const742 Error checkFeatureSupport() const override {
743 // LBR is the only feature we conditionally support now.
744 // So if LBR is not requested, then we should be able to run the benchmarks.
745 if (LbrSamplingPeriod == 0)
746 return Error::success();
747
748 #if defined(__linux__) && defined(HAVE_LIBPFM) && \
749 defined(LIBPFM_HAS_FIELD_CYCLES)
750 // FIXME: Fix this.
751 // https://bugs.llvm.org/show_bug.cgi?id=48918
752 // For now, only do the check if we see an Intel machine because
753 // the counter uses some intel-specific magic and it could
754 // be confuse and think an AMD machine actually has LBR support.
755 #if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || \
756 defined(_M_X64)
757 using namespace sys::detail::x86;
758
759 if (getVendorSignature() == VendorSignatures::GENUINE_INTEL)
760 // If the kernel supports it, the hardware still may not have it.
761 return X86LbrCounter::checkLbrSupport();
762 #else
763 llvm_unreachable("Running X86 exegesis on non-X86 target");
764 #endif
765 #endif
766 return llvm::make_error<llvm::StringError>(
767 "LBR not supported on this kernel and/or platform",
768 llvm::errc::not_supported);
769 }
770
withSavedState() const771 std::unique_ptr<SavedState> withSavedState() const override {
772 return std::make_unique<X86SavedState>();
773 }
774
775 static const unsigned kUnavailableRegisters[4];
776 };
777
778 // We disable a few registers that cannot be encoded on instructions with a REX
779 // prefix.
780 const unsigned ExegesisX86Target::kUnavailableRegisters[4] = {X86::AH, X86::BH,
781 X86::CH, X86::DH};
782
783 // We're using one of R8-R15 because these registers are never hardcoded in
784 // instructions (e.g. MOVS writes to EDI, ESI, EDX), so they have less
785 // conflicts.
786 constexpr const unsigned kLoopCounterReg = X86::R8;
787
788 } // namespace
789
addTargetSpecificPasses(PassManagerBase & PM) const790 void ExegesisX86Target::addTargetSpecificPasses(PassManagerBase &PM) const {
791 // Lowers FP pseudo-instructions, e.g. ABS_Fp32 -> ABS_F.
792 PM.add(createX86FloatingPointStackifierPass());
793 }
794
getScratchMemoryRegister(const Triple & TT) const795 unsigned ExegesisX86Target::getScratchMemoryRegister(const Triple &TT) const {
796 if (!TT.isArch64Bit()) {
797 // FIXME: This would require popping from the stack, so we would have to
798 // add some additional setup code.
799 return 0;
800 }
801 return TT.isOSWindows() ? X86::RCX : X86::RDI;
802 }
803
getLoopCounterRegister(const Triple & TT) const804 unsigned ExegesisX86Target::getLoopCounterRegister(const Triple &TT) const {
805 if (!TT.isArch64Bit()) {
806 return 0;
807 }
808 return kLoopCounterReg;
809 }
810
randomizeTargetMCOperand(const Instruction & Instr,const Variable & Var,MCOperand & AssignedValue,const BitVector & ForbiddenRegs) const811 Error ExegesisX86Target::randomizeTargetMCOperand(
812 const Instruction &Instr, const Variable &Var, MCOperand &AssignedValue,
813 const BitVector &ForbiddenRegs) const {
814 const Operand &Op = Instr.getPrimaryOperand(Var);
815 switch (Op.getExplicitOperandInfo().OperandType) {
816 case X86::OperandType::OPERAND_ROUNDING_CONTROL:
817 AssignedValue =
818 MCOperand::createImm(randomIndex(X86::STATIC_ROUNDING::TO_ZERO));
819 return Error::success();
820 default:
821 break;
822 }
823 return make_error<Failure>(
824 Twine("unimplemented operand type ")
825 .concat(Twine(Op.getExplicitOperandInfo().OperandType)));
826 }
827
fillMemoryOperands(InstructionTemplate & IT,unsigned Reg,unsigned Offset) const828 void ExegesisX86Target::fillMemoryOperands(InstructionTemplate &IT,
829 unsigned Reg,
830 unsigned Offset) const {
831 assert(!isInvalidMemoryInstr(IT.getInstr()) &&
832 "fillMemoryOperands requires a valid memory instruction");
833 int MemOpIdx = X86II::getMemoryOperandNo(IT.getInstr().Description.TSFlags);
834 assert(MemOpIdx >= 0 && "invalid memory operand index");
835 // getMemoryOperandNo() ignores tied operands, so we have to add them back.
836 MemOpIdx += X86II::getOperandBias(IT.getInstr().Description);
837 setMemOp(IT, MemOpIdx + 0, MCOperand::createReg(Reg)); // BaseReg
838 setMemOp(IT, MemOpIdx + 1, MCOperand::createImm(1)); // ScaleAmt
839 setMemOp(IT, MemOpIdx + 2, MCOperand::createReg(0)); // IndexReg
840 setMemOp(IT, MemOpIdx + 3, MCOperand::createImm(Offset)); // Disp
841 setMemOp(IT, MemOpIdx + 4, MCOperand::createReg(0)); // Segment
842 }
843
decrementLoopCounterAndJump(MachineBasicBlock & MBB,MachineBasicBlock & TargetMBB,const MCInstrInfo & MII) const844 void ExegesisX86Target::decrementLoopCounterAndJump(
845 MachineBasicBlock &MBB, MachineBasicBlock &TargetMBB,
846 const MCInstrInfo &MII) const {
847 BuildMI(&MBB, DebugLoc(), MII.get(X86::ADD64ri8))
848 .addDef(kLoopCounterReg)
849 .addUse(kLoopCounterReg)
850 .addImm(-1);
851 BuildMI(&MBB, DebugLoc(), MII.get(X86::JCC_1))
852 .addMBB(&TargetMBB)
853 .addImm(X86::COND_NE);
854 }
855
setRegTo(const MCSubtargetInfo & STI,unsigned Reg,const APInt & Value) const856 std::vector<MCInst> ExegesisX86Target::setRegTo(const MCSubtargetInfo &STI,
857 unsigned Reg,
858 const APInt &Value) const {
859 if (X86::GR8RegClass.contains(Reg))
860 return {loadImmediate(Reg, 8, Value)};
861 if (X86::GR16RegClass.contains(Reg))
862 return {loadImmediate(Reg, 16, Value)};
863 if (X86::GR32RegClass.contains(Reg))
864 return {loadImmediate(Reg, 32, Value)};
865 if (X86::GR64RegClass.contains(Reg))
866 return {loadImmediate(Reg, 64, Value)};
867 ConstantInliner CI(Value);
868 if (X86::VR64RegClass.contains(Reg))
869 return CI.loadAndFinalize(Reg, 64, X86::MMX_MOVQ64rm);
870 if (X86::VR128XRegClass.contains(Reg)) {
871 if (STI.getFeatureBits()[X86::FeatureAVX512])
872 return CI.loadAndFinalize(Reg, 128, X86::VMOVDQU32Z128rm);
873 if (STI.getFeatureBits()[X86::FeatureAVX])
874 return CI.loadAndFinalize(Reg, 128, X86::VMOVDQUrm);
875 return CI.loadAndFinalize(Reg, 128, X86::MOVDQUrm);
876 }
877 if (X86::VR256XRegClass.contains(Reg)) {
878 if (STI.getFeatureBits()[X86::FeatureAVX512])
879 return CI.loadAndFinalize(Reg, 256, X86::VMOVDQU32Z256rm);
880 if (STI.getFeatureBits()[X86::FeatureAVX])
881 return CI.loadAndFinalize(Reg, 256, X86::VMOVDQUYrm);
882 }
883 if (X86::VR512RegClass.contains(Reg))
884 if (STI.getFeatureBits()[X86::FeatureAVX512])
885 return CI.loadAndFinalize(Reg, 512, X86::VMOVDQU32Zrm);
886 if (X86::RSTRegClass.contains(Reg)) {
887 return CI.loadX87STAndFinalize(Reg);
888 }
889 if (X86::RFP32RegClass.contains(Reg) || X86::RFP64RegClass.contains(Reg) ||
890 X86::RFP80RegClass.contains(Reg)) {
891 return CI.loadX87FPAndFinalize(Reg);
892 }
893 if (Reg == X86::EFLAGS)
894 return CI.popFlagAndFinalize();
895 if (Reg == X86::MXCSR)
896 return CI.loadImplicitRegAndFinalize(
897 STI.getFeatureBits()[X86::FeatureAVX] ? X86::VLDMXCSR : X86::LDMXCSR,
898 0x1f80);
899 if (Reg == X86::FPCW)
900 return CI.loadImplicitRegAndFinalize(X86::FLDCW16m, 0x37f);
901 return {}; // Not yet implemented.
902 }
903
904 // Instruction can have some variable operands, and we may want to see how
905 // different operands affect performance. So for each operand position,
906 // precompute all the possible choices we might care about,
907 // and greedily generate all the possible combinations of choices.
generateInstructionVariants(const Instruction & Instr,unsigned MaxConfigsPerOpcode) const908 std::vector<InstructionTemplate> ExegesisX86Target::generateInstructionVariants(
909 const Instruction &Instr, unsigned MaxConfigsPerOpcode) const {
910 bool Exploration = false;
911 SmallVector<SmallVector<MCOperand, 1>, 4> VariableChoices;
912 VariableChoices.resize(Instr.Variables.size());
913 for (auto I : llvm::zip(Instr.Variables, VariableChoices)) {
914 const Variable &Var = std::get<0>(I);
915 SmallVectorImpl<MCOperand> &Choices = std::get<1>(I);
916
917 switch (Instr.getPrimaryOperand(Var).getExplicitOperandInfo().OperandType) {
918 default:
919 // We don't wish to explicitly explore this variable.
920 Choices.emplace_back(); // But add invalid MCOperand to simplify logic.
921 continue;
922 case X86::OperandType::OPERAND_COND_CODE: {
923 Exploration = true;
924 auto CondCodes = enum_seq_inclusive(X86::CondCode::COND_O,
925 X86::CondCode::LAST_VALID_COND,
926 force_iteration_on_noniterable_enum);
927 Choices.reserve(CondCodes.size());
928 for (int CondCode : CondCodes)
929 Choices.emplace_back(MCOperand::createImm(CondCode));
930 break;
931 }
932 }
933 }
934
935 // If we don't wish to explore any variables, defer to the baseline method.
936 if (!Exploration)
937 return ExegesisTarget::generateInstructionVariants(Instr,
938 MaxConfigsPerOpcode);
939
940 std::vector<InstructionTemplate> Variants;
941 size_t NumVariants;
942 CombinationGenerator<MCOperand, decltype(VariableChoices)::value_type, 4> G(
943 VariableChoices);
944
945 // How many operand combinations can we produce, within the limit?
946 NumVariants = std::min(G.numCombinations(), (size_t)MaxConfigsPerOpcode);
947 // And actually produce all the wanted operand combinations.
948 Variants.reserve(NumVariants);
949 G.generate([&](ArrayRef<MCOperand> State) -> bool {
950 Variants.emplace_back(&Instr);
951 Variants.back().setVariableValues(State);
952 // Did we run out of space for variants?
953 return Variants.size() >= NumVariants;
954 });
955
956 assert(Variants.size() == NumVariants &&
957 Variants.size() <= MaxConfigsPerOpcode &&
958 "Should not produce too many variants");
959 return Variants;
960 }
961
getTheExegesisX86Target()962 static ExegesisTarget *getTheExegesisX86Target() {
963 static ExegesisX86Target Target;
964 return &Target;
965 }
966
InitializeX86ExegesisTarget()967 void InitializeX86ExegesisTarget() {
968 ExegesisTarget::registerTarget(getTheExegesisX86Target());
969 }
970
971 } // namespace exegesis
972 } // namespace llvm
973