1 //===-- Target.cpp ----------------------------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 #include "../Target.h" 9 10 #include "../Error.h" 11 #include "../ParallelSnippetGenerator.h" 12 #include "../SerialSnippetGenerator.h" 13 #include "../SnippetGenerator.h" 14 #include "MCTargetDesc/X86BaseInfo.h" 15 #include "MCTargetDesc/X86MCTargetDesc.h" 16 #include "X86.h" 17 #include "X86Counter.h" 18 #include "X86RegisterInfo.h" 19 #include "X86Subtarget.h" 20 #include "llvm/ADT/Sequence.h" 21 #include "llvm/MC/MCInstBuilder.h" 22 #include "llvm/Support/Errc.h" 23 #include "llvm/Support/Error.h" 24 #include "llvm/Support/FormatVariadic.h" 25 #include "llvm/Support/Host.h" 26 27 #include <memory> 28 #include <string> 29 #include <vector> 30 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) 31 #include <immintrin.h> 32 #include <intrin.h> 33 #endif 34 35 namespace llvm { 36 namespace exegesis { 37 38 static cl::OptionCategory 39 BenchmarkOptions("llvm-exegesis benchmark x86-options"); 40 41 // If a positive value is specified, we are going to use the LBR in 42 // latency-mode. 43 // 44 // Note: 45 // - A small value is preferred, but too low a value could result in 46 // throttling. 47 // - A prime number is preferred to avoid always skipping certain blocks. 48 // 49 static cl::opt<unsigned> LbrSamplingPeriod( 50 "x86-lbr-sample-period", 51 cl::desc("The sample period (nbranches/sample), used for LBR sampling"), 52 cl::cat(BenchmarkOptions), cl::init(0)); 53 54 // FIXME: Validates that repetition-mode is loop if LBR is requested. 55 56 // Returns a non-null reason if we cannot handle the memory references in this 57 // instruction. 58 static const char *isInvalidMemoryInstr(const Instruction &Instr) { 59 switch (Instr.Description.TSFlags & X86II::FormMask) { 60 default: 61 return "Unknown FormMask value"; 62 // These have no memory access. 63 case X86II::Pseudo: 64 case X86II::RawFrm: 65 case X86II::AddCCFrm: 66 case X86II::PrefixByte: 67 case X86II::MRMDestReg: 68 case X86II::MRMSrcReg: 69 case X86II::MRMSrcReg4VOp3: 70 case X86II::MRMSrcRegOp4: 71 case X86II::MRMSrcRegCC: 72 case X86II::MRMXrCC: 73 case X86II::MRMr0: 74 case X86II::MRMXr: 75 case X86II::MRM0r: 76 case X86II::MRM1r: 77 case X86II::MRM2r: 78 case X86II::MRM3r: 79 case X86II::MRM4r: 80 case X86II::MRM5r: 81 case X86II::MRM6r: 82 case X86II::MRM7r: 83 case X86II::MRM0X: 84 case X86II::MRM1X: 85 case X86II::MRM2X: 86 case X86II::MRM3X: 87 case X86II::MRM4X: 88 case X86II::MRM5X: 89 case X86II::MRM6X: 90 case X86II::MRM7X: 91 case X86II::MRM_C0: 92 case X86II::MRM_C1: 93 case X86II::MRM_C2: 94 case X86II::MRM_C3: 95 case X86II::MRM_C4: 96 case X86II::MRM_C5: 97 case X86II::MRM_C6: 98 case X86II::MRM_C7: 99 case X86II::MRM_C8: 100 case X86II::MRM_C9: 101 case X86II::MRM_CA: 102 case X86II::MRM_CB: 103 case X86II::MRM_CC: 104 case X86II::MRM_CD: 105 case X86II::MRM_CE: 106 case X86II::MRM_CF: 107 case X86II::MRM_D0: 108 case X86II::MRM_D1: 109 case X86II::MRM_D2: 110 case X86II::MRM_D3: 111 case X86II::MRM_D4: 112 case X86II::MRM_D5: 113 case X86II::MRM_D6: 114 case X86II::MRM_D7: 115 case X86II::MRM_D8: 116 case X86II::MRM_D9: 117 case X86II::MRM_DA: 118 case X86II::MRM_DB: 119 case X86II::MRM_DC: 120 case X86II::MRM_DD: 121 case X86II::MRM_DE: 122 case X86II::MRM_DF: 123 case X86II::MRM_E0: 124 case X86II::MRM_E1: 125 case X86II::MRM_E2: 126 case X86II::MRM_E3: 127 case X86II::MRM_E4: 128 case X86II::MRM_E5: 129 case X86II::MRM_E6: 130 case X86II::MRM_E7: 131 case X86II::MRM_E8: 132 case X86II::MRM_E9: 133 case X86II::MRM_EA: 134 case X86II::MRM_EB: 135 case X86II::MRM_EC: 136 case X86II::MRM_ED: 137 case X86II::MRM_EE: 138 case X86II::MRM_EF: 139 case X86II::MRM_F0: 140 case X86II::MRM_F1: 141 case X86II::MRM_F2: 142 case X86II::MRM_F3: 143 case X86II::MRM_F4: 144 case X86II::MRM_F5: 145 case X86II::MRM_F6: 146 case X86II::MRM_F7: 147 case X86II::MRM_F8: 148 case X86II::MRM_F9: 149 case X86II::MRM_FA: 150 case X86II::MRM_FB: 151 case X86II::MRM_FC: 152 case X86II::MRM_FD: 153 case X86II::MRM_FE: 154 case X86II::MRM_FF: 155 case X86II::RawFrmImm8: 156 return nullptr; 157 case X86II::AddRegFrm: 158 return (Instr.Description.Opcode == X86::POP16r || 159 Instr.Description.Opcode == X86::POP32r || 160 Instr.Description.Opcode == X86::PUSH16r || 161 Instr.Description.Opcode == X86::PUSH32r) 162 ? "unsupported opcode: unsupported memory access" 163 : nullptr; 164 // These access memory and are handled. 165 case X86II::MRMDestMem: 166 case X86II::MRMSrcMem: 167 case X86II::MRMSrcMem4VOp3: 168 case X86II::MRMSrcMemOp4: 169 case X86II::MRMSrcMemCC: 170 case X86II::MRMXmCC: 171 case X86II::MRMXm: 172 case X86II::MRM0m: 173 case X86II::MRM1m: 174 case X86II::MRM2m: 175 case X86II::MRM3m: 176 case X86II::MRM4m: 177 case X86II::MRM5m: 178 case X86II::MRM6m: 179 case X86II::MRM7m: 180 return nullptr; 181 // These access memory and are not handled yet. 182 case X86II::RawFrmImm16: 183 case X86II::RawFrmMemOffs: 184 case X86II::RawFrmSrc: 185 case X86II::RawFrmDst: 186 case X86II::RawFrmDstSrc: 187 return "unsupported opcode: non uniform memory access"; 188 } 189 } 190 191 // If the opcode is invalid, returns a pointer to a character literal indicating 192 // the reason. nullptr indicates a valid opcode. 193 static const char *isInvalidOpcode(const Instruction &Instr) { 194 const auto OpcodeName = Instr.Name; 195 if ((Instr.Description.TSFlags & X86II::FormMask) == X86II::Pseudo) 196 return "unsupported opcode: pseudo instruction"; 197 if ((OpcodeName.startswith("POP") && !OpcodeName.startswith("POPCNT")) || 198 OpcodeName.startswith("PUSH") || OpcodeName.startswith("ADJCALLSTACK") || 199 OpcodeName.startswith("LEAVE")) 200 return "unsupported opcode: Push/Pop/AdjCallStack/Leave"; 201 switch (Instr.Description.Opcode) { 202 case X86::LFS16rm: 203 case X86::LFS32rm: 204 case X86::LFS64rm: 205 case X86::LGS16rm: 206 case X86::LGS32rm: 207 case X86::LGS64rm: 208 case X86::LSS16rm: 209 case X86::LSS32rm: 210 case X86::LSS64rm: 211 case X86::SYSENTER: 212 return "unsupported opcode"; 213 default: 214 break; 215 } 216 if (const auto reason = isInvalidMemoryInstr(Instr)) 217 return reason; 218 // We do not handle instructions with OPERAND_PCREL. 219 for (const Operand &Op : Instr.Operands) 220 if (Op.isExplicit() && 221 Op.getExplicitOperandInfo().OperandType == MCOI::OPERAND_PCREL) 222 return "unsupported opcode: PC relative operand"; 223 // We do not handle second-form X87 instructions. We only handle first-form 224 // ones (_Fp), see comment in X86InstrFPStack.td. 225 for (const Operand &Op : Instr.Operands) 226 if (Op.isReg() && Op.isExplicit() && 227 Op.getExplicitOperandInfo().RegClass == X86::RSTRegClassID) 228 return "unsupported second-form X87 instruction"; 229 return nullptr; 230 } 231 232 static unsigned getX86FPFlags(const Instruction &Instr) { 233 return Instr.Description.TSFlags & X86II::FPTypeMask; 234 } 235 236 // Helper to fill a memory operand with a value. 237 static void setMemOp(InstructionTemplate &IT, int OpIdx, 238 const MCOperand &OpVal) { 239 const auto Op = IT.getInstr().Operands[OpIdx]; 240 assert(Op.isExplicit() && "invalid memory pattern"); 241 IT.getValueFor(Op) = OpVal; 242 } 243 244 // Common (latency, uops) code for LEA templates. `GetDestReg` takes the 245 // addressing base and index registers and returns the LEA destination register. 246 static Expected<std::vector<CodeTemplate>> generateLEATemplatesCommon( 247 const Instruction &Instr, const BitVector &ForbiddenRegisters, 248 const LLVMState &State, const SnippetGenerator::Options &Opts, 249 std::function<void(unsigned, unsigned, BitVector &CandidateDestRegs)> 250 RestrictDestRegs) { 251 assert(Instr.Operands.size() == 6 && "invalid LEA"); 252 assert(X86II::getMemoryOperandNo(Instr.Description.TSFlags) == 1 && 253 "invalid LEA"); 254 255 constexpr const int kDestOp = 0; 256 constexpr const int kBaseOp = 1; 257 constexpr const int kIndexOp = 3; 258 auto PossibleDestRegs = 259 Instr.Operands[kDestOp].getRegisterAliasing().sourceBits(); 260 remove(PossibleDestRegs, ForbiddenRegisters); 261 auto PossibleBaseRegs = 262 Instr.Operands[kBaseOp].getRegisterAliasing().sourceBits(); 263 remove(PossibleBaseRegs, ForbiddenRegisters); 264 auto PossibleIndexRegs = 265 Instr.Operands[kIndexOp].getRegisterAliasing().sourceBits(); 266 remove(PossibleIndexRegs, ForbiddenRegisters); 267 268 const auto &RegInfo = State.getRegInfo(); 269 std::vector<CodeTemplate> Result; 270 for (const unsigned BaseReg : PossibleBaseRegs.set_bits()) { 271 for (const unsigned IndexReg : PossibleIndexRegs.set_bits()) { 272 for (int LogScale = 0; LogScale <= 3; ++LogScale) { 273 // FIXME: Add an option for controlling how we explore immediates. 274 for (const int Disp : {0, 42}) { 275 InstructionTemplate IT(&Instr); 276 const int64_t Scale = 1ull << LogScale; 277 setMemOp(IT, 1, MCOperand::createReg(BaseReg)); 278 setMemOp(IT, 2, MCOperand::createImm(Scale)); 279 setMemOp(IT, 3, MCOperand::createReg(IndexReg)); 280 setMemOp(IT, 4, MCOperand::createImm(Disp)); 281 // SegmentReg must be 0 for LEA. 282 setMemOp(IT, 5, MCOperand::createReg(0)); 283 284 // Output reg candidates are selected by the caller. 285 auto PossibleDestRegsNow = PossibleDestRegs; 286 RestrictDestRegs(BaseReg, IndexReg, PossibleDestRegsNow); 287 assert(PossibleDestRegsNow.set_bits().begin() != 288 PossibleDestRegsNow.set_bits().end() && 289 "no remaining registers"); 290 setMemOp( 291 IT, 0, 292 MCOperand::createReg(*PossibleDestRegsNow.set_bits().begin())); 293 294 CodeTemplate CT; 295 CT.Instructions.push_back(std::move(IT)); 296 CT.Config = formatv("{3}(%{0}, %{1}, {2})", RegInfo.getName(BaseReg), 297 RegInfo.getName(IndexReg), Scale, Disp) 298 .str(); 299 Result.push_back(std::move(CT)); 300 if (Result.size() >= Opts.MaxConfigsPerOpcode) 301 return std::move(Result); 302 } 303 } 304 } 305 } 306 307 return std::move(Result); 308 } 309 310 namespace { 311 class X86SerialSnippetGenerator : public SerialSnippetGenerator { 312 public: 313 using SerialSnippetGenerator::SerialSnippetGenerator; 314 315 Expected<std::vector<CodeTemplate>> 316 generateCodeTemplates(InstructionTemplate Variant, 317 const BitVector &ForbiddenRegisters) const override; 318 }; 319 } // namespace 320 321 Expected<std::vector<CodeTemplate>> 322 X86SerialSnippetGenerator::generateCodeTemplates( 323 InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const { 324 const Instruction &Instr = Variant.getInstr(); 325 326 if (const auto reason = isInvalidOpcode(Instr)) 327 return make_error<Failure>(reason); 328 329 // LEA gets special attention. 330 const auto Opcode = Instr.Description.getOpcode(); 331 if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) { 332 return generateLEATemplatesCommon( 333 Instr, ForbiddenRegisters, State, Opts, 334 [this](unsigned BaseReg, unsigned IndexReg, 335 BitVector &CandidateDestRegs) { 336 // We just select a destination register that aliases the base 337 // register. 338 CandidateDestRegs &= 339 State.getRATC().getRegister(BaseReg).aliasedBits(); 340 }); 341 } 342 343 if (Instr.hasMemoryOperands()) 344 return make_error<Failure>( 345 "unsupported memory operand in latency measurements"); 346 347 switch (getX86FPFlags(Instr)) { 348 case X86II::NotFP: 349 return SerialSnippetGenerator::generateCodeTemplates(Variant, 350 ForbiddenRegisters); 351 case X86II::ZeroArgFP: 352 case X86II::OneArgFP: 353 case X86II::SpecialFP: 354 case X86II::CompareFP: 355 case X86II::CondMovFP: 356 return make_error<Failure>("Unsupported x87 Instruction"); 357 case X86II::OneArgFPRW: 358 case X86II::TwoArgFP: 359 // These are instructions like 360 // - `ST(0) = fsqrt(ST(0))` (OneArgFPRW) 361 // - `ST(0) = ST(0) + ST(i)` (TwoArgFP) 362 // They are intrinsically serial and do not modify the state of the stack. 363 return generateSelfAliasingCodeTemplates(Variant); 364 default: 365 llvm_unreachable("Unknown FP Type!"); 366 } 367 } 368 369 namespace { 370 class X86ParallelSnippetGenerator : public ParallelSnippetGenerator { 371 public: 372 using ParallelSnippetGenerator::ParallelSnippetGenerator; 373 374 Expected<std::vector<CodeTemplate>> 375 generateCodeTemplates(InstructionTemplate Variant, 376 const BitVector &ForbiddenRegisters) const override; 377 }; 378 379 } // namespace 380 381 Expected<std::vector<CodeTemplate>> 382 X86ParallelSnippetGenerator::generateCodeTemplates( 383 InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const { 384 const Instruction &Instr = Variant.getInstr(); 385 386 if (const auto reason = isInvalidOpcode(Instr)) 387 return make_error<Failure>(reason); 388 389 // LEA gets special attention. 390 const auto Opcode = Instr.Description.getOpcode(); 391 if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) { 392 return generateLEATemplatesCommon( 393 Instr, ForbiddenRegisters, State, Opts, 394 [this](unsigned BaseReg, unsigned IndexReg, 395 BitVector &CandidateDestRegs) { 396 // Any destination register that is not used for addressing is fine. 397 remove(CandidateDestRegs, 398 State.getRATC().getRegister(BaseReg).aliasedBits()); 399 remove(CandidateDestRegs, 400 State.getRATC().getRegister(IndexReg).aliasedBits()); 401 }); 402 } 403 404 switch (getX86FPFlags(Instr)) { 405 case X86II::NotFP: 406 return ParallelSnippetGenerator::generateCodeTemplates(Variant, 407 ForbiddenRegisters); 408 case X86II::ZeroArgFP: 409 case X86II::OneArgFP: 410 case X86II::SpecialFP: 411 return make_error<Failure>("Unsupported x87 Instruction"); 412 case X86II::OneArgFPRW: 413 case X86II::TwoArgFP: 414 // These are instructions like 415 // - `ST(0) = fsqrt(ST(0))` (OneArgFPRW) 416 // - `ST(0) = ST(0) + ST(i)` (TwoArgFP) 417 // They are intrinsically serial and do not modify the state of the stack. 418 // We generate the same code for latency and uops. 419 return generateSelfAliasingCodeTemplates(Variant); 420 case X86II::CompareFP: 421 case X86II::CondMovFP: 422 // We can compute uops for any FP instruction that does not grow or shrink 423 // the stack (either do not touch the stack or push as much as they pop). 424 return generateUnconstrainedCodeTemplates( 425 Variant, "instruction does not grow/shrink the FP stack"); 426 default: 427 llvm_unreachable("Unknown FP Type!"); 428 } 429 } 430 431 static unsigned getLoadImmediateOpcode(unsigned RegBitWidth) { 432 switch (RegBitWidth) { 433 case 8: 434 return X86::MOV8ri; 435 case 16: 436 return X86::MOV16ri; 437 case 32: 438 return X86::MOV32ri; 439 case 64: 440 return X86::MOV64ri; 441 } 442 llvm_unreachable("Invalid Value Width"); 443 } 444 445 // Generates instruction to load an immediate value into a register. 446 static MCInst loadImmediate(unsigned Reg, unsigned RegBitWidth, 447 const APInt &Value) { 448 if (Value.getBitWidth() > RegBitWidth) 449 llvm_unreachable("Value must fit in the Register"); 450 return MCInstBuilder(getLoadImmediateOpcode(RegBitWidth)) 451 .addReg(Reg) 452 .addImm(Value.getZExtValue()); 453 } 454 455 // Allocates scratch memory on the stack. 456 static MCInst allocateStackSpace(unsigned Bytes) { 457 return MCInstBuilder(X86::SUB64ri8) 458 .addReg(X86::RSP) 459 .addReg(X86::RSP) 460 .addImm(Bytes); 461 } 462 463 // Fills scratch memory at offset `OffsetBytes` with value `Imm`. 464 static MCInst fillStackSpace(unsigned MovOpcode, unsigned OffsetBytes, 465 uint64_t Imm) { 466 return MCInstBuilder(MovOpcode) 467 // Address = ESP 468 .addReg(X86::RSP) // BaseReg 469 .addImm(1) // ScaleAmt 470 .addReg(0) // IndexReg 471 .addImm(OffsetBytes) // Disp 472 .addReg(0) // Segment 473 // Immediate. 474 .addImm(Imm); 475 } 476 477 // Loads scratch memory into register `Reg` using opcode `RMOpcode`. 478 static MCInst loadToReg(unsigned Reg, unsigned RMOpcode) { 479 return MCInstBuilder(RMOpcode) 480 .addReg(Reg) 481 // Address = ESP 482 .addReg(X86::RSP) // BaseReg 483 .addImm(1) // ScaleAmt 484 .addReg(0) // IndexReg 485 .addImm(0) // Disp 486 .addReg(0); // Segment 487 } 488 489 // Releases scratch memory. 490 static MCInst releaseStackSpace(unsigned Bytes) { 491 return MCInstBuilder(X86::ADD64ri8) 492 .addReg(X86::RSP) 493 .addReg(X86::RSP) 494 .addImm(Bytes); 495 } 496 497 // Reserves some space on the stack, fills it with the content of the provided 498 // constant and provide methods to load the stack value into a register. 499 namespace { 500 struct ConstantInliner { 501 explicit ConstantInliner(const APInt &Constant) : Constant_(Constant) {} 502 503 std::vector<MCInst> loadAndFinalize(unsigned Reg, unsigned RegBitWidth, 504 unsigned Opcode); 505 506 std::vector<MCInst> loadX87STAndFinalize(unsigned Reg); 507 508 std::vector<MCInst> loadX87FPAndFinalize(unsigned Reg); 509 510 std::vector<MCInst> popFlagAndFinalize(); 511 512 std::vector<MCInst> loadImplicitRegAndFinalize(unsigned Opcode, 513 unsigned Value); 514 515 private: 516 ConstantInliner &add(const MCInst &Inst) { 517 Instructions.push_back(Inst); 518 return *this; 519 } 520 521 void initStack(unsigned Bytes); 522 523 static constexpr const unsigned kF80Bytes = 10; // 80 bits. 524 525 APInt Constant_; 526 std::vector<MCInst> Instructions; 527 }; 528 } // namespace 529 530 std::vector<MCInst> ConstantInliner::loadAndFinalize(unsigned Reg, 531 unsigned RegBitWidth, 532 unsigned Opcode) { 533 assert((RegBitWidth & 7) == 0 && "RegBitWidth must be a multiple of 8 bits"); 534 initStack(RegBitWidth / 8); 535 add(loadToReg(Reg, Opcode)); 536 add(releaseStackSpace(RegBitWidth / 8)); 537 return std::move(Instructions); 538 } 539 540 std::vector<MCInst> ConstantInliner::loadX87STAndFinalize(unsigned Reg) { 541 initStack(kF80Bytes); 542 add(MCInstBuilder(X86::LD_F80m) 543 // Address = ESP 544 .addReg(X86::RSP) // BaseReg 545 .addImm(1) // ScaleAmt 546 .addReg(0) // IndexReg 547 .addImm(0) // Disp 548 .addReg(0)); // Segment 549 if (Reg != X86::ST0) 550 add(MCInstBuilder(X86::ST_Frr).addReg(Reg)); 551 add(releaseStackSpace(kF80Bytes)); 552 return std::move(Instructions); 553 } 554 555 std::vector<MCInst> ConstantInliner::loadX87FPAndFinalize(unsigned Reg) { 556 initStack(kF80Bytes); 557 add(MCInstBuilder(X86::LD_Fp80m) 558 .addReg(Reg) 559 // Address = ESP 560 .addReg(X86::RSP) // BaseReg 561 .addImm(1) // ScaleAmt 562 .addReg(0) // IndexReg 563 .addImm(0) // Disp 564 .addReg(0)); // Segment 565 add(releaseStackSpace(kF80Bytes)); 566 return std::move(Instructions); 567 } 568 569 std::vector<MCInst> ConstantInliner::popFlagAndFinalize() { 570 initStack(8); 571 add(MCInstBuilder(X86::POPF64)); 572 return std::move(Instructions); 573 } 574 575 std::vector<MCInst> 576 ConstantInliner::loadImplicitRegAndFinalize(unsigned Opcode, unsigned Value) { 577 add(allocateStackSpace(4)); 578 add(fillStackSpace(X86::MOV32mi, 0, Value)); // Mask all FP exceptions 579 add(MCInstBuilder(Opcode) 580 // Address = ESP 581 .addReg(X86::RSP) // BaseReg 582 .addImm(1) // ScaleAmt 583 .addReg(0) // IndexReg 584 .addImm(0) // Disp 585 .addReg(0)); // Segment 586 add(releaseStackSpace(4)); 587 return std::move(Instructions); 588 } 589 590 void ConstantInliner::initStack(unsigned Bytes) { 591 assert(Constant_.getBitWidth() <= Bytes * 8 && 592 "Value does not have the correct size"); 593 const APInt WideConstant = Constant_.getBitWidth() < Bytes * 8 594 ? Constant_.sext(Bytes * 8) 595 : Constant_; 596 add(allocateStackSpace(Bytes)); 597 size_t ByteOffset = 0; 598 for (; Bytes - ByteOffset >= 4; ByteOffset += 4) 599 add(fillStackSpace( 600 X86::MOV32mi, ByteOffset, 601 WideConstant.extractBits(32, ByteOffset * 8).getZExtValue())); 602 if (Bytes - ByteOffset >= 2) { 603 add(fillStackSpace( 604 X86::MOV16mi, ByteOffset, 605 WideConstant.extractBits(16, ByteOffset * 8).getZExtValue())); 606 ByteOffset += 2; 607 } 608 if (Bytes - ByteOffset >= 1) 609 add(fillStackSpace( 610 X86::MOV8mi, ByteOffset, 611 WideConstant.extractBits(8, ByteOffset * 8).getZExtValue())); 612 } 613 614 #include "X86GenExegesis.inc" 615 616 namespace { 617 618 class X86SavedState : public ExegesisTarget::SavedState { 619 public: 620 X86SavedState() { 621 #ifdef __x86_64__ 622 # if defined(_MSC_VER) 623 _fxsave64(FPState); 624 Eflags = __readeflags(); 625 # elif defined(__GNUC__) 626 __builtin_ia32_fxsave64(FPState); 627 Eflags = __builtin_ia32_readeflags_u64(); 628 # endif 629 #else 630 llvm_unreachable("X86 exegesis running on non-X86 target"); 631 #endif 632 } 633 634 ~X86SavedState() { 635 // Restoring the X87 state does not flush pending exceptions, make sure 636 // these exceptions are flushed now. 637 #ifdef __x86_64__ 638 # if defined(_MSC_VER) 639 _clearfp(); 640 _fxrstor64(FPState); 641 __writeeflags(Eflags); 642 # elif defined(__GNUC__) 643 asm volatile("fwait"); 644 __builtin_ia32_fxrstor64(FPState); 645 __builtin_ia32_writeeflags_u64(Eflags); 646 # endif 647 #else 648 llvm_unreachable("X86 exegesis running on non-X86 target"); 649 #endif 650 } 651 652 private: 653 #ifdef __x86_64__ 654 alignas(16) char FPState[512]; 655 uint64_t Eflags; 656 #endif 657 }; 658 659 class ExegesisX86Target : public ExegesisTarget { 660 public: 661 ExegesisX86Target() : ExegesisTarget(X86CpuPfmCounters) {} 662 663 Expected<std::unique_ptr<pfm::Counter>> 664 createCounter(StringRef CounterName, const LLVMState &State) const override { 665 // If LbrSamplingPeriod was provided, then ignore the 666 // CounterName because we only have one for LBR. 667 if (LbrSamplingPeriod > 0) { 668 // Can't use LBR without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, or without 669 // __linux__ (for now) 670 #if defined(HAVE_LIBPFM) && defined(LIBPFM_HAS_FIELD_CYCLES) && \ 671 defined(__linux__) 672 return std::make_unique<X86LbrCounter>( 673 X86LbrPerfEvent(LbrSamplingPeriod)); 674 #else 675 return llvm::make_error<llvm::StringError>( 676 "LBR counter requested without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, " 677 "or running on Linux.", 678 llvm::errc::invalid_argument); 679 #endif 680 } 681 return ExegesisTarget::createCounter(CounterName, State); 682 } 683 684 private: 685 void addTargetSpecificPasses(PassManagerBase &PM) const override; 686 687 unsigned getScratchMemoryRegister(const Triple &TT) const override; 688 689 unsigned getLoopCounterRegister(const Triple &) const override; 690 691 unsigned getMaxMemoryAccessSize() const override { return 64; } 692 693 Error randomizeTargetMCOperand(const Instruction &Instr, const Variable &Var, 694 MCOperand &AssignedValue, 695 const BitVector &ForbiddenRegs) const override; 696 697 void fillMemoryOperands(InstructionTemplate &IT, unsigned Reg, 698 unsigned Offset) const override; 699 700 void decrementLoopCounterAndJump(MachineBasicBlock &MBB, 701 MachineBasicBlock &TargetMBB, 702 const MCInstrInfo &MII) const override; 703 704 std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, unsigned Reg, 705 const APInt &Value) const override; 706 707 ArrayRef<unsigned> getUnavailableRegisters() const override { 708 return makeArrayRef(kUnavailableRegisters, 709 sizeof(kUnavailableRegisters) / 710 sizeof(kUnavailableRegisters[0])); 711 } 712 713 bool allowAsBackToBack(const Instruction &Instr) const override { 714 const unsigned Opcode = Instr.Description.Opcode; 715 return !isInvalidOpcode(Instr) && Opcode != X86::LEA64r && 716 Opcode != X86::LEA64_32r && Opcode != X86::LEA16r; 717 } 718 719 std::vector<InstructionTemplate> 720 generateInstructionVariants(const Instruction &Instr, 721 unsigned MaxConfigsPerOpcode) const override; 722 723 std::unique_ptr<SnippetGenerator> createSerialSnippetGenerator( 724 const LLVMState &State, 725 const SnippetGenerator::Options &Opts) const override { 726 return std::make_unique<X86SerialSnippetGenerator>(State, Opts); 727 } 728 729 std::unique_ptr<SnippetGenerator> createParallelSnippetGenerator( 730 const LLVMState &State, 731 const SnippetGenerator::Options &Opts) const override { 732 return std::make_unique<X86ParallelSnippetGenerator>(State, Opts); 733 } 734 735 bool matchesArch(Triple::ArchType Arch) const override { 736 return Arch == Triple::x86_64 || Arch == Triple::x86; 737 } 738 739 Error checkFeatureSupport() const override { 740 // LBR is the only feature we conditionally support now. 741 // So if LBR is not requested, then we should be able to run the benchmarks. 742 if (LbrSamplingPeriod == 0) 743 return Error::success(); 744 745 #if defined(__linux__) && defined(HAVE_LIBPFM) && \ 746 defined(LIBPFM_HAS_FIELD_CYCLES) 747 // FIXME: Fix this. 748 // https://bugs.llvm.org/show_bug.cgi?id=48918 749 // For now, only do the check if we see an Intel machine because 750 // the counter uses some intel-specific magic and it could 751 // be confuse and think an AMD machine actually has LBR support. 752 #if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || \ 753 defined(_M_X64) 754 using namespace sys::detail::x86; 755 756 if (getVendorSignature() == VendorSignatures::GENUINE_INTEL) 757 // If the kernel supports it, the hardware still may not have it. 758 return X86LbrCounter::checkLbrSupport(); 759 #else 760 llvm_unreachable("Running X86 exegesis on non-X86 target"); 761 #endif 762 #endif 763 return llvm::make_error<llvm::StringError>( 764 "LBR not supported on this kernel and/or platform", 765 llvm::errc::not_supported); 766 } 767 768 std::unique_ptr<SavedState> withSavedState() const override { 769 return std::make_unique<X86SavedState>(); 770 } 771 772 static const unsigned kUnavailableRegisters[4]; 773 }; 774 775 // We disable a few registers that cannot be encoded on instructions with a REX 776 // prefix. 777 const unsigned ExegesisX86Target::kUnavailableRegisters[4] = {X86::AH, X86::BH, 778 X86::CH, X86::DH}; 779 780 // We're using one of R8-R15 because these registers are never hardcoded in 781 // instructions (e.g. MOVS writes to EDI, ESI, EDX), so they have less 782 // conflicts. 783 constexpr const unsigned kLoopCounterReg = X86::R8; 784 785 } // namespace 786 787 void ExegesisX86Target::addTargetSpecificPasses(PassManagerBase &PM) const { 788 // Lowers FP pseudo-instructions, e.g. ABS_Fp32 -> ABS_F. 789 PM.add(createX86FloatingPointStackifierPass()); 790 } 791 792 unsigned ExegesisX86Target::getScratchMemoryRegister(const Triple &TT) const { 793 if (!TT.isArch64Bit()) { 794 // FIXME: This would require popping from the stack, so we would have to 795 // add some additional setup code. 796 return 0; 797 } 798 return TT.isOSWindows() ? X86::RCX : X86::RDI; 799 } 800 801 unsigned ExegesisX86Target::getLoopCounterRegister(const Triple &TT) const { 802 if (!TT.isArch64Bit()) { 803 return 0; 804 } 805 return kLoopCounterReg; 806 } 807 808 Error ExegesisX86Target::randomizeTargetMCOperand( 809 const Instruction &Instr, const Variable &Var, MCOperand &AssignedValue, 810 const BitVector &ForbiddenRegs) const { 811 const Operand &Op = Instr.getPrimaryOperand(Var); 812 switch (Op.getExplicitOperandInfo().OperandType) { 813 case X86::OperandType::OPERAND_ROUNDING_CONTROL: 814 AssignedValue = 815 MCOperand::createImm(randomIndex(X86::STATIC_ROUNDING::TO_ZERO)); 816 return Error::success(); 817 default: 818 break; 819 } 820 return make_error<Failure>( 821 Twine("unimplemented operand type ") 822 .concat(Twine(Op.getExplicitOperandInfo().OperandType))); 823 } 824 825 void ExegesisX86Target::fillMemoryOperands(InstructionTemplate &IT, 826 unsigned Reg, 827 unsigned Offset) const { 828 assert(!isInvalidMemoryInstr(IT.getInstr()) && 829 "fillMemoryOperands requires a valid memory instruction"); 830 int MemOpIdx = X86II::getMemoryOperandNo(IT.getInstr().Description.TSFlags); 831 assert(MemOpIdx >= 0 && "invalid memory operand index"); 832 // getMemoryOperandNo() ignores tied operands, so we have to add them back. 833 MemOpIdx += X86II::getOperandBias(IT.getInstr().Description); 834 setMemOp(IT, MemOpIdx + 0, MCOperand::createReg(Reg)); // BaseReg 835 setMemOp(IT, MemOpIdx + 1, MCOperand::createImm(1)); // ScaleAmt 836 setMemOp(IT, MemOpIdx + 2, MCOperand::createReg(0)); // IndexReg 837 setMemOp(IT, MemOpIdx + 3, MCOperand::createImm(Offset)); // Disp 838 setMemOp(IT, MemOpIdx + 4, MCOperand::createReg(0)); // Segment 839 } 840 841 void ExegesisX86Target::decrementLoopCounterAndJump( 842 MachineBasicBlock &MBB, MachineBasicBlock &TargetMBB, 843 const MCInstrInfo &MII) const { 844 BuildMI(&MBB, DebugLoc(), MII.get(X86::ADD64ri8)) 845 .addDef(kLoopCounterReg) 846 .addUse(kLoopCounterReg) 847 .addImm(-1); 848 BuildMI(&MBB, DebugLoc(), MII.get(X86::JCC_1)) 849 .addMBB(&TargetMBB) 850 .addImm(X86::COND_NE); 851 } 852 853 std::vector<MCInst> ExegesisX86Target::setRegTo(const MCSubtargetInfo &STI, 854 unsigned Reg, 855 const APInt &Value) const { 856 if (X86::GR8RegClass.contains(Reg)) 857 return {loadImmediate(Reg, 8, Value)}; 858 if (X86::GR16RegClass.contains(Reg)) 859 return {loadImmediate(Reg, 16, Value)}; 860 if (X86::GR32RegClass.contains(Reg)) 861 return {loadImmediate(Reg, 32, Value)}; 862 if (X86::GR64RegClass.contains(Reg)) 863 return {loadImmediate(Reg, 64, Value)}; 864 ConstantInliner CI(Value); 865 if (X86::VR64RegClass.contains(Reg)) 866 return CI.loadAndFinalize(Reg, 64, X86::MMX_MOVQ64rm); 867 if (X86::VR128XRegClass.contains(Reg)) { 868 if (STI.getFeatureBits()[X86::FeatureAVX512]) 869 return CI.loadAndFinalize(Reg, 128, X86::VMOVDQU32Z128rm); 870 if (STI.getFeatureBits()[X86::FeatureAVX]) 871 return CI.loadAndFinalize(Reg, 128, X86::VMOVDQUrm); 872 return CI.loadAndFinalize(Reg, 128, X86::MOVDQUrm); 873 } 874 if (X86::VR256XRegClass.contains(Reg)) { 875 if (STI.getFeatureBits()[X86::FeatureAVX512]) 876 return CI.loadAndFinalize(Reg, 256, X86::VMOVDQU32Z256rm); 877 if (STI.getFeatureBits()[X86::FeatureAVX]) 878 return CI.loadAndFinalize(Reg, 256, X86::VMOVDQUYrm); 879 } 880 if (X86::VR512RegClass.contains(Reg)) 881 if (STI.getFeatureBits()[X86::FeatureAVX512]) 882 return CI.loadAndFinalize(Reg, 512, X86::VMOVDQU32Zrm); 883 if (X86::RSTRegClass.contains(Reg)) { 884 return CI.loadX87STAndFinalize(Reg); 885 } 886 if (X86::RFP32RegClass.contains(Reg) || X86::RFP64RegClass.contains(Reg) || 887 X86::RFP80RegClass.contains(Reg)) { 888 return CI.loadX87FPAndFinalize(Reg); 889 } 890 if (Reg == X86::EFLAGS) 891 return CI.popFlagAndFinalize(); 892 if (Reg == X86::MXCSR) 893 return CI.loadImplicitRegAndFinalize( 894 STI.getFeatureBits()[X86::FeatureAVX] ? X86::VLDMXCSR : X86::LDMXCSR, 895 0x1f80); 896 if (Reg == X86::FPCW) 897 return CI.loadImplicitRegAndFinalize(X86::FLDCW16m, 0x37f); 898 return {}; // Not yet implemented. 899 } 900 901 // Instruction can have some variable operands, and we may want to see how 902 // different operands affect performance. So for each operand position, 903 // precompute all the possible choices we might care about, 904 // and greedily generate all the possible combinations of choices. 905 std::vector<InstructionTemplate> ExegesisX86Target::generateInstructionVariants( 906 const Instruction &Instr, unsigned MaxConfigsPerOpcode) const { 907 bool Exploration = false; 908 SmallVector<SmallVector<MCOperand, 1>, 4> VariableChoices; 909 VariableChoices.resize(Instr.Variables.size()); 910 for (auto I : llvm::zip(Instr.Variables, VariableChoices)) { 911 const Variable &Var = std::get<0>(I); 912 SmallVectorImpl<MCOperand> &Choices = std::get<1>(I); 913 914 switch (Instr.getPrimaryOperand(Var).getExplicitOperandInfo().OperandType) { 915 default: 916 // We don't wish to explicitly explore this variable. 917 Choices.emplace_back(); // But add invalid MCOperand to simplify logic. 918 continue; 919 case X86::OperandType::OPERAND_COND_CODE: { 920 Exploration = true; 921 auto CondCodes = enum_seq_inclusive(X86::CondCode::COND_O, 922 X86::CondCode::LAST_VALID_COND, 923 force_iteration_on_noniterable_enum); 924 Choices.reserve(CondCodes.size()); 925 for (int CondCode : CondCodes) 926 Choices.emplace_back(MCOperand::createImm(CondCode)); 927 break; 928 } 929 } 930 } 931 932 // If we don't wish to explore any variables, defer to the baseline method. 933 if (!Exploration) 934 return ExegesisTarget::generateInstructionVariants(Instr, 935 MaxConfigsPerOpcode); 936 937 std::vector<InstructionTemplate> Variants; 938 size_t NumVariants; 939 CombinationGenerator<MCOperand, decltype(VariableChoices)::value_type, 4> G( 940 VariableChoices); 941 942 // How many operand combinations can we produce, within the limit? 943 NumVariants = std::min(G.numCombinations(), (size_t)MaxConfigsPerOpcode); 944 // And actually produce all the wanted operand combinations. 945 Variants.reserve(NumVariants); 946 G.generate([&](ArrayRef<MCOperand> State) -> bool { 947 Variants.emplace_back(&Instr); 948 Variants.back().setVariableValues(State); 949 // Did we run out of space for variants? 950 return Variants.size() >= NumVariants; 951 }); 952 953 assert(Variants.size() == NumVariants && 954 Variants.size() <= MaxConfigsPerOpcode && 955 "Should not produce too many variants"); 956 return Variants; 957 } 958 959 static ExegesisTarget *getTheExegesisX86Target() { 960 static ExegesisX86Target Target; 961 return &Target; 962 } 963 964 void InitializeX86ExegesisTarget() { 965 ExegesisTarget::registerTarget(getTheExegesisX86Target()); 966 } 967 968 } // namespace exegesis 969 } // namespace llvm 970