1 //===-- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions --===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file This pass tries to apply several peephole SDWA patterns. 11 /// 12 /// E.g. original: 13 /// V_LSHRREV_B32_e32 %vreg0, 16, %vreg1 14 /// V_ADD_I32_e32 %vreg2, %vreg0, %vreg3 15 /// V_LSHLREV_B32_e32 %vreg4, 16, %vreg2 16 /// 17 /// Replace: 18 /// V_ADD_I32_sdwa %vreg4, %vreg1, %vreg3 19 /// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 20 /// 21 //===----------------------------------------------------------------------===// 22 23 24 #include "AMDGPU.h" 25 #include "AMDGPUSubtarget.h" 26 #include "SIDefines.h" 27 #include "SIInstrInfo.h" 28 #include "llvm/ADT/Statistic.h" 29 #include "llvm/ADT/STLExtras.h" 30 #include "llvm/CodeGen/MachineFunctionPass.h" 31 #include "llvm/CodeGen/MachineInstrBuilder.h" 32 #include <unordered_map> 33 34 using namespace llvm; 35 36 #define DEBUG_TYPE "si-peephole-sdwa" 37 38 STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found."); 39 STATISTIC(NumSDWAInstructionsPeepholed, 40 "Number of instruction converted to SDWA."); 41 42 namespace { 43 44 class SDWAOperand; 45 46 class SIPeepholeSDWA : public MachineFunctionPass { 47 private: 48 MachineRegisterInfo *MRI; 49 const SIRegisterInfo *TRI; 50 const SIInstrInfo *TII; 51 52 std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands; 53 54 Optional<int64_t> foldToImm(const MachineOperand &Op) const; 55 56 public: 57 static char ID; 58 59 typedef SmallVector<std::unique_ptr<SDWAOperand>, 4> SDWAOperandsVector; 60 61 SIPeepholeSDWA() : MachineFunctionPass(ID) { 62 initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry()); 63 } 64 65 bool runOnMachineFunction(MachineFunction &MF) override; 66 void matchSDWAOperands(MachineFunction &MF); 67 bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); 68 69 StringRef getPassName() const override { return "SI Peephole SDWA"; } 70 71 void getAnalysisUsage(AnalysisUsage &AU) const override { 72 AU.setPreservesCFG(); 73 MachineFunctionPass::getAnalysisUsage(AU); 74 } 75 }; 76 77 class SDWAOperand { 78 private: 79 MachineOperand *Target; // Operand that would be used in converted instruction 80 MachineOperand *Replaced; // Operand that would be replace by Target 81 82 public: 83 SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp) 84 : Target(TargetOp), Replaced(ReplacedOp) { 85 assert(Target->isReg()); 86 assert(Replaced->isReg()); 87 } 88 89 virtual ~SDWAOperand() {} 90 91 virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0; 92 virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; 93 94 MachineOperand *getTargetOperand() const { return Target; } 95 MachineOperand *getReplacedOperand() const { return Replaced; } 96 MachineInstr *getParentInst() const { return Target->getParent(); } 97 MachineRegisterInfo *getMRI() const { 98 return &getParentInst()->getParent()->getParent()->getRegInfo(); 99 } 100 }; 101 102 using namespace AMDGPU::SDWA; 103 104 class SDWASrcOperand : public SDWAOperand { 105 private: 106 SdwaSel SrcSel; 107 bool Abs; 108 bool Neg; 109 bool Sext; 110 111 public: 112 SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 113 SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false, 114 bool Sext_ = false) 115 : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_), 116 Neg(Neg_), Sext(Sext_) {} 117 118 virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; 119 virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 120 121 SdwaSel getSrcSel() const { return SrcSel; } 122 bool getAbs() const { return Abs; } 123 bool getNeg() const { return Neg; } 124 bool getSext() const { return Sext; } 125 126 uint64_t getSrcMods() const; 127 }; 128 129 class SDWADstOperand : public SDWAOperand { 130 private: 131 SdwaSel DstSel; 132 DstUnused DstUn; 133 134 public: 135 SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 136 SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) 137 : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} 138 139 virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; 140 virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 141 142 SdwaSel getDstSel() const { return DstSel; } 143 DstUnused getDstUnused() const { return DstUn; } 144 }; 145 146 } // End anonymous namespace. 147 148 INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false) 149 150 char SIPeepholeSDWA::ID = 0; 151 152 char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID; 153 154 FunctionPass *llvm::createSIPeepholeSDWAPass() { 155 return new SIPeepholeSDWA(); 156 } 157 158 #ifndef NDEBUG 159 160 static raw_ostream& operator<<(raw_ostream &OS, const SdwaSel &Sel) { 161 switch(Sel) { 162 case BYTE_0: OS << "BYTE_0"; break; 163 case BYTE_1: OS << "BYTE_1"; break; 164 case BYTE_2: OS << "BYTE_2"; break; 165 case BYTE_3: OS << "BYTE_3"; break; 166 case WORD_0: OS << "WORD_0"; break; 167 case WORD_1: OS << "WORD_1"; break; 168 case DWORD: OS << "DWORD"; break; 169 } 170 return OS; 171 } 172 173 static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) { 174 switch(Un) { 175 case UNUSED_PAD: OS << "UNUSED_PAD"; break; 176 case UNUSED_SEXT: OS << "UNUSED_SEXT"; break; 177 case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break; 178 } 179 return OS; 180 } 181 182 static raw_ostream& operator<<(raw_ostream &OS, const SDWASrcOperand &Src) { 183 OS << "SDWA src: " << *Src.getTargetOperand() 184 << " src_sel:" << Src.getSrcSel() 185 << " abs:" << Src.getAbs() << " neg:" << Src.getNeg() 186 << " sext:" << Src.getSext() << '\n'; 187 return OS; 188 } 189 190 static raw_ostream& operator<<(raw_ostream &OS, const SDWADstOperand &Dst) { 191 OS << "SDWA dst: " << *Dst.getTargetOperand() 192 << " dst_sel:" << Dst.getDstSel() 193 << " dst_unused:" << Dst.getDstUnused() << '\n'; 194 return OS; 195 } 196 197 #endif 198 199 static void copyRegOperand(MachineOperand &To, const MachineOperand &From) { 200 assert(To.isReg() && From.isReg()); 201 To.setReg(From.getReg()); 202 To.setSubReg(From.getSubReg()); 203 To.setIsUndef(From.isUndef()); 204 if (To.isUse()) { 205 To.setIsKill(From.isKill()); 206 } else { 207 To.setIsDead(From.isDead()); 208 } 209 } 210 211 static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { 212 return LHS.isReg() && 213 RHS.isReg() && 214 LHS.getReg() == RHS.getReg() && 215 LHS.getSubReg() == RHS.getSubReg(); 216 } 217 218 static bool isSubregOf(const MachineOperand &SubReg, 219 const MachineOperand &SuperReg, 220 const TargetRegisterInfo *TRI) { 221 222 if (!SuperReg.isReg() || !SubReg.isReg()) 223 return false; 224 225 if (isSameReg(SuperReg, SubReg)) 226 return true; 227 228 if (SuperReg.getReg() != SubReg.getReg()) 229 return false; 230 231 LaneBitmask SuperMask = TRI->getSubRegIndexLaneMask(SuperReg.getSubReg()); 232 LaneBitmask SubMask = TRI->getSubRegIndexLaneMask(SubReg.getSubReg()); 233 SuperMask |= ~SubMask; 234 return SuperMask.all(); 235 } 236 237 uint64_t SDWASrcOperand::getSrcMods() const { 238 uint64_t Mods = 0; 239 if (Abs || Neg) { 240 assert(!Sext && 241 "Float and integer src modifiers can't be set simulteniously"); 242 Mods |= Abs ? SISrcMods::ABS : 0; 243 Mods |= Neg ? SISrcMods::NEG : 0; 244 } else if (Sext) { 245 Mods |= SISrcMods::SEXT; 246 } 247 248 return Mods; 249 } 250 251 MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) { 252 // For SDWA src operand potential instruction is one that use register 253 // defined by parent instruction 254 MachineRegisterInfo *MRI = getMRI(); 255 MachineOperand *Replaced = getReplacedOperand(); 256 assert(Replaced->isReg()); 257 258 MachineInstr *PotentialMI = nullptr; 259 for (MachineOperand &PotentialMO : MRI->use_operands(Replaced->getReg())) { 260 // If this is use of another subreg of dst reg then do nothing 261 if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo())) 262 continue; 263 264 // If there exist use of superreg of dst then we should not combine this 265 // opernad 266 if (!isSameReg(PotentialMO, *Replaced)) 267 return nullptr; 268 269 // Check that PotentialMI is only instruction that uses dst reg 270 if (PotentialMI == nullptr) { 271 PotentialMI = PotentialMO.getParent(); 272 } else if (PotentialMI != PotentialMO.getParent()) { 273 return nullptr; 274 } 275 } 276 277 return PotentialMI; 278 } 279 280 bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { 281 // Find operand in instruction that matches source operand and replace it with 282 // target operand. Set corresponding src_sel 283 284 MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 285 MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); 286 MachineOperand *SrcMods = 287 TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); 288 assert(Src && Src->isReg()); 289 if (!isSameReg(*Src, *getReplacedOperand())) { 290 // If this is not src0 then it should be src1 291 Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 292 SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); 293 SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 294 295 assert(Src && Src->isReg()); 296 297 if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || 298 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && 299 !isSameReg(*Src, *getReplacedOperand())) { 300 // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to 301 // src2. This is not allowed. 302 return false; 303 } 304 305 assert(isSameReg(*Src, *getReplacedOperand()) && SrcSel && SrcMods); 306 } 307 copyRegOperand(*Src, *getTargetOperand()); 308 SrcSel->setImm(getSrcSel()); 309 SrcMods->setImm(getSrcMods()); 310 getTargetOperand()->setIsKill(false); 311 return true; 312 } 313 314 MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) { 315 // For SDWA dst operand potential instruction is one that defines register 316 // that this operand uses 317 MachineRegisterInfo *MRI = getMRI(); 318 MachineInstr *ParentMI = getParentInst(); 319 MachineOperand *Replaced = getReplacedOperand(); 320 assert(Replaced->isReg()); 321 322 for (MachineOperand &PotentialMO : MRI->def_operands(Replaced->getReg())) { 323 if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo())) 324 continue; 325 326 if (!isSameReg(*Replaced, PotentialMO)) 327 return nullptr; 328 329 // Check that ParentMI is the only instruction that uses replaced register 330 for (MachineOperand &UseMO : MRI->use_operands(PotentialMO.getReg())) { 331 if (isSubregOf(UseMO, PotentialMO, MRI->getTargetRegisterInfo()) && 332 UseMO.getParent() != ParentMI) { 333 return nullptr; 334 } 335 } 336 337 // Due to SSA this should be onle def of replaced register, so return it 338 return PotentialMO.getParent(); 339 } 340 341 return nullptr; 342 } 343 344 bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { 345 // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused 346 347 if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || 348 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && 349 getDstSel() != AMDGPU::SDWA::DWORD) { 350 // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD 351 return false; 352 } 353 354 MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 355 assert(Operand && 356 Operand->isReg() && 357 isSameReg(*Operand, *getReplacedOperand())); 358 copyRegOperand(*Operand, *getTargetOperand()); 359 MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); 360 assert(DstSel); 361 DstSel->setImm(getDstSel()); 362 MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 363 assert(DstUnused); 364 DstUnused->setImm(getDstUnused()); 365 366 // Remove original instruction because it would conflict with our new 367 // instruction by register definition 368 getParentInst()->eraseFromParent(); 369 return true; 370 } 371 372 Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { 373 if (Op.isImm()) { 374 return Op.getImm(); 375 } 376 377 // If this is not immediate then it can be copy of immediate value, e.g.: 378 // %vreg1<def> = S_MOV_B32 255; 379 if (Op.isReg()) { 380 for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) { 381 if (!isSameReg(Op, Def)) 382 continue; 383 384 const MachineInstr *DefInst = Def.getParent(); 385 if (!TII->isFoldableCopy(*DefInst)) 386 return None; 387 388 const MachineOperand &Copied = DefInst->getOperand(1); 389 if (!Copied.isImm()) 390 return None; 391 392 return Copied.getImm(); 393 } 394 } 395 396 return None; 397 } 398 399 void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) { 400 for (MachineBasicBlock &MBB : MF) { 401 for (MachineInstr &MI : MBB) { 402 unsigned Opcode = MI.getOpcode(); 403 switch (Opcode) { 404 case AMDGPU::V_LSHRREV_B32_e32: 405 case AMDGPU::V_ASHRREV_I32_e32: 406 case AMDGPU::V_LSHLREV_B32_e32: { 407 // from: v_lshrrev_b32_e32 v1, 16/24, v0 408 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 409 410 // from: v_ashrrev_i32_e32 v1, 16/24, v0 411 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1 412 413 // from: v_lshlrev_b32_e32 v1, 16/24, v0 414 // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD 415 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 416 auto Imm = foldToImm(*Src0); 417 if (!Imm) 418 break; 419 420 if (*Imm != 16 && *Imm != 24) 421 break; 422 423 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 424 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 425 if (TRI->isPhysicalRegister(Src1->getReg()) || 426 TRI->isPhysicalRegister(Dst->getReg())) 427 break; 428 429 if (Opcode == AMDGPU::V_LSHLREV_B32_e32) { 430 auto SDWADst = make_unique<SDWADstOperand>( 431 Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD); 432 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n'); 433 SDWAOperands[&MI] = std::move(SDWADst); 434 ++NumSDWAPatternsFound; 435 } else { 436 auto SDWASrc = make_unique<SDWASrcOperand>( 437 Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false, 438 Opcode == AMDGPU::V_LSHRREV_B32_e32 ? false : true); 439 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); 440 SDWAOperands[&MI] = std::move(SDWASrc); 441 ++NumSDWAPatternsFound; 442 } 443 break; 444 } 445 446 case AMDGPU::V_LSHRREV_B16_e32: 447 case AMDGPU::V_ASHRREV_I16_e32: 448 case AMDGPU::V_LSHLREV_B16_e32: { 449 // from: v_lshrrev_b16_e32 v1, 8, v0 450 // to SDWA src:v0 src_sel:BYTE_1 451 452 // from: v_ashrrev_i16_e32 v1, 8, v0 453 // to SDWA src:v0 src_sel:BYTE_1 sext:1 454 455 // from: v_lshlrev_b16_e32 v1, 8, v0 456 // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD 457 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 458 auto Imm = foldToImm(*Src0); 459 if (!Imm || *Imm != 8) 460 break; 461 462 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 463 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 464 465 if (TRI->isPhysicalRegister(Src1->getReg()) || 466 TRI->isPhysicalRegister(Dst->getReg())) 467 break; 468 469 if (Opcode == AMDGPU::V_LSHLREV_B16_e32) { 470 auto SDWADst = 471 make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD); 472 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n'); 473 SDWAOperands[&MI] = std::move(SDWADst); 474 ++NumSDWAPatternsFound; 475 } else { 476 auto SDWASrc = make_unique<SDWASrcOperand>( 477 Src1, Dst, BYTE_1, false, false, 478 Opcode == AMDGPU::V_LSHRREV_B16_e32 ? false : true); 479 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); 480 SDWAOperands[&MI] = std::move(SDWASrc); 481 ++NumSDWAPatternsFound; 482 } 483 break; 484 } 485 486 case AMDGPU::V_BFE_I32: 487 case AMDGPU::V_BFE_U32: { 488 // e.g.: 489 // from: v_bfe_u32 v1, v0, 8, 8 490 // to SDWA src:v0 src_sel:BYTE_1 491 492 // offset | width | src_sel 493 // ------------------------ 494 // 0 | 8 | BYTE_0 495 // 0 | 16 | WORD_0 496 // 0 | 32 | DWORD ? 497 // 8 | 8 | BYTE_1 498 // 16 | 8 | BYTE_2 499 // 16 | 16 | WORD_1 500 // 24 | 8 | BYTE_3 501 502 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 503 auto Offset = foldToImm(*Src1); 504 if (!Offset) 505 break; 506 507 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 508 auto Width = foldToImm(*Src2); 509 if (!Width) 510 break; 511 512 SdwaSel SrcSel = DWORD; 513 514 if (*Offset == 0 && *Width == 8) 515 SrcSel = BYTE_0; 516 else if (*Offset == 0 && *Width == 16) 517 SrcSel = WORD_0; 518 else if (*Offset == 0 && *Width == 32) 519 SrcSel = DWORD; 520 else if (*Offset == 8 && *Width == 8) 521 SrcSel = BYTE_1; 522 else if (*Offset == 16 && *Width == 8) 523 SrcSel = BYTE_2; 524 else if (*Offset == 16 && *Width == 16) 525 SrcSel = WORD_1; 526 else if (*Offset == 24 && *Width == 8) 527 SrcSel = BYTE_3; 528 else 529 break; 530 531 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 532 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 533 534 if (TRI->isPhysicalRegister(Src0->getReg()) || 535 TRI->isPhysicalRegister(Dst->getReg())) 536 break; 537 538 auto SDWASrc = make_unique<SDWASrcOperand>( 539 Src0, Dst, SrcSel, false, false, 540 Opcode == AMDGPU::V_BFE_U32 ? false : true); 541 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); 542 SDWAOperands[&MI] = std::move(SDWASrc); 543 ++NumSDWAPatternsFound; 544 break; 545 } 546 case AMDGPU::V_AND_B32_e32: { 547 // e.g.: 548 // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0 549 // to SDWA src:v0 src_sel:WORD_0/BYTE_0 550 551 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 552 auto Imm = foldToImm(*Src0); 553 if (!Imm) 554 break; 555 556 if (*Imm != 0x0000ffff && *Imm != 0x000000ff) 557 break; 558 559 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 560 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 561 562 if (TRI->isPhysicalRegister(Src1->getReg()) || 563 TRI->isPhysicalRegister(Dst->getReg())) 564 break; 565 566 auto SDWASrc = make_unique<SDWASrcOperand>( 567 Src1, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0); 568 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); 569 SDWAOperands[&MI] = std::move(SDWASrc); 570 ++NumSDWAPatternsFound; 571 break; 572 } 573 } 574 } 575 } 576 } 577 578 bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, 579 const SDWAOperandsVector &SDWAOperands) { 580 // Check if this instruction can be converted to SDWA: 581 // 1. Does this opcode support SDWA 582 if (AMDGPU::getSDWAOp(MI.getOpcode()) == -1) 583 return false; 584 585 // 2. Are all operands - VGPRs 586 for (const MachineOperand &Operand : MI.explicit_operands()) { 587 if (!Operand.isReg() || !TRI->isVGPR(*MRI, Operand.getReg())) 588 return false; 589 } 590 591 // Convert to sdwa 592 int SDWAOpcode = AMDGPU::getSDWAOp(MI.getOpcode()); 593 assert(SDWAOpcode != -1); 594 595 const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode); 596 597 // Create SDWA version of instruction MI and initialize its operands 598 MachineInstrBuilder SDWAInst = 599 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc); 600 601 // Copy dst, if it is present in original then should also be present in SDWA 602 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 603 if (Dst) { 604 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1); 605 SDWAInst.add(*Dst); 606 } else { 607 assert(TII->isVOPC(MI)); 608 } 609 610 // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and 611 // src0_modifiers (except for v_nop_sdwa, but it can't get here) 612 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 613 assert( 614 Src0 && 615 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 && 616 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1); 617 SDWAInst.addImm(0); 618 SDWAInst.add(*Src0); 619 620 // Copy src1 if present, initialize src1_modifiers. 621 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 622 if (Src1) { 623 assert( 624 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 && 625 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1); 626 SDWAInst.addImm(0); 627 SDWAInst.add(*Src1); 628 } else { 629 assert(TII->isVOP1(MI)); 630 } 631 632 if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || 633 SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) { 634 // v_mac_f16/32 has additional src2 operand tied to vdst 635 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 636 assert(Src2); 637 SDWAInst.add(*Src2); 638 } 639 640 // Initialize clamp. 641 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1); 642 SDWAInst.addImm(0); 643 644 // Initialize dst_sel and dst_unused if present 645 if (Dst) { 646 assert( 647 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1 && 648 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1); 649 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 650 SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD); 651 } 652 653 // Initialize src0_sel 654 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1); 655 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 656 657 658 // Initialize src1_sel if present 659 if (Src1) { 660 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1); 661 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 662 } 663 664 // Apply all sdwa operand pattenrs 665 bool Converted = false; 666 for (auto &Operand : SDWAOperands) { 667 Converted |= Operand->convertToSDWA(*SDWAInst, TII); 668 } 669 if (!Converted) { 670 SDWAInst->eraseFromParent(); 671 return false; 672 } 673 674 DEBUG(dbgs() << "Convert instruction:" << MI 675 << "Into:" << *SDWAInst << '\n'); 676 ++NumSDWAInstructionsPeepholed; 677 678 MI.eraseFromParent(); 679 return true; 680 } 681 682 bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { 683 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 684 685 if (!ST.hasSDWA() || 686 !AMDGPU::isVI(ST)) { // TODO: Add support for SDWA on gfx9 687 return false; 688 } 689 690 MRI = &MF.getRegInfo(); 691 TRI = ST.getRegisterInfo(); 692 TII = ST.getInstrInfo(); 693 694 std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches; 695 696 matchSDWAOperands(MF); 697 698 for (auto &OperandPair : SDWAOperands) { 699 auto &Operand = OperandPair.second; 700 MachineInstr *PotentialMI = Operand->potentialToConvert(TII); 701 if (PotentialMI) { 702 PotentialMatches[PotentialMI].push_back(std::move(Operand)); 703 } 704 } 705 706 for (auto &PotentialPair : PotentialMatches) { 707 MachineInstr &PotentialMI = *PotentialPair.first; 708 convertToSDWA(PotentialMI, PotentialPair.second); 709 } 710 711 SDWAOperands.clear(); 712 return false; 713 } 714