1 //===-- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions --===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file This pass tries to apply several peephole SDWA patterns. 11 /// 12 /// E.g. original: 13 /// V_LSHRREV_B32_e32 %vreg0, 16, %vreg1 14 /// V_ADD_I32_e32 %vreg2, %vreg0, %vreg3 15 /// V_LSHLREV_B32_e32 %vreg4, 16, %vreg2 16 /// 17 /// Replace: 18 /// V_ADD_I32_sdwa %vreg4, %vreg1, %vreg3 19 /// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 20 /// 21 //===----------------------------------------------------------------------===// 22 23 24 #include "AMDGPU.h" 25 #include "AMDGPUSubtarget.h" 26 #include "SIDefines.h" 27 #include "SIInstrInfo.h" 28 #include "llvm/ADT/Statistic.h" 29 #include "llvm/ADT/STLExtras.h" 30 #include "llvm/CodeGen/MachineFunctionPass.h" 31 #include "llvm/CodeGen/MachineInstrBuilder.h" 32 #include <unordered_map> 33 #include <unordered_set> 34 35 using namespace llvm; 36 37 #define DEBUG_TYPE "si-peephole-sdwa" 38 39 STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found."); 40 STATISTIC(NumSDWAInstructionsPeepholed, 41 "Number of instruction converted to SDWA."); 42 43 namespace { 44 45 class SDWAOperand; 46 47 class SIPeepholeSDWA : public MachineFunctionPass { 48 public: 49 typedef SmallVector<SDWAOperand *, 4> SDWAOperandsVector; 50 51 private: 52 MachineRegisterInfo *MRI; 53 const SIRegisterInfo *TRI; 54 const SIInstrInfo *TII; 55 56 std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands; 57 std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches; 58 59 Optional<int64_t> foldToImm(const MachineOperand &Op) const; 60 61 public: 62 static char ID; 63 64 SIPeepholeSDWA() : MachineFunctionPass(ID) { 65 initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry()); 66 } 67 68 bool runOnMachineFunction(MachineFunction &MF) override; 69 void matchSDWAOperands(MachineFunction &MF); 70 bool isConvertibleToSDWA(const MachineInstr &MI) const; 71 bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); 72 73 StringRef getPassName() const override { return "SI Peephole SDWA"; } 74 75 void getAnalysisUsage(AnalysisUsage &AU) const override { 76 AU.setPreservesCFG(); 77 MachineFunctionPass::getAnalysisUsage(AU); 78 } 79 }; 80 81 class SDWAOperand { 82 private: 83 MachineOperand *Target; // Operand that would be used in converted instruction 84 MachineOperand *Replaced; // Operand that would be replace by Target 85 86 public: 87 SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp) 88 : Target(TargetOp), Replaced(ReplacedOp) { 89 assert(Target->isReg()); 90 assert(Replaced->isReg()); 91 } 92 93 virtual ~SDWAOperand() {} 94 95 virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0; 96 virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; 97 98 MachineOperand *getTargetOperand() const { return Target; } 99 MachineOperand *getReplacedOperand() const { return Replaced; } 100 MachineInstr *getParentInst() const { return Target->getParent(); } 101 MachineRegisterInfo *getMRI() const { 102 return &getParentInst()->getParent()->getParent()->getRegInfo(); 103 } 104 }; 105 106 using namespace AMDGPU::SDWA; 107 108 class SDWASrcOperand : public SDWAOperand { 109 private: 110 SdwaSel SrcSel; 111 bool Abs; 112 bool Neg; 113 bool Sext; 114 115 public: 116 SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 117 SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false, 118 bool Sext_ = false) 119 : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_), 120 Neg(Neg_), Sext(Sext_) {} 121 122 virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; 123 virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 124 125 SdwaSel getSrcSel() const { return SrcSel; } 126 bool getAbs() const { return Abs; } 127 bool getNeg() const { return Neg; } 128 bool getSext() const { return Sext; } 129 130 uint64_t getSrcMods() const; 131 }; 132 133 class SDWADstOperand : public SDWAOperand { 134 private: 135 SdwaSel DstSel; 136 DstUnused DstUn; 137 138 public: 139 SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 140 SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) 141 : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} 142 143 virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; 144 virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 145 146 SdwaSel getDstSel() const { return DstSel; } 147 DstUnused getDstUnused() const { return DstUn; } 148 }; 149 150 } // End anonymous namespace. 151 152 INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false) 153 154 char SIPeepholeSDWA::ID = 0; 155 156 char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID; 157 158 FunctionPass *llvm::createSIPeepholeSDWAPass() { 159 return new SIPeepholeSDWA(); 160 } 161 162 #ifndef NDEBUG 163 164 static raw_ostream& operator<<(raw_ostream &OS, const SdwaSel &Sel) { 165 switch(Sel) { 166 case BYTE_0: OS << "BYTE_0"; break; 167 case BYTE_1: OS << "BYTE_1"; break; 168 case BYTE_2: OS << "BYTE_2"; break; 169 case BYTE_3: OS << "BYTE_3"; break; 170 case WORD_0: OS << "WORD_0"; break; 171 case WORD_1: OS << "WORD_1"; break; 172 case DWORD: OS << "DWORD"; break; 173 } 174 return OS; 175 } 176 177 static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) { 178 switch(Un) { 179 case UNUSED_PAD: OS << "UNUSED_PAD"; break; 180 case UNUSED_SEXT: OS << "UNUSED_SEXT"; break; 181 case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break; 182 } 183 return OS; 184 } 185 186 static raw_ostream& operator<<(raw_ostream &OS, const SDWASrcOperand &Src) { 187 OS << "SDWA src: " << *Src.getTargetOperand() 188 << " src_sel:" << Src.getSrcSel() 189 << " abs:" << Src.getAbs() << " neg:" << Src.getNeg() 190 << " sext:" << Src.getSext() << '\n'; 191 return OS; 192 } 193 194 static raw_ostream& operator<<(raw_ostream &OS, const SDWADstOperand &Dst) { 195 OS << "SDWA dst: " << *Dst.getTargetOperand() 196 << " dst_sel:" << Dst.getDstSel() 197 << " dst_unused:" << Dst.getDstUnused() << '\n'; 198 return OS; 199 } 200 201 #endif 202 203 static void copyRegOperand(MachineOperand &To, const MachineOperand &From) { 204 assert(To.isReg() && From.isReg()); 205 To.setReg(From.getReg()); 206 To.setSubReg(From.getSubReg()); 207 To.setIsUndef(From.isUndef()); 208 if (To.isUse()) { 209 To.setIsKill(From.isKill()); 210 } else { 211 To.setIsDead(From.isDead()); 212 } 213 } 214 215 static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { 216 return LHS.isReg() && 217 RHS.isReg() && 218 LHS.getReg() == RHS.getReg() && 219 LHS.getSubReg() == RHS.getSubReg(); 220 } 221 222 static bool isSubregOf(const MachineOperand &SubReg, 223 const MachineOperand &SuperReg, 224 const TargetRegisterInfo *TRI) { 225 226 if (!SuperReg.isReg() || !SubReg.isReg()) 227 return false; 228 229 if (isSameReg(SuperReg, SubReg)) 230 return true; 231 232 if (SuperReg.getReg() != SubReg.getReg()) 233 return false; 234 235 LaneBitmask SuperMask = TRI->getSubRegIndexLaneMask(SuperReg.getSubReg()); 236 LaneBitmask SubMask = TRI->getSubRegIndexLaneMask(SubReg.getSubReg()); 237 SuperMask |= ~SubMask; 238 return SuperMask.all(); 239 } 240 241 uint64_t SDWASrcOperand::getSrcMods() const { 242 uint64_t Mods = 0; 243 if (Abs || Neg) { 244 assert(!Sext && 245 "Float and integer src modifiers can't be set simulteniously"); 246 Mods |= Abs ? SISrcMods::ABS : 0; 247 Mods |= Neg ? SISrcMods::NEG : 0; 248 } else if (Sext) { 249 Mods |= SISrcMods::SEXT; 250 } 251 252 return Mods; 253 } 254 255 MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) { 256 // For SDWA src operand potential instruction is one that use register 257 // defined by parent instruction 258 MachineRegisterInfo *MRI = getMRI(); 259 MachineOperand *Replaced = getReplacedOperand(); 260 assert(Replaced->isReg()); 261 262 MachineInstr *PotentialMI = nullptr; 263 for (MachineOperand &PotentialMO : MRI->use_operands(Replaced->getReg())) { 264 // If this is use of another subreg of dst reg then do nothing 265 if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo())) 266 continue; 267 268 // If there exist use of superreg of dst then we should not combine this 269 // opernad 270 if (!isSameReg(PotentialMO, *Replaced)) 271 return nullptr; 272 273 // Check that PotentialMI is only instruction that uses dst reg 274 if (PotentialMI == nullptr) { 275 PotentialMI = PotentialMO.getParent(); 276 } else if (PotentialMI != PotentialMO.getParent()) { 277 return nullptr; 278 } 279 } 280 281 return PotentialMI; 282 } 283 284 bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { 285 // Find operand in instruction that matches source operand and replace it with 286 // target operand. Set corresponding src_sel 287 288 MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 289 MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); 290 MachineOperand *SrcMods = 291 TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); 292 assert(Src && Src->isReg()); 293 if (!isSameReg(*Src, *getReplacedOperand())) { 294 // If this is not src0 then it should be src1 295 Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 296 SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); 297 SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 298 299 assert(Src && Src->isReg()); 300 301 if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || 302 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && 303 !isSameReg(*Src, *getReplacedOperand())) { 304 // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to 305 // src2. This is not allowed. 306 return false; 307 } 308 309 assert(isSameReg(*Src, *getReplacedOperand()) && SrcSel && SrcMods); 310 } 311 copyRegOperand(*Src, *getTargetOperand()); 312 SrcSel->setImm(getSrcSel()); 313 SrcMods->setImm(getSrcMods()); 314 getTargetOperand()->setIsKill(false); 315 return true; 316 } 317 318 MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) { 319 // For SDWA dst operand potential instruction is one that defines register 320 // that this operand uses 321 MachineRegisterInfo *MRI = getMRI(); 322 MachineInstr *ParentMI = getParentInst(); 323 MachineOperand *Replaced = getReplacedOperand(); 324 assert(Replaced->isReg()); 325 326 for (MachineOperand &PotentialMO : MRI->def_operands(Replaced->getReg())) { 327 if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo())) 328 continue; 329 330 if (!isSameReg(*Replaced, PotentialMO)) 331 return nullptr; 332 333 // Check that ParentMI is the only instruction that uses replaced register 334 for (MachineOperand &UseMO : MRI->use_operands(PotentialMO.getReg())) { 335 if (isSubregOf(UseMO, PotentialMO, MRI->getTargetRegisterInfo()) && 336 UseMO.getParent() != ParentMI) { 337 return nullptr; 338 } 339 } 340 341 // Due to SSA this should be onle def of replaced register, so return it 342 return PotentialMO.getParent(); 343 } 344 345 return nullptr; 346 } 347 348 bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { 349 // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused 350 351 if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || 352 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && 353 getDstSel() != AMDGPU::SDWA::DWORD) { 354 // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD 355 return false; 356 } 357 358 MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 359 assert(Operand && 360 Operand->isReg() && 361 isSameReg(*Operand, *getReplacedOperand())); 362 copyRegOperand(*Operand, *getTargetOperand()); 363 MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); 364 assert(DstSel); 365 DstSel->setImm(getDstSel()); 366 MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 367 assert(DstUnused); 368 DstUnused->setImm(getDstUnused()); 369 370 // Remove original instruction because it would conflict with our new 371 // instruction by register definition 372 getParentInst()->eraseFromParent(); 373 return true; 374 } 375 376 Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { 377 if (Op.isImm()) { 378 return Op.getImm(); 379 } 380 381 // If this is not immediate then it can be copy of immediate value, e.g.: 382 // %vreg1<def> = S_MOV_B32 255; 383 if (Op.isReg()) { 384 for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) { 385 if (!isSameReg(Op, Def)) 386 continue; 387 388 const MachineInstr *DefInst = Def.getParent(); 389 if (!TII->isFoldableCopy(*DefInst)) 390 return None; 391 392 const MachineOperand &Copied = DefInst->getOperand(1); 393 if (!Copied.isImm()) 394 return None; 395 396 return Copied.getImm(); 397 } 398 } 399 400 return None; 401 } 402 403 void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) { 404 for (MachineBasicBlock &MBB : MF) { 405 for (MachineInstr &MI : MBB) { 406 unsigned Opcode = MI.getOpcode(); 407 switch (Opcode) { 408 case AMDGPU::V_LSHRREV_B32_e32: 409 case AMDGPU::V_ASHRREV_I32_e32: 410 case AMDGPU::V_LSHLREV_B32_e32: { 411 // from: v_lshrrev_b32_e32 v1, 16/24, v0 412 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 413 414 // from: v_ashrrev_i32_e32 v1, 16/24, v0 415 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1 416 417 // from: v_lshlrev_b32_e32 v1, 16/24, v0 418 // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD 419 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 420 auto Imm = foldToImm(*Src0); 421 if (!Imm) 422 break; 423 424 if (*Imm != 16 && *Imm != 24) 425 break; 426 427 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 428 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 429 if (TRI->isPhysicalRegister(Src1->getReg()) || 430 TRI->isPhysicalRegister(Dst->getReg())) 431 break; 432 433 if (Opcode == AMDGPU::V_LSHLREV_B32_e32) { 434 auto SDWADst = make_unique<SDWADstOperand>( 435 Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD); 436 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n'); 437 SDWAOperands[&MI] = std::move(SDWADst); 438 ++NumSDWAPatternsFound; 439 } else { 440 auto SDWASrc = make_unique<SDWASrcOperand>( 441 Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false, 442 Opcode == AMDGPU::V_LSHRREV_B32_e32 ? false : true); 443 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); 444 SDWAOperands[&MI] = std::move(SDWASrc); 445 ++NumSDWAPatternsFound; 446 } 447 break; 448 } 449 450 case AMDGPU::V_LSHRREV_B16_e32: 451 case AMDGPU::V_ASHRREV_I16_e32: 452 case AMDGPU::V_LSHLREV_B16_e32: { 453 // from: v_lshrrev_b16_e32 v1, 8, v0 454 // to SDWA src:v0 src_sel:BYTE_1 455 456 // from: v_ashrrev_i16_e32 v1, 8, v0 457 // to SDWA src:v0 src_sel:BYTE_1 sext:1 458 459 // from: v_lshlrev_b16_e32 v1, 8, v0 460 // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD 461 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 462 auto Imm = foldToImm(*Src0); 463 if (!Imm || *Imm != 8) 464 break; 465 466 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 467 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 468 469 if (TRI->isPhysicalRegister(Src1->getReg()) || 470 TRI->isPhysicalRegister(Dst->getReg())) 471 break; 472 473 if (Opcode == AMDGPU::V_LSHLREV_B16_e32) { 474 auto SDWADst = 475 make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD); 476 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n'); 477 SDWAOperands[&MI] = std::move(SDWADst); 478 ++NumSDWAPatternsFound; 479 } else { 480 auto SDWASrc = make_unique<SDWASrcOperand>( 481 Src1, Dst, BYTE_1, false, false, 482 Opcode == AMDGPU::V_LSHRREV_B16_e32 ? false : true); 483 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); 484 SDWAOperands[&MI] = std::move(SDWASrc); 485 ++NumSDWAPatternsFound; 486 } 487 break; 488 } 489 490 case AMDGPU::V_BFE_I32: 491 case AMDGPU::V_BFE_U32: { 492 // e.g.: 493 // from: v_bfe_u32 v1, v0, 8, 8 494 // to SDWA src:v0 src_sel:BYTE_1 495 496 // offset | width | src_sel 497 // ------------------------ 498 // 0 | 8 | BYTE_0 499 // 0 | 16 | WORD_0 500 // 0 | 32 | DWORD ? 501 // 8 | 8 | BYTE_1 502 // 16 | 8 | BYTE_2 503 // 16 | 16 | WORD_1 504 // 24 | 8 | BYTE_3 505 506 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 507 auto Offset = foldToImm(*Src1); 508 if (!Offset) 509 break; 510 511 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 512 auto Width = foldToImm(*Src2); 513 if (!Width) 514 break; 515 516 SdwaSel SrcSel = DWORD; 517 518 if (*Offset == 0 && *Width == 8) 519 SrcSel = BYTE_0; 520 else if (*Offset == 0 && *Width == 16) 521 SrcSel = WORD_0; 522 else if (*Offset == 0 && *Width == 32) 523 SrcSel = DWORD; 524 else if (*Offset == 8 && *Width == 8) 525 SrcSel = BYTE_1; 526 else if (*Offset == 16 && *Width == 8) 527 SrcSel = BYTE_2; 528 else if (*Offset == 16 && *Width == 16) 529 SrcSel = WORD_1; 530 else if (*Offset == 24 && *Width == 8) 531 SrcSel = BYTE_3; 532 else 533 break; 534 535 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 536 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 537 538 if (TRI->isPhysicalRegister(Src0->getReg()) || 539 TRI->isPhysicalRegister(Dst->getReg())) 540 break; 541 542 auto SDWASrc = make_unique<SDWASrcOperand>( 543 Src0, Dst, SrcSel, false, false, 544 Opcode == AMDGPU::V_BFE_U32 ? false : true); 545 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); 546 SDWAOperands[&MI] = std::move(SDWASrc); 547 ++NumSDWAPatternsFound; 548 break; 549 } 550 case AMDGPU::V_AND_B32_e32: { 551 // e.g.: 552 // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0 553 // to SDWA src:v0 src_sel:WORD_0/BYTE_0 554 555 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 556 auto Imm = foldToImm(*Src0); 557 if (!Imm) 558 break; 559 560 if (*Imm != 0x0000ffff && *Imm != 0x000000ff) 561 break; 562 563 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 564 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 565 566 if (TRI->isPhysicalRegister(Src1->getReg()) || 567 TRI->isPhysicalRegister(Dst->getReg())) 568 break; 569 570 auto SDWASrc = make_unique<SDWASrcOperand>( 571 Src1, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0); 572 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); 573 SDWAOperands[&MI] = std::move(SDWASrc); 574 ++NumSDWAPatternsFound; 575 break; 576 } 577 } 578 } 579 } 580 } 581 582 bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI) const { 583 // Check if this instruction can be converted to SDWA: 584 // 1. Does this opcode support SDWA 585 if (AMDGPU::getSDWAOp(MI.getOpcode()) == -1) 586 return false; 587 588 // 2. Are all operands - VGPRs 589 for (const MachineOperand &Operand : MI.explicit_operands()) { 590 if (!Operand.isReg() || !TRI->isVGPR(*MRI, Operand.getReg())) 591 return false; 592 } 593 594 return true; 595 } 596 597 bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, 598 const SDWAOperandsVector &SDWAOperands) { 599 // Convert to sdwa 600 int SDWAOpcode = AMDGPU::getSDWAOp(MI.getOpcode()); 601 assert(SDWAOpcode != -1); 602 603 const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode); 604 605 // Create SDWA version of instruction MI and initialize its operands 606 MachineInstrBuilder SDWAInst = 607 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc); 608 609 // Copy dst, if it is present in original then should also be present in SDWA 610 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 611 if (Dst) { 612 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1); 613 SDWAInst.add(*Dst); 614 } else { 615 assert(TII->isVOPC(MI)); 616 } 617 618 // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and 619 // src0_modifiers (except for v_nop_sdwa, but it can't get here) 620 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 621 assert( 622 Src0 && 623 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 && 624 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1); 625 SDWAInst.addImm(0); 626 SDWAInst.add(*Src0); 627 628 // Copy src1 if present, initialize src1_modifiers. 629 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 630 if (Src1) { 631 assert( 632 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 && 633 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1); 634 SDWAInst.addImm(0); 635 SDWAInst.add(*Src1); 636 } else { 637 assert(TII->isVOP1(MI)); 638 } 639 640 if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || 641 SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) { 642 // v_mac_f16/32 has additional src2 operand tied to vdst 643 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 644 assert(Src2); 645 SDWAInst.add(*Src2); 646 } 647 648 // Initialize clamp. 649 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1); 650 SDWAInst.addImm(0); 651 652 // Initialize dst_sel and dst_unused if present 653 if (Dst) { 654 assert( 655 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1 && 656 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1); 657 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 658 SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD); 659 } 660 661 // Initialize src0_sel 662 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1); 663 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 664 665 666 // Initialize src1_sel if present 667 if (Src1) { 668 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1); 669 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 670 } 671 672 // Apply all sdwa operand pattenrs 673 bool Converted = false; 674 for (auto &Operand : SDWAOperands) { 675 // There should be no intesection between SDWA operands and potential MIs 676 // e.g.: 677 // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0 678 // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0 679 // v_add_u32 v3, v4, v2 680 // 681 // In that example it is possible that we would fold 2nd instruction into 3rd 682 // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was 683 // already destroyed). So if SDWAOperand is also a potential MI then do not 684 // apply it. 685 if (PotentialMatches.count(Operand->getParentInst()) == 0) 686 Converted |= Operand->convertToSDWA(*SDWAInst, TII); 687 } 688 if (!Converted) { 689 SDWAInst->eraseFromParent(); 690 return false; 691 } 692 693 DEBUG(dbgs() << "Convert instruction:" << MI 694 << "Into:" << *SDWAInst << '\n'); 695 ++NumSDWAInstructionsPeepholed; 696 697 MI.eraseFromParent(); 698 return true; 699 } 700 701 bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { 702 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 703 704 if (!ST.hasSDWA() || 705 !AMDGPU::isVI(ST)) { // TODO: Add support for SDWA on gfx9 706 return false; 707 } 708 709 MRI = &MF.getRegInfo(); 710 TRI = ST.getRegisterInfo(); 711 TII = ST.getInstrInfo(); 712 713 // Find all SDWA operands in MF. 714 matchSDWAOperands(MF); 715 716 for (const auto &OperandPair : SDWAOperands) { 717 const auto &Operand = OperandPair.second; 718 MachineInstr *PotentialMI = Operand->potentialToConvert(TII); 719 if (PotentialMI && isConvertibleToSDWA(*PotentialMI)) { 720 PotentialMatches[PotentialMI].push_back(Operand.get()); 721 } 722 } 723 724 for (auto &PotentialPair : PotentialMatches) { 725 MachineInstr &PotentialMI = *PotentialPair.first; 726 convertToSDWA(PotentialMI, PotentialPair.second); 727 } 728 729 PotentialMatches.clear(); 730 SDWAOperands.clear(); 731 return false; 732 } 733