1 //=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0 10 // operand.If any of the use instruction cannot be combined with the mov the 11 // whole sequence is reverted. 12 // 13 // $old = ... 14 // $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane, 15 // dpp_controls..., $bound_ctrl 16 // $res = VALU $dpp_value, ... 17 // 18 // to 19 // 20 // $res = VALU_DPP $folded_old, $vgpr_to_be_read_from_other_lane, ..., 21 // dpp_controls..., $folded_bound_ctrl 22 // 23 // Combining rules : 24 // 25 // $bound_ctrl is DPP_BOUND_ZERO, $old is any 26 // $bound_ctrl is DPP_BOUND_OFF, $old is 0 27 // 28 // ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_ZERO 29 // $bound_ctrl is DPP_BOUND_OFF, $old is undef 30 // 31 // ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_OFF 32 // $bound_ctrl is DPP_BOUND_OFF, $old is foldable 33 // 34 // ->$folded_old = folded value, $folded_bound_ctrl = DPP_BOUND_OFF 35 //===----------------------------------------------------------------------===// 36 37 #include "AMDGPU.h" 38 #include "AMDGPUSubtarget.h" 39 #include "SIInstrInfo.h" 40 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 41 #include "llvm/ADT/SmallVector.h" 42 #include "llvm/ADT/Statistic.h" 43 #include "llvm/CodeGen/MachineBasicBlock.h" 44 #include "llvm/CodeGen/MachineFunction.h" 45 #include "llvm/CodeGen/MachineFunctionPass.h" 46 #include "llvm/CodeGen/MachineInstr.h" 47 #include "llvm/CodeGen/MachineInstrBuilder.h" 48 #include "llvm/CodeGen/MachineOperand.h" 49 #include "llvm/CodeGen/MachineRegisterInfo.h" 50 #include "llvm/CodeGen/TargetRegisterInfo.h" 51 #include "llvm/Pass.h" 52 #include <cassert> 53 54 using namespace llvm; 55 56 #define DEBUG_TYPE "gcn-dpp-combine" 57 58 STATISTIC(NumDPPMovsCombined, "Number of DPP moves combined."); 59 60 namespace { 61 62 class GCNDPPCombine : public MachineFunctionPass { 63 MachineRegisterInfo *MRI; 64 const SIInstrInfo *TII; 65 66 using RegSubRegPair = TargetInstrInfo::RegSubRegPair; 67 68 MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const; 69 70 RegSubRegPair foldOldOpnd(MachineInstr &OrigMI, 71 RegSubRegPair OldOpndVGPR, 72 MachineOperand &OldOpndValue) const; 73 74 MachineInstr *createDPPInst(MachineInstr &OrigMI, 75 MachineInstr &MovMI, 76 RegSubRegPair OldOpndVGPR, 77 MachineOperand *OldOpnd, 78 bool BoundCtrlZero) const; 79 80 MachineInstr *createDPPInst(MachineInstr &OrigMI, 81 MachineInstr &MovMI, 82 RegSubRegPair OldOpndVGPR, 83 bool BoundCtrlZero) const; 84 85 bool hasNoImmOrEqual(MachineInstr &MI, 86 unsigned OpndName, 87 int64_t Value, 88 int64_t Mask = -1) const; 89 90 bool combineDPPMov(MachineInstr &MI) const; 91 92 public: 93 static char ID; 94 95 GCNDPPCombine() : MachineFunctionPass(ID) { 96 initializeGCNDPPCombinePass(*PassRegistry::getPassRegistry()); 97 } 98 99 bool runOnMachineFunction(MachineFunction &MF) override; 100 101 StringRef getPassName() const override { return "GCN DPP Combine"; } 102 103 void getAnalysisUsage(AnalysisUsage &AU) const override { 104 AU.setPreservesCFG(); 105 MachineFunctionPass::getAnalysisUsage(AU); 106 } 107 }; 108 109 } // end anonymous namespace 110 111 INITIALIZE_PASS(GCNDPPCombine, DEBUG_TYPE, "GCN DPP Combine", false, false) 112 113 char GCNDPPCombine::ID = 0; 114 115 char &llvm::GCNDPPCombineID = GCNDPPCombine::ID; 116 117 FunctionPass *llvm::createGCNDPPCombinePass() { 118 return new GCNDPPCombine(); 119 } 120 121 static int getDPPOp(unsigned Op) { 122 auto DPP32 = AMDGPU::getDPPOp32(Op); 123 if (DPP32 != -1) 124 return DPP32; 125 126 auto E32 = AMDGPU::getVOPe32(Op); 127 return E32 != -1 ? AMDGPU::getDPPOp32(E32) : -1; 128 } 129 130 // tracks the register operand definition and returns: 131 // 1. immediate operand used to initialize the register if found 132 // 2. nullptr if the register operand is undef 133 // 3. the operand itself otherwise 134 MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const { 135 auto *Def = getVRegSubRegDef(getRegSubRegPair(OldOpnd), *MRI); 136 if (!Def) 137 return nullptr; 138 139 switch(Def->getOpcode()) { 140 default: break; 141 case AMDGPU::IMPLICIT_DEF: 142 return nullptr; 143 case AMDGPU::COPY: 144 case AMDGPU::V_MOV_B32_e32: { 145 auto &Op1 = Def->getOperand(1); 146 if (Op1.isImm()) 147 return &Op1; 148 break; 149 } 150 } 151 return &OldOpnd; 152 } 153 154 MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, 155 MachineInstr &MovMI, 156 RegSubRegPair OldOpndVGPR, 157 bool BoundCtrlZero) const { 158 assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp); 159 assert(TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg() == 160 TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)->getReg()); 161 162 auto OrigOp = OrigMI.getOpcode(); 163 auto DPPOp = getDPPOp(OrigOp); 164 if (DPPOp == -1) { 165 LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n"); 166 return nullptr; 167 } 168 169 auto DPPInst = BuildMI(*OrigMI.getParent(), OrigMI, 170 OrigMI.getDebugLoc(), TII->get(DPPOp)); 171 bool Fail = false; 172 do { 173 auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst); 174 assert(Dst); 175 DPPInst.add(*Dst); 176 int NumOperands = 1; 177 178 const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old); 179 if (OldIdx != -1) { 180 assert(OldIdx == NumOperands); 181 assert(isOfRegClass(OldOpndVGPR, AMDGPU::VGPR_32RegClass, *MRI)); 182 DPPInst.addReg(OldOpndVGPR.Reg, 0, OldOpndVGPR.SubReg); 183 ++NumOperands; 184 } 185 186 if (auto *Mod0 = TII->getNamedOperand(OrigMI, 187 AMDGPU::OpName::src0_modifiers)) { 188 assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, 189 AMDGPU::OpName::src0_modifiers)); 190 assert(0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))); 191 DPPInst.addImm(Mod0->getImm()); 192 ++NumOperands; 193 } 194 auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0); 195 assert(Src0); 196 if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) { 197 LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n"); 198 Fail = true; 199 break; 200 } 201 DPPInst.add(*Src0); 202 ++NumOperands; 203 204 if (auto *Mod1 = TII->getNamedOperand(OrigMI, 205 AMDGPU::OpName::src1_modifiers)) { 206 assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, 207 AMDGPU::OpName::src1_modifiers)); 208 assert(0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))); 209 DPPInst.addImm(Mod1->getImm()); 210 ++NumOperands; 211 } 212 if (auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) { 213 if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) { 214 LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n"); 215 Fail = true; 216 break; 217 } 218 DPPInst.add(*Src1); 219 ++NumOperands; 220 } 221 222 if (auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2)) { 223 if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) { 224 LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n"); 225 Fail = true; 226 break; 227 } 228 DPPInst.add(*Src2); 229 } 230 231 DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl)); 232 DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask)); 233 DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask)); 234 DPPInst.addImm(BoundCtrlZero ? 1 : 0); 235 } while (false); 236 237 if (Fail) { 238 DPPInst.getInstr()->eraseFromParent(); 239 return nullptr; 240 } 241 LLVM_DEBUG(dbgs() << " combined: " << *DPPInst.getInstr()); 242 return DPPInst.getInstr(); 243 } 244 245 GCNDPPCombine::RegSubRegPair 246 GCNDPPCombine::foldOldOpnd(MachineInstr &OrigMI, 247 RegSubRegPair OldOpndVGPR, 248 MachineOperand &OldOpndValue) const { 249 assert(OldOpndValue.isImm()); 250 switch (OrigMI.getOpcode()) { 251 default: break; 252 case AMDGPU::V_MAX_U32_e32: 253 if (OldOpndValue.getImm() == std::numeric_limits<uint32_t>::max()) 254 return OldOpndVGPR; 255 break; 256 case AMDGPU::V_MAX_I32_e32: 257 if (OldOpndValue.getImm() == std::numeric_limits<int32_t>::max()) 258 return OldOpndVGPR; 259 break; 260 case AMDGPU::V_MIN_I32_e32: 261 if (OldOpndValue.getImm() == std::numeric_limits<int32_t>::min()) 262 return OldOpndVGPR; 263 break; 264 265 case AMDGPU::V_MUL_I32_I24_e32: 266 case AMDGPU::V_MUL_U32_U24_e32: 267 if (OldOpndValue.getImm() == 1) { 268 auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1); 269 assert(Src1 && Src1->isReg()); 270 return getRegSubRegPair(*Src1); 271 } 272 break; 273 } 274 return RegSubRegPair(); 275 } 276 277 // Cases to combine: 278 // $bound_ctrl is DPP_BOUND_ZERO, $old is any 279 // $bound_ctrl is DPP_BOUND_OFF, $old is 0 280 // -> $old = undef, $bound_ctrl = DPP_BOUND_ZERO 281 282 // $bound_ctrl is DPP_BOUND_OFF, $old is undef 283 // -> $old = undef, $bound_ctrl = DPP_BOUND_OFF 284 285 // $bound_ctrl is DPP_BOUND_OFF, $old is foldable 286 // -> $old = folded value, $bound_ctrl = DPP_BOUND_OFF 287 288 MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, 289 MachineInstr &MovMI, 290 RegSubRegPair OldOpndVGPR, 291 MachineOperand *OldOpndValue, 292 bool BoundCtrlZero) const { 293 assert(OldOpndVGPR.Reg); 294 if (!BoundCtrlZero && OldOpndValue) { 295 assert(OldOpndValue->isImm()); 296 OldOpndVGPR = foldOldOpnd(OrigMI, OldOpndVGPR, *OldOpndValue); 297 if (!OldOpndVGPR.Reg) { 298 LLVM_DEBUG(dbgs() << " failed: old immediate cannot be folded\n"); 299 return nullptr; 300 } 301 } 302 return createDPPInst(OrigMI, MovMI, OldOpndVGPR, BoundCtrlZero); 303 } 304 305 // returns true if MI doesn't have OpndName immediate operand or the 306 // operand has Value 307 bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName, 308 int64_t Value, int64_t Mask) const { 309 auto *Imm = TII->getNamedOperand(MI, OpndName); 310 if (!Imm) 311 return true; 312 313 assert(Imm->isImm()); 314 return (Imm->getImm() & Mask) == Value; 315 } 316 317 bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { 318 assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp); 319 auto *BCZOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bound_ctrl); 320 assert(BCZOpnd && BCZOpnd->isImm()); 321 bool BoundCtrlZero = 0 != BCZOpnd->getImm(); 322 323 LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI); 324 325 auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old); 326 assert(OldOpnd && OldOpnd->isReg()); 327 auto OldOpndVGPR = getRegSubRegPair(*OldOpnd); 328 auto *OldOpndValue = getOldOpndValue(*OldOpnd); 329 assert(!OldOpndValue || OldOpndValue->isImm() || OldOpndValue == OldOpnd); 330 if (OldOpndValue) { 331 if (BoundCtrlZero) { 332 OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef, ignore old opnd 333 OldOpndValue = nullptr; 334 } else { 335 if (!OldOpndValue->isImm()) { 336 LLVM_DEBUG(dbgs() << " failed: old operand isn't an imm or undef\n"); 337 return false; 338 } 339 if (OldOpndValue->getImm() == 0) { 340 OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef 341 OldOpndValue = nullptr; 342 BoundCtrlZero = true; 343 } 344 } 345 } 346 347 LLVM_DEBUG(dbgs() << " old="; 348 if (!OldOpndValue) 349 dbgs() << "undef"; 350 else 351 dbgs() << OldOpndValue->getImm(); 352 dbgs() << ", bound_ctrl=" << BoundCtrlZero << '\n'); 353 354 std::vector<MachineInstr*> OrigMIs, DPPMIs; 355 if (!OldOpndVGPR.Reg) { // OldOpndVGPR = undef 356 OldOpndVGPR = RegSubRegPair( 357 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass)); 358 auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(), 359 TII->get(AMDGPU::IMPLICIT_DEF), OldOpndVGPR.Reg); 360 DPPMIs.push_back(UndefInst.getInstr()); 361 } 362 363 OrigMIs.push_back(&MovMI); 364 bool Rollback = true; 365 for (auto &Use : MRI->use_nodbg_operands( 366 TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg())) { 367 Rollback = true; 368 369 auto &OrigMI = *Use.getParent(); 370 auto OrigOp = OrigMI.getOpcode(); 371 if (TII->isVOP3(OrigOp)) { 372 if (!TII->hasVALU32BitEncoding(OrigOp)) { 373 LLVM_DEBUG(dbgs() << " failed: VOP3 hasn't e32 equivalent\n"); 374 break; 375 } 376 // check if other than abs|neg modifiers are set (opsel for example) 377 const int64_t Mask = ~(SISrcMods::ABS | SISrcMods::NEG); 378 if (!hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src0_modifiers, 0, Mask) || 379 !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src1_modifiers, 0, Mask) || 380 !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::clamp, 0) || 381 !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::omod, 0)) { 382 LLVM_DEBUG(dbgs() << " failed: VOP3 has non-default modifiers\n"); 383 break; 384 } 385 } else if (!TII->isVOP1(OrigOp) && !TII->isVOP2(OrigOp)) { 386 LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3\n"); 387 break; 388 } 389 390 LLVM_DEBUG(dbgs() << " combining: " << OrigMI); 391 if (&Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) { 392 if (auto *DPPInst = createDPPInst(OrigMI, MovMI, OldOpndVGPR, 393 OldOpndValue, BoundCtrlZero)) { 394 DPPMIs.push_back(DPPInst); 395 Rollback = false; 396 } 397 } else if (OrigMI.isCommutable() && 398 &Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) { 399 auto *BB = OrigMI.getParent(); 400 auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI); 401 BB->insert(OrigMI, NewMI); 402 if (TII->commuteInstruction(*NewMI)) { 403 LLVM_DEBUG(dbgs() << " commuted: " << *NewMI); 404 if (auto *DPPInst = createDPPInst(*NewMI, MovMI, OldOpndVGPR, 405 OldOpndValue, BoundCtrlZero)) { 406 DPPMIs.push_back(DPPInst); 407 Rollback = false; 408 } 409 } else 410 LLVM_DEBUG(dbgs() << " failed: cannot be commuted\n"); 411 NewMI->eraseFromParent(); 412 } else 413 LLVM_DEBUG(dbgs() << " failed: no suitable operands\n"); 414 if (Rollback) 415 break; 416 OrigMIs.push_back(&OrigMI); 417 } 418 419 for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs)) 420 MI->eraseFromParent(); 421 422 return !Rollback; 423 } 424 425 bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) { 426 auto &ST = MF.getSubtarget<GCNSubtarget>(); 427 if (!ST.hasDPP() || skipFunction(MF.getFunction())) 428 return false; 429 430 MRI = &MF.getRegInfo(); 431 TII = ST.getInstrInfo(); 432 433 assert(MRI->isSSA() && "Must be run on SSA"); 434 435 bool Changed = false; 436 for (auto &MBB : MF) { 437 for (auto I = MBB.rbegin(), E = MBB.rend(); I != E;) { 438 auto &MI = *I++; 439 if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) { 440 Changed = true; 441 ++NumDPPMovsCombined; 442 } 443 } 444 } 445 return Changed; 446 } 447