1 //===-- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies --------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// Copies from VGPR to SGPR registers are illegal and the register coalescer 12 /// will sometimes generate these illegal copies in situations like this: 13 /// 14 /// Register Class <vsrc> is the union of <vgpr> and <sgpr> 15 /// 16 /// BB0: 17 /// %vreg0 <sgpr> = SCALAR_INST 18 /// %vreg1 <vsrc> = COPY %vreg0 <sgpr> 19 /// ... 20 /// BRANCH %cond BB1, BB2 21 /// BB1: 22 /// %vreg2 <vgpr> = VECTOR_INST 23 /// %vreg3 <vsrc> = COPY %vreg2 <vgpr> 24 /// BB2: 25 /// %vreg4 <vsrc> = PHI %vreg1 <vsrc>, <BB#0>, %vreg3 <vrsc>, <BB#1> 26 /// %vreg5 <vgpr> = VECTOR_INST %vreg4 <vsrc> 27 /// 28 /// 29 /// The coalescer will begin at BB0 and eliminate its copy, then the resulting 30 /// code will look like this: 31 /// 32 /// BB0: 33 /// %vreg0 <sgpr> = SCALAR_INST 34 /// ... 35 /// BRANCH %cond BB1, BB2 36 /// BB1: 37 /// %vreg2 <vgpr> = VECTOR_INST 38 /// %vreg3 <vsrc> = COPY %vreg2 <vgpr> 39 /// BB2: 40 /// %vreg4 <sgpr> = PHI %vreg0 <sgpr>, <BB#0>, %vreg3 <vsrc>, <BB#1> 41 /// %vreg5 <vgpr> = VECTOR_INST %vreg4 <sgpr> 42 /// 43 /// Now that the result of the PHI instruction is an SGPR, the register 44 /// allocator is now forced to constrain the register class of %vreg3 to 45 /// <sgpr> so we end up with final code like this: 46 /// 47 /// BB0: 48 /// %vreg0 <sgpr> = SCALAR_INST 49 /// ... 50 /// BRANCH %cond BB1, BB2 51 /// BB1: 52 /// %vreg2 <vgpr> = VECTOR_INST 53 /// %vreg3 <sgpr> = COPY %vreg2 <vgpr> 54 /// BB2: 55 /// %vreg4 <sgpr> = PHI %vreg0 <sgpr>, <BB#0>, %vreg3 <sgpr>, <BB#1> 56 /// %vreg5 <vgpr> = VECTOR_INST %vreg4 <sgpr> 57 /// 58 /// Now this code contains an illegal copy from a VGPR to an SGPR. 59 /// 60 /// In order to avoid this problem, this pass searches for PHI instructions 61 /// which define a <vsrc> register and constrains its definition class to 62 /// <vgpr> if the user of the PHI's definition register is a vector instruction. 63 /// If the PHI's definition class is constrained to <vgpr> then the coalescer 64 /// will be unable to perform the COPY removal from the above example which 65 /// ultimately led to the creation of an illegal COPY. 66 //===----------------------------------------------------------------------===// 67 68 #include "AMDGPU.h" 69 #include "AMDGPUSubtarget.h" 70 #include "SIInstrInfo.h" 71 #include "llvm/CodeGen/MachineDominators.h" 72 #include "llvm/CodeGen/MachineFunctionPass.h" 73 #include "llvm/CodeGen/MachineInstrBuilder.h" 74 #include "llvm/CodeGen/MachineRegisterInfo.h" 75 #include "llvm/Support/Debug.h" 76 #include "llvm/Support/raw_ostream.h" 77 #include "llvm/Target/TargetMachine.h" 78 79 using namespace llvm; 80 81 #define DEBUG_TYPE "si-fix-sgpr-copies" 82 83 namespace { 84 85 class SIFixSGPRCopies : public MachineFunctionPass { 86 87 MachineDominatorTree *MDT; 88 89 public: 90 static char ID; 91 92 SIFixSGPRCopies() : MachineFunctionPass(ID) { } 93 94 bool runOnMachineFunction(MachineFunction &MF) override; 95 96 StringRef getPassName() const override { return "SI Fix SGPR copies"; } 97 98 void getAnalysisUsage(AnalysisUsage &AU) const override { 99 AU.addRequired<MachineDominatorTree>(); 100 AU.addPreserved<MachineDominatorTree>(); 101 AU.setPreservesCFG(); 102 MachineFunctionPass::getAnalysisUsage(AU); 103 } 104 }; 105 106 } // End anonymous namespace 107 108 INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE, 109 "SI Fix SGPR copies", false, false) 110 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) 111 INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE, 112 "SI Fix SGPR copies", false, false) 113 114 115 char SIFixSGPRCopies::ID = 0; 116 117 char &llvm::SIFixSGPRCopiesID = SIFixSGPRCopies::ID; 118 119 FunctionPass *llvm::createSIFixSGPRCopiesPass() { 120 return new SIFixSGPRCopies(); 121 } 122 123 static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) { 124 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 125 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 126 if (!MI.getOperand(i).isReg() || 127 !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())) 128 continue; 129 130 if (TRI->hasVGPRs(MRI.getRegClass(MI.getOperand(i).getReg()))) 131 return true; 132 } 133 return false; 134 } 135 136 static std::pair<const TargetRegisterClass *, const TargetRegisterClass *> 137 getCopyRegClasses(const MachineInstr &Copy, 138 const SIRegisterInfo &TRI, 139 const MachineRegisterInfo &MRI) { 140 unsigned DstReg = Copy.getOperand(0).getReg(); 141 unsigned SrcReg = Copy.getOperand(1).getReg(); 142 143 const TargetRegisterClass *SrcRC = 144 TargetRegisterInfo::isVirtualRegister(SrcReg) ? 145 MRI.getRegClass(SrcReg) : 146 TRI.getPhysRegClass(SrcReg); 147 148 // We don't really care about the subregister here. 149 // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg()); 150 151 const TargetRegisterClass *DstRC = 152 TargetRegisterInfo::isVirtualRegister(DstReg) ? 153 MRI.getRegClass(DstReg) : 154 TRI.getPhysRegClass(DstReg); 155 156 return std::make_pair(SrcRC, DstRC); 157 } 158 159 static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC, 160 const TargetRegisterClass *DstRC, 161 const SIRegisterInfo &TRI) { 162 return TRI.isSGPRClass(DstRC) && TRI.hasVGPRs(SrcRC); 163 } 164 165 static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, 166 const TargetRegisterClass *DstRC, 167 const SIRegisterInfo &TRI) { 168 return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC); 169 } 170 171 // Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE. 172 // 173 // SGPRx = ... 174 // SGPRy = REG_SEQUENCE SGPRx, sub0 ... 175 // VGPRz = COPY SGPRy 176 // 177 // ==> 178 // 179 // VGPRx = COPY SGPRx 180 // VGPRz = REG_SEQUENCE VGPRx, sub0 181 // 182 // This exposes immediate folding opportunities when materializing 64-bit 183 // immediates. 184 static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, 185 const SIRegisterInfo *TRI, 186 const SIInstrInfo *TII, 187 MachineRegisterInfo &MRI) { 188 assert(MI.isRegSequence()); 189 190 unsigned DstReg = MI.getOperand(0).getReg(); 191 if (!TRI->isSGPRClass(MRI.getRegClass(DstReg))) 192 return false; 193 194 if (!MRI.hasOneUse(DstReg)) 195 return false; 196 197 MachineInstr &CopyUse = *MRI.use_instr_begin(DstReg); 198 if (!CopyUse.isCopy()) 199 return false; 200 201 const TargetRegisterClass *SrcRC, *DstRC; 202 std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI); 203 204 if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) 205 return false; 206 207 // TODO: Could have multiple extracts? 208 unsigned SubReg = CopyUse.getOperand(1).getSubReg(); 209 if (SubReg != AMDGPU::NoSubRegister) 210 return false; 211 212 MRI.setRegClass(DstReg, DstRC); 213 214 // SGPRx = ... 215 // SGPRy = REG_SEQUENCE SGPRx, sub0 ... 216 // VGPRz = COPY SGPRy 217 218 // => 219 // VGPRx = COPY SGPRx 220 // VGPRz = REG_SEQUENCE VGPRx, sub0 221 222 MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg()); 223 224 for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { 225 unsigned SrcReg = MI.getOperand(I).getReg(); 226 unsigned SrcSubReg = MI.getOperand(I).getSubReg(); 227 228 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); 229 assert(TRI->isSGPRClass(SrcRC) && 230 "Expected SGPR REG_SEQUENCE to only have SGPR inputs"); 231 232 SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg); 233 const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC); 234 235 unsigned TmpReg = MRI.createVirtualRegister(NewSrcRC); 236 237 BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), TmpReg) 238 .addOperand(MI.getOperand(I)); 239 240 MI.getOperand(I).setReg(TmpReg); 241 } 242 243 CopyUse.eraseFromParent(); 244 return true; 245 } 246 247 static bool phiHasVGPROperands(const MachineInstr &PHI, 248 const MachineRegisterInfo &MRI, 249 const SIRegisterInfo *TRI, 250 const SIInstrInfo *TII) { 251 252 for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) { 253 unsigned Reg = PHI.getOperand(i).getReg(); 254 if (TRI->hasVGPRs(MRI.getRegClass(Reg))) 255 return true; 256 } 257 return false; 258 } 259 static bool phiHasBreakDef(const MachineInstr &PHI, 260 const MachineRegisterInfo &MRI, 261 SmallSet<unsigned, 8> &Visited) { 262 263 for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) { 264 unsigned Reg = PHI.getOperand(i).getReg(); 265 if (Visited.count(Reg)) 266 continue; 267 268 Visited.insert(Reg); 269 270 MachineInstr *DefInstr = MRI.getUniqueVRegDef(Reg); 271 assert(DefInstr); 272 switch (DefInstr->getOpcode()) { 273 default: 274 break; 275 case AMDGPU::SI_BREAK: 276 case AMDGPU::SI_IF_BREAK: 277 case AMDGPU::SI_ELSE_BREAK: 278 return true; 279 case AMDGPU::PHI: 280 if (phiHasBreakDef(*DefInstr, MRI, Visited)) 281 return true; 282 } 283 } 284 return false; 285 } 286 287 static bool hasTerminatorThatModifiesExec(const MachineBasicBlock &MBB, 288 const TargetRegisterInfo &TRI) { 289 for (MachineBasicBlock::const_iterator I = MBB.getFirstTerminator(), 290 E = MBB.end(); I != E; ++I) { 291 if (I->modifiesRegister(AMDGPU::EXEC, &TRI)) 292 return true; 293 } 294 return false; 295 } 296 297 static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, 298 const MachineInstr *MoveImm, 299 const SIInstrInfo *TII, 300 unsigned &SMovOp, 301 int64_t &Imm) { 302 303 if (!MoveImm->isMoveImmediate()) 304 return false; 305 306 const MachineOperand *ImmOp = 307 TII->getNamedOperand(*MoveImm, AMDGPU::OpName::src0); 308 if (!ImmOp->isImm()) 309 return false; 310 311 // FIXME: Handle copies with sub-regs. 312 if (Copy->getOperand(0).getSubReg()) 313 return false; 314 315 switch (MoveImm->getOpcode()) { 316 default: 317 return false; 318 case AMDGPU::V_MOV_B32_e32: 319 SMovOp = AMDGPU::S_MOV_B32; 320 break; 321 case AMDGPU::V_MOV_B64_PSEUDO: 322 SMovOp = AMDGPU::S_MOV_B64; 323 break; 324 } 325 Imm = ImmOp->getImm(); 326 return true; 327 } 328 329 bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { 330 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 331 MachineRegisterInfo &MRI = MF.getRegInfo(); 332 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 333 const SIInstrInfo *TII = ST.getInstrInfo(); 334 MDT = &getAnalysis<MachineDominatorTree>(); 335 336 SmallVector<MachineInstr *, 16> Worklist; 337 338 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 339 BI != BE; ++BI) { 340 341 MachineBasicBlock &MBB = *BI; 342 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); 343 I != E; ++I) { 344 MachineInstr &MI = *I; 345 346 switch (MI.getOpcode()) { 347 default: 348 continue; 349 case AMDGPU::COPY: { 350 // If the destination register is a physical register there isn't really 351 // much we can do to fix this. 352 if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg())) 353 continue; 354 355 const TargetRegisterClass *SrcRC, *DstRC; 356 std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI); 357 if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) { 358 MachineInstr *DefMI = MRI.getVRegDef(MI.getOperand(1).getReg()); 359 unsigned SMovOp; 360 int64_t Imm; 361 // If we are just copying an immediate, we can replace the copy with 362 // s_mov_b32. 363 if (isSafeToFoldImmIntoCopy(&MI, DefMI, TII, SMovOp, Imm)) { 364 MI.getOperand(1).ChangeToImmediate(Imm); 365 MI.addImplicitDefUseOperands(MF); 366 MI.setDesc(TII->get(SMovOp)); 367 break; 368 } 369 TII->moveToVALU(MI); 370 } 371 372 break; 373 } 374 case AMDGPU::PHI: { 375 unsigned Reg = MI.getOperand(0).getReg(); 376 if (!TRI->isSGPRClass(MRI.getRegClass(Reg))) 377 break; 378 379 // We don't need to fix the PHI if the common dominator of the 380 // two incoming blocks terminates with a uniform branch. 381 if (MI.getNumExplicitOperands() == 5) { 382 MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB(); 383 MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB(); 384 385 MachineBasicBlock *NCD = MDT->findNearestCommonDominator(MBB0, MBB1); 386 if (NCD && !hasTerminatorThatModifiesExec(*NCD, *TRI)) { 387 DEBUG(dbgs() << "Not fixing PHI for uniform branch: " << MI << '\n'); 388 break; 389 } 390 } 391 392 // If a PHI node defines an SGPR and any of its operands are VGPRs, 393 // then we need to move it to the VALU. 394 // 395 // Also, if a PHI node defines an SGPR and has all SGPR operands 396 // we must move it to the VALU, because the SGPR operands will 397 // all end up being assigned the same register, which means 398 // there is a potential for a conflict if different threads take 399 // different control flow paths. 400 // 401 // For Example: 402 // 403 // sgpr0 = def; 404 // ... 405 // sgpr1 = def; 406 // ... 407 // sgpr2 = PHI sgpr0, sgpr1 408 // use sgpr2; 409 // 410 // Will Become: 411 // 412 // sgpr2 = def; 413 // ... 414 // sgpr2 = def; 415 // ... 416 // use sgpr2 417 // 418 // The one exception to this rule is when one of the operands 419 // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK 420 // instruction. In this case, there we know the program will 421 // never enter the second block (the loop) without entering 422 // the first block (where the condition is computed), so there 423 // is no chance for values to be over-written. 424 425 SmallSet<unsigned, 8> Visited; 426 if (phiHasVGPROperands(MI, MRI, TRI, TII) || 427 !phiHasBreakDef(MI, MRI, Visited)) { 428 DEBUG(dbgs() << "Fixing PHI: " << MI); 429 TII->moveToVALU(MI); 430 } 431 break; 432 } 433 case AMDGPU::REG_SEQUENCE: { 434 if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) || 435 !hasVGPROperands(MI, TRI)) { 436 foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI); 437 continue; 438 } 439 440 DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI); 441 442 TII->moveToVALU(MI); 443 break; 444 } 445 case AMDGPU::INSERT_SUBREG: { 446 const TargetRegisterClass *DstRC, *Src0RC, *Src1RC; 447 DstRC = MRI.getRegClass(MI.getOperand(0).getReg()); 448 Src0RC = MRI.getRegClass(MI.getOperand(1).getReg()); 449 Src1RC = MRI.getRegClass(MI.getOperand(2).getReg()); 450 if (TRI->isSGPRClass(DstRC) && 451 (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) { 452 DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI); 453 TII->moveToVALU(MI); 454 } 455 break; 456 } 457 } 458 } 459 } 460 461 return true; 462 } 463