1 //===-- SIOptimizeExecMasking.cpp -----------------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 10 #include "AMDGPU.h" 11 #include "AMDGPUSubtarget.h" 12 #include "SIInstrInfo.h" 13 #include "llvm/CodeGen/LiveIntervalAnalysis.h" 14 #include "llvm/CodeGen/MachineFunctionPass.h" 15 #include "llvm/CodeGen/MachineInstrBuilder.h" 16 #include "llvm/CodeGen/MachineRegisterInfo.h" 17 #include "llvm/Support/Debug.h" 18 19 using namespace llvm; 20 21 #define DEBUG_TYPE "si-optimize-exec-masking" 22 23 namespace { 24 25 class SIOptimizeExecMasking : public MachineFunctionPass { 26 public: 27 static char ID; 28 29 public: 30 SIOptimizeExecMasking() : MachineFunctionPass(ID) { 31 initializeSIOptimizeExecMaskingPass(*PassRegistry::getPassRegistry()); 32 } 33 34 bool runOnMachineFunction(MachineFunction &MF) override; 35 36 StringRef getPassName() const override { 37 return "SI optimize exec mask operations"; 38 } 39 40 void getAnalysisUsage(AnalysisUsage &AU) const override { 41 AU.setPreservesCFG(); 42 MachineFunctionPass::getAnalysisUsage(AU); 43 } 44 }; 45 46 } // End anonymous namespace. 47 48 INITIALIZE_PASS_BEGIN(SIOptimizeExecMasking, DEBUG_TYPE, 49 "SI optimize exec mask operations", false, false) 50 INITIALIZE_PASS_DEPENDENCY(LiveIntervals) 51 INITIALIZE_PASS_END(SIOptimizeExecMasking, DEBUG_TYPE, 52 "SI optimize exec mask operations", false, false) 53 54 char SIOptimizeExecMasking::ID = 0; 55 56 char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID; 57 58 /// If \p MI is a copy from exec, return the register copied to. 59 static unsigned isCopyFromExec(const MachineInstr &MI) { 60 switch (MI.getOpcode()) { 61 case AMDGPU::COPY: 62 case AMDGPU::S_MOV_B64: 63 case AMDGPU::S_MOV_B64_term: { 64 const MachineOperand &Src = MI.getOperand(1); 65 if (Src.isReg() && Src.getReg() == AMDGPU::EXEC) 66 return MI.getOperand(0).getReg(); 67 } 68 } 69 70 return AMDGPU::NoRegister; 71 } 72 73 /// If \p MI is a copy to exec, return the register copied from. 74 static unsigned isCopyToExec(const MachineInstr &MI) { 75 switch (MI.getOpcode()) { 76 case AMDGPU::COPY: 77 case AMDGPU::S_MOV_B64: { 78 const MachineOperand &Dst = MI.getOperand(0); 79 if (Dst.isReg() && Dst.getReg() == AMDGPU::EXEC) 80 return MI.getOperand(1).getReg(); 81 break; 82 } 83 case AMDGPU::S_MOV_B64_term: 84 llvm_unreachable("should have been replaced"); 85 } 86 87 return AMDGPU::NoRegister; 88 } 89 90 static unsigned getSaveExecOp(unsigned Opc) { 91 switch (Opc) { 92 case AMDGPU::S_AND_B64: 93 return AMDGPU::S_AND_SAVEEXEC_B64; 94 case AMDGPU::S_OR_B64: 95 return AMDGPU::S_OR_SAVEEXEC_B64; 96 case AMDGPU::S_XOR_B64: 97 return AMDGPU::S_XOR_SAVEEXEC_B64; 98 case AMDGPU::S_ANDN2_B64: 99 return AMDGPU::S_ANDN2_SAVEEXEC_B64; 100 case AMDGPU::S_ORN2_B64: 101 return AMDGPU::S_ORN2_SAVEEXEC_B64; 102 case AMDGPU::S_NAND_B64: 103 return AMDGPU::S_NAND_SAVEEXEC_B64; 104 case AMDGPU::S_NOR_B64: 105 return AMDGPU::S_NOR_SAVEEXEC_B64; 106 case AMDGPU::S_XNOR_B64: 107 return AMDGPU::S_XNOR_SAVEEXEC_B64; 108 default: 109 return AMDGPU::INSTRUCTION_LIST_END; 110 } 111 } 112 113 // These are only terminators to get correct spill code placement during 114 // register allocation, so turn them back into normal instructions. Only one of 115 // these is expected per block. 116 static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) { 117 switch (MI.getOpcode()) { 118 case AMDGPU::S_MOV_B64_term: { 119 MI.setDesc(TII.get(AMDGPU::COPY)); 120 return true; 121 } 122 case AMDGPU::S_XOR_B64_term: { 123 // This is only a terminator to get the correct spill code placement during 124 // register allocation. 125 MI.setDesc(TII.get(AMDGPU::S_XOR_B64)); 126 return true; 127 } 128 case AMDGPU::S_ANDN2_B64_term: { 129 // This is only a terminator to get the correct spill code placement during 130 // register allocation. 131 MI.setDesc(TII.get(AMDGPU::S_ANDN2_B64)); 132 return true; 133 } 134 default: 135 return false; 136 } 137 } 138 139 static MachineBasicBlock::reverse_iterator fixTerminators( 140 const SIInstrInfo &TII, 141 MachineBasicBlock &MBB) { 142 MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend(); 143 for (; I != E; ++I) { 144 if (!I->isTerminator()) 145 return I; 146 147 if (removeTerminatorBit(TII, *I)) 148 return I; 149 } 150 151 return E; 152 } 153 154 static MachineBasicBlock::reverse_iterator findExecCopy( 155 const SIInstrInfo &TII, 156 MachineBasicBlock &MBB, 157 MachineBasicBlock::reverse_iterator I, 158 unsigned CopyToExec) { 159 const unsigned InstLimit = 25; 160 161 auto E = MBB.rend(); 162 for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) { 163 unsigned CopyFromExec = isCopyFromExec(*I); 164 if (CopyFromExec != AMDGPU::NoRegister) 165 return I; 166 } 167 168 return E; 169 } 170 171 // XXX - Seems LivePhysRegs doesn't work correctly since it will incorrectly 172 // repor tthe register as unavailable because a super-register with a lane mask 173 // as unavailable. 174 static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) { 175 for (MachineBasicBlock *Succ : MBB.successors()) { 176 if (Succ->isLiveIn(Reg)) 177 return true; 178 } 179 180 return false; 181 } 182 183 bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { 184 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 185 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 186 const SIInstrInfo *TII = ST.getInstrInfo(); 187 188 // Optimize sequences emitted for control flow lowering. They are originally 189 // emitted as the separate operations because spill code may need to be 190 // inserted for the saved copy of exec. 191 // 192 // x = copy exec 193 // z = s_<op>_b64 x, y 194 // exec = copy z 195 // => 196 // x = s_<op>_saveexec_b64 y 197 // 198 199 for (MachineBasicBlock &MBB : MF) { 200 MachineBasicBlock::reverse_iterator I = fixTerminators(*TII, MBB); 201 MachineBasicBlock::reverse_iterator E = MBB.rend(); 202 if (I == E) 203 continue; 204 205 unsigned CopyToExec = isCopyToExec(*I); 206 if (CopyToExec == AMDGPU::NoRegister) 207 continue; 208 209 // Scan backwards to find the def. 210 auto CopyToExecInst = &*I; 211 auto CopyFromExecInst = findExecCopy(*TII, MBB, I, CopyToExec); 212 if (CopyFromExecInst == E) 213 continue; 214 215 if (isLiveOut(MBB, CopyToExec)) { 216 // The copied register is live out and has a second use in another block. 217 DEBUG(dbgs() << "Exec copy source register is live out\n"); 218 continue; 219 } 220 221 unsigned CopyFromExec = CopyFromExecInst->getOperand(0).getReg(); 222 MachineInstr *SaveExecInst = nullptr; 223 SmallVector<MachineInstr *, 4> OtherUseInsts; 224 225 for (MachineBasicBlock::iterator J 226 = std::next(CopyFromExecInst->getIterator()), JE = I->getIterator(); 227 J != JE; ++J) { 228 if (SaveExecInst && J->readsRegister(AMDGPU::EXEC, TRI)) { 229 DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n'); 230 // Make sure this is inserted after any VALU ops that may have been 231 // scheduled in between. 232 SaveExecInst = nullptr; 233 break; 234 } 235 236 if (J->modifiesRegister(CopyToExec, TRI)) { 237 if (SaveExecInst) { 238 DEBUG(dbgs() << "Multiple instructions modify " 239 << PrintReg(CopyToExec, TRI) << '\n'); 240 SaveExecInst = nullptr; 241 break; 242 } 243 244 unsigned SaveExecOp = getSaveExecOp(J->getOpcode()); 245 if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END) 246 break; 247 248 if (J->readsRegister(CopyFromExec, TRI)) { 249 SaveExecInst = &*J; 250 DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n'); 251 continue; 252 } else { 253 DEBUG(dbgs() << "Instruction does not read exec copy: " << *J << '\n'); 254 break; 255 } 256 } 257 258 if (SaveExecInst && J->readsRegister(CopyToExec, TRI)) { 259 assert(SaveExecInst != &*J); 260 OtherUseInsts.push_back(&*J); 261 } 262 } 263 264 if (!SaveExecInst) 265 continue; 266 267 DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n'); 268 269 MachineOperand &Src0 = SaveExecInst->getOperand(1); 270 MachineOperand &Src1 = SaveExecInst->getOperand(2); 271 272 MachineOperand *OtherOp = nullptr; 273 274 if (Src0.isReg() && Src0.getReg() == CopyFromExec) { 275 OtherOp = &Src1; 276 } else if (Src1.isReg() && Src1.getReg() == CopyFromExec) { 277 if (!SaveExecInst->isCommutable()) 278 break; 279 280 OtherOp = &Src0; 281 } else 282 llvm_unreachable("unexpected"); 283 284 CopyFromExecInst->eraseFromParent(); 285 286 auto InsPt = SaveExecInst->getIterator(); 287 const DebugLoc &DL = SaveExecInst->getDebugLoc(); 288 289 BuildMI(MBB, InsPt, DL, TII->get(getSaveExecOp(SaveExecInst->getOpcode())), 290 CopyFromExec) 291 .addReg(OtherOp->getReg()); 292 SaveExecInst->eraseFromParent(); 293 294 CopyToExecInst->eraseFromParent(); 295 296 for (MachineInstr *OtherInst : OtherUseInsts) { 297 OtherInst->substituteRegister(CopyToExec, AMDGPU::EXEC, 298 AMDGPU::NoSubRegister, *TRI); 299 } 300 } 301 302 return true; 303 304 } 305