1 //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// This pass optimizes atomic operations by using a single lane of a wavefront 12 /// to perform the atomic operation, thus reducing contention on that memory 13 /// location. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "AMDGPU.h" 18 #include "AMDGPUSubtarget.h" 19 #include "llvm/Analysis/LegacyDivergenceAnalysis.h" 20 #include "llvm/CodeGen/TargetPassConfig.h" 21 #include "llvm/IR/IRBuilder.h" 22 #include "llvm/IR/InstVisitor.h" 23 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 24 25 #define DEBUG_TYPE "amdgpu-atomic-optimizer" 26 27 using namespace llvm; 28 29 namespace { 30 31 enum DPP_CTRL { 32 DPP_ROW_SR1 = 0x111, 33 DPP_ROW_SR2 = 0x112, 34 DPP_ROW_SR4 = 0x114, 35 DPP_ROW_SR8 = 0x118, 36 DPP_WF_SR1 = 0x138, 37 DPP_ROW_BCAST15 = 0x142, 38 DPP_ROW_BCAST31 = 0x143 39 }; 40 41 struct ReplacementInfo { 42 Instruction *I; 43 Instruction::BinaryOps Op; 44 unsigned ValIdx; 45 bool ValDivergent; 46 }; 47 48 class AMDGPUAtomicOptimizer : public FunctionPass, 49 public InstVisitor<AMDGPUAtomicOptimizer> { 50 private: 51 SmallVector<ReplacementInfo, 8> ToReplace; 52 const LegacyDivergenceAnalysis *DA; 53 const DataLayout *DL; 54 DominatorTree *DT; 55 bool HasDPP; 56 57 void optimizeAtomic(Instruction &I, Instruction::BinaryOps Op, 58 unsigned ValIdx, bool ValDivergent) const; 59 60 void setConvergent(CallInst *const CI) const; 61 62 public: 63 static char ID; 64 65 AMDGPUAtomicOptimizer() : FunctionPass(ID) {} 66 67 bool runOnFunction(Function &F) override; 68 69 void getAnalysisUsage(AnalysisUsage &AU) const override { 70 AU.addPreserved<DominatorTreeWrapperPass>(); 71 AU.addRequired<LegacyDivergenceAnalysis>(); 72 AU.addRequired<TargetPassConfig>(); 73 } 74 75 void visitAtomicRMWInst(AtomicRMWInst &I); 76 void visitIntrinsicInst(IntrinsicInst &I); 77 }; 78 79 } // namespace 80 81 char AMDGPUAtomicOptimizer::ID = 0; 82 83 char &llvm::AMDGPUAtomicOptimizerID = AMDGPUAtomicOptimizer::ID; 84 85 bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) { 86 if (skipFunction(F)) { 87 return false; 88 } 89 90 DA = &getAnalysis<LegacyDivergenceAnalysis>(); 91 DL = &F.getParent()->getDataLayout(); 92 DominatorTreeWrapperPass *const DTW = 93 getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 94 DT = DTW ? &DTW->getDomTree() : nullptr; 95 const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); 96 const TargetMachine &TM = TPC.getTM<TargetMachine>(); 97 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 98 HasDPP = ST.hasDPP(); 99 100 visit(F); 101 102 const bool Changed = !ToReplace.empty(); 103 104 for (ReplacementInfo &Info : ToReplace) { 105 optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent); 106 } 107 108 ToReplace.clear(); 109 110 return Changed; 111 } 112 113 void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) { 114 // Early exit for unhandled address space atomic instructions. 115 switch (I.getPointerAddressSpace()) { 116 default: 117 return; 118 case AMDGPUAS::GLOBAL_ADDRESS: 119 case AMDGPUAS::LOCAL_ADDRESS: 120 break; 121 } 122 123 Instruction::BinaryOps Op; 124 125 switch (I.getOperation()) { 126 default: 127 return; 128 case AtomicRMWInst::Add: 129 Op = Instruction::Add; 130 break; 131 case AtomicRMWInst::Sub: 132 Op = Instruction::Sub; 133 break; 134 } 135 136 const unsigned PtrIdx = 0; 137 const unsigned ValIdx = 1; 138 139 // If the pointer operand is divergent, then each lane is doing an atomic 140 // operation on a different address, and we cannot optimize that. 141 if (DA->isDivergent(I.getOperand(PtrIdx))) { 142 return; 143 } 144 145 const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx)); 146 147 // If the value operand is divergent, each lane is contributing a different 148 // value to the atomic calculation. We can only optimize divergent values if 149 // we have DPP available on our subtarget, and the atomic operation is 32 150 // bits. 151 if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) { 152 return; 153 } 154 155 // If we get here, we can optimize the atomic using a single wavefront-wide 156 // atomic operation to do the calculation for the entire wavefront, so 157 // remember the instruction so we can come back to it. 158 const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; 159 160 ToReplace.push_back(Info); 161 } 162 163 void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) { 164 Instruction::BinaryOps Op; 165 166 switch (I.getIntrinsicID()) { 167 default: 168 return; 169 case Intrinsic::amdgcn_buffer_atomic_add: 170 case Intrinsic::amdgcn_struct_buffer_atomic_add: 171 case Intrinsic::amdgcn_raw_buffer_atomic_add: 172 Op = Instruction::Add; 173 break; 174 case Intrinsic::amdgcn_buffer_atomic_sub: 175 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 176 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 177 Op = Instruction::Sub; 178 break; 179 } 180 181 const unsigned ValIdx = 0; 182 183 const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx)); 184 185 // If the value operand is divergent, each lane is contributing a different 186 // value to the atomic calculation. We can only optimize divergent values if 187 // we have DPP available on our subtarget, and the atomic operation is 32 188 // bits. 189 if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) { 190 return; 191 } 192 193 // If any of the other arguments to the intrinsic are divergent, we can't 194 // optimize the operation. 195 for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) { 196 if (DA->isDivergent(I.getOperand(Idx))) { 197 return; 198 } 199 } 200 201 // If we get here, we can optimize the atomic using a single wavefront-wide 202 // atomic operation to do the calculation for the entire wavefront, so 203 // remember the instruction so we can come back to it. 204 const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; 205 206 ToReplace.push_back(Info); 207 } 208 209 void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, 210 Instruction::BinaryOps Op, 211 unsigned ValIdx, 212 bool ValDivergent) const { 213 LLVMContext &Context = I.getContext(); 214 215 // Start building just before the instruction. 216 IRBuilder<> B(&I); 217 218 Type *const Ty = I.getType(); 219 const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty); 220 Type *const VecTy = VectorType::get(B.getInt32Ty(), 2); 221 222 // This is the value in the atomic operation we need to combine in order to 223 // reduce the number of atomic operations. 224 Value *const V = I.getOperand(ValIdx); 225 226 // We need to know how many lanes are active within the wavefront, and we do 227 // this by getting the exec register, which tells us all the lanes that are 228 // active. 229 MDNode *const RegName = 230 llvm::MDNode::get(Context, llvm::MDString::get(Context, "exec")); 231 Value *const Metadata = llvm::MetadataAsValue::get(Context, RegName); 232 CallInst *const Exec = 233 B.CreateIntrinsic(Intrinsic::read_register, {B.getInt64Ty()}, {Metadata}); 234 setConvergent(Exec); 235 236 // We need to know how many lanes are active within the wavefront that are 237 // below us. If we counted each lane linearly starting from 0, a lane is 238 // below us only if its associated index was less than ours. We do this by 239 // using the mbcnt intrinsic. 240 Value *const BitCast = B.CreateBitCast(Exec, VecTy); 241 Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0)); 242 Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1)); 243 CallInst *const PartialMbcnt = B.CreateIntrinsic( 244 Intrinsic::amdgcn_mbcnt_lo, {}, {ExtractLo, B.getInt32(0)}); 245 CallInst *const Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, 246 {ExtractHi, PartialMbcnt}); 247 248 Value *const MbcntCast = B.CreateIntCast(Mbcnt, Ty, false); 249 250 Value *LaneOffset = nullptr; 251 Value *NewV = nullptr; 252 253 // If we have a divergent value in each lane, we need to combine the value 254 // using DPP. 255 if (ValDivergent) { 256 // First we need to set all inactive invocations to 0, so that they can 257 // correctly contribute to the final result. 258 CallInst *const SetInactive = B.CreateIntrinsic( 259 Intrinsic::amdgcn_set_inactive, Ty, {V, B.getIntN(TyBitWidth, 0)}); 260 setConvergent(SetInactive); 261 NewV = SetInactive; 262 263 const unsigned Iters = 6; 264 const unsigned DPPCtrl[Iters] = {DPP_ROW_SR1, DPP_ROW_SR2, 265 DPP_ROW_SR4, DPP_ROW_SR8, 266 DPP_ROW_BCAST15, DPP_ROW_BCAST31}; 267 const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xa, 0xc}; 268 269 // This loop performs an inclusive scan across the wavefront, with all lanes 270 // active (by using the WWM intrinsic). 271 for (unsigned Idx = 0; Idx < Iters; Idx++) { 272 CallInst *const DPP = B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp, Ty, 273 {NewV, B.getInt32(DPPCtrl[Idx]), 274 B.getInt32(RowMask[Idx]), 275 B.getInt32(0xf), B.getFalse()}); 276 setConvergent(DPP); 277 Value *const WWM = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, DPP); 278 279 NewV = B.CreateBinOp(Op, NewV, WWM); 280 NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV); 281 } 282 283 // NewV has returned the inclusive scan of V, but for the lane offset we 284 // require an exclusive scan. We do this by shifting the values from the 285 // entire wavefront right by 1, and by setting the bound_ctrl (last argument 286 // to the intrinsic below) to true, we can guarantee that 0 will be shifted 287 // into the 0'th invocation. 288 CallInst *const DPP = 289 B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp, {Ty}, 290 {NewV, B.getInt32(DPP_WF_SR1), B.getInt32(0xf), 291 B.getInt32(0xf), B.getTrue()}); 292 setConvergent(DPP); 293 LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, DPP); 294 295 // Read the value from the last lane, which has accumlated the values of 296 // each active lane in the wavefront. This will be our new value with which 297 // we will provide to the atomic operation. 298 if (TyBitWidth == 64) { 299 Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty()); 300 Value *const ExtractHi = 301 B.CreateTrunc(B.CreateLShr(NewV, B.getInt64(32)), B.getInt32Ty()); 302 CallInst *const ReadLaneLo = B.CreateIntrinsic( 303 Intrinsic::amdgcn_readlane, {}, {ExtractLo, B.getInt32(63)}); 304 setConvergent(ReadLaneLo); 305 CallInst *const ReadLaneHi = B.CreateIntrinsic( 306 Intrinsic::amdgcn_readlane, {}, {ExtractHi, B.getInt32(63)}); 307 setConvergent(ReadLaneHi); 308 Value *const PartialInsert = B.CreateInsertElement( 309 UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0)); 310 Value *const Insert = 311 B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1)); 312 NewV = B.CreateBitCast(Insert, Ty); 313 } else if (TyBitWidth == 32) { 314 CallInst *const ReadLane = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, 315 {}, {NewV, B.getInt32(63)}); 316 setConvergent(ReadLane); 317 NewV = ReadLane; 318 } else { 319 llvm_unreachable("Unhandled atomic bit width"); 320 } 321 } else { 322 // Get the total number of active lanes we have by using popcount. 323 Instruction *const Ctpop = B.CreateUnaryIntrinsic(Intrinsic::ctpop, Exec); 324 Value *const CtpopCast = B.CreateIntCast(Ctpop, Ty, false); 325 326 // Calculate the new value we will be contributing to the atomic operation 327 // for the entire wavefront. 328 NewV = B.CreateMul(V, CtpopCast); 329 LaneOffset = B.CreateMul(V, MbcntCast); 330 } 331 332 // We only want a single lane to enter our new control flow, and we do this 333 // by checking if there are any active lanes below us. Only one lane will 334 // have 0 active lanes below us, so that will be the only one to progress. 335 Value *const Cond = B.CreateICmpEQ(MbcntCast, B.getIntN(TyBitWidth, 0)); 336 337 // Store I's original basic block before we split the block. 338 BasicBlock *const EntryBB = I.getParent(); 339 340 // We need to introduce some new control flow to force a single lane to be 341 // active. We do this by splitting I's basic block at I, and introducing the 342 // new block such that: 343 // entry --> single_lane -\ 344 // \------------------> exit 345 Instruction *const SingleLaneTerminator = 346 SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr); 347 348 // Move the IR builder into single_lane next. 349 B.SetInsertPoint(SingleLaneTerminator); 350 351 // Clone the original atomic operation into single lane, replacing the 352 // original value with our newly created one. 353 Instruction *const NewI = I.clone(); 354 B.Insert(NewI); 355 NewI->setOperand(ValIdx, NewV); 356 357 // Move the IR builder into exit next, and start inserting just before the 358 // original instruction. 359 B.SetInsertPoint(&I); 360 361 // Create a PHI node to get our new atomic result into the exit block. 362 PHINode *const PHI = B.CreatePHI(Ty, 2); 363 PHI->addIncoming(UndefValue::get(Ty), EntryBB); 364 PHI->addIncoming(NewI, SingleLaneTerminator->getParent()); 365 366 // We need to broadcast the value who was the lowest active lane (the first 367 // lane) to all other lanes in the wavefront. We use an intrinsic for this, 368 // but have to handle 64-bit broadcasts with two calls to this intrinsic. 369 Value *BroadcastI = nullptr; 370 371 if (TyBitWidth == 64) { 372 Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty()); 373 Value *const ExtractHi = 374 B.CreateTrunc(B.CreateLShr(PHI, B.getInt64(32)), B.getInt32Ty()); 375 CallInst *const ReadFirstLaneLo = 376 B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo); 377 setConvergent(ReadFirstLaneLo); 378 CallInst *const ReadFirstLaneHi = 379 B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi); 380 setConvergent(ReadFirstLaneHi); 381 Value *const PartialInsert = B.CreateInsertElement( 382 UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0)); 383 Value *const Insert = 384 B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1)); 385 BroadcastI = B.CreateBitCast(Insert, Ty); 386 } else if (TyBitWidth == 32) { 387 CallInst *const ReadFirstLane = 388 B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI); 389 setConvergent(ReadFirstLane); 390 BroadcastI = ReadFirstLane; 391 } else { 392 llvm_unreachable("Unhandled atomic bit width"); 393 } 394 395 // Now that we have the result of our single atomic operation, we need to 396 // get our individual lane's slice into the result. We use the lane offset we 397 // previously calculated combined with the atomic result value we got from the 398 // first lane, to get our lane's index into the atomic result. 399 Value *const Result = B.CreateBinOp(Op, BroadcastI, LaneOffset); 400 401 // Replace the original atomic instruction with the new one. 402 I.replaceAllUsesWith(Result); 403 404 // And delete the original. 405 I.eraseFromParent(); 406 } 407 408 void AMDGPUAtomicOptimizer::setConvergent(CallInst *const CI) const { 409 CI->addAttribute(AttributeList::FunctionIndex, Attribute::Convergent); 410 } 411 412 INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE, 413 "AMDGPU atomic optimizations", false, false) 414 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) 415 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 416 INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE, 417 "AMDGPU atomic optimizations", false, false) 418 419 FunctionPass *llvm::createAMDGPUAtomicOptimizerPass() { 420 return new AMDGPUAtomicOptimizer(); 421 } 422