1 //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This pass optimizes atomic operations by using a single lane of a wavefront 11 /// to perform the atomic operation, thus reducing contention on that memory 12 /// location. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUSubtarget.h" 18 #include "llvm/Analysis/LegacyDivergenceAnalysis.h" 19 #include "llvm/CodeGen/TargetPassConfig.h" 20 #include "llvm/IR/IRBuilder.h" 21 #include "llvm/IR/InstVisitor.h" 22 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 23 24 #define DEBUG_TYPE "amdgpu-atomic-optimizer" 25 26 using namespace llvm; 27 28 namespace { 29 30 enum DPP_CTRL { 31 DPP_ROW_SR1 = 0x111, 32 DPP_ROW_SR2 = 0x112, 33 DPP_ROW_SR4 = 0x114, 34 DPP_ROW_SR8 = 0x118, 35 DPP_WF_SR1 = 0x138, 36 DPP_ROW_BCAST15 = 0x142, 37 DPP_ROW_BCAST31 = 0x143 38 }; 39 40 struct ReplacementInfo { 41 Instruction *I; 42 Instruction::BinaryOps Op; 43 unsigned ValIdx; 44 bool ValDivergent; 45 }; 46 47 class AMDGPUAtomicOptimizer : public FunctionPass, 48 public InstVisitor<AMDGPUAtomicOptimizer> { 49 private: 50 SmallVector<ReplacementInfo, 8> ToReplace; 51 const LegacyDivergenceAnalysis *DA; 52 const DataLayout *DL; 53 DominatorTree *DT; 54 bool HasDPP; 55 bool IsPixelShader; 56 57 void optimizeAtomic(Instruction &I, Instruction::BinaryOps Op, 58 unsigned ValIdx, bool ValDivergent) const; 59 60 void setConvergent(CallInst *const CI) const; 61 62 public: 63 static char ID; 64 65 AMDGPUAtomicOptimizer() : FunctionPass(ID) {} 66 67 bool runOnFunction(Function &F) override; 68 69 void getAnalysisUsage(AnalysisUsage &AU) const override { 70 AU.addPreserved<DominatorTreeWrapperPass>(); 71 AU.addRequired<LegacyDivergenceAnalysis>(); 72 AU.addRequired<TargetPassConfig>(); 73 } 74 75 void visitAtomicRMWInst(AtomicRMWInst &I); 76 void visitIntrinsicInst(IntrinsicInst &I); 77 }; 78 79 } // namespace 80 81 char AMDGPUAtomicOptimizer::ID = 0; 82 83 char &llvm::AMDGPUAtomicOptimizerID = AMDGPUAtomicOptimizer::ID; 84 85 bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) { 86 if (skipFunction(F)) { 87 return false; 88 } 89 90 DA = &getAnalysis<LegacyDivergenceAnalysis>(); 91 DL = &F.getParent()->getDataLayout(); 92 DominatorTreeWrapperPass *const DTW = 93 getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 94 DT = DTW ? &DTW->getDomTree() : nullptr; 95 const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); 96 const TargetMachine &TM = TPC.getTM<TargetMachine>(); 97 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 98 HasDPP = ST.hasDPP(); 99 IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; 100 101 visit(F); 102 103 const bool Changed = !ToReplace.empty(); 104 105 for (ReplacementInfo &Info : ToReplace) { 106 optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent); 107 } 108 109 ToReplace.clear(); 110 111 return Changed; 112 } 113 114 void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) { 115 // Early exit for unhandled address space atomic instructions. 116 switch (I.getPointerAddressSpace()) { 117 default: 118 return; 119 case AMDGPUAS::GLOBAL_ADDRESS: 120 case AMDGPUAS::LOCAL_ADDRESS: 121 break; 122 } 123 124 Instruction::BinaryOps Op; 125 126 switch (I.getOperation()) { 127 default: 128 return; 129 case AtomicRMWInst::Add: 130 Op = Instruction::Add; 131 break; 132 case AtomicRMWInst::Sub: 133 Op = Instruction::Sub; 134 break; 135 } 136 137 const unsigned PtrIdx = 0; 138 const unsigned ValIdx = 1; 139 140 // If the pointer operand is divergent, then each lane is doing an atomic 141 // operation on a different address, and we cannot optimize that. 142 if (DA->isDivergent(I.getOperand(PtrIdx))) { 143 return; 144 } 145 146 const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx)); 147 148 // If the value operand is divergent, each lane is contributing a different 149 // value to the atomic calculation. We can only optimize divergent values if 150 // we have DPP available on our subtarget, and the atomic operation is 32 151 // bits. 152 if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) { 153 return; 154 } 155 156 // If we get here, we can optimize the atomic using a single wavefront-wide 157 // atomic operation to do the calculation for the entire wavefront, so 158 // remember the instruction so we can come back to it. 159 const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; 160 161 ToReplace.push_back(Info); 162 } 163 164 void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) { 165 Instruction::BinaryOps Op; 166 167 switch (I.getIntrinsicID()) { 168 default: 169 return; 170 case Intrinsic::amdgcn_buffer_atomic_add: 171 case Intrinsic::amdgcn_struct_buffer_atomic_add: 172 case Intrinsic::amdgcn_raw_buffer_atomic_add: 173 Op = Instruction::Add; 174 break; 175 case Intrinsic::amdgcn_buffer_atomic_sub: 176 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 177 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 178 Op = Instruction::Sub; 179 break; 180 } 181 182 const unsigned ValIdx = 0; 183 184 const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx)); 185 186 // If the value operand is divergent, each lane is contributing a different 187 // value to the atomic calculation. We can only optimize divergent values if 188 // we have DPP available on our subtarget, and the atomic operation is 32 189 // bits. 190 if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) { 191 return; 192 } 193 194 // If any of the other arguments to the intrinsic are divergent, we can't 195 // optimize the operation. 196 for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) { 197 if (DA->isDivergent(I.getOperand(Idx))) { 198 return; 199 } 200 } 201 202 // If we get here, we can optimize the atomic using a single wavefront-wide 203 // atomic operation to do the calculation for the entire wavefront, so 204 // remember the instruction so we can come back to it. 205 const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; 206 207 ToReplace.push_back(Info); 208 } 209 210 void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, 211 Instruction::BinaryOps Op, 212 unsigned ValIdx, 213 bool ValDivergent) const { 214 LLVMContext &Context = I.getContext(); 215 216 // Start building just before the instruction. 217 IRBuilder<> B(&I); 218 219 // If we are in a pixel shader, because of how we have to mask out helper 220 // lane invocations, we need to record the entry and exit BB's. 221 BasicBlock *PixelEntryBB = nullptr; 222 BasicBlock *PixelExitBB = nullptr; 223 224 // If we're optimizing an atomic within a pixel shader, we need to wrap the 225 // entire atomic operation in a helper-lane check. We do not want any helper 226 // lanes that are around only for the purposes of derivatives to take part 227 // in any cross-lane communication, and we use a branch on whether the lane is 228 // live to do this. 229 if (IsPixelShader) { 230 // Record I's original position as the entry block. 231 PixelEntryBB = I.getParent(); 232 233 Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {}); 234 Instruction *const NonHelperTerminator = 235 SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr); 236 237 // Record I's new position as the exit block. 238 PixelExitBB = I.getParent(); 239 240 I.moveBefore(NonHelperTerminator); 241 B.SetInsertPoint(&I); 242 } 243 244 Type *const Ty = I.getType(); 245 const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty); 246 Type *const VecTy = VectorType::get(B.getInt32Ty(), 2); 247 248 // This is the value in the atomic operation we need to combine in order to 249 // reduce the number of atomic operations. 250 Value *const V = I.getOperand(ValIdx); 251 252 // We need to know how many lanes are active within the wavefront, and we do 253 // this by getting the exec register, which tells us all the lanes that are 254 // active. 255 MDNode *const RegName = 256 llvm::MDNode::get(Context, llvm::MDString::get(Context, "exec")); 257 Value *const Metadata = llvm::MetadataAsValue::get(Context, RegName); 258 CallInst *const Exec = 259 B.CreateIntrinsic(Intrinsic::read_register, {B.getInt64Ty()}, {Metadata}); 260 setConvergent(Exec); 261 262 // We need to know how many lanes are active within the wavefront that are 263 // below us. If we counted each lane linearly starting from 0, a lane is 264 // below us only if its associated index was less than ours. We do this by 265 // using the mbcnt intrinsic. 266 Value *const BitCast = B.CreateBitCast(Exec, VecTy); 267 Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0)); 268 Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1)); 269 CallInst *const PartialMbcnt = B.CreateIntrinsic( 270 Intrinsic::amdgcn_mbcnt_lo, {}, {ExtractLo, B.getInt32(0)}); 271 CallInst *const Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, 272 {ExtractHi, PartialMbcnt}); 273 274 Value *const MbcntCast = B.CreateIntCast(Mbcnt, Ty, false); 275 276 Value *LaneOffset = nullptr; 277 Value *NewV = nullptr; 278 279 // If we have a divergent value in each lane, we need to combine the value 280 // using DPP. 281 if (ValDivergent) { 282 // First we need to set all inactive invocations to 0, so that they can 283 // correctly contribute to the final result. 284 CallInst *const SetInactive = B.CreateIntrinsic( 285 Intrinsic::amdgcn_set_inactive, Ty, {V, B.getIntN(TyBitWidth, 0)}); 286 setConvergent(SetInactive); 287 NewV = SetInactive; 288 289 const unsigned Iters = 6; 290 const unsigned DPPCtrl[Iters] = {DPP_ROW_SR1, DPP_ROW_SR2, 291 DPP_ROW_SR4, DPP_ROW_SR8, 292 DPP_ROW_BCAST15, DPP_ROW_BCAST31}; 293 const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xa, 0xc}; 294 295 // This loop performs an inclusive scan across the wavefront, with all lanes 296 // active (by using the WWM intrinsic). 297 for (unsigned Idx = 0; Idx < Iters; Idx++) { 298 CallInst *const DPP = B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp, Ty, 299 {NewV, B.getInt32(DPPCtrl[Idx]), 300 B.getInt32(RowMask[Idx]), 301 B.getInt32(0xf), B.getFalse()}); 302 setConvergent(DPP); 303 Value *const WWM = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, DPP); 304 305 NewV = B.CreateBinOp(Op, NewV, WWM); 306 NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV); 307 } 308 309 // NewV has returned the inclusive scan of V, but for the lane offset we 310 // require an exclusive scan. We do this by shifting the values from the 311 // entire wavefront right by 1, and by setting the bound_ctrl (last argument 312 // to the intrinsic below) to true, we can guarantee that 0 will be shifted 313 // into the 0'th invocation. 314 CallInst *const DPP = 315 B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp, {Ty}, 316 {NewV, B.getInt32(DPP_WF_SR1), B.getInt32(0xf), 317 B.getInt32(0xf), B.getTrue()}); 318 setConvergent(DPP); 319 LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, DPP); 320 321 // Read the value from the last lane, which has accumlated the values of 322 // each active lane in the wavefront. This will be our new value with which 323 // we will provide to the atomic operation. 324 if (TyBitWidth == 64) { 325 Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty()); 326 Value *const ExtractHi = 327 B.CreateTrunc(B.CreateLShr(NewV, B.getInt64(32)), B.getInt32Ty()); 328 CallInst *const ReadLaneLo = B.CreateIntrinsic( 329 Intrinsic::amdgcn_readlane, {}, {ExtractLo, B.getInt32(63)}); 330 setConvergent(ReadLaneLo); 331 CallInst *const ReadLaneHi = B.CreateIntrinsic( 332 Intrinsic::amdgcn_readlane, {}, {ExtractHi, B.getInt32(63)}); 333 setConvergent(ReadLaneHi); 334 Value *const PartialInsert = B.CreateInsertElement( 335 UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0)); 336 Value *const Insert = 337 B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1)); 338 NewV = B.CreateBitCast(Insert, Ty); 339 } else if (TyBitWidth == 32) { 340 CallInst *const ReadLane = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, 341 {}, {NewV, B.getInt32(63)}); 342 setConvergent(ReadLane); 343 NewV = ReadLane; 344 } else { 345 llvm_unreachable("Unhandled atomic bit width"); 346 } 347 } else { 348 // Get the total number of active lanes we have by using popcount. 349 Instruction *const Ctpop = B.CreateUnaryIntrinsic(Intrinsic::ctpop, Exec); 350 Value *const CtpopCast = B.CreateIntCast(Ctpop, Ty, false); 351 352 // Calculate the new value we will be contributing to the atomic operation 353 // for the entire wavefront. 354 NewV = B.CreateMul(V, CtpopCast); 355 LaneOffset = B.CreateMul(V, MbcntCast); 356 } 357 358 // We only want a single lane to enter our new control flow, and we do this 359 // by checking if there are any active lanes below us. Only one lane will 360 // have 0 active lanes below us, so that will be the only one to progress. 361 Value *const Cond = B.CreateICmpEQ(MbcntCast, B.getIntN(TyBitWidth, 0)); 362 363 // Store I's original basic block before we split the block. 364 BasicBlock *const EntryBB = I.getParent(); 365 366 // We need to introduce some new control flow to force a single lane to be 367 // active. We do this by splitting I's basic block at I, and introducing the 368 // new block such that: 369 // entry --> single_lane -\ 370 // \------------------> exit 371 Instruction *const SingleLaneTerminator = 372 SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr); 373 374 // Move the IR builder into single_lane next. 375 B.SetInsertPoint(SingleLaneTerminator); 376 377 // Clone the original atomic operation into single lane, replacing the 378 // original value with our newly created one. 379 Instruction *const NewI = I.clone(); 380 B.Insert(NewI); 381 NewI->setOperand(ValIdx, NewV); 382 383 // Move the IR builder into exit next, and start inserting just before the 384 // original instruction. 385 B.SetInsertPoint(&I); 386 387 // Create a PHI node to get our new atomic result into the exit block. 388 PHINode *const PHI = B.CreatePHI(Ty, 2); 389 PHI->addIncoming(UndefValue::get(Ty), EntryBB); 390 PHI->addIncoming(NewI, SingleLaneTerminator->getParent()); 391 392 // We need to broadcast the value who was the lowest active lane (the first 393 // lane) to all other lanes in the wavefront. We use an intrinsic for this, 394 // but have to handle 64-bit broadcasts with two calls to this intrinsic. 395 Value *BroadcastI = nullptr; 396 397 if (TyBitWidth == 64) { 398 Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty()); 399 Value *const ExtractHi = 400 B.CreateTrunc(B.CreateLShr(PHI, B.getInt64(32)), B.getInt32Ty()); 401 CallInst *const ReadFirstLaneLo = 402 B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo); 403 setConvergent(ReadFirstLaneLo); 404 CallInst *const ReadFirstLaneHi = 405 B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi); 406 setConvergent(ReadFirstLaneHi); 407 Value *const PartialInsert = B.CreateInsertElement( 408 UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0)); 409 Value *const Insert = 410 B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1)); 411 BroadcastI = B.CreateBitCast(Insert, Ty); 412 } else if (TyBitWidth == 32) { 413 CallInst *const ReadFirstLane = 414 B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI); 415 setConvergent(ReadFirstLane); 416 BroadcastI = ReadFirstLane; 417 } else { 418 llvm_unreachable("Unhandled atomic bit width"); 419 } 420 421 // Now that we have the result of our single atomic operation, we need to 422 // get our individual lane's slice into the result. We use the lane offset we 423 // previously calculated combined with the atomic result value we got from the 424 // first lane, to get our lane's index into the atomic result. 425 Value *const Result = B.CreateBinOp(Op, BroadcastI, LaneOffset); 426 427 if (IsPixelShader) { 428 // Need a final PHI to reconverge to above the helper lane branch mask. 429 B.SetInsertPoint(PixelExitBB->getFirstNonPHI()); 430 431 PHINode *const PHI = B.CreatePHI(Ty, 2); 432 PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB); 433 PHI->addIncoming(Result, I.getParent()); 434 I.replaceAllUsesWith(PHI); 435 } else { 436 // Replace the original atomic instruction with the new one. 437 I.replaceAllUsesWith(Result); 438 } 439 440 // And delete the original. 441 I.eraseFromParent(); 442 } 443 444 void AMDGPUAtomicOptimizer::setConvergent(CallInst *const CI) const { 445 CI->addAttribute(AttributeList::FunctionIndex, Attribute::Convergent); 446 } 447 448 INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE, 449 "AMDGPU atomic optimizations", false, false) 450 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) 451 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 452 INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE, 453 "AMDGPU atomic optimizations", false, false) 454 455 FunctionPass *llvm::createAMDGPUAtomicOptimizerPass() { 456 return new AMDGPUAtomicOptimizer(); 457 } 458