1 //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This pass optimizes atomic operations by using a single lane of a wavefront 11 /// to perform the atomic operation, thus reducing contention on that memory 12 /// location. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUSubtarget.h" 18 #include "llvm/Analysis/LegacyDivergenceAnalysis.h" 19 #include "llvm/CodeGen/TargetPassConfig.h" 20 #include "llvm/IR/IRBuilder.h" 21 #include "llvm/IR/InstVisitor.h" 22 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 23 24 #define DEBUG_TYPE "amdgpu-atomic-optimizer" 25 26 using namespace llvm; 27 28 namespace { 29 30 enum DPP_CTRL { 31 DPP_ROW_SR1 = 0x111, 32 DPP_ROW_SR2 = 0x112, 33 DPP_ROW_SR3 = 0x113, 34 DPP_ROW_SR4 = 0x114, 35 DPP_ROW_SR8 = 0x118, 36 DPP_WF_SR1 = 0x138, 37 DPP_ROW_BCAST15 = 0x142, 38 DPP_ROW_BCAST31 = 0x143 39 }; 40 41 struct ReplacementInfo { 42 Instruction *I; 43 AtomicRMWInst::BinOp Op; 44 unsigned ValIdx; 45 bool ValDivergent; 46 }; 47 48 class AMDGPUAtomicOptimizer : public FunctionPass, 49 public InstVisitor<AMDGPUAtomicOptimizer> { 50 private: 51 SmallVector<ReplacementInfo, 8> ToReplace; 52 const LegacyDivergenceAnalysis *DA; 53 const DataLayout *DL; 54 DominatorTree *DT; 55 bool HasDPP; 56 bool IsPixelShader; 57 58 void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx, 59 bool ValDivergent) const; 60 61 public: 62 static char ID; 63 64 AMDGPUAtomicOptimizer() : FunctionPass(ID) {} 65 66 bool runOnFunction(Function &F) override; 67 68 void getAnalysisUsage(AnalysisUsage &AU) const override { 69 AU.addPreserved<DominatorTreeWrapperPass>(); 70 AU.addRequired<LegacyDivergenceAnalysis>(); 71 AU.addRequired<TargetPassConfig>(); 72 } 73 74 void visitAtomicRMWInst(AtomicRMWInst &I); 75 void visitIntrinsicInst(IntrinsicInst &I); 76 }; 77 78 } // namespace 79 80 char AMDGPUAtomicOptimizer::ID = 0; 81 82 char &llvm::AMDGPUAtomicOptimizerID = AMDGPUAtomicOptimizer::ID; 83 84 bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) { 85 if (skipFunction(F)) { 86 return false; 87 } 88 89 DA = &getAnalysis<LegacyDivergenceAnalysis>(); 90 DL = &F.getParent()->getDataLayout(); 91 DominatorTreeWrapperPass *const DTW = 92 getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 93 DT = DTW ? &DTW->getDomTree() : nullptr; 94 const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); 95 const TargetMachine &TM = TPC.getTM<TargetMachine>(); 96 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 97 HasDPP = ST.hasDPP(); 98 IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; 99 100 visit(F); 101 102 const bool Changed = !ToReplace.empty(); 103 104 for (ReplacementInfo &Info : ToReplace) { 105 optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent); 106 } 107 108 ToReplace.clear(); 109 110 return Changed; 111 } 112 113 void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) { 114 // Early exit for unhandled address space atomic instructions. 115 switch (I.getPointerAddressSpace()) { 116 default: 117 return; 118 case AMDGPUAS::GLOBAL_ADDRESS: 119 case AMDGPUAS::LOCAL_ADDRESS: 120 break; 121 } 122 123 AtomicRMWInst::BinOp Op = I.getOperation(); 124 125 switch (Op) { 126 default: 127 return; 128 case AtomicRMWInst::Add: 129 case AtomicRMWInst::Sub: 130 case AtomicRMWInst::Max: 131 case AtomicRMWInst::Min: 132 case AtomicRMWInst::UMax: 133 case AtomicRMWInst::UMin: 134 break; 135 } 136 137 const unsigned PtrIdx = 0; 138 const unsigned ValIdx = 1; 139 140 // If the pointer operand is divergent, then each lane is doing an atomic 141 // operation on a different address, and we cannot optimize that. 142 if (DA->isDivergent(I.getOperand(PtrIdx))) { 143 return; 144 } 145 146 const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx)); 147 148 // If the value operand is divergent, each lane is contributing a different 149 // value to the atomic calculation. We can only optimize divergent values if 150 // we have DPP available on our subtarget, and the atomic operation is 32 151 // bits. 152 if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) { 153 return; 154 } 155 156 // If we get here, we can optimize the atomic using a single wavefront-wide 157 // atomic operation to do the calculation for the entire wavefront, so 158 // remember the instruction so we can come back to it. 159 const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; 160 161 ToReplace.push_back(Info); 162 } 163 164 void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) { 165 AtomicRMWInst::BinOp Op; 166 167 switch (I.getIntrinsicID()) { 168 default: 169 return; 170 case Intrinsic::amdgcn_buffer_atomic_add: 171 case Intrinsic::amdgcn_struct_buffer_atomic_add: 172 case Intrinsic::amdgcn_raw_buffer_atomic_add: 173 Op = AtomicRMWInst::Add; 174 break; 175 case Intrinsic::amdgcn_buffer_atomic_sub: 176 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 177 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 178 Op = AtomicRMWInst::Sub; 179 break; 180 case Intrinsic::amdgcn_buffer_atomic_smin: 181 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 182 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 183 Op = AtomicRMWInst::Min; 184 break; 185 case Intrinsic::amdgcn_buffer_atomic_umin: 186 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 187 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 188 Op = AtomicRMWInst::UMin; 189 break; 190 case Intrinsic::amdgcn_buffer_atomic_smax: 191 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 192 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 193 Op = AtomicRMWInst::Max; 194 break; 195 case Intrinsic::amdgcn_buffer_atomic_umax: 196 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 197 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 198 Op = AtomicRMWInst::UMax; 199 break; 200 } 201 202 const unsigned ValIdx = 0; 203 204 const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx)); 205 206 // If the value operand is divergent, each lane is contributing a different 207 // value to the atomic calculation. We can only optimize divergent values if 208 // we have DPP available on our subtarget, and the atomic operation is 32 209 // bits. 210 if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) { 211 return; 212 } 213 214 // If any of the other arguments to the intrinsic are divergent, we can't 215 // optimize the operation. 216 for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) { 217 if (DA->isDivergent(I.getOperand(Idx))) { 218 return; 219 } 220 } 221 222 // If we get here, we can optimize the atomic using a single wavefront-wide 223 // atomic operation to do the calculation for the entire wavefront, so 224 // remember the instruction so we can come back to it. 225 const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; 226 227 ToReplace.push_back(Info); 228 } 229 230 // Use the builder to create the non-atomic counterpart of the specified 231 // atomicrmw binary op. 232 static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, 233 Value *LHS, Value *RHS) { 234 CmpInst::Predicate Pred; 235 236 switch (Op) { 237 default: 238 llvm_unreachable("Unhandled atomic op"); 239 case AtomicRMWInst::Add: 240 return B.CreateBinOp(Instruction::Add, LHS, RHS); 241 case AtomicRMWInst::Sub: 242 return B.CreateBinOp(Instruction::Sub, LHS, RHS); 243 244 case AtomicRMWInst::Max: 245 Pred = CmpInst::ICMP_SGT; 246 break; 247 case AtomicRMWInst::Min: 248 Pred = CmpInst::ICMP_SLT; 249 break; 250 case AtomicRMWInst::UMax: 251 Pred = CmpInst::ICMP_UGT; 252 break; 253 case AtomicRMWInst::UMin: 254 Pred = CmpInst::ICMP_ULT; 255 break; 256 } 257 Value *Cond = B.CreateICmp(Pred, LHS, RHS); 258 return B.CreateSelect(Cond, LHS, RHS); 259 } 260 261 static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op, 262 unsigned BitWidth) { 263 switch (Op) { 264 default: 265 llvm_unreachable("Unhandled atomic op"); 266 case AtomicRMWInst::Add: 267 case AtomicRMWInst::Sub: 268 case AtomicRMWInst::UMax: 269 return APInt::getMinValue(BitWidth); 270 case AtomicRMWInst::UMin: 271 return APInt::getMaxValue(BitWidth); 272 case AtomicRMWInst::Max: 273 return APInt::getSignedMinValue(BitWidth); 274 case AtomicRMWInst::Min: 275 return APInt::getSignedMaxValue(BitWidth); 276 } 277 } 278 279 void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, 280 AtomicRMWInst::BinOp Op, 281 unsigned ValIdx, 282 bool ValDivergent) const { 283 // Start building just before the instruction. 284 IRBuilder<> B(&I); 285 286 // If we are in a pixel shader, because of how we have to mask out helper 287 // lane invocations, we need to record the entry and exit BB's. 288 BasicBlock *PixelEntryBB = nullptr; 289 BasicBlock *PixelExitBB = nullptr; 290 291 // If we're optimizing an atomic within a pixel shader, we need to wrap the 292 // entire atomic operation in a helper-lane check. We do not want any helper 293 // lanes that are around only for the purposes of derivatives to take part 294 // in any cross-lane communication, and we use a branch on whether the lane is 295 // live to do this. 296 if (IsPixelShader) { 297 // Record I's original position as the entry block. 298 PixelEntryBB = I.getParent(); 299 300 Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {}); 301 Instruction *const NonHelperTerminator = 302 SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr); 303 304 // Record I's new position as the exit block. 305 PixelExitBB = I.getParent(); 306 307 I.moveBefore(NonHelperTerminator); 308 B.SetInsertPoint(&I); 309 } 310 311 Type *const Ty = I.getType(); 312 const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty); 313 Type *const VecTy = VectorType::get(B.getInt32Ty(), 2); 314 315 // This is the value in the atomic operation we need to combine in order to 316 // reduce the number of atomic operations. 317 Value *const V = I.getOperand(ValIdx); 318 319 // We need to know how many lanes are active within the wavefront, and we do 320 // this by doing a ballot of active lanes. 321 CallInst *const Ballot = B.CreateIntrinsic( 322 Intrinsic::amdgcn_icmp, {B.getInt64Ty(), B.getInt32Ty()}, 323 {B.getInt32(1), B.getInt32(0), B.getInt32(CmpInst::ICMP_NE)}); 324 325 // We need to know how many lanes are active within the wavefront that are 326 // below us. If we counted each lane linearly starting from 0, a lane is 327 // below us only if its associated index was less than ours. We do this by 328 // using the mbcnt intrinsic. 329 Value *const BitCast = B.CreateBitCast(Ballot, VecTy); 330 Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0)); 331 Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1)); 332 CallInst *const PartialMbcnt = B.CreateIntrinsic( 333 Intrinsic::amdgcn_mbcnt_lo, {}, {ExtractLo, B.getInt32(0)}); 334 CallInst *const Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, 335 {ExtractHi, PartialMbcnt}); 336 337 Value *const MbcntCast = B.CreateIntCast(Mbcnt, Ty, false); 338 339 Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth)); 340 341 Value *ExclScan = nullptr; 342 Value *NewV = nullptr; 343 344 // If we have a divergent value in each lane, we need to combine the value 345 // using DPP. 346 if (ValDivergent) { 347 // First we need to set all inactive invocations to the identity value, so 348 // that they can correctly contribute to the final result. 349 CallInst *const SetInactive = 350 B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); 351 352 CallInst *const FirstDPP = 353 B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Ty, 354 {Identity, SetInactive, B.getInt32(DPP_WF_SR1), 355 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); 356 ExclScan = FirstDPP; 357 358 const unsigned Iters = 7; 359 const unsigned DPPCtrl[Iters] = { 360 DPP_ROW_SR1, DPP_ROW_SR2, DPP_ROW_SR3, DPP_ROW_SR4, 361 DPP_ROW_SR8, DPP_ROW_BCAST15, DPP_ROW_BCAST31}; 362 const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xf, 0xa, 0xc}; 363 const unsigned BankMask[Iters] = {0xf, 0xf, 0xf, 0xe, 0xc, 0xf, 0xf}; 364 365 // This loop performs an exclusive scan across the wavefront, with all lanes 366 // active (by using the WWM intrinsic). 367 for (unsigned Idx = 0; Idx < Iters; Idx++) { 368 Value *const UpdateValue = Idx < 3 ? FirstDPP : ExclScan; 369 CallInst *const DPP = B.CreateIntrinsic( 370 Intrinsic::amdgcn_update_dpp, Ty, 371 {Identity, UpdateValue, B.getInt32(DPPCtrl[Idx]), 372 B.getInt32(RowMask[Idx]), B.getInt32(BankMask[Idx]), B.getFalse()}); 373 374 ExclScan = buildNonAtomicBinOp(B, Op, ExclScan, DPP); 375 } 376 377 NewV = buildNonAtomicBinOp(B, Op, SetInactive, ExclScan); 378 379 // Read the value from the last lane, which has accumlated the values of 380 // each active lane in the wavefront. This will be our new value which we 381 // will provide to the atomic operation. 382 if (TyBitWidth == 64) { 383 Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty()); 384 Value *const ExtractHi = 385 B.CreateTrunc(B.CreateLShr(NewV, B.getInt64(32)), B.getInt32Ty()); 386 CallInst *const ReadLaneLo = B.CreateIntrinsic( 387 Intrinsic::amdgcn_readlane, {}, {ExtractLo, B.getInt32(63)}); 388 CallInst *const ReadLaneHi = B.CreateIntrinsic( 389 Intrinsic::amdgcn_readlane, {}, {ExtractHi, B.getInt32(63)}); 390 Value *const PartialInsert = B.CreateInsertElement( 391 UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0)); 392 Value *const Insert = 393 B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1)); 394 NewV = B.CreateBitCast(Insert, Ty); 395 } else if (TyBitWidth == 32) { 396 NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, 397 {NewV, B.getInt32(63)}); 398 } else { 399 llvm_unreachable("Unhandled atomic bit width"); 400 } 401 402 // Finally mark the readlanes in the WWM section. 403 NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV); 404 } else { 405 switch (Op) { 406 default: 407 llvm_unreachable("Unhandled atomic op"); 408 409 case AtomicRMWInst::Add: 410 case AtomicRMWInst::Sub: { 411 // Get the total number of active lanes we have by using popcount. 412 Instruction *const Ctpop = 413 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot); 414 Value *const CtpopCast = B.CreateIntCast(Ctpop, Ty, false); 415 416 // Calculate the new value we will be contributing to the atomic operation 417 // for the entire wavefront. 418 NewV = B.CreateMul(V, CtpopCast); 419 break; 420 } 421 422 case AtomicRMWInst::Max: 423 case AtomicRMWInst::Min: 424 case AtomicRMWInst::UMax: 425 case AtomicRMWInst::UMin: 426 // Max/min with a uniform value is idempotent: doing the atomic operation 427 // multiple times has the same effect as doing it once. 428 NewV = V; 429 break; 430 } 431 } 432 433 // We only want a single lane to enter our new control flow, and we do this 434 // by checking if there are any active lanes below us. Only one lane will 435 // have 0 active lanes below us, so that will be the only one to progress. 436 Value *const Cond = B.CreateICmpEQ(MbcntCast, B.getIntN(TyBitWidth, 0)); 437 438 // Store I's original basic block before we split the block. 439 BasicBlock *const EntryBB = I.getParent(); 440 441 // We need to introduce some new control flow to force a single lane to be 442 // active. We do this by splitting I's basic block at I, and introducing the 443 // new block such that: 444 // entry --> single_lane -\ 445 // \------------------> exit 446 Instruction *const SingleLaneTerminator = 447 SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr); 448 449 // Move the IR builder into single_lane next. 450 B.SetInsertPoint(SingleLaneTerminator); 451 452 // Clone the original atomic operation into single lane, replacing the 453 // original value with our newly created one. 454 Instruction *const NewI = I.clone(); 455 B.Insert(NewI); 456 NewI->setOperand(ValIdx, NewV); 457 458 // Move the IR builder into exit next, and start inserting just before the 459 // original instruction. 460 B.SetInsertPoint(&I); 461 462 // Create a PHI node to get our new atomic result into the exit block. 463 PHINode *const PHI = B.CreatePHI(Ty, 2); 464 PHI->addIncoming(UndefValue::get(Ty), EntryBB); 465 PHI->addIncoming(NewI, SingleLaneTerminator->getParent()); 466 467 // We need to broadcast the value who was the lowest active lane (the first 468 // lane) to all other lanes in the wavefront. We use an intrinsic for this, 469 // but have to handle 64-bit broadcasts with two calls to this intrinsic. 470 Value *BroadcastI = nullptr; 471 472 if (TyBitWidth == 64) { 473 Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty()); 474 Value *const ExtractHi = 475 B.CreateTrunc(B.CreateLShr(PHI, B.getInt64(32)), B.getInt32Ty()); 476 CallInst *const ReadFirstLaneLo = 477 B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo); 478 CallInst *const ReadFirstLaneHi = 479 B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi); 480 Value *const PartialInsert = B.CreateInsertElement( 481 UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0)); 482 Value *const Insert = 483 B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1)); 484 BroadcastI = B.CreateBitCast(Insert, Ty); 485 } else if (TyBitWidth == 32) { 486 487 BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI); 488 } else { 489 llvm_unreachable("Unhandled atomic bit width"); 490 } 491 492 // Now that we have the result of our single atomic operation, we need to 493 // get our individual lane's slice into the result. We use the lane offset we 494 // previously calculated combined with the atomic result value we got from the 495 // first lane, to get our lane's index into the atomic result. 496 Value *LaneOffset = nullptr; 497 if (ValDivergent) { 498 LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan); 499 } else { 500 switch (Op) { 501 default: 502 llvm_unreachable("Unhandled atomic op"); 503 case AtomicRMWInst::Add: 504 case AtomicRMWInst::Sub: 505 LaneOffset = B.CreateMul(V, MbcntCast); 506 break; 507 case AtomicRMWInst::Max: 508 case AtomicRMWInst::Min: 509 case AtomicRMWInst::UMax: 510 case AtomicRMWInst::UMin: 511 LaneOffset = B.CreateSelect(Cond, Identity, V); 512 break; 513 } 514 } 515 Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset); 516 517 if (IsPixelShader) { 518 // Need a final PHI to reconverge to above the helper lane branch mask. 519 B.SetInsertPoint(PixelExitBB->getFirstNonPHI()); 520 521 PHINode *const PHI = B.CreatePHI(Ty, 2); 522 PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB); 523 PHI->addIncoming(Result, I.getParent()); 524 I.replaceAllUsesWith(PHI); 525 } else { 526 // Replace the original atomic instruction with the new one. 527 I.replaceAllUsesWith(Result); 528 } 529 530 // And delete the original. 531 I.eraseFromParent(); 532 } 533 534 INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE, 535 "AMDGPU atomic optimizations", false, false) 536 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) 537 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 538 INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE, 539 "AMDGPU atomic optimizations", false, false) 540 541 FunctionPass *llvm::createAMDGPUAtomicOptimizerPass() { 542 return new AMDGPUAtomicOptimizer(); 543 } 544