166416574SNeil Henning //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===//
266416574SNeil Henning //
32946cd70SChandler Carruth // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
42946cd70SChandler Carruth // See https://llvm.org/LICENSE.txt for license information.
52946cd70SChandler Carruth // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
666416574SNeil Henning //
766416574SNeil Henning //===----------------------------------------------------------------------===//
866416574SNeil Henning //
966416574SNeil Henning /// \file
1066416574SNeil Henning /// This pass optimizes atomic operations by using a single lane of a wavefront
1166416574SNeil Henning /// to perform the atomic operation, thus reducing contention on that memory
1266416574SNeil Henning /// location.
1366416574SNeil Henning //
1466416574SNeil Henning //===----------------------------------------------------------------------===//
1566416574SNeil Henning 
1666416574SNeil Henning #include "AMDGPU.h"
17560d7e04Sdfukalov #include "GCNSubtarget.h"
1866416574SNeil Henning #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
1966416574SNeil Henning #include "llvm/CodeGen/TargetPassConfig.h"
2066416574SNeil Henning #include "llvm/IR/IRBuilder.h"
2166416574SNeil Henning #include "llvm/IR/InstVisitor.h"
226a87e9b0Sdfukalov #include "llvm/IR/IntrinsicsAMDGPU.h"
2305da2fe5SReid Kleckner #include "llvm/InitializePasses.h"
246a87e9b0Sdfukalov #include "llvm/Target/TargetMachine.h"
2566416574SNeil Henning #include "llvm/Transforms/Utils/BasicBlockUtils.h"
2666416574SNeil Henning 
2766416574SNeil Henning #define DEBUG_TYPE "amdgpu-atomic-optimizer"
2866416574SNeil Henning 
2966416574SNeil Henning using namespace llvm;
30eac23862SJay Foad using namespace llvm::AMDGPU;
3166416574SNeil Henning 
3266416574SNeil Henning namespace {
3366416574SNeil Henning 
3466416574SNeil Henning struct ReplacementInfo {
3566416574SNeil Henning   Instruction *I;
3617060f0aSJay Foad   AtomicRMWInst::BinOp Op;
3766416574SNeil Henning   unsigned ValIdx;
3866416574SNeil Henning   bool ValDivergent;
3966416574SNeil Henning };
4066416574SNeil Henning 
4166416574SNeil Henning class AMDGPUAtomicOptimizer : public FunctionPass,
4266416574SNeil Henning                               public InstVisitor<AMDGPUAtomicOptimizer> {
4366416574SNeil Henning private:
4466416574SNeil Henning   SmallVector<ReplacementInfo, 8> ToReplace;
4566416574SNeil Henning   const LegacyDivergenceAnalysis *DA;
4666416574SNeil Henning   const DataLayout *DL;
4766416574SNeil Henning   DominatorTree *DT;
48eac23862SJay Foad   const GCNSubtarget *ST;
49233a02d0SNeil Henning   bool IsPixelShader;
5066416574SNeil Henning 
519d08f276SJay Foad   Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
529d08f276SJay Foad                         Value *const Identity) const;
53eac23862SJay Foad   Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
54eac23862SJay Foad                    Value *const Identity) const;
55eac23862SJay Foad   Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const;
5617060f0aSJay Foad   void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,
5717060f0aSJay Foad                       bool ValDivergent) const;
5866416574SNeil Henning 
5966416574SNeil Henning public:
6066416574SNeil Henning   static char ID;
6166416574SNeil Henning 
AMDGPUAtomicOptimizer()6266416574SNeil Henning   AMDGPUAtomicOptimizer() : FunctionPass(ID) {}
6366416574SNeil Henning 
6466416574SNeil Henning   bool runOnFunction(Function &F) override;
6566416574SNeil Henning 
getAnalysisUsage(AnalysisUsage & AU) const6666416574SNeil Henning   void getAnalysisUsage(AnalysisUsage &AU) const override {
6766416574SNeil Henning     AU.addPreserved<DominatorTreeWrapperPass>();
6866416574SNeil Henning     AU.addRequired<LegacyDivergenceAnalysis>();
6966416574SNeil Henning     AU.addRequired<TargetPassConfig>();
7066416574SNeil Henning   }
7166416574SNeil Henning 
7266416574SNeil Henning   void visitAtomicRMWInst(AtomicRMWInst &I);
7366416574SNeil Henning   void visitIntrinsicInst(IntrinsicInst &I);
7466416574SNeil Henning };
7566416574SNeil Henning 
7666416574SNeil Henning } // namespace
7766416574SNeil Henning 
7866416574SNeil Henning char AMDGPUAtomicOptimizer::ID = 0;
7966416574SNeil Henning 
8066416574SNeil Henning char &llvm::AMDGPUAtomicOptimizerID = AMDGPUAtomicOptimizer::ID;
8166416574SNeil Henning 
runOnFunction(Function & F)8266416574SNeil Henning bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {
8366416574SNeil Henning   if (skipFunction(F)) {
8466416574SNeil Henning     return false;
8566416574SNeil Henning   }
8666416574SNeil Henning 
8766416574SNeil Henning   DA = &getAnalysis<LegacyDivergenceAnalysis>();
8866416574SNeil Henning   DL = &F.getParent()->getDataLayout();
8966416574SNeil Henning   DominatorTreeWrapperPass *const DTW =
9066416574SNeil Henning       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
9166416574SNeil Henning   DT = DTW ? &DTW->getDomTree() : nullptr;
9266416574SNeil Henning   const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
9366416574SNeil Henning   const TargetMachine &TM = TPC.getTM<TargetMachine>();
94eac23862SJay Foad   ST = &TM.getSubtarget<GCNSubtarget>(F);
95233a02d0SNeil Henning   IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
9666416574SNeil Henning 
9766416574SNeil Henning   visit(F);
9866416574SNeil Henning 
9966416574SNeil Henning   const bool Changed = !ToReplace.empty();
10066416574SNeil Henning 
10166416574SNeil Henning   for (ReplacementInfo &Info : ToReplace) {
10266416574SNeil Henning     optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent);
10366416574SNeil Henning   }
10466416574SNeil Henning 
10566416574SNeil Henning   ToReplace.clear();
10666416574SNeil Henning 
10766416574SNeil Henning   return Changed;
10866416574SNeil Henning }
10966416574SNeil Henning 
visitAtomicRMWInst(AtomicRMWInst & I)11066416574SNeil Henning void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
11166416574SNeil Henning   // Early exit for unhandled address space atomic instructions.
11266416574SNeil Henning   switch (I.getPointerAddressSpace()) {
11366416574SNeil Henning   default:
11466416574SNeil Henning     return;
11566416574SNeil Henning   case AMDGPUAS::GLOBAL_ADDRESS:
11666416574SNeil Henning   case AMDGPUAS::LOCAL_ADDRESS:
11766416574SNeil Henning     break;
11866416574SNeil Henning   }
11966416574SNeil Henning 
12017060f0aSJay Foad   AtomicRMWInst::BinOp Op = I.getOperation();
12166416574SNeil Henning 
12217060f0aSJay Foad   switch (Op) {
12366416574SNeil Henning   default:
12466416574SNeil Henning     return;
12566416574SNeil Henning   case AtomicRMWInst::Add:
12666416574SNeil Henning   case AtomicRMWInst::Sub:
12770235c64SJay Foad   case AtomicRMWInst::And:
12870235c64SJay Foad   case AtomicRMWInst::Or:
12970235c64SJay Foad   case AtomicRMWInst::Xor:
13017060f0aSJay Foad   case AtomicRMWInst::Max:
13117060f0aSJay Foad   case AtomicRMWInst::Min:
13217060f0aSJay Foad   case AtomicRMWInst::UMax:
13317060f0aSJay Foad   case AtomicRMWInst::UMin:
13466416574SNeil Henning     break;
13566416574SNeil Henning   }
13666416574SNeil Henning 
13766416574SNeil Henning   const unsigned PtrIdx = 0;
13866416574SNeil Henning   const unsigned ValIdx = 1;
13966416574SNeil Henning 
14066416574SNeil Henning   // If the pointer operand is divergent, then each lane is doing an atomic
14166416574SNeil Henning   // operation on a different address, and we cannot optimize that.
142dcb75324SJay Foad   if (DA->isDivergentUse(&I.getOperandUse(PtrIdx))) {
14366416574SNeil Henning     return;
14466416574SNeil Henning   }
14566416574SNeil Henning 
146dcb75324SJay Foad   const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx));
14766416574SNeil Henning 
14866416574SNeil Henning   // If the value operand is divergent, each lane is contributing a different
14966416574SNeil Henning   // value to the atomic calculation. We can only optimize divergent values if
15066416574SNeil Henning   // we have DPP available on our subtarget, and the atomic operation is 32
15166416574SNeil Henning   // bits.
152eac23862SJay Foad   if (ValDivergent &&
153eac23862SJay Foad       (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
15466416574SNeil Henning     return;
15566416574SNeil Henning   }
15666416574SNeil Henning 
15766416574SNeil Henning   // If we get here, we can optimize the atomic using a single wavefront-wide
15866416574SNeil Henning   // atomic operation to do the calculation for the entire wavefront, so
15966416574SNeil Henning   // remember the instruction so we can come back to it.
16066416574SNeil Henning   const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};
16166416574SNeil Henning 
16266416574SNeil Henning   ToReplace.push_back(Info);
16366416574SNeil Henning }
16466416574SNeil Henning 
visitIntrinsicInst(IntrinsicInst & I)16566416574SNeil Henning void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
16617060f0aSJay Foad   AtomicRMWInst::BinOp Op;
16766416574SNeil Henning 
16866416574SNeil Henning   switch (I.getIntrinsicID()) {
16966416574SNeil Henning   default:
17066416574SNeil Henning     return;
17166416574SNeil Henning   case Intrinsic::amdgcn_buffer_atomic_add:
17266416574SNeil Henning   case Intrinsic::amdgcn_struct_buffer_atomic_add:
17366416574SNeil Henning   case Intrinsic::amdgcn_raw_buffer_atomic_add:
17417060f0aSJay Foad     Op = AtomicRMWInst::Add;
17566416574SNeil Henning     break;
17666416574SNeil Henning   case Intrinsic::amdgcn_buffer_atomic_sub:
17766416574SNeil Henning   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
17866416574SNeil Henning   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
17917060f0aSJay Foad     Op = AtomicRMWInst::Sub;
18017060f0aSJay Foad     break;
18170235c64SJay Foad   case Intrinsic::amdgcn_buffer_atomic_and:
18270235c64SJay Foad   case Intrinsic::amdgcn_struct_buffer_atomic_and:
18370235c64SJay Foad   case Intrinsic::amdgcn_raw_buffer_atomic_and:
18470235c64SJay Foad     Op = AtomicRMWInst::And;
18570235c64SJay Foad     break;
18670235c64SJay Foad   case Intrinsic::amdgcn_buffer_atomic_or:
18770235c64SJay Foad   case Intrinsic::amdgcn_struct_buffer_atomic_or:
18870235c64SJay Foad   case Intrinsic::amdgcn_raw_buffer_atomic_or:
18970235c64SJay Foad     Op = AtomicRMWInst::Or;
19070235c64SJay Foad     break;
19170235c64SJay Foad   case Intrinsic::amdgcn_buffer_atomic_xor:
19270235c64SJay Foad   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
19370235c64SJay Foad   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
19470235c64SJay Foad     Op = AtomicRMWInst::Xor;
19570235c64SJay Foad     break;
19617060f0aSJay Foad   case Intrinsic::amdgcn_buffer_atomic_smin:
19717060f0aSJay Foad   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
19817060f0aSJay Foad   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
19917060f0aSJay Foad     Op = AtomicRMWInst::Min;
20017060f0aSJay Foad     break;
20117060f0aSJay Foad   case Intrinsic::amdgcn_buffer_atomic_umin:
20217060f0aSJay Foad   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
20317060f0aSJay Foad   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
20417060f0aSJay Foad     Op = AtomicRMWInst::UMin;
20517060f0aSJay Foad     break;
20617060f0aSJay Foad   case Intrinsic::amdgcn_buffer_atomic_smax:
20717060f0aSJay Foad   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
20817060f0aSJay Foad   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
20917060f0aSJay Foad     Op = AtomicRMWInst::Max;
21017060f0aSJay Foad     break;
21117060f0aSJay Foad   case Intrinsic::amdgcn_buffer_atomic_umax:
21217060f0aSJay Foad   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
21317060f0aSJay Foad   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
21417060f0aSJay Foad     Op = AtomicRMWInst::UMax;
21566416574SNeil Henning     break;
21666416574SNeil Henning   }
21766416574SNeil Henning 
21866416574SNeil Henning   const unsigned ValIdx = 0;
21966416574SNeil Henning 
220dcb75324SJay Foad   const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx));
22166416574SNeil Henning 
22266416574SNeil Henning   // If the value operand is divergent, each lane is contributing a different
22366416574SNeil Henning   // value to the atomic calculation. We can only optimize divergent values if
22466416574SNeil Henning   // we have DPP available on our subtarget, and the atomic operation is 32
22566416574SNeil Henning   // bits.
226eac23862SJay Foad   if (ValDivergent &&
227eac23862SJay Foad       (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
22866416574SNeil Henning     return;
22966416574SNeil Henning   }
23066416574SNeil Henning 
23166416574SNeil Henning   // If any of the other arguments to the intrinsic are divergent, we can't
23266416574SNeil Henning   // optimize the operation.
23366416574SNeil Henning   for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) {
234dcb75324SJay Foad     if (DA->isDivergentUse(&I.getOperandUse(Idx))) {
23566416574SNeil Henning       return;
23666416574SNeil Henning     }
23766416574SNeil Henning   }
23866416574SNeil Henning 
23966416574SNeil Henning   // If we get here, we can optimize the atomic using a single wavefront-wide
24066416574SNeil Henning   // atomic operation to do the calculation for the entire wavefront, so
24166416574SNeil Henning   // remember the instruction so we can come back to it.
24266416574SNeil Henning   const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};
24366416574SNeil Henning 
24466416574SNeil Henning   ToReplace.push_back(Info);
24566416574SNeil Henning }
24666416574SNeil Henning 
24717060f0aSJay Foad // Use the builder to create the non-atomic counterpart of the specified
24817060f0aSJay Foad // atomicrmw binary op.
buildNonAtomicBinOp(IRBuilder<> & B,AtomicRMWInst::BinOp Op,Value * LHS,Value * RHS)24917060f0aSJay Foad static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
25017060f0aSJay Foad                                   Value *LHS, Value *RHS) {
25117060f0aSJay Foad   CmpInst::Predicate Pred;
25217060f0aSJay Foad 
25317060f0aSJay Foad   switch (Op) {
25417060f0aSJay Foad   default:
25517060f0aSJay Foad     llvm_unreachable("Unhandled atomic op");
25617060f0aSJay Foad   case AtomicRMWInst::Add:
25717060f0aSJay Foad     return B.CreateBinOp(Instruction::Add, LHS, RHS);
25817060f0aSJay Foad   case AtomicRMWInst::Sub:
25917060f0aSJay Foad     return B.CreateBinOp(Instruction::Sub, LHS, RHS);
26070235c64SJay Foad   case AtomicRMWInst::And:
26170235c64SJay Foad     return B.CreateBinOp(Instruction::And, LHS, RHS);
26270235c64SJay Foad   case AtomicRMWInst::Or:
26370235c64SJay Foad     return B.CreateBinOp(Instruction::Or, LHS, RHS);
26470235c64SJay Foad   case AtomicRMWInst::Xor:
26570235c64SJay Foad     return B.CreateBinOp(Instruction::Xor, LHS, RHS);
26617060f0aSJay Foad 
26717060f0aSJay Foad   case AtomicRMWInst::Max:
26817060f0aSJay Foad     Pred = CmpInst::ICMP_SGT;
26917060f0aSJay Foad     break;
27017060f0aSJay Foad   case AtomicRMWInst::Min:
27117060f0aSJay Foad     Pred = CmpInst::ICMP_SLT;
27217060f0aSJay Foad     break;
27317060f0aSJay Foad   case AtomicRMWInst::UMax:
27417060f0aSJay Foad     Pred = CmpInst::ICMP_UGT;
27517060f0aSJay Foad     break;
27617060f0aSJay Foad   case AtomicRMWInst::UMin:
27717060f0aSJay Foad     Pred = CmpInst::ICMP_ULT;
27817060f0aSJay Foad     break;
27917060f0aSJay Foad   }
28017060f0aSJay Foad   Value *Cond = B.CreateICmp(Pred, LHS, RHS);
28117060f0aSJay Foad   return B.CreateSelect(Cond, LHS, RHS);
28217060f0aSJay Foad }
28317060f0aSJay Foad 
2849d08f276SJay Foad // Use the builder to create a reduction of V across the wavefront, with all
2859d08f276SJay Foad // lanes active, returning the same result in all lanes.
buildReduction(IRBuilder<> & B,AtomicRMWInst::BinOp Op,Value * V,Value * const Identity) const2869d08f276SJay Foad Value *AMDGPUAtomicOptimizer::buildReduction(IRBuilder<> &B,
2879d08f276SJay Foad                                              AtomicRMWInst::BinOp Op, Value *V,
2889d08f276SJay Foad                                              Value *const Identity) const {
2899d08f276SJay Foad   Type *const Ty = V->getType();
2909d08f276SJay Foad   Module *M = B.GetInsertBlock()->getModule();
2919d08f276SJay Foad   Function *UpdateDPP =
2929d08f276SJay Foad       Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
2939d08f276SJay Foad 
2949d08f276SJay Foad   // Reduce within each row of 16 lanes.
2959d08f276SJay Foad   for (unsigned Idx = 0; Idx < 4; Idx++) {
2969d08f276SJay Foad     V = buildNonAtomicBinOp(
2979d08f276SJay Foad         B, Op, V,
2989d08f276SJay Foad         B.CreateCall(UpdateDPP,
2999d08f276SJay Foad                      {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx),
3009d08f276SJay Foad                       B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
3019d08f276SJay Foad   }
3029d08f276SJay Foad 
3039d08f276SJay Foad   // Reduce within each pair of rows (i.e. 32 lanes).
3049d08f276SJay Foad   assert(ST->hasPermLaneX16());
3059d08f276SJay Foad   V = buildNonAtomicBinOp(
3069d08f276SJay Foad       B, Op, V,
3079d08f276SJay Foad       B.CreateIntrinsic(
3089d08f276SJay Foad           Intrinsic::amdgcn_permlanex16, {},
3099d08f276SJay Foad           {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}));
3109d08f276SJay Foad 
3119d08f276SJay Foad   if (ST->isWave32())
3129d08f276SJay Foad     return V;
3139d08f276SJay Foad 
314*bfcfd53bSJay Foad   if (ST->hasPermLane64()) {
315*bfcfd53bSJay Foad     // Reduce across the upper and lower 32 lanes.
316*bfcfd53bSJay Foad     return buildNonAtomicBinOp(
317*bfcfd53bSJay Foad         B, Op, V, B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V));
318*bfcfd53bSJay Foad   }
319*bfcfd53bSJay Foad 
3209d08f276SJay Foad   // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
3219d08f276SJay Foad   // combine them with a scalar operation.
3229d08f276SJay Foad   Function *ReadLane =
3239d08f276SJay Foad       Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
3249d08f276SJay Foad   Value *const Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
3259d08f276SJay Foad   Value *const Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
3269d08f276SJay Foad   return buildNonAtomicBinOp(B, Op, Lane0, Lane32);
3279d08f276SJay Foad }
3289d08f276SJay Foad 
329eac23862SJay Foad // Use the builder to create an inclusive scan of V across the wavefront, with
330eac23862SJay Foad // all lanes active.
buildScan(IRBuilder<> & B,AtomicRMWInst::BinOp Op,Value * V,Value * const Identity) const331eac23862SJay Foad Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
332eac23862SJay Foad                                         Value *V, Value *const Identity) const {
333eac23862SJay Foad   Type *const Ty = V->getType();
334eac23862SJay Foad   Module *M = B.GetInsertBlock()->getModule();
335eac23862SJay Foad   Function *UpdateDPP =
336eac23862SJay Foad       Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
337eac23862SJay Foad 
338eac23862SJay Foad   for (unsigned Idx = 0; Idx < 4; Idx++) {
339eac23862SJay Foad     V = buildNonAtomicBinOp(
340eac23862SJay Foad         B, Op, V,
341eac23862SJay Foad         B.CreateCall(UpdateDPP,
342eac23862SJay Foad                      {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx),
343eac23862SJay Foad                       B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
344eac23862SJay Foad   }
345eac23862SJay Foad   if (ST->hasDPPBroadcasts()) {
346eac23862SJay Foad     // GFX9 has DPP row broadcast operations.
347eac23862SJay Foad     V = buildNonAtomicBinOp(
348eac23862SJay Foad         B, Op, V,
349eac23862SJay Foad         B.CreateCall(UpdateDPP,
350eac23862SJay Foad                      {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa),
351eac23862SJay Foad                       B.getInt32(0xf), B.getFalse()}));
352eac23862SJay Foad     V = buildNonAtomicBinOp(
353eac23862SJay Foad         B, Op, V,
354eac23862SJay Foad         B.CreateCall(UpdateDPP,
355eac23862SJay Foad                      {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc),
356eac23862SJay Foad                       B.getInt32(0xf), B.getFalse()}));
357eac23862SJay Foad   } else {
358eac23862SJay Foad     // On GFX10 all DPP operations are confined to a single row. To get cross-
359eac23862SJay Foad     // row operations we have to use permlane or readlane.
360eac23862SJay Foad 
361eac23862SJay Foad     // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
362eac23862SJay Foad     // 48..63).
3639d08f276SJay Foad     assert(ST->hasPermLaneX16());
364c96dfe0dSJay Foad     Value *const PermX = B.CreateIntrinsic(
365c96dfe0dSJay Foad         Intrinsic::amdgcn_permlanex16, {},
366c96dfe0dSJay Foad         {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
367eac23862SJay Foad     V = buildNonAtomicBinOp(
368eac23862SJay Foad         B, Op, V,
369eac23862SJay Foad         B.CreateCall(UpdateDPP,
370eac23862SJay Foad                      {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID),
371eac23862SJay Foad                       B.getInt32(0xa), B.getInt32(0xf), B.getFalse()}));
372eac23862SJay Foad     if (!ST->isWave32()) {
373eac23862SJay Foad       // Combine lane 31 into lanes 32..63.
374c96dfe0dSJay Foad       Value *const Lane31 = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
375c96dfe0dSJay Foad                                               {V, B.getInt32(31)});
376eac23862SJay Foad       V = buildNonAtomicBinOp(
377eac23862SJay Foad           B, Op, V,
378eac23862SJay Foad           B.CreateCall(UpdateDPP,
379eac23862SJay Foad                        {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID),
380eac23862SJay Foad                         B.getInt32(0xc), B.getInt32(0xf), B.getFalse()}));
381eac23862SJay Foad     }
382eac23862SJay Foad   }
383eac23862SJay Foad   return V;
384eac23862SJay Foad }
385eac23862SJay Foad 
386eac23862SJay Foad // Use the builder to create a shift right of V across the wavefront, with all
387eac23862SJay Foad // lanes active, to turn an inclusive scan into an exclusive scan.
buildShiftRight(IRBuilder<> & B,Value * V,Value * const Identity) const388eac23862SJay Foad Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V,
389eac23862SJay Foad                                               Value *const Identity) const {
390eac23862SJay Foad   Type *const Ty = V->getType();
391eac23862SJay Foad   Module *M = B.GetInsertBlock()->getModule();
392eac23862SJay Foad   Function *UpdateDPP =
393eac23862SJay Foad       Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
394eac23862SJay Foad 
395eac23862SJay Foad   if (ST->hasDPPWavefrontShifts()) {
396eac23862SJay Foad     // GFX9 has DPP wavefront shift operations.
397eac23862SJay Foad     V = B.CreateCall(UpdateDPP,
398eac23862SJay Foad                      {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
399eac23862SJay Foad                       B.getInt32(0xf), B.getFalse()});
400eac23862SJay Foad   } else {
401c96dfe0dSJay Foad     Function *ReadLane =
402c96dfe0dSJay Foad         Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
403c96dfe0dSJay Foad     Function *WriteLane =
404c96dfe0dSJay Foad         Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
405c96dfe0dSJay Foad 
406eac23862SJay Foad     // On GFX10 all DPP operations are confined to a single row. To get cross-
407eac23862SJay Foad     // row operations we have to use permlane or readlane.
408eac23862SJay Foad     Value *Old = V;
409eac23862SJay Foad     V = B.CreateCall(UpdateDPP,
410eac23862SJay Foad                      {Identity, V, B.getInt32(DPP::ROW_SHR0 + 1),
411eac23862SJay Foad                       B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
412eac23862SJay Foad 
413eac23862SJay Foad     // Copy the old lane 15 to the new lane 16.
414eac23862SJay Foad     V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}),
415eac23862SJay Foad                                  B.getInt32(16), V});
416eac23862SJay Foad 
417eac23862SJay Foad     if (!ST->isWave32()) {
418eac23862SJay Foad       // Copy the old lane 31 to the new lane 32.
419eac23862SJay Foad       V = B.CreateCall(
420eac23862SJay Foad           WriteLane,
421eac23862SJay Foad           {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V});
422eac23862SJay Foad 
423eac23862SJay Foad       // Copy the old lane 47 to the new lane 48.
424eac23862SJay Foad       V = B.CreateCall(
425eac23862SJay Foad           WriteLane,
426eac23862SJay Foad           {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V});
427eac23862SJay Foad     }
428eac23862SJay Foad   }
429eac23862SJay Foad 
430eac23862SJay Foad   return V;
431eac23862SJay Foad }
432eac23862SJay Foad 
getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op,unsigned BitWidth)43317060f0aSJay Foad static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op,
43417060f0aSJay Foad                                          unsigned BitWidth) {
43517060f0aSJay Foad   switch (Op) {
43617060f0aSJay Foad   default:
43717060f0aSJay Foad     llvm_unreachable("Unhandled atomic op");
43817060f0aSJay Foad   case AtomicRMWInst::Add:
43917060f0aSJay Foad   case AtomicRMWInst::Sub:
44070235c64SJay Foad   case AtomicRMWInst::Or:
44170235c64SJay Foad   case AtomicRMWInst::Xor:
44217060f0aSJay Foad   case AtomicRMWInst::UMax:
44317060f0aSJay Foad     return APInt::getMinValue(BitWidth);
44470235c64SJay Foad   case AtomicRMWInst::And:
44517060f0aSJay Foad   case AtomicRMWInst::UMin:
44617060f0aSJay Foad     return APInt::getMaxValue(BitWidth);
44717060f0aSJay Foad   case AtomicRMWInst::Max:
44817060f0aSJay Foad     return APInt::getSignedMinValue(BitWidth);
44917060f0aSJay Foad   case AtomicRMWInst::Min:
45017060f0aSJay Foad     return APInt::getSignedMaxValue(BitWidth);
45117060f0aSJay Foad   }
45217060f0aSJay Foad }
45317060f0aSJay Foad 
buildMul(IRBuilder<> & B,Value * LHS,Value * RHS)4540249df33SMirko Brkusanin static Value *buildMul(IRBuilder<> &B, Value *LHS, Value *RHS) {
4550249df33SMirko Brkusanin   const ConstantInt *CI = dyn_cast<ConstantInt>(LHS);
4560249df33SMirko Brkusanin   return (CI && CI->isOne()) ? RHS : B.CreateMul(LHS, RHS);
4570249df33SMirko Brkusanin }
4580249df33SMirko Brkusanin 
optimizeAtomic(Instruction & I,AtomicRMWInst::BinOp Op,unsigned ValIdx,bool ValDivergent) const45966416574SNeil Henning void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
46017060f0aSJay Foad                                            AtomicRMWInst::BinOp Op,
46166416574SNeil Henning                                            unsigned ValIdx,
46266416574SNeil Henning                                            bool ValDivergent) const {
46366416574SNeil Henning   // Start building just before the instruction.
46466416574SNeil Henning   IRBuilder<> B(&I);
46566416574SNeil Henning 
466233a02d0SNeil Henning   // If we are in a pixel shader, because of how we have to mask out helper
467233a02d0SNeil Henning   // lane invocations, we need to record the entry and exit BB's.
468233a02d0SNeil Henning   BasicBlock *PixelEntryBB = nullptr;
469233a02d0SNeil Henning   BasicBlock *PixelExitBB = nullptr;
470233a02d0SNeil Henning 
471233a02d0SNeil Henning   // If we're optimizing an atomic within a pixel shader, we need to wrap the
472233a02d0SNeil Henning   // entire atomic operation in a helper-lane check. We do not want any helper
473233a02d0SNeil Henning   // lanes that are around only for the purposes of derivatives to take part
474233a02d0SNeil Henning   // in any cross-lane communication, and we use a branch on whether the lane is
475233a02d0SNeil Henning   // live to do this.
476233a02d0SNeil Henning   if (IsPixelShader) {
477233a02d0SNeil Henning     // Record I's original position as the entry block.
478233a02d0SNeil Henning     PixelEntryBB = I.getParent();
479233a02d0SNeil Henning 
480233a02d0SNeil Henning     Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {});
481233a02d0SNeil Henning     Instruction *const NonHelperTerminator =
482233a02d0SNeil Henning         SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr);
483233a02d0SNeil Henning 
484233a02d0SNeil Henning     // Record I's new position as the exit block.
485233a02d0SNeil Henning     PixelExitBB = I.getParent();
486233a02d0SNeil Henning 
487233a02d0SNeil Henning     I.moveBefore(NonHelperTerminator);
488233a02d0SNeil Henning     B.SetInsertPoint(&I);
489233a02d0SNeil Henning   }
490233a02d0SNeil Henning 
49166416574SNeil Henning   Type *const Ty = I.getType();
49266416574SNeil Henning   const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
493aad93654SChristopher Tetreault   auto *const VecTy = FixedVectorType::get(B.getInt32Ty(), 2);
49466416574SNeil Henning 
49566416574SNeil Henning   // This is the value in the atomic operation we need to combine in order to
49666416574SNeil Henning   // reduce the number of atomic operations.
49766416574SNeil Henning   Value *const V = I.getOperand(ValIdx);
49866416574SNeil Henning 
49966416574SNeil Henning   // We need to know how many lanes are active within the wavefront, and we do
5008c10fa1aSNeil Henning   // this by doing a ballot of active lanes.
501eac23862SJay Foad   Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
5025d3a69feSSebastian Neubauer   CallInst *const Ballot =
5035d3a69feSSebastian Neubauer       B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
50466416574SNeil Henning 
50566416574SNeil Henning   // We need to know how many lanes are active within the wavefront that are
50666416574SNeil Henning   // below us. If we counted each lane linearly starting from 0, a lane is
50766416574SNeil Henning   // below us only if its associated index was less than ours. We do this by
50866416574SNeil Henning   // using the mbcnt intrinsic.
509eac23862SJay Foad   Value *Mbcnt;
510eac23862SJay Foad   if (ST->isWave32()) {
511eac23862SJay Foad     Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
512eac23862SJay Foad                               {Ballot, B.getInt32(0)});
513eac23862SJay Foad   } else {
5148c10fa1aSNeil Henning     Value *const BitCast = B.CreateBitCast(Ballot, VecTy);
51566416574SNeil Henning     Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0));
51666416574SNeil Henning     Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1));
517eac23862SJay Foad     Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
518eac23862SJay Foad                               {ExtractLo, B.getInt32(0)});
519eac23862SJay Foad     Mbcnt =
520eac23862SJay Foad         B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt});
521eac23862SJay Foad   }
522eac23862SJay Foad   Mbcnt = B.CreateIntCast(Mbcnt, Ty, false);
52366416574SNeil Henning 
52417060f0aSJay Foad   Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth));
52517060f0aSJay Foad 
52617060f0aSJay Foad   Value *ExclScan = nullptr;
52766416574SNeil Henning   Value *NewV = nullptr;
52866416574SNeil Henning 
5295dd5ddcbSJay Foad   const bool NeedResult = !I.use_empty();
5305dd5ddcbSJay Foad 
53166416574SNeil Henning   // If we have a divergent value in each lane, we need to combine the value
53266416574SNeil Henning   // using DPP.
53366416574SNeil Henning   if (ValDivergent) {
53417060f0aSJay Foad     // First we need to set all inactive invocations to the identity value, so
53517060f0aSJay Foad     // that they can correctly contribute to the final result.
536eac23862SJay Foad     NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
53766416574SNeil Henning 
538eac23862SJay Foad     const AtomicRMWInst::BinOp ScanOp =
539eac23862SJay Foad         Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op;
5409d08f276SJay Foad     if (!NeedResult && ST->hasPermLaneX16()) {
5419d08f276SJay Foad       // On GFX10 the permlanex16 instruction helps us build a reduction without
5429d08f276SJay Foad       // too many readlanes and writelanes, which are generally bad for
5439d08f276SJay Foad       // performance.
5449d08f276SJay Foad       NewV = buildReduction(B, ScanOp, NewV, Identity);
5459d08f276SJay Foad     } else {
546eac23862SJay Foad       NewV = buildScan(B, ScanOp, NewV, Identity);
5475dd5ddcbSJay Foad       if (NeedResult)
548eac23862SJay Foad         ExclScan = buildShiftRight(B, NewV, Identity);
54966416574SNeil Henning 
550dc6e8dfdSJacob Lambert       // Read the value from the last lane, which has accumulated the values of
55117060f0aSJay Foad       // each active lane in the wavefront. This will be our new value which we
55217060f0aSJay Foad       // will provide to the atomic operation.
553eac23862SJay Foad       Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
5545a5a5312SJay Foad       assert(TyBitWidth == 32);
5559d08f276SJay Foad       NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
5569d08f276SJay Foad                                {NewV, LastLaneIdx});
5579d08f276SJay Foad     }
5588c10fa1aSNeil Henning 
5598c10fa1aSNeil Henning     // Finally mark the readlanes in the WWM section.
560c3ce7baeSPiotr Sobczak     NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
56166416574SNeil Henning   } else {
56217060f0aSJay Foad     switch (Op) {
56317060f0aSJay Foad     default:
56417060f0aSJay Foad       llvm_unreachable("Unhandled atomic op");
56517060f0aSJay Foad 
56617060f0aSJay Foad     case AtomicRMWInst::Add:
56717060f0aSJay Foad     case AtomicRMWInst::Sub: {
56870235c64SJay Foad       // The new value we will be contributing to the atomic operation is the
56970235c64SJay Foad       // old value times the number of active lanes.
57070235c64SJay Foad       Value *const Ctpop = B.CreateIntCast(
57170235c64SJay Foad           B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
5720249df33SMirko Brkusanin       NewV = buildMul(B, V, Ctpop);
57317060f0aSJay Foad       break;
57417060f0aSJay Foad     }
57517060f0aSJay Foad 
57670235c64SJay Foad     case AtomicRMWInst::And:
57770235c64SJay Foad     case AtomicRMWInst::Or:
57817060f0aSJay Foad     case AtomicRMWInst::Max:
57917060f0aSJay Foad     case AtomicRMWInst::Min:
58017060f0aSJay Foad     case AtomicRMWInst::UMax:
58117060f0aSJay Foad     case AtomicRMWInst::UMin:
58270235c64SJay Foad       // These operations with a uniform value are idempotent: doing the atomic
58370235c64SJay Foad       // operation multiple times has the same effect as doing it once.
58417060f0aSJay Foad       NewV = V;
58517060f0aSJay Foad       break;
58670235c64SJay Foad 
58770235c64SJay Foad     case AtomicRMWInst::Xor:
58870235c64SJay Foad       // The new value we will be contributing to the atomic operation is the
58970235c64SJay Foad       // old value times the parity of the number of active lanes.
59070235c64SJay Foad       Value *const Ctpop = B.CreateIntCast(
59170235c64SJay Foad           B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
5920249df33SMirko Brkusanin       NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1));
59370235c64SJay Foad       break;
59417060f0aSJay Foad     }
59566416574SNeil Henning   }
59666416574SNeil Henning 
59766416574SNeil Henning   // We only want a single lane to enter our new control flow, and we do this
59866416574SNeil Henning   // by checking if there are any active lanes below us. Only one lane will
59966416574SNeil Henning   // have 0 active lanes below us, so that will be the only one to progress.
60070235c64SJay Foad   Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0));
60166416574SNeil Henning 
60266416574SNeil Henning   // Store I's original basic block before we split the block.
60366416574SNeil Henning   BasicBlock *const EntryBB = I.getParent();
60466416574SNeil Henning 
60566416574SNeil Henning   // We need to introduce some new control flow to force a single lane to be
60666416574SNeil Henning   // active. We do this by splitting I's basic block at I, and introducing the
60766416574SNeil Henning   // new block such that:
60866416574SNeil Henning   // entry --> single_lane -\
60966416574SNeil Henning   //       \------------------> exit
61066416574SNeil Henning   Instruction *const SingleLaneTerminator =
61166416574SNeil Henning       SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr);
61266416574SNeil Henning 
61366416574SNeil Henning   // Move the IR builder into single_lane next.
61466416574SNeil Henning   B.SetInsertPoint(SingleLaneTerminator);
61566416574SNeil Henning 
61666416574SNeil Henning   // Clone the original atomic operation into single lane, replacing the
61766416574SNeil Henning   // original value with our newly created one.
61866416574SNeil Henning   Instruction *const NewI = I.clone();
61966416574SNeil Henning   B.Insert(NewI);
62066416574SNeil Henning   NewI->setOperand(ValIdx, NewV);
62166416574SNeil Henning 
62266416574SNeil Henning   // Move the IR builder into exit next, and start inserting just before the
62366416574SNeil Henning   // original instruction.
62466416574SNeil Henning   B.SetInsertPoint(&I);
62566416574SNeil Henning 
626298500aeSJay Foad   if (NeedResult) {
62766416574SNeil Henning     // Create a PHI node to get our new atomic result into the exit block.
62866416574SNeil Henning     PHINode *const PHI = B.CreatePHI(Ty, 2);
62966416574SNeil Henning     PHI->addIncoming(UndefValue::get(Ty), EntryBB);
63066416574SNeil Henning     PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
63166416574SNeil Henning 
63266416574SNeil Henning     // We need to broadcast the value who was the lowest active lane (the first
63366416574SNeil Henning     // lane) to all other lanes in the wavefront. We use an intrinsic for this,
63466416574SNeil Henning     // but have to handle 64-bit broadcasts with two calls to this intrinsic.
63566416574SNeil Henning     Value *BroadcastI = nullptr;
63666416574SNeil Henning 
63766416574SNeil Henning     if (TyBitWidth == 64) {
63866416574SNeil Henning       Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());
63966416574SNeil Henning       Value *const ExtractHi =
640eac23862SJay Foad           B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty());
64166416574SNeil Henning       CallInst *const ReadFirstLaneLo =
64266416574SNeil Henning           B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
64366416574SNeil Henning       CallInst *const ReadFirstLaneHi =
64466416574SNeil Henning           B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
64566416574SNeil Henning       Value *const PartialInsert = B.CreateInsertElement(
64666416574SNeil Henning           UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
64766416574SNeil Henning       Value *const Insert =
64866416574SNeil Henning           B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
64966416574SNeil Henning       BroadcastI = B.CreateBitCast(Insert, Ty);
65066416574SNeil Henning     } else if (TyBitWidth == 32) {
6513a31b3f6SMatt Arsenault 
6523a31b3f6SMatt Arsenault       BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
65366416574SNeil Henning     } else {
65466416574SNeil Henning       llvm_unreachable("Unhandled atomic bit width");
65566416574SNeil Henning     }
65666416574SNeil Henning 
65766416574SNeil Henning     // Now that we have the result of our single atomic operation, we need to
658298500aeSJay Foad     // get our individual lane's slice into the result. We use the lane offset
659298500aeSJay Foad     // we previously calculated combined with the atomic result value we got
660298500aeSJay Foad     // from the first lane, to get our lane's index into the atomic result.
66117060f0aSJay Foad     Value *LaneOffset = nullptr;
66217060f0aSJay Foad     if (ValDivergent) {
663c3ce7baeSPiotr Sobczak       LaneOffset =
664c3ce7baeSPiotr Sobczak           B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
66517060f0aSJay Foad     } else {
66617060f0aSJay Foad       switch (Op) {
66717060f0aSJay Foad       default:
66817060f0aSJay Foad         llvm_unreachable("Unhandled atomic op");
66917060f0aSJay Foad       case AtomicRMWInst::Add:
67017060f0aSJay Foad       case AtomicRMWInst::Sub:
6710249df33SMirko Brkusanin         LaneOffset = buildMul(B, V, Mbcnt);
67217060f0aSJay Foad         break;
67370235c64SJay Foad       case AtomicRMWInst::And:
67470235c64SJay Foad       case AtomicRMWInst::Or:
67517060f0aSJay Foad       case AtomicRMWInst::Max:
67617060f0aSJay Foad       case AtomicRMWInst::Min:
67717060f0aSJay Foad       case AtomicRMWInst::UMax:
67817060f0aSJay Foad       case AtomicRMWInst::UMin:
67917060f0aSJay Foad         LaneOffset = B.CreateSelect(Cond, Identity, V);
68017060f0aSJay Foad         break;
68170235c64SJay Foad       case AtomicRMWInst::Xor:
6820249df33SMirko Brkusanin         LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1));
68370235c64SJay Foad         break;
68417060f0aSJay Foad       }
68517060f0aSJay Foad     }
68617060f0aSJay Foad     Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
68766416574SNeil Henning 
688233a02d0SNeil Henning     if (IsPixelShader) {
689233a02d0SNeil Henning       // Need a final PHI to reconverge to above the helper lane branch mask.
690233a02d0SNeil Henning       B.SetInsertPoint(PixelExitBB->getFirstNonPHI());
691233a02d0SNeil Henning 
692233a02d0SNeil Henning       PHINode *const PHI = B.CreatePHI(Ty, 2);
693233a02d0SNeil Henning       PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB);
694233a02d0SNeil Henning       PHI->addIncoming(Result, I.getParent());
695233a02d0SNeil Henning       I.replaceAllUsesWith(PHI);
696233a02d0SNeil Henning     } else {
69766416574SNeil Henning       // Replace the original atomic instruction with the new one.
69866416574SNeil Henning       I.replaceAllUsesWith(Result);
699233a02d0SNeil Henning     }
700298500aeSJay Foad   }
70166416574SNeil Henning 
70266416574SNeil Henning   // And delete the original.
70366416574SNeil Henning   I.eraseFromParent();
70466416574SNeil Henning }
70566416574SNeil Henning 
70666416574SNeil Henning INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE,
70766416574SNeil Henning                       "AMDGPU atomic optimizations", false, false)
INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)70866416574SNeil Henning INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
70966416574SNeil Henning INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
71066416574SNeil Henning INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE,
71166416574SNeil Henning                     "AMDGPU atomic optimizations", false, false)
71266416574SNeil Henning 
71366416574SNeil Henning FunctionPass *llvm::createAMDGPUAtomicOptimizerPass() {
71466416574SNeil Henning   return new AMDGPUAtomicOptimizer();
71566416574SNeil Henning }
716