166416574SNeil Henning //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===//
266416574SNeil Henning //
32946cd70SChandler Carruth // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
42946cd70SChandler Carruth // See https://llvm.org/LICENSE.txt for license information.
52946cd70SChandler Carruth // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
666416574SNeil Henning //
766416574SNeil Henning //===----------------------------------------------------------------------===//
866416574SNeil Henning //
966416574SNeil Henning /// \file
1066416574SNeil Henning /// This pass optimizes atomic operations by using a single lane of a wavefront
1166416574SNeil Henning /// to perform the atomic operation, thus reducing contention on that memory
1266416574SNeil Henning /// location.
1366416574SNeil Henning //
1466416574SNeil Henning //===----------------------------------------------------------------------===//
1566416574SNeil Henning
1666416574SNeil Henning #include "AMDGPU.h"
17560d7e04Sdfukalov #include "GCNSubtarget.h"
1866416574SNeil Henning #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
1966416574SNeil Henning #include "llvm/CodeGen/TargetPassConfig.h"
2066416574SNeil Henning #include "llvm/IR/IRBuilder.h"
2166416574SNeil Henning #include "llvm/IR/InstVisitor.h"
226a87e9b0Sdfukalov #include "llvm/IR/IntrinsicsAMDGPU.h"
2305da2fe5SReid Kleckner #include "llvm/InitializePasses.h"
246a87e9b0Sdfukalov #include "llvm/Target/TargetMachine.h"
2566416574SNeil Henning #include "llvm/Transforms/Utils/BasicBlockUtils.h"
2666416574SNeil Henning
2766416574SNeil Henning #define DEBUG_TYPE "amdgpu-atomic-optimizer"
2866416574SNeil Henning
2966416574SNeil Henning using namespace llvm;
30eac23862SJay Foad using namespace llvm::AMDGPU;
3166416574SNeil Henning
3266416574SNeil Henning namespace {
3366416574SNeil Henning
3466416574SNeil Henning struct ReplacementInfo {
3566416574SNeil Henning Instruction *I;
3617060f0aSJay Foad AtomicRMWInst::BinOp Op;
3766416574SNeil Henning unsigned ValIdx;
3866416574SNeil Henning bool ValDivergent;
3966416574SNeil Henning };
4066416574SNeil Henning
4166416574SNeil Henning class AMDGPUAtomicOptimizer : public FunctionPass,
4266416574SNeil Henning public InstVisitor<AMDGPUAtomicOptimizer> {
4366416574SNeil Henning private:
4466416574SNeil Henning SmallVector<ReplacementInfo, 8> ToReplace;
4566416574SNeil Henning const LegacyDivergenceAnalysis *DA;
4666416574SNeil Henning const DataLayout *DL;
4766416574SNeil Henning DominatorTree *DT;
48eac23862SJay Foad const GCNSubtarget *ST;
49233a02d0SNeil Henning bool IsPixelShader;
5066416574SNeil Henning
519d08f276SJay Foad Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
529d08f276SJay Foad Value *const Identity) const;
53eac23862SJay Foad Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
54eac23862SJay Foad Value *const Identity) const;
55eac23862SJay Foad Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const;
5617060f0aSJay Foad void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,
5717060f0aSJay Foad bool ValDivergent) const;
5866416574SNeil Henning
5966416574SNeil Henning public:
6066416574SNeil Henning static char ID;
6166416574SNeil Henning
AMDGPUAtomicOptimizer()6266416574SNeil Henning AMDGPUAtomicOptimizer() : FunctionPass(ID) {}
6366416574SNeil Henning
6466416574SNeil Henning bool runOnFunction(Function &F) override;
6566416574SNeil Henning
getAnalysisUsage(AnalysisUsage & AU) const6666416574SNeil Henning void getAnalysisUsage(AnalysisUsage &AU) const override {
6766416574SNeil Henning AU.addPreserved<DominatorTreeWrapperPass>();
6866416574SNeil Henning AU.addRequired<LegacyDivergenceAnalysis>();
6966416574SNeil Henning AU.addRequired<TargetPassConfig>();
7066416574SNeil Henning }
7166416574SNeil Henning
7266416574SNeil Henning void visitAtomicRMWInst(AtomicRMWInst &I);
7366416574SNeil Henning void visitIntrinsicInst(IntrinsicInst &I);
7466416574SNeil Henning };
7566416574SNeil Henning
7666416574SNeil Henning } // namespace
7766416574SNeil Henning
7866416574SNeil Henning char AMDGPUAtomicOptimizer::ID = 0;
7966416574SNeil Henning
8066416574SNeil Henning char &llvm::AMDGPUAtomicOptimizerID = AMDGPUAtomicOptimizer::ID;
8166416574SNeil Henning
runOnFunction(Function & F)8266416574SNeil Henning bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {
8366416574SNeil Henning if (skipFunction(F)) {
8466416574SNeil Henning return false;
8566416574SNeil Henning }
8666416574SNeil Henning
8766416574SNeil Henning DA = &getAnalysis<LegacyDivergenceAnalysis>();
8866416574SNeil Henning DL = &F.getParent()->getDataLayout();
8966416574SNeil Henning DominatorTreeWrapperPass *const DTW =
9066416574SNeil Henning getAnalysisIfAvailable<DominatorTreeWrapperPass>();
9166416574SNeil Henning DT = DTW ? &DTW->getDomTree() : nullptr;
9266416574SNeil Henning const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
9366416574SNeil Henning const TargetMachine &TM = TPC.getTM<TargetMachine>();
94eac23862SJay Foad ST = &TM.getSubtarget<GCNSubtarget>(F);
95233a02d0SNeil Henning IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
9666416574SNeil Henning
9766416574SNeil Henning visit(F);
9866416574SNeil Henning
9966416574SNeil Henning const bool Changed = !ToReplace.empty();
10066416574SNeil Henning
10166416574SNeil Henning for (ReplacementInfo &Info : ToReplace) {
10266416574SNeil Henning optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent);
10366416574SNeil Henning }
10466416574SNeil Henning
10566416574SNeil Henning ToReplace.clear();
10666416574SNeil Henning
10766416574SNeil Henning return Changed;
10866416574SNeil Henning }
10966416574SNeil Henning
visitAtomicRMWInst(AtomicRMWInst & I)11066416574SNeil Henning void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
11166416574SNeil Henning // Early exit for unhandled address space atomic instructions.
11266416574SNeil Henning switch (I.getPointerAddressSpace()) {
11366416574SNeil Henning default:
11466416574SNeil Henning return;
11566416574SNeil Henning case AMDGPUAS::GLOBAL_ADDRESS:
11666416574SNeil Henning case AMDGPUAS::LOCAL_ADDRESS:
11766416574SNeil Henning break;
11866416574SNeil Henning }
11966416574SNeil Henning
12017060f0aSJay Foad AtomicRMWInst::BinOp Op = I.getOperation();
12166416574SNeil Henning
12217060f0aSJay Foad switch (Op) {
12366416574SNeil Henning default:
12466416574SNeil Henning return;
12566416574SNeil Henning case AtomicRMWInst::Add:
12666416574SNeil Henning case AtomicRMWInst::Sub:
12770235c64SJay Foad case AtomicRMWInst::And:
12870235c64SJay Foad case AtomicRMWInst::Or:
12970235c64SJay Foad case AtomicRMWInst::Xor:
13017060f0aSJay Foad case AtomicRMWInst::Max:
13117060f0aSJay Foad case AtomicRMWInst::Min:
13217060f0aSJay Foad case AtomicRMWInst::UMax:
13317060f0aSJay Foad case AtomicRMWInst::UMin:
13466416574SNeil Henning break;
13566416574SNeil Henning }
13666416574SNeil Henning
13766416574SNeil Henning const unsigned PtrIdx = 0;
13866416574SNeil Henning const unsigned ValIdx = 1;
13966416574SNeil Henning
14066416574SNeil Henning // If the pointer operand is divergent, then each lane is doing an atomic
14166416574SNeil Henning // operation on a different address, and we cannot optimize that.
142dcb75324SJay Foad if (DA->isDivergentUse(&I.getOperandUse(PtrIdx))) {
14366416574SNeil Henning return;
14466416574SNeil Henning }
14566416574SNeil Henning
146dcb75324SJay Foad const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx));
14766416574SNeil Henning
14866416574SNeil Henning // If the value operand is divergent, each lane is contributing a different
14966416574SNeil Henning // value to the atomic calculation. We can only optimize divergent values if
15066416574SNeil Henning // we have DPP available on our subtarget, and the atomic operation is 32
15166416574SNeil Henning // bits.
152eac23862SJay Foad if (ValDivergent &&
153eac23862SJay Foad (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
15466416574SNeil Henning return;
15566416574SNeil Henning }
15666416574SNeil Henning
15766416574SNeil Henning // If we get here, we can optimize the atomic using a single wavefront-wide
15866416574SNeil Henning // atomic operation to do the calculation for the entire wavefront, so
15966416574SNeil Henning // remember the instruction so we can come back to it.
16066416574SNeil Henning const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};
16166416574SNeil Henning
16266416574SNeil Henning ToReplace.push_back(Info);
16366416574SNeil Henning }
16466416574SNeil Henning
visitIntrinsicInst(IntrinsicInst & I)16566416574SNeil Henning void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
16617060f0aSJay Foad AtomicRMWInst::BinOp Op;
16766416574SNeil Henning
16866416574SNeil Henning switch (I.getIntrinsicID()) {
16966416574SNeil Henning default:
17066416574SNeil Henning return;
17166416574SNeil Henning case Intrinsic::amdgcn_buffer_atomic_add:
17266416574SNeil Henning case Intrinsic::amdgcn_struct_buffer_atomic_add:
17366416574SNeil Henning case Intrinsic::amdgcn_raw_buffer_atomic_add:
17417060f0aSJay Foad Op = AtomicRMWInst::Add;
17566416574SNeil Henning break;
17666416574SNeil Henning case Intrinsic::amdgcn_buffer_atomic_sub:
17766416574SNeil Henning case Intrinsic::amdgcn_struct_buffer_atomic_sub:
17866416574SNeil Henning case Intrinsic::amdgcn_raw_buffer_atomic_sub:
17917060f0aSJay Foad Op = AtomicRMWInst::Sub;
18017060f0aSJay Foad break;
18170235c64SJay Foad case Intrinsic::amdgcn_buffer_atomic_and:
18270235c64SJay Foad case Intrinsic::amdgcn_struct_buffer_atomic_and:
18370235c64SJay Foad case Intrinsic::amdgcn_raw_buffer_atomic_and:
18470235c64SJay Foad Op = AtomicRMWInst::And;
18570235c64SJay Foad break;
18670235c64SJay Foad case Intrinsic::amdgcn_buffer_atomic_or:
18770235c64SJay Foad case Intrinsic::amdgcn_struct_buffer_atomic_or:
18870235c64SJay Foad case Intrinsic::amdgcn_raw_buffer_atomic_or:
18970235c64SJay Foad Op = AtomicRMWInst::Or;
19070235c64SJay Foad break;
19170235c64SJay Foad case Intrinsic::amdgcn_buffer_atomic_xor:
19270235c64SJay Foad case Intrinsic::amdgcn_struct_buffer_atomic_xor:
19370235c64SJay Foad case Intrinsic::amdgcn_raw_buffer_atomic_xor:
19470235c64SJay Foad Op = AtomicRMWInst::Xor;
19570235c64SJay Foad break;
19617060f0aSJay Foad case Intrinsic::amdgcn_buffer_atomic_smin:
19717060f0aSJay Foad case Intrinsic::amdgcn_struct_buffer_atomic_smin:
19817060f0aSJay Foad case Intrinsic::amdgcn_raw_buffer_atomic_smin:
19917060f0aSJay Foad Op = AtomicRMWInst::Min;
20017060f0aSJay Foad break;
20117060f0aSJay Foad case Intrinsic::amdgcn_buffer_atomic_umin:
20217060f0aSJay Foad case Intrinsic::amdgcn_struct_buffer_atomic_umin:
20317060f0aSJay Foad case Intrinsic::amdgcn_raw_buffer_atomic_umin:
20417060f0aSJay Foad Op = AtomicRMWInst::UMin;
20517060f0aSJay Foad break;
20617060f0aSJay Foad case Intrinsic::amdgcn_buffer_atomic_smax:
20717060f0aSJay Foad case Intrinsic::amdgcn_struct_buffer_atomic_smax:
20817060f0aSJay Foad case Intrinsic::amdgcn_raw_buffer_atomic_smax:
20917060f0aSJay Foad Op = AtomicRMWInst::Max;
21017060f0aSJay Foad break;
21117060f0aSJay Foad case Intrinsic::amdgcn_buffer_atomic_umax:
21217060f0aSJay Foad case Intrinsic::amdgcn_struct_buffer_atomic_umax:
21317060f0aSJay Foad case Intrinsic::amdgcn_raw_buffer_atomic_umax:
21417060f0aSJay Foad Op = AtomicRMWInst::UMax;
21566416574SNeil Henning break;
21666416574SNeil Henning }
21766416574SNeil Henning
21866416574SNeil Henning const unsigned ValIdx = 0;
21966416574SNeil Henning
220dcb75324SJay Foad const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx));
22166416574SNeil Henning
22266416574SNeil Henning // If the value operand is divergent, each lane is contributing a different
22366416574SNeil Henning // value to the atomic calculation. We can only optimize divergent values if
22466416574SNeil Henning // we have DPP available on our subtarget, and the atomic operation is 32
22566416574SNeil Henning // bits.
226eac23862SJay Foad if (ValDivergent &&
227eac23862SJay Foad (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
22866416574SNeil Henning return;
22966416574SNeil Henning }
23066416574SNeil Henning
23166416574SNeil Henning // If any of the other arguments to the intrinsic are divergent, we can't
23266416574SNeil Henning // optimize the operation.
23366416574SNeil Henning for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) {
234dcb75324SJay Foad if (DA->isDivergentUse(&I.getOperandUse(Idx))) {
23566416574SNeil Henning return;
23666416574SNeil Henning }
23766416574SNeil Henning }
23866416574SNeil Henning
23966416574SNeil Henning // If we get here, we can optimize the atomic using a single wavefront-wide
24066416574SNeil Henning // atomic operation to do the calculation for the entire wavefront, so
24166416574SNeil Henning // remember the instruction so we can come back to it.
24266416574SNeil Henning const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};
24366416574SNeil Henning
24466416574SNeil Henning ToReplace.push_back(Info);
24566416574SNeil Henning }
24666416574SNeil Henning
24717060f0aSJay Foad // Use the builder to create the non-atomic counterpart of the specified
24817060f0aSJay Foad // atomicrmw binary op.
buildNonAtomicBinOp(IRBuilder<> & B,AtomicRMWInst::BinOp Op,Value * LHS,Value * RHS)24917060f0aSJay Foad static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
25017060f0aSJay Foad Value *LHS, Value *RHS) {
25117060f0aSJay Foad CmpInst::Predicate Pred;
25217060f0aSJay Foad
25317060f0aSJay Foad switch (Op) {
25417060f0aSJay Foad default:
25517060f0aSJay Foad llvm_unreachable("Unhandled atomic op");
25617060f0aSJay Foad case AtomicRMWInst::Add:
25717060f0aSJay Foad return B.CreateBinOp(Instruction::Add, LHS, RHS);
25817060f0aSJay Foad case AtomicRMWInst::Sub:
25917060f0aSJay Foad return B.CreateBinOp(Instruction::Sub, LHS, RHS);
26070235c64SJay Foad case AtomicRMWInst::And:
26170235c64SJay Foad return B.CreateBinOp(Instruction::And, LHS, RHS);
26270235c64SJay Foad case AtomicRMWInst::Or:
26370235c64SJay Foad return B.CreateBinOp(Instruction::Or, LHS, RHS);
26470235c64SJay Foad case AtomicRMWInst::Xor:
26570235c64SJay Foad return B.CreateBinOp(Instruction::Xor, LHS, RHS);
26617060f0aSJay Foad
26717060f0aSJay Foad case AtomicRMWInst::Max:
26817060f0aSJay Foad Pred = CmpInst::ICMP_SGT;
26917060f0aSJay Foad break;
27017060f0aSJay Foad case AtomicRMWInst::Min:
27117060f0aSJay Foad Pred = CmpInst::ICMP_SLT;
27217060f0aSJay Foad break;
27317060f0aSJay Foad case AtomicRMWInst::UMax:
27417060f0aSJay Foad Pred = CmpInst::ICMP_UGT;
27517060f0aSJay Foad break;
27617060f0aSJay Foad case AtomicRMWInst::UMin:
27717060f0aSJay Foad Pred = CmpInst::ICMP_ULT;
27817060f0aSJay Foad break;
27917060f0aSJay Foad }
28017060f0aSJay Foad Value *Cond = B.CreateICmp(Pred, LHS, RHS);
28117060f0aSJay Foad return B.CreateSelect(Cond, LHS, RHS);
28217060f0aSJay Foad }
28317060f0aSJay Foad
2849d08f276SJay Foad // Use the builder to create a reduction of V across the wavefront, with all
2859d08f276SJay Foad // lanes active, returning the same result in all lanes.
buildReduction(IRBuilder<> & B,AtomicRMWInst::BinOp Op,Value * V,Value * const Identity) const2869d08f276SJay Foad Value *AMDGPUAtomicOptimizer::buildReduction(IRBuilder<> &B,
2879d08f276SJay Foad AtomicRMWInst::BinOp Op, Value *V,
2889d08f276SJay Foad Value *const Identity) const {
2899d08f276SJay Foad Type *const Ty = V->getType();
2909d08f276SJay Foad Module *M = B.GetInsertBlock()->getModule();
2919d08f276SJay Foad Function *UpdateDPP =
2929d08f276SJay Foad Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
2939d08f276SJay Foad
2949d08f276SJay Foad // Reduce within each row of 16 lanes.
2959d08f276SJay Foad for (unsigned Idx = 0; Idx < 4; Idx++) {
2969d08f276SJay Foad V = buildNonAtomicBinOp(
2979d08f276SJay Foad B, Op, V,
2989d08f276SJay Foad B.CreateCall(UpdateDPP,
2999d08f276SJay Foad {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx),
3009d08f276SJay Foad B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
3019d08f276SJay Foad }
3029d08f276SJay Foad
3039d08f276SJay Foad // Reduce within each pair of rows (i.e. 32 lanes).
3049d08f276SJay Foad assert(ST->hasPermLaneX16());
3059d08f276SJay Foad V = buildNonAtomicBinOp(
3069d08f276SJay Foad B, Op, V,
3079d08f276SJay Foad B.CreateIntrinsic(
3089d08f276SJay Foad Intrinsic::amdgcn_permlanex16, {},
3099d08f276SJay Foad {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}));
3109d08f276SJay Foad
3119d08f276SJay Foad if (ST->isWave32())
3129d08f276SJay Foad return V;
3139d08f276SJay Foad
314*bfcfd53bSJay Foad if (ST->hasPermLane64()) {
315*bfcfd53bSJay Foad // Reduce across the upper and lower 32 lanes.
316*bfcfd53bSJay Foad return buildNonAtomicBinOp(
317*bfcfd53bSJay Foad B, Op, V, B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V));
318*bfcfd53bSJay Foad }
319*bfcfd53bSJay Foad
3209d08f276SJay Foad // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
3219d08f276SJay Foad // combine them with a scalar operation.
3229d08f276SJay Foad Function *ReadLane =
3239d08f276SJay Foad Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
3249d08f276SJay Foad Value *const Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
3259d08f276SJay Foad Value *const Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
3269d08f276SJay Foad return buildNonAtomicBinOp(B, Op, Lane0, Lane32);
3279d08f276SJay Foad }
3289d08f276SJay Foad
329eac23862SJay Foad // Use the builder to create an inclusive scan of V across the wavefront, with
330eac23862SJay Foad // all lanes active.
buildScan(IRBuilder<> & B,AtomicRMWInst::BinOp Op,Value * V,Value * const Identity) const331eac23862SJay Foad Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
332eac23862SJay Foad Value *V, Value *const Identity) const {
333eac23862SJay Foad Type *const Ty = V->getType();
334eac23862SJay Foad Module *M = B.GetInsertBlock()->getModule();
335eac23862SJay Foad Function *UpdateDPP =
336eac23862SJay Foad Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
337eac23862SJay Foad
338eac23862SJay Foad for (unsigned Idx = 0; Idx < 4; Idx++) {
339eac23862SJay Foad V = buildNonAtomicBinOp(
340eac23862SJay Foad B, Op, V,
341eac23862SJay Foad B.CreateCall(UpdateDPP,
342eac23862SJay Foad {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx),
343eac23862SJay Foad B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
344eac23862SJay Foad }
345eac23862SJay Foad if (ST->hasDPPBroadcasts()) {
346eac23862SJay Foad // GFX9 has DPP row broadcast operations.
347eac23862SJay Foad V = buildNonAtomicBinOp(
348eac23862SJay Foad B, Op, V,
349eac23862SJay Foad B.CreateCall(UpdateDPP,
350eac23862SJay Foad {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa),
351eac23862SJay Foad B.getInt32(0xf), B.getFalse()}));
352eac23862SJay Foad V = buildNonAtomicBinOp(
353eac23862SJay Foad B, Op, V,
354eac23862SJay Foad B.CreateCall(UpdateDPP,
355eac23862SJay Foad {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc),
356eac23862SJay Foad B.getInt32(0xf), B.getFalse()}));
357eac23862SJay Foad } else {
358eac23862SJay Foad // On GFX10 all DPP operations are confined to a single row. To get cross-
359eac23862SJay Foad // row operations we have to use permlane or readlane.
360eac23862SJay Foad
361eac23862SJay Foad // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
362eac23862SJay Foad // 48..63).
3639d08f276SJay Foad assert(ST->hasPermLaneX16());
364c96dfe0dSJay Foad Value *const PermX = B.CreateIntrinsic(
365c96dfe0dSJay Foad Intrinsic::amdgcn_permlanex16, {},
366c96dfe0dSJay Foad {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
367eac23862SJay Foad V = buildNonAtomicBinOp(
368eac23862SJay Foad B, Op, V,
369eac23862SJay Foad B.CreateCall(UpdateDPP,
370eac23862SJay Foad {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID),
371eac23862SJay Foad B.getInt32(0xa), B.getInt32(0xf), B.getFalse()}));
372eac23862SJay Foad if (!ST->isWave32()) {
373eac23862SJay Foad // Combine lane 31 into lanes 32..63.
374c96dfe0dSJay Foad Value *const Lane31 = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
375c96dfe0dSJay Foad {V, B.getInt32(31)});
376eac23862SJay Foad V = buildNonAtomicBinOp(
377eac23862SJay Foad B, Op, V,
378eac23862SJay Foad B.CreateCall(UpdateDPP,
379eac23862SJay Foad {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID),
380eac23862SJay Foad B.getInt32(0xc), B.getInt32(0xf), B.getFalse()}));
381eac23862SJay Foad }
382eac23862SJay Foad }
383eac23862SJay Foad return V;
384eac23862SJay Foad }
385eac23862SJay Foad
386eac23862SJay Foad // Use the builder to create a shift right of V across the wavefront, with all
387eac23862SJay Foad // lanes active, to turn an inclusive scan into an exclusive scan.
buildShiftRight(IRBuilder<> & B,Value * V,Value * const Identity) const388eac23862SJay Foad Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V,
389eac23862SJay Foad Value *const Identity) const {
390eac23862SJay Foad Type *const Ty = V->getType();
391eac23862SJay Foad Module *M = B.GetInsertBlock()->getModule();
392eac23862SJay Foad Function *UpdateDPP =
393eac23862SJay Foad Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
394eac23862SJay Foad
395eac23862SJay Foad if (ST->hasDPPWavefrontShifts()) {
396eac23862SJay Foad // GFX9 has DPP wavefront shift operations.
397eac23862SJay Foad V = B.CreateCall(UpdateDPP,
398eac23862SJay Foad {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
399eac23862SJay Foad B.getInt32(0xf), B.getFalse()});
400eac23862SJay Foad } else {
401c96dfe0dSJay Foad Function *ReadLane =
402c96dfe0dSJay Foad Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
403c96dfe0dSJay Foad Function *WriteLane =
404c96dfe0dSJay Foad Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
405c96dfe0dSJay Foad
406eac23862SJay Foad // On GFX10 all DPP operations are confined to a single row. To get cross-
407eac23862SJay Foad // row operations we have to use permlane or readlane.
408eac23862SJay Foad Value *Old = V;
409eac23862SJay Foad V = B.CreateCall(UpdateDPP,
410eac23862SJay Foad {Identity, V, B.getInt32(DPP::ROW_SHR0 + 1),
411eac23862SJay Foad B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
412eac23862SJay Foad
413eac23862SJay Foad // Copy the old lane 15 to the new lane 16.
414eac23862SJay Foad V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}),
415eac23862SJay Foad B.getInt32(16), V});
416eac23862SJay Foad
417eac23862SJay Foad if (!ST->isWave32()) {
418eac23862SJay Foad // Copy the old lane 31 to the new lane 32.
419eac23862SJay Foad V = B.CreateCall(
420eac23862SJay Foad WriteLane,
421eac23862SJay Foad {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V});
422eac23862SJay Foad
423eac23862SJay Foad // Copy the old lane 47 to the new lane 48.
424eac23862SJay Foad V = B.CreateCall(
425eac23862SJay Foad WriteLane,
426eac23862SJay Foad {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V});
427eac23862SJay Foad }
428eac23862SJay Foad }
429eac23862SJay Foad
430eac23862SJay Foad return V;
431eac23862SJay Foad }
432eac23862SJay Foad
getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op,unsigned BitWidth)43317060f0aSJay Foad static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op,
43417060f0aSJay Foad unsigned BitWidth) {
43517060f0aSJay Foad switch (Op) {
43617060f0aSJay Foad default:
43717060f0aSJay Foad llvm_unreachable("Unhandled atomic op");
43817060f0aSJay Foad case AtomicRMWInst::Add:
43917060f0aSJay Foad case AtomicRMWInst::Sub:
44070235c64SJay Foad case AtomicRMWInst::Or:
44170235c64SJay Foad case AtomicRMWInst::Xor:
44217060f0aSJay Foad case AtomicRMWInst::UMax:
44317060f0aSJay Foad return APInt::getMinValue(BitWidth);
44470235c64SJay Foad case AtomicRMWInst::And:
44517060f0aSJay Foad case AtomicRMWInst::UMin:
44617060f0aSJay Foad return APInt::getMaxValue(BitWidth);
44717060f0aSJay Foad case AtomicRMWInst::Max:
44817060f0aSJay Foad return APInt::getSignedMinValue(BitWidth);
44917060f0aSJay Foad case AtomicRMWInst::Min:
45017060f0aSJay Foad return APInt::getSignedMaxValue(BitWidth);
45117060f0aSJay Foad }
45217060f0aSJay Foad }
45317060f0aSJay Foad
buildMul(IRBuilder<> & B,Value * LHS,Value * RHS)4540249df33SMirko Brkusanin static Value *buildMul(IRBuilder<> &B, Value *LHS, Value *RHS) {
4550249df33SMirko Brkusanin const ConstantInt *CI = dyn_cast<ConstantInt>(LHS);
4560249df33SMirko Brkusanin return (CI && CI->isOne()) ? RHS : B.CreateMul(LHS, RHS);
4570249df33SMirko Brkusanin }
4580249df33SMirko Brkusanin
optimizeAtomic(Instruction & I,AtomicRMWInst::BinOp Op,unsigned ValIdx,bool ValDivergent) const45966416574SNeil Henning void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
46017060f0aSJay Foad AtomicRMWInst::BinOp Op,
46166416574SNeil Henning unsigned ValIdx,
46266416574SNeil Henning bool ValDivergent) const {
46366416574SNeil Henning // Start building just before the instruction.
46466416574SNeil Henning IRBuilder<> B(&I);
46566416574SNeil Henning
466233a02d0SNeil Henning // If we are in a pixel shader, because of how we have to mask out helper
467233a02d0SNeil Henning // lane invocations, we need to record the entry and exit BB's.
468233a02d0SNeil Henning BasicBlock *PixelEntryBB = nullptr;
469233a02d0SNeil Henning BasicBlock *PixelExitBB = nullptr;
470233a02d0SNeil Henning
471233a02d0SNeil Henning // If we're optimizing an atomic within a pixel shader, we need to wrap the
472233a02d0SNeil Henning // entire atomic operation in a helper-lane check. We do not want any helper
473233a02d0SNeil Henning // lanes that are around only for the purposes of derivatives to take part
474233a02d0SNeil Henning // in any cross-lane communication, and we use a branch on whether the lane is
475233a02d0SNeil Henning // live to do this.
476233a02d0SNeil Henning if (IsPixelShader) {
477233a02d0SNeil Henning // Record I's original position as the entry block.
478233a02d0SNeil Henning PixelEntryBB = I.getParent();
479233a02d0SNeil Henning
480233a02d0SNeil Henning Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {});
481233a02d0SNeil Henning Instruction *const NonHelperTerminator =
482233a02d0SNeil Henning SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr);
483233a02d0SNeil Henning
484233a02d0SNeil Henning // Record I's new position as the exit block.
485233a02d0SNeil Henning PixelExitBB = I.getParent();
486233a02d0SNeil Henning
487233a02d0SNeil Henning I.moveBefore(NonHelperTerminator);
488233a02d0SNeil Henning B.SetInsertPoint(&I);
489233a02d0SNeil Henning }
490233a02d0SNeil Henning
49166416574SNeil Henning Type *const Ty = I.getType();
49266416574SNeil Henning const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
493aad93654SChristopher Tetreault auto *const VecTy = FixedVectorType::get(B.getInt32Ty(), 2);
49466416574SNeil Henning
49566416574SNeil Henning // This is the value in the atomic operation we need to combine in order to
49666416574SNeil Henning // reduce the number of atomic operations.
49766416574SNeil Henning Value *const V = I.getOperand(ValIdx);
49866416574SNeil Henning
49966416574SNeil Henning // We need to know how many lanes are active within the wavefront, and we do
5008c10fa1aSNeil Henning // this by doing a ballot of active lanes.
501eac23862SJay Foad Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
5025d3a69feSSebastian Neubauer CallInst *const Ballot =
5035d3a69feSSebastian Neubauer B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
50466416574SNeil Henning
50566416574SNeil Henning // We need to know how many lanes are active within the wavefront that are
50666416574SNeil Henning // below us. If we counted each lane linearly starting from 0, a lane is
50766416574SNeil Henning // below us only if its associated index was less than ours. We do this by
50866416574SNeil Henning // using the mbcnt intrinsic.
509eac23862SJay Foad Value *Mbcnt;
510eac23862SJay Foad if (ST->isWave32()) {
511eac23862SJay Foad Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
512eac23862SJay Foad {Ballot, B.getInt32(0)});
513eac23862SJay Foad } else {
5148c10fa1aSNeil Henning Value *const BitCast = B.CreateBitCast(Ballot, VecTy);
51566416574SNeil Henning Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0));
51666416574SNeil Henning Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1));
517eac23862SJay Foad Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
518eac23862SJay Foad {ExtractLo, B.getInt32(0)});
519eac23862SJay Foad Mbcnt =
520eac23862SJay Foad B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt});
521eac23862SJay Foad }
522eac23862SJay Foad Mbcnt = B.CreateIntCast(Mbcnt, Ty, false);
52366416574SNeil Henning
52417060f0aSJay Foad Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth));
52517060f0aSJay Foad
52617060f0aSJay Foad Value *ExclScan = nullptr;
52766416574SNeil Henning Value *NewV = nullptr;
52866416574SNeil Henning
5295dd5ddcbSJay Foad const bool NeedResult = !I.use_empty();
5305dd5ddcbSJay Foad
53166416574SNeil Henning // If we have a divergent value in each lane, we need to combine the value
53266416574SNeil Henning // using DPP.
53366416574SNeil Henning if (ValDivergent) {
53417060f0aSJay Foad // First we need to set all inactive invocations to the identity value, so
53517060f0aSJay Foad // that they can correctly contribute to the final result.
536eac23862SJay Foad NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
53766416574SNeil Henning
538eac23862SJay Foad const AtomicRMWInst::BinOp ScanOp =
539eac23862SJay Foad Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op;
5409d08f276SJay Foad if (!NeedResult && ST->hasPermLaneX16()) {
5419d08f276SJay Foad // On GFX10 the permlanex16 instruction helps us build a reduction without
5429d08f276SJay Foad // too many readlanes and writelanes, which are generally bad for
5439d08f276SJay Foad // performance.
5449d08f276SJay Foad NewV = buildReduction(B, ScanOp, NewV, Identity);
5459d08f276SJay Foad } else {
546eac23862SJay Foad NewV = buildScan(B, ScanOp, NewV, Identity);
5475dd5ddcbSJay Foad if (NeedResult)
548eac23862SJay Foad ExclScan = buildShiftRight(B, NewV, Identity);
54966416574SNeil Henning
550dc6e8dfdSJacob Lambert // Read the value from the last lane, which has accumulated the values of
55117060f0aSJay Foad // each active lane in the wavefront. This will be our new value which we
55217060f0aSJay Foad // will provide to the atomic operation.
553eac23862SJay Foad Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
5545a5a5312SJay Foad assert(TyBitWidth == 32);
5559d08f276SJay Foad NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
5569d08f276SJay Foad {NewV, LastLaneIdx});
5579d08f276SJay Foad }
5588c10fa1aSNeil Henning
5598c10fa1aSNeil Henning // Finally mark the readlanes in the WWM section.
560c3ce7baeSPiotr Sobczak NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
56166416574SNeil Henning } else {
56217060f0aSJay Foad switch (Op) {
56317060f0aSJay Foad default:
56417060f0aSJay Foad llvm_unreachable("Unhandled atomic op");
56517060f0aSJay Foad
56617060f0aSJay Foad case AtomicRMWInst::Add:
56717060f0aSJay Foad case AtomicRMWInst::Sub: {
56870235c64SJay Foad // The new value we will be contributing to the atomic operation is the
56970235c64SJay Foad // old value times the number of active lanes.
57070235c64SJay Foad Value *const Ctpop = B.CreateIntCast(
57170235c64SJay Foad B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
5720249df33SMirko Brkusanin NewV = buildMul(B, V, Ctpop);
57317060f0aSJay Foad break;
57417060f0aSJay Foad }
57517060f0aSJay Foad
57670235c64SJay Foad case AtomicRMWInst::And:
57770235c64SJay Foad case AtomicRMWInst::Or:
57817060f0aSJay Foad case AtomicRMWInst::Max:
57917060f0aSJay Foad case AtomicRMWInst::Min:
58017060f0aSJay Foad case AtomicRMWInst::UMax:
58117060f0aSJay Foad case AtomicRMWInst::UMin:
58270235c64SJay Foad // These operations with a uniform value are idempotent: doing the atomic
58370235c64SJay Foad // operation multiple times has the same effect as doing it once.
58417060f0aSJay Foad NewV = V;
58517060f0aSJay Foad break;
58670235c64SJay Foad
58770235c64SJay Foad case AtomicRMWInst::Xor:
58870235c64SJay Foad // The new value we will be contributing to the atomic operation is the
58970235c64SJay Foad // old value times the parity of the number of active lanes.
59070235c64SJay Foad Value *const Ctpop = B.CreateIntCast(
59170235c64SJay Foad B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
5920249df33SMirko Brkusanin NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1));
59370235c64SJay Foad break;
59417060f0aSJay Foad }
59566416574SNeil Henning }
59666416574SNeil Henning
59766416574SNeil Henning // We only want a single lane to enter our new control flow, and we do this
59866416574SNeil Henning // by checking if there are any active lanes below us. Only one lane will
59966416574SNeil Henning // have 0 active lanes below us, so that will be the only one to progress.
60070235c64SJay Foad Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0));
60166416574SNeil Henning
60266416574SNeil Henning // Store I's original basic block before we split the block.
60366416574SNeil Henning BasicBlock *const EntryBB = I.getParent();
60466416574SNeil Henning
60566416574SNeil Henning // We need to introduce some new control flow to force a single lane to be
60666416574SNeil Henning // active. We do this by splitting I's basic block at I, and introducing the
60766416574SNeil Henning // new block such that:
60866416574SNeil Henning // entry --> single_lane -\
60966416574SNeil Henning // \------------------> exit
61066416574SNeil Henning Instruction *const SingleLaneTerminator =
61166416574SNeil Henning SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr);
61266416574SNeil Henning
61366416574SNeil Henning // Move the IR builder into single_lane next.
61466416574SNeil Henning B.SetInsertPoint(SingleLaneTerminator);
61566416574SNeil Henning
61666416574SNeil Henning // Clone the original atomic operation into single lane, replacing the
61766416574SNeil Henning // original value with our newly created one.
61866416574SNeil Henning Instruction *const NewI = I.clone();
61966416574SNeil Henning B.Insert(NewI);
62066416574SNeil Henning NewI->setOperand(ValIdx, NewV);
62166416574SNeil Henning
62266416574SNeil Henning // Move the IR builder into exit next, and start inserting just before the
62366416574SNeil Henning // original instruction.
62466416574SNeil Henning B.SetInsertPoint(&I);
62566416574SNeil Henning
626298500aeSJay Foad if (NeedResult) {
62766416574SNeil Henning // Create a PHI node to get our new atomic result into the exit block.
62866416574SNeil Henning PHINode *const PHI = B.CreatePHI(Ty, 2);
62966416574SNeil Henning PHI->addIncoming(UndefValue::get(Ty), EntryBB);
63066416574SNeil Henning PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
63166416574SNeil Henning
63266416574SNeil Henning // We need to broadcast the value who was the lowest active lane (the first
63366416574SNeil Henning // lane) to all other lanes in the wavefront. We use an intrinsic for this,
63466416574SNeil Henning // but have to handle 64-bit broadcasts with two calls to this intrinsic.
63566416574SNeil Henning Value *BroadcastI = nullptr;
63666416574SNeil Henning
63766416574SNeil Henning if (TyBitWidth == 64) {
63866416574SNeil Henning Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());
63966416574SNeil Henning Value *const ExtractHi =
640eac23862SJay Foad B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty());
64166416574SNeil Henning CallInst *const ReadFirstLaneLo =
64266416574SNeil Henning B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
64366416574SNeil Henning CallInst *const ReadFirstLaneHi =
64466416574SNeil Henning B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
64566416574SNeil Henning Value *const PartialInsert = B.CreateInsertElement(
64666416574SNeil Henning UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
64766416574SNeil Henning Value *const Insert =
64866416574SNeil Henning B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
64966416574SNeil Henning BroadcastI = B.CreateBitCast(Insert, Ty);
65066416574SNeil Henning } else if (TyBitWidth == 32) {
6513a31b3f6SMatt Arsenault
6523a31b3f6SMatt Arsenault BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
65366416574SNeil Henning } else {
65466416574SNeil Henning llvm_unreachable("Unhandled atomic bit width");
65566416574SNeil Henning }
65666416574SNeil Henning
65766416574SNeil Henning // Now that we have the result of our single atomic operation, we need to
658298500aeSJay Foad // get our individual lane's slice into the result. We use the lane offset
659298500aeSJay Foad // we previously calculated combined with the atomic result value we got
660298500aeSJay Foad // from the first lane, to get our lane's index into the atomic result.
66117060f0aSJay Foad Value *LaneOffset = nullptr;
66217060f0aSJay Foad if (ValDivergent) {
663c3ce7baeSPiotr Sobczak LaneOffset =
664c3ce7baeSPiotr Sobczak B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
66517060f0aSJay Foad } else {
66617060f0aSJay Foad switch (Op) {
66717060f0aSJay Foad default:
66817060f0aSJay Foad llvm_unreachable("Unhandled atomic op");
66917060f0aSJay Foad case AtomicRMWInst::Add:
67017060f0aSJay Foad case AtomicRMWInst::Sub:
6710249df33SMirko Brkusanin LaneOffset = buildMul(B, V, Mbcnt);
67217060f0aSJay Foad break;
67370235c64SJay Foad case AtomicRMWInst::And:
67470235c64SJay Foad case AtomicRMWInst::Or:
67517060f0aSJay Foad case AtomicRMWInst::Max:
67617060f0aSJay Foad case AtomicRMWInst::Min:
67717060f0aSJay Foad case AtomicRMWInst::UMax:
67817060f0aSJay Foad case AtomicRMWInst::UMin:
67917060f0aSJay Foad LaneOffset = B.CreateSelect(Cond, Identity, V);
68017060f0aSJay Foad break;
68170235c64SJay Foad case AtomicRMWInst::Xor:
6820249df33SMirko Brkusanin LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1));
68370235c64SJay Foad break;
68417060f0aSJay Foad }
68517060f0aSJay Foad }
68617060f0aSJay Foad Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
68766416574SNeil Henning
688233a02d0SNeil Henning if (IsPixelShader) {
689233a02d0SNeil Henning // Need a final PHI to reconverge to above the helper lane branch mask.
690233a02d0SNeil Henning B.SetInsertPoint(PixelExitBB->getFirstNonPHI());
691233a02d0SNeil Henning
692233a02d0SNeil Henning PHINode *const PHI = B.CreatePHI(Ty, 2);
693233a02d0SNeil Henning PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB);
694233a02d0SNeil Henning PHI->addIncoming(Result, I.getParent());
695233a02d0SNeil Henning I.replaceAllUsesWith(PHI);
696233a02d0SNeil Henning } else {
69766416574SNeil Henning // Replace the original atomic instruction with the new one.
69866416574SNeil Henning I.replaceAllUsesWith(Result);
699233a02d0SNeil Henning }
700298500aeSJay Foad }
70166416574SNeil Henning
70266416574SNeil Henning // And delete the original.
70366416574SNeil Henning I.eraseFromParent();
70466416574SNeil Henning }
70566416574SNeil Henning
70666416574SNeil Henning INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE,
70766416574SNeil Henning "AMDGPU atomic optimizations", false, false)
INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)70866416574SNeil Henning INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
70966416574SNeil Henning INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
71066416574SNeil Henning INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE,
71166416574SNeil Henning "AMDGPU atomic optimizations", false, false)
71266416574SNeil Henning
71366416574SNeil Henning FunctionPass *llvm::createAMDGPUAtomicOptimizerPass() {
71466416574SNeil Henning return new AMDGPUAtomicOptimizer();
71566416574SNeil Henning }
716