166416574SNeil Henning //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===// 266416574SNeil Henning // 32946cd70SChandler Carruth // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 42946cd70SChandler Carruth // See https://llvm.org/LICENSE.txt for license information. 52946cd70SChandler Carruth // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 666416574SNeil Henning // 766416574SNeil Henning //===----------------------------------------------------------------------===// 866416574SNeil Henning // 966416574SNeil Henning /// \file 1066416574SNeil Henning /// This pass optimizes atomic operations by using a single lane of a wavefront 1166416574SNeil Henning /// to perform the atomic operation, thus reducing contention on that memory 1266416574SNeil Henning /// location. 1366416574SNeil Henning // 1466416574SNeil Henning //===----------------------------------------------------------------------===// 1566416574SNeil Henning 1666416574SNeil Henning #include "AMDGPU.h" 17560d7e04Sdfukalov #include "GCNSubtarget.h" 1866416574SNeil Henning #include "llvm/Analysis/LegacyDivergenceAnalysis.h" 1966416574SNeil Henning #include "llvm/CodeGen/TargetPassConfig.h" 2066416574SNeil Henning #include "llvm/IR/IRBuilder.h" 2166416574SNeil Henning #include "llvm/IR/InstVisitor.h" 226a87e9b0Sdfukalov #include "llvm/IR/IntrinsicsAMDGPU.h" 2305da2fe5SReid Kleckner #include "llvm/InitializePasses.h" 246a87e9b0Sdfukalov #include "llvm/Target/TargetMachine.h" 2566416574SNeil Henning #include "llvm/Transforms/Utils/BasicBlockUtils.h" 2666416574SNeil Henning 2766416574SNeil Henning #define DEBUG_TYPE "amdgpu-atomic-optimizer" 2866416574SNeil Henning 2966416574SNeil Henning using namespace llvm; 30eac23862SJay Foad using namespace llvm::AMDGPU; 3166416574SNeil Henning 3266416574SNeil Henning namespace { 3366416574SNeil Henning 3466416574SNeil Henning struct ReplacementInfo { 3566416574SNeil Henning Instruction *I; 3617060f0aSJay Foad AtomicRMWInst::BinOp Op; 3766416574SNeil Henning unsigned ValIdx; 3866416574SNeil Henning bool ValDivergent; 3966416574SNeil Henning }; 4066416574SNeil Henning 4166416574SNeil Henning class AMDGPUAtomicOptimizer : public FunctionPass, 4266416574SNeil Henning public InstVisitor<AMDGPUAtomicOptimizer> { 4366416574SNeil Henning private: 4466416574SNeil Henning SmallVector<ReplacementInfo, 8> ToReplace; 4566416574SNeil Henning const LegacyDivergenceAnalysis *DA; 4666416574SNeil Henning const DataLayout *DL; 4766416574SNeil Henning DominatorTree *DT; 48eac23862SJay Foad const GCNSubtarget *ST; 49233a02d0SNeil Henning bool IsPixelShader; 5066416574SNeil Henning 519d08f276SJay Foad Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, 529d08f276SJay Foad Value *const Identity) const; 53eac23862SJay Foad Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, 54eac23862SJay Foad Value *const Identity) const; 55eac23862SJay Foad Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const; 5617060f0aSJay Foad void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx, 5717060f0aSJay Foad bool ValDivergent) const; 5866416574SNeil Henning 5966416574SNeil Henning public: 6066416574SNeil Henning static char ID; 6166416574SNeil Henning 6266416574SNeil Henning AMDGPUAtomicOptimizer() : FunctionPass(ID) {} 6366416574SNeil Henning 6466416574SNeil Henning bool runOnFunction(Function &F) override; 6566416574SNeil Henning 6666416574SNeil Henning void getAnalysisUsage(AnalysisUsage &AU) const override { 6766416574SNeil Henning AU.addPreserved<DominatorTreeWrapperPass>(); 6866416574SNeil Henning AU.addRequired<LegacyDivergenceAnalysis>(); 6966416574SNeil Henning AU.addRequired<TargetPassConfig>(); 7066416574SNeil Henning } 7166416574SNeil Henning 7266416574SNeil Henning void visitAtomicRMWInst(AtomicRMWInst &I); 7366416574SNeil Henning void visitIntrinsicInst(IntrinsicInst &I); 7466416574SNeil Henning }; 7566416574SNeil Henning 7666416574SNeil Henning } // namespace 7766416574SNeil Henning 7866416574SNeil Henning char AMDGPUAtomicOptimizer::ID = 0; 7966416574SNeil Henning 8066416574SNeil Henning char &llvm::AMDGPUAtomicOptimizerID = AMDGPUAtomicOptimizer::ID; 8166416574SNeil Henning 8266416574SNeil Henning bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) { 8366416574SNeil Henning if (skipFunction(F)) { 8466416574SNeil Henning return false; 8566416574SNeil Henning } 8666416574SNeil Henning 8766416574SNeil Henning DA = &getAnalysis<LegacyDivergenceAnalysis>(); 8866416574SNeil Henning DL = &F.getParent()->getDataLayout(); 8966416574SNeil Henning DominatorTreeWrapperPass *const DTW = 9066416574SNeil Henning getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 9166416574SNeil Henning DT = DTW ? &DTW->getDomTree() : nullptr; 9266416574SNeil Henning const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); 9366416574SNeil Henning const TargetMachine &TM = TPC.getTM<TargetMachine>(); 94eac23862SJay Foad ST = &TM.getSubtarget<GCNSubtarget>(F); 95233a02d0SNeil Henning IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; 9666416574SNeil Henning 9766416574SNeil Henning visit(F); 9866416574SNeil Henning 9966416574SNeil Henning const bool Changed = !ToReplace.empty(); 10066416574SNeil Henning 10166416574SNeil Henning for (ReplacementInfo &Info : ToReplace) { 10266416574SNeil Henning optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent); 10366416574SNeil Henning } 10466416574SNeil Henning 10566416574SNeil Henning ToReplace.clear(); 10666416574SNeil Henning 10766416574SNeil Henning return Changed; 10866416574SNeil Henning } 10966416574SNeil Henning 11066416574SNeil Henning void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) { 11166416574SNeil Henning // Early exit for unhandled address space atomic instructions. 11266416574SNeil Henning switch (I.getPointerAddressSpace()) { 11366416574SNeil Henning default: 11466416574SNeil Henning return; 11566416574SNeil Henning case AMDGPUAS::GLOBAL_ADDRESS: 11666416574SNeil Henning case AMDGPUAS::LOCAL_ADDRESS: 11766416574SNeil Henning break; 11866416574SNeil Henning } 11966416574SNeil Henning 12017060f0aSJay Foad AtomicRMWInst::BinOp Op = I.getOperation(); 12166416574SNeil Henning 12217060f0aSJay Foad switch (Op) { 12366416574SNeil Henning default: 12466416574SNeil Henning return; 12566416574SNeil Henning case AtomicRMWInst::Add: 12666416574SNeil Henning case AtomicRMWInst::Sub: 12770235c64SJay Foad case AtomicRMWInst::And: 12870235c64SJay Foad case AtomicRMWInst::Or: 12970235c64SJay Foad case AtomicRMWInst::Xor: 13017060f0aSJay Foad case AtomicRMWInst::Max: 13117060f0aSJay Foad case AtomicRMWInst::Min: 13217060f0aSJay Foad case AtomicRMWInst::UMax: 13317060f0aSJay Foad case AtomicRMWInst::UMin: 13466416574SNeil Henning break; 13566416574SNeil Henning } 13666416574SNeil Henning 13766416574SNeil Henning const unsigned PtrIdx = 0; 13866416574SNeil Henning const unsigned ValIdx = 1; 13966416574SNeil Henning 14066416574SNeil Henning // If the pointer operand is divergent, then each lane is doing an atomic 14166416574SNeil Henning // operation on a different address, and we cannot optimize that. 142dcb75324SJay Foad if (DA->isDivergentUse(&I.getOperandUse(PtrIdx))) { 14366416574SNeil Henning return; 14466416574SNeil Henning } 14566416574SNeil Henning 146dcb75324SJay Foad const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx)); 14766416574SNeil Henning 14866416574SNeil Henning // If the value operand is divergent, each lane is contributing a different 14966416574SNeil Henning // value to the atomic calculation. We can only optimize divergent values if 15066416574SNeil Henning // we have DPP available on our subtarget, and the atomic operation is 32 15166416574SNeil Henning // bits. 152eac23862SJay Foad if (ValDivergent && 153eac23862SJay Foad (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) { 15466416574SNeil Henning return; 15566416574SNeil Henning } 15666416574SNeil Henning 15766416574SNeil Henning // If we get here, we can optimize the atomic using a single wavefront-wide 15866416574SNeil Henning // atomic operation to do the calculation for the entire wavefront, so 15966416574SNeil Henning // remember the instruction so we can come back to it. 16066416574SNeil Henning const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; 16166416574SNeil Henning 16266416574SNeil Henning ToReplace.push_back(Info); 16366416574SNeil Henning } 16466416574SNeil Henning 16566416574SNeil Henning void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) { 16617060f0aSJay Foad AtomicRMWInst::BinOp Op; 16766416574SNeil Henning 16866416574SNeil Henning switch (I.getIntrinsicID()) { 16966416574SNeil Henning default: 17066416574SNeil Henning return; 17166416574SNeil Henning case Intrinsic::amdgcn_buffer_atomic_add: 17266416574SNeil Henning case Intrinsic::amdgcn_struct_buffer_atomic_add: 17366416574SNeil Henning case Intrinsic::amdgcn_raw_buffer_atomic_add: 17417060f0aSJay Foad Op = AtomicRMWInst::Add; 17566416574SNeil Henning break; 17666416574SNeil Henning case Intrinsic::amdgcn_buffer_atomic_sub: 17766416574SNeil Henning case Intrinsic::amdgcn_struct_buffer_atomic_sub: 17866416574SNeil Henning case Intrinsic::amdgcn_raw_buffer_atomic_sub: 17917060f0aSJay Foad Op = AtomicRMWInst::Sub; 18017060f0aSJay Foad break; 18170235c64SJay Foad case Intrinsic::amdgcn_buffer_atomic_and: 18270235c64SJay Foad case Intrinsic::amdgcn_struct_buffer_atomic_and: 18370235c64SJay Foad case Intrinsic::amdgcn_raw_buffer_atomic_and: 18470235c64SJay Foad Op = AtomicRMWInst::And; 18570235c64SJay Foad break; 18670235c64SJay Foad case Intrinsic::amdgcn_buffer_atomic_or: 18770235c64SJay Foad case Intrinsic::amdgcn_struct_buffer_atomic_or: 18870235c64SJay Foad case Intrinsic::amdgcn_raw_buffer_atomic_or: 18970235c64SJay Foad Op = AtomicRMWInst::Or; 19070235c64SJay Foad break; 19170235c64SJay Foad case Intrinsic::amdgcn_buffer_atomic_xor: 19270235c64SJay Foad case Intrinsic::amdgcn_struct_buffer_atomic_xor: 19370235c64SJay Foad case Intrinsic::amdgcn_raw_buffer_atomic_xor: 19470235c64SJay Foad Op = AtomicRMWInst::Xor; 19570235c64SJay Foad break; 19617060f0aSJay Foad case Intrinsic::amdgcn_buffer_atomic_smin: 19717060f0aSJay Foad case Intrinsic::amdgcn_struct_buffer_atomic_smin: 19817060f0aSJay Foad case Intrinsic::amdgcn_raw_buffer_atomic_smin: 19917060f0aSJay Foad Op = AtomicRMWInst::Min; 20017060f0aSJay Foad break; 20117060f0aSJay Foad case Intrinsic::amdgcn_buffer_atomic_umin: 20217060f0aSJay Foad case Intrinsic::amdgcn_struct_buffer_atomic_umin: 20317060f0aSJay Foad case Intrinsic::amdgcn_raw_buffer_atomic_umin: 20417060f0aSJay Foad Op = AtomicRMWInst::UMin; 20517060f0aSJay Foad break; 20617060f0aSJay Foad case Intrinsic::amdgcn_buffer_atomic_smax: 20717060f0aSJay Foad case Intrinsic::amdgcn_struct_buffer_atomic_smax: 20817060f0aSJay Foad case Intrinsic::amdgcn_raw_buffer_atomic_smax: 20917060f0aSJay Foad Op = AtomicRMWInst::Max; 21017060f0aSJay Foad break; 21117060f0aSJay Foad case Intrinsic::amdgcn_buffer_atomic_umax: 21217060f0aSJay Foad case Intrinsic::amdgcn_struct_buffer_atomic_umax: 21317060f0aSJay Foad case Intrinsic::amdgcn_raw_buffer_atomic_umax: 21417060f0aSJay Foad Op = AtomicRMWInst::UMax; 21566416574SNeil Henning break; 21666416574SNeil Henning } 21766416574SNeil Henning 21866416574SNeil Henning const unsigned ValIdx = 0; 21966416574SNeil Henning 220dcb75324SJay Foad const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx)); 22166416574SNeil Henning 22266416574SNeil Henning // If the value operand is divergent, each lane is contributing a different 22366416574SNeil Henning // value to the atomic calculation. We can only optimize divergent values if 22466416574SNeil Henning // we have DPP available on our subtarget, and the atomic operation is 32 22566416574SNeil Henning // bits. 226eac23862SJay Foad if (ValDivergent && 227eac23862SJay Foad (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) { 22866416574SNeil Henning return; 22966416574SNeil Henning } 23066416574SNeil Henning 23166416574SNeil Henning // If any of the other arguments to the intrinsic are divergent, we can't 23266416574SNeil Henning // optimize the operation. 23366416574SNeil Henning for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) { 234dcb75324SJay Foad if (DA->isDivergentUse(&I.getOperandUse(Idx))) { 23566416574SNeil Henning return; 23666416574SNeil Henning } 23766416574SNeil Henning } 23866416574SNeil Henning 23966416574SNeil Henning // If we get here, we can optimize the atomic using a single wavefront-wide 24066416574SNeil Henning // atomic operation to do the calculation for the entire wavefront, so 24166416574SNeil Henning // remember the instruction so we can come back to it. 24266416574SNeil Henning const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; 24366416574SNeil Henning 24466416574SNeil Henning ToReplace.push_back(Info); 24566416574SNeil Henning } 24666416574SNeil Henning 24717060f0aSJay Foad // Use the builder to create the non-atomic counterpart of the specified 24817060f0aSJay Foad // atomicrmw binary op. 24917060f0aSJay Foad static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, 25017060f0aSJay Foad Value *LHS, Value *RHS) { 25117060f0aSJay Foad CmpInst::Predicate Pred; 25217060f0aSJay Foad 25317060f0aSJay Foad switch (Op) { 25417060f0aSJay Foad default: 25517060f0aSJay Foad llvm_unreachable("Unhandled atomic op"); 25617060f0aSJay Foad case AtomicRMWInst::Add: 25717060f0aSJay Foad return B.CreateBinOp(Instruction::Add, LHS, RHS); 25817060f0aSJay Foad case AtomicRMWInst::Sub: 25917060f0aSJay Foad return B.CreateBinOp(Instruction::Sub, LHS, RHS); 26070235c64SJay Foad case AtomicRMWInst::And: 26170235c64SJay Foad return B.CreateBinOp(Instruction::And, LHS, RHS); 26270235c64SJay Foad case AtomicRMWInst::Or: 26370235c64SJay Foad return B.CreateBinOp(Instruction::Or, LHS, RHS); 26470235c64SJay Foad case AtomicRMWInst::Xor: 26570235c64SJay Foad return B.CreateBinOp(Instruction::Xor, LHS, RHS); 26617060f0aSJay Foad 26717060f0aSJay Foad case AtomicRMWInst::Max: 26817060f0aSJay Foad Pred = CmpInst::ICMP_SGT; 26917060f0aSJay Foad break; 27017060f0aSJay Foad case AtomicRMWInst::Min: 27117060f0aSJay Foad Pred = CmpInst::ICMP_SLT; 27217060f0aSJay Foad break; 27317060f0aSJay Foad case AtomicRMWInst::UMax: 27417060f0aSJay Foad Pred = CmpInst::ICMP_UGT; 27517060f0aSJay Foad break; 27617060f0aSJay Foad case AtomicRMWInst::UMin: 27717060f0aSJay Foad Pred = CmpInst::ICMP_ULT; 27817060f0aSJay Foad break; 27917060f0aSJay Foad } 28017060f0aSJay Foad Value *Cond = B.CreateICmp(Pred, LHS, RHS); 28117060f0aSJay Foad return B.CreateSelect(Cond, LHS, RHS); 28217060f0aSJay Foad } 28317060f0aSJay Foad 2849d08f276SJay Foad // Use the builder to create a reduction of V across the wavefront, with all 2859d08f276SJay Foad // lanes active, returning the same result in all lanes. 2869d08f276SJay Foad Value *AMDGPUAtomicOptimizer::buildReduction(IRBuilder<> &B, 2879d08f276SJay Foad AtomicRMWInst::BinOp Op, Value *V, 2889d08f276SJay Foad Value *const Identity) const { 2899d08f276SJay Foad Type *const Ty = V->getType(); 2909d08f276SJay Foad Module *M = B.GetInsertBlock()->getModule(); 2919d08f276SJay Foad Function *UpdateDPP = 2929d08f276SJay Foad Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); 2939d08f276SJay Foad 2949d08f276SJay Foad // Reduce within each row of 16 lanes. 2959d08f276SJay Foad for (unsigned Idx = 0; Idx < 4; Idx++) { 2969d08f276SJay Foad V = buildNonAtomicBinOp( 2979d08f276SJay Foad B, Op, V, 2989d08f276SJay Foad B.CreateCall(UpdateDPP, 2999d08f276SJay Foad {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx), 3009d08f276SJay Foad B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); 3019d08f276SJay Foad } 3029d08f276SJay Foad 3039d08f276SJay Foad // Reduce within each pair of rows (i.e. 32 lanes). 3049d08f276SJay Foad assert(ST->hasPermLaneX16()); 3059d08f276SJay Foad V = buildNonAtomicBinOp( 3069d08f276SJay Foad B, Op, V, 3079d08f276SJay Foad B.CreateIntrinsic( 3089d08f276SJay Foad Intrinsic::amdgcn_permlanex16, {}, 3099d08f276SJay Foad {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()})); 3109d08f276SJay Foad 3119d08f276SJay Foad if (ST->isWave32()) 3129d08f276SJay Foad return V; 3139d08f276SJay Foad 3149d08f276SJay Foad // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and 3159d08f276SJay Foad // combine them with a scalar operation. 3169d08f276SJay Foad Function *ReadLane = 3179d08f276SJay Foad Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); 3189d08f276SJay Foad Value *const Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)}); 3199d08f276SJay Foad Value *const Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)}); 3209d08f276SJay Foad return buildNonAtomicBinOp(B, Op, Lane0, Lane32); 3219d08f276SJay Foad } 3229d08f276SJay Foad 323eac23862SJay Foad // Use the builder to create an inclusive scan of V across the wavefront, with 324eac23862SJay Foad // all lanes active. 325eac23862SJay Foad Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, 326eac23862SJay Foad Value *V, Value *const Identity) const { 327eac23862SJay Foad Type *const Ty = V->getType(); 328eac23862SJay Foad Module *M = B.GetInsertBlock()->getModule(); 329eac23862SJay Foad Function *UpdateDPP = 330eac23862SJay Foad Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); 331eac23862SJay Foad 332eac23862SJay Foad for (unsigned Idx = 0; Idx < 4; Idx++) { 333eac23862SJay Foad V = buildNonAtomicBinOp( 334eac23862SJay Foad B, Op, V, 335eac23862SJay Foad B.CreateCall(UpdateDPP, 336eac23862SJay Foad {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx), 337eac23862SJay Foad B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); 338eac23862SJay Foad } 339eac23862SJay Foad if (ST->hasDPPBroadcasts()) { 340eac23862SJay Foad // GFX9 has DPP row broadcast operations. 341eac23862SJay Foad V = buildNonAtomicBinOp( 342eac23862SJay Foad B, Op, V, 343eac23862SJay Foad B.CreateCall(UpdateDPP, 344eac23862SJay Foad {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa), 345eac23862SJay Foad B.getInt32(0xf), B.getFalse()})); 346eac23862SJay Foad V = buildNonAtomicBinOp( 347eac23862SJay Foad B, Op, V, 348eac23862SJay Foad B.CreateCall(UpdateDPP, 349eac23862SJay Foad {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc), 350eac23862SJay Foad B.getInt32(0xf), B.getFalse()})); 351eac23862SJay Foad } else { 352eac23862SJay Foad // On GFX10 all DPP operations are confined to a single row. To get cross- 353eac23862SJay Foad // row operations we have to use permlane or readlane. 354eac23862SJay Foad 355eac23862SJay Foad // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes 356eac23862SJay Foad // 48..63). 3579d08f276SJay Foad assert(ST->hasPermLaneX16()); 358c96dfe0dSJay Foad Value *const PermX = B.CreateIntrinsic( 359c96dfe0dSJay Foad Intrinsic::amdgcn_permlanex16, {}, 360c96dfe0dSJay Foad {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); 361eac23862SJay Foad V = buildNonAtomicBinOp( 362eac23862SJay Foad B, Op, V, 363eac23862SJay Foad B.CreateCall(UpdateDPP, 364eac23862SJay Foad {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID), 365eac23862SJay Foad B.getInt32(0xa), B.getInt32(0xf), B.getFalse()})); 366eac23862SJay Foad if (!ST->isWave32()) { 367eac23862SJay Foad // Combine lane 31 into lanes 32..63. 368c96dfe0dSJay Foad Value *const Lane31 = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, 369c96dfe0dSJay Foad {V, B.getInt32(31)}); 370eac23862SJay Foad V = buildNonAtomicBinOp( 371eac23862SJay Foad B, Op, V, 372eac23862SJay Foad B.CreateCall(UpdateDPP, 373eac23862SJay Foad {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID), 374eac23862SJay Foad B.getInt32(0xc), B.getInt32(0xf), B.getFalse()})); 375eac23862SJay Foad } 376eac23862SJay Foad } 377eac23862SJay Foad return V; 378eac23862SJay Foad } 379eac23862SJay Foad 380eac23862SJay Foad // Use the builder to create a shift right of V across the wavefront, with all 381eac23862SJay Foad // lanes active, to turn an inclusive scan into an exclusive scan. 382eac23862SJay Foad Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V, 383eac23862SJay Foad Value *const Identity) const { 384eac23862SJay Foad Type *const Ty = V->getType(); 385eac23862SJay Foad Module *M = B.GetInsertBlock()->getModule(); 386eac23862SJay Foad Function *UpdateDPP = 387eac23862SJay Foad Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); 388eac23862SJay Foad 389eac23862SJay Foad if (ST->hasDPPWavefrontShifts()) { 390eac23862SJay Foad // GFX9 has DPP wavefront shift operations. 391eac23862SJay Foad V = B.CreateCall(UpdateDPP, 392eac23862SJay Foad {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf), 393eac23862SJay Foad B.getInt32(0xf), B.getFalse()}); 394eac23862SJay Foad } else { 395c96dfe0dSJay Foad Function *ReadLane = 396c96dfe0dSJay Foad Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); 397c96dfe0dSJay Foad Function *WriteLane = 398c96dfe0dSJay Foad Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {}); 399c96dfe0dSJay Foad 400eac23862SJay Foad // On GFX10 all DPP operations are confined to a single row. To get cross- 401eac23862SJay Foad // row operations we have to use permlane or readlane. 402eac23862SJay Foad Value *Old = V; 403eac23862SJay Foad V = B.CreateCall(UpdateDPP, 404eac23862SJay Foad {Identity, V, B.getInt32(DPP::ROW_SHR0 + 1), 405eac23862SJay Foad B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); 406eac23862SJay Foad 407eac23862SJay Foad // Copy the old lane 15 to the new lane 16. 408eac23862SJay Foad V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}), 409eac23862SJay Foad B.getInt32(16), V}); 410eac23862SJay Foad 411eac23862SJay Foad if (!ST->isWave32()) { 412eac23862SJay Foad // Copy the old lane 31 to the new lane 32. 413eac23862SJay Foad V = B.CreateCall( 414eac23862SJay Foad WriteLane, 415eac23862SJay Foad {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V}); 416eac23862SJay Foad 417eac23862SJay Foad // Copy the old lane 47 to the new lane 48. 418eac23862SJay Foad V = B.CreateCall( 419eac23862SJay Foad WriteLane, 420eac23862SJay Foad {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V}); 421eac23862SJay Foad } 422eac23862SJay Foad } 423eac23862SJay Foad 424eac23862SJay Foad return V; 425eac23862SJay Foad } 426eac23862SJay Foad 42717060f0aSJay Foad static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op, 42817060f0aSJay Foad unsigned BitWidth) { 42917060f0aSJay Foad switch (Op) { 43017060f0aSJay Foad default: 43117060f0aSJay Foad llvm_unreachable("Unhandled atomic op"); 43217060f0aSJay Foad case AtomicRMWInst::Add: 43317060f0aSJay Foad case AtomicRMWInst::Sub: 43470235c64SJay Foad case AtomicRMWInst::Or: 43570235c64SJay Foad case AtomicRMWInst::Xor: 43617060f0aSJay Foad case AtomicRMWInst::UMax: 43717060f0aSJay Foad return APInt::getMinValue(BitWidth); 43870235c64SJay Foad case AtomicRMWInst::And: 43917060f0aSJay Foad case AtomicRMWInst::UMin: 44017060f0aSJay Foad return APInt::getMaxValue(BitWidth); 44117060f0aSJay Foad case AtomicRMWInst::Max: 44217060f0aSJay Foad return APInt::getSignedMinValue(BitWidth); 44317060f0aSJay Foad case AtomicRMWInst::Min: 44417060f0aSJay Foad return APInt::getSignedMaxValue(BitWidth); 44517060f0aSJay Foad } 44617060f0aSJay Foad } 44717060f0aSJay Foad 4480249df33SMirko Brkusanin static Value *buildMul(IRBuilder<> &B, Value *LHS, Value *RHS) { 4490249df33SMirko Brkusanin const ConstantInt *CI = dyn_cast<ConstantInt>(LHS); 4500249df33SMirko Brkusanin return (CI && CI->isOne()) ? RHS : B.CreateMul(LHS, RHS); 4510249df33SMirko Brkusanin } 4520249df33SMirko Brkusanin 45366416574SNeil Henning void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, 45417060f0aSJay Foad AtomicRMWInst::BinOp Op, 45566416574SNeil Henning unsigned ValIdx, 45666416574SNeil Henning bool ValDivergent) const { 45766416574SNeil Henning // Start building just before the instruction. 45866416574SNeil Henning IRBuilder<> B(&I); 45966416574SNeil Henning 460233a02d0SNeil Henning // If we are in a pixel shader, because of how we have to mask out helper 461233a02d0SNeil Henning // lane invocations, we need to record the entry and exit BB's. 462233a02d0SNeil Henning BasicBlock *PixelEntryBB = nullptr; 463233a02d0SNeil Henning BasicBlock *PixelExitBB = nullptr; 464233a02d0SNeil Henning 465233a02d0SNeil Henning // If we're optimizing an atomic within a pixel shader, we need to wrap the 466233a02d0SNeil Henning // entire atomic operation in a helper-lane check. We do not want any helper 467233a02d0SNeil Henning // lanes that are around only for the purposes of derivatives to take part 468233a02d0SNeil Henning // in any cross-lane communication, and we use a branch on whether the lane is 469233a02d0SNeil Henning // live to do this. 470233a02d0SNeil Henning if (IsPixelShader) { 471233a02d0SNeil Henning // Record I's original position as the entry block. 472233a02d0SNeil Henning PixelEntryBB = I.getParent(); 473233a02d0SNeil Henning 474233a02d0SNeil Henning Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {}); 475233a02d0SNeil Henning Instruction *const NonHelperTerminator = 476233a02d0SNeil Henning SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr); 477233a02d0SNeil Henning 478233a02d0SNeil Henning // Record I's new position as the exit block. 479233a02d0SNeil Henning PixelExitBB = I.getParent(); 480233a02d0SNeil Henning 481233a02d0SNeil Henning I.moveBefore(NonHelperTerminator); 482233a02d0SNeil Henning B.SetInsertPoint(&I); 483233a02d0SNeil Henning } 484233a02d0SNeil Henning 48566416574SNeil Henning Type *const Ty = I.getType(); 48666416574SNeil Henning const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty); 487aad93654SChristopher Tetreault auto *const VecTy = FixedVectorType::get(B.getInt32Ty(), 2); 48866416574SNeil Henning 48966416574SNeil Henning // This is the value in the atomic operation we need to combine in order to 49066416574SNeil Henning // reduce the number of atomic operations. 49166416574SNeil Henning Value *const V = I.getOperand(ValIdx); 49266416574SNeil Henning 49366416574SNeil Henning // We need to know how many lanes are active within the wavefront, and we do 4948c10fa1aSNeil Henning // this by doing a ballot of active lanes. 495eac23862SJay Foad Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize()); 4965d3a69feSSebastian Neubauer CallInst *const Ballot = 4975d3a69feSSebastian Neubauer B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue()); 49866416574SNeil Henning 49966416574SNeil Henning // We need to know how many lanes are active within the wavefront that are 50066416574SNeil Henning // below us. If we counted each lane linearly starting from 0, a lane is 50166416574SNeil Henning // below us only if its associated index was less than ours. We do this by 50266416574SNeil Henning // using the mbcnt intrinsic. 503eac23862SJay Foad Value *Mbcnt; 504eac23862SJay Foad if (ST->isWave32()) { 505eac23862SJay Foad Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, 506eac23862SJay Foad {Ballot, B.getInt32(0)}); 507eac23862SJay Foad } else { 5088c10fa1aSNeil Henning Value *const BitCast = B.CreateBitCast(Ballot, VecTy); 50966416574SNeil Henning Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0)); 51066416574SNeil Henning Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1)); 511eac23862SJay Foad Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, 512eac23862SJay Foad {ExtractLo, B.getInt32(0)}); 513eac23862SJay Foad Mbcnt = 514eac23862SJay Foad B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt}); 515eac23862SJay Foad } 516eac23862SJay Foad Mbcnt = B.CreateIntCast(Mbcnt, Ty, false); 51766416574SNeil Henning 51817060f0aSJay Foad Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth)); 51917060f0aSJay Foad 52017060f0aSJay Foad Value *ExclScan = nullptr; 52166416574SNeil Henning Value *NewV = nullptr; 52266416574SNeil Henning 5235dd5ddcbSJay Foad const bool NeedResult = !I.use_empty(); 5245dd5ddcbSJay Foad 52566416574SNeil Henning // If we have a divergent value in each lane, we need to combine the value 52666416574SNeil Henning // using DPP. 52766416574SNeil Henning if (ValDivergent) { 52817060f0aSJay Foad // First we need to set all inactive invocations to the identity value, so 52917060f0aSJay Foad // that they can correctly contribute to the final result. 530eac23862SJay Foad NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); 53166416574SNeil Henning 532eac23862SJay Foad const AtomicRMWInst::BinOp ScanOp = 533eac23862SJay Foad Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; 5349d08f276SJay Foad if (!NeedResult && ST->hasPermLaneX16()) { 5359d08f276SJay Foad // On GFX10 the permlanex16 instruction helps us build a reduction without 5369d08f276SJay Foad // too many readlanes and writelanes, which are generally bad for 5379d08f276SJay Foad // performance. 5389d08f276SJay Foad NewV = buildReduction(B, ScanOp, NewV, Identity); 5399d08f276SJay Foad } else { 540eac23862SJay Foad NewV = buildScan(B, ScanOp, NewV, Identity); 5415dd5ddcbSJay Foad if (NeedResult) 542eac23862SJay Foad ExclScan = buildShiftRight(B, NewV, Identity); 54366416574SNeil Henning 544*dc6e8dfdSJacob Lambert // Read the value from the last lane, which has accumulated the values of 54517060f0aSJay Foad // each active lane in the wavefront. This will be our new value which we 54617060f0aSJay Foad // will provide to the atomic operation. 547eac23862SJay Foad Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); 5485a5a5312SJay Foad assert(TyBitWidth == 32); 5499d08f276SJay Foad NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, 5509d08f276SJay Foad {NewV, LastLaneIdx}); 5519d08f276SJay Foad } 5528c10fa1aSNeil Henning 5538c10fa1aSNeil Henning // Finally mark the readlanes in the WWM section. 554c3ce7baeSPiotr Sobczak NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); 55566416574SNeil Henning } else { 55617060f0aSJay Foad switch (Op) { 55717060f0aSJay Foad default: 55817060f0aSJay Foad llvm_unreachable("Unhandled atomic op"); 55917060f0aSJay Foad 56017060f0aSJay Foad case AtomicRMWInst::Add: 56117060f0aSJay Foad case AtomicRMWInst::Sub: { 56270235c64SJay Foad // The new value we will be contributing to the atomic operation is the 56370235c64SJay Foad // old value times the number of active lanes. 56470235c64SJay Foad Value *const Ctpop = B.CreateIntCast( 56570235c64SJay Foad B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); 5660249df33SMirko Brkusanin NewV = buildMul(B, V, Ctpop); 56717060f0aSJay Foad break; 56817060f0aSJay Foad } 56917060f0aSJay Foad 57070235c64SJay Foad case AtomicRMWInst::And: 57170235c64SJay Foad case AtomicRMWInst::Or: 57217060f0aSJay Foad case AtomicRMWInst::Max: 57317060f0aSJay Foad case AtomicRMWInst::Min: 57417060f0aSJay Foad case AtomicRMWInst::UMax: 57517060f0aSJay Foad case AtomicRMWInst::UMin: 57670235c64SJay Foad // These operations with a uniform value are idempotent: doing the atomic 57770235c64SJay Foad // operation multiple times has the same effect as doing it once. 57817060f0aSJay Foad NewV = V; 57917060f0aSJay Foad break; 58070235c64SJay Foad 58170235c64SJay Foad case AtomicRMWInst::Xor: 58270235c64SJay Foad // The new value we will be contributing to the atomic operation is the 58370235c64SJay Foad // old value times the parity of the number of active lanes. 58470235c64SJay Foad Value *const Ctpop = B.CreateIntCast( 58570235c64SJay Foad B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); 5860249df33SMirko Brkusanin NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1)); 58770235c64SJay Foad break; 58817060f0aSJay Foad } 58966416574SNeil Henning } 59066416574SNeil Henning 59166416574SNeil Henning // We only want a single lane to enter our new control flow, and we do this 59266416574SNeil Henning // by checking if there are any active lanes below us. Only one lane will 59366416574SNeil Henning // have 0 active lanes below us, so that will be the only one to progress. 59470235c64SJay Foad Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0)); 59566416574SNeil Henning 59666416574SNeil Henning // Store I's original basic block before we split the block. 59766416574SNeil Henning BasicBlock *const EntryBB = I.getParent(); 59866416574SNeil Henning 59966416574SNeil Henning // We need to introduce some new control flow to force a single lane to be 60066416574SNeil Henning // active. We do this by splitting I's basic block at I, and introducing the 60166416574SNeil Henning // new block such that: 60266416574SNeil Henning // entry --> single_lane -\ 60366416574SNeil Henning // \------------------> exit 60466416574SNeil Henning Instruction *const SingleLaneTerminator = 60566416574SNeil Henning SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr); 60666416574SNeil Henning 60766416574SNeil Henning // Move the IR builder into single_lane next. 60866416574SNeil Henning B.SetInsertPoint(SingleLaneTerminator); 60966416574SNeil Henning 61066416574SNeil Henning // Clone the original atomic operation into single lane, replacing the 61166416574SNeil Henning // original value with our newly created one. 61266416574SNeil Henning Instruction *const NewI = I.clone(); 61366416574SNeil Henning B.Insert(NewI); 61466416574SNeil Henning NewI->setOperand(ValIdx, NewV); 61566416574SNeil Henning 61666416574SNeil Henning // Move the IR builder into exit next, and start inserting just before the 61766416574SNeil Henning // original instruction. 61866416574SNeil Henning B.SetInsertPoint(&I); 61966416574SNeil Henning 620298500aeSJay Foad if (NeedResult) { 62166416574SNeil Henning // Create a PHI node to get our new atomic result into the exit block. 62266416574SNeil Henning PHINode *const PHI = B.CreatePHI(Ty, 2); 62366416574SNeil Henning PHI->addIncoming(UndefValue::get(Ty), EntryBB); 62466416574SNeil Henning PHI->addIncoming(NewI, SingleLaneTerminator->getParent()); 62566416574SNeil Henning 62666416574SNeil Henning // We need to broadcast the value who was the lowest active lane (the first 62766416574SNeil Henning // lane) to all other lanes in the wavefront. We use an intrinsic for this, 62866416574SNeil Henning // but have to handle 64-bit broadcasts with two calls to this intrinsic. 62966416574SNeil Henning Value *BroadcastI = nullptr; 63066416574SNeil Henning 63166416574SNeil Henning if (TyBitWidth == 64) { 63266416574SNeil Henning Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty()); 63366416574SNeil Henning Value *const ExtractHi = 634eac23862SJay Foad B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty()); 63566416574SNeil Henning CallInst *const ReadFirstLaneLo = 63666416574SNeil Henning B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo); 63766416574SNeil Henning CallInst *const ReadFirstLaneHi = 63866416574SNeil Henning B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi); 63966416574SNeil Henning Value *const PartialInsert = B.CreateInsertElement( 64066416574SNeil Henning UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0)); 64166416574SNeil Henning Value *const Insert = 64266416574SNeil Henning B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1)); 64366416574SNeil Henning BroadcastI = B.CreateBitCast(Insert, Ty); 64466416574SNeil Henning } else if (TyBitWidth == 32) { 6453a31b3f6SMatt Arsenault 6463a31b3f6SMatt Arsenault BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI); 64766416574SNeil Henning } else { 64866416574SNeil Henning llvm_unreachable("Unhandled atomic bit width"); 64966416574SNeil Henning } 65066416574SNeil Henning 65166416574SNeil Henning // Now that we have the result of our single atomic operation, we need to 652298500aeSJay Foad // get our individual lane's slice into the result. We use the lane offset 653298500aeSJay Foad // we previously calculated combined with the atomic result value we got 654298500aeSJay Foad // from the first lane, to get our lane's index into the atomic result. 65517060f0aSJay Foad Value *LaneOffset = nullptr; 65617060f0aSJay Foad if (ValDivergent) { 657c3ce7baeSPiotr Sobczak LaneOffset = 658c3ce7baeSPiotr Sobczak B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan); 65917060f0aSJay Foad } else { 66017060f0aSJay Foad switch (Op) { 66117060f0aSJay Foad default: 66217060f0aSJay Foad llvm_unreachable("Unhandled atomic op"); 66317060f0aSJay Foad case AtomicRMWInst::Add: 66417060f0aSJay Foad case AtomicRMWInst::Sub: 6650249df33SMirko Brkusanin LaneOffset = buildMul(B, V, Mbcnt); 66617060f0aSJay Foad break; 66770235c64SJay Foad case AtomicRMWInst::And: 66870235c64SJay Foad case AtomicRMWInst::Or: 66917060f0aSJay Foad case AtomicRMWInst::Max: 67017060f0aSJay Foad case AtomicRMWInst::Min: 67117060f0aSJay Foad case AtomicRMWInst::UMax: 67217060f0aSJay Foad case AtomicRMWInst::UMin: 67317060f0aSJay Foad LaneOffset = B.CreateSelect(Cond, Identity, V); 67417060f0aSJay Foad break; 67570235c64SJay Foad case AtomicRMWInst::Xor: 6760249df33SMirko Brkusanin LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1)); 67770235c64SJay Foad break; 67817060f0aSJay Foad } 67917060f0aSJay Foad } 68017060f0aSJay Foad Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset); 68166416574SNeil Henning 682233a02d0SNeil Henning if (IsPixelShader) { 683233a02d0SNeil Henning // Need a final PHI to reconverge to above the helper lane branch mask. 684233a02d0SNeil Henning B.SetInsertPoint(PixelExitBB->getFirstNonPHI()); 685233a02d0SNeil Henning 686233a02d0SNeil Henning PHINode *const PHI = B.CreatePHI(Ty, 2); 687233a02d0SNeil Henning PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB); 688233a02d0SNeil Henning PHI->addIncoming(Result, I.getParent()); 689233a02d0SNeil Henning I.replaceAllUsesWith(PHI); 690233a02d0SNeil Henning } else { 69166416574SNeil Henning // Replace the original atomic instruction with the new one. 69266416574SNeil Henning I.replaceAllUsesWith(Result); 693233a02d0SNeil Henning } 694298500aeSJay Foad } 69566416574SNeil Henning 69666416574SNeil Henning // And delete the original. 69766416574SNeil Henning I.eraseFromParent(); 69866416574SNeil Henning } 69966416574SNeil Henning 70066416574SNeil Henning INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE, 70166416574SNeil Henning "AMDGPU atomic optimizations", false, false) 70266416574SNeil Henning INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) 70366416574SNeil Henning INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 70466416574SNeil Henning INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE, 70566416574SNeil Henning "AMDGPU atomic optimizations", false, false) 70666416574SNeil Henning 70766416574SNeil Henning FunctionPass *llvm::createAMDGPUAtomicOptimizerPass() { 70866416574SNeil Henning return new AMDGPUAtomicOptimizer(); 70966416574SNeil Henning } 710