1 //===-- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass ---------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // \file 11 // This file implements a TargetTransformInfo analysis pass specific to the 12 // AMDGPU target machine. It uses the target's detailed information to provide 13 // more precise answers to certain TTI queries, while letting the target 14 // independent and default TTI implementations handle the rest. 15 // 16 //===----------------------------------------------------------------------===// 17 18 #include "AMDGPUTargetTransformInfo.h" 19 #include "llvm/Analysis/LoopInfo.h" 20 #include "llvm/Analysis/TargetTransformInfo.h" 21 #include "llvm/Analysis/ValueTracking.h" 22 #include "llvm/CodeGen/BasicTTIImpl.h" 23 #include "llvm/IR/Module.h" 24 #include "llvm/IR/Intrinsics.h" 25 #include "llvm/Support/Debug.h" 26 #include "llvm/Target/CostTable.h" 27 #include "llvm/Target/TargetLowering.h" 28 using namespace llvm; 29 30 #define DEBUG_TYPE "AMDGPUtti" 31 32 void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, 33 TTI::UnrollingPreferences &UP) { 34 UP.Threshold = 300; // Twice the default. 35 UP.MaxCount = UINT_MAX; 36 UP.Partial = true; 37 38 // TODO: Do we want runtime unrolling? 39 40 for (const BasicBlock *BB : L->getBlocks()) { 41 const DataLayout &DL = BB->getModule()->getDataLayout(); 42 for (const Instruction &I : *BB) { 43 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I); 44 if (!GEP || GEP->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) 45 continue; 46 47 const Value *Ptr = GEP->getPointerOperand(); 48 const AllocaInst *Alloca = 49 dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL)); 50 if (Alloca) { 51 // We want to do whatever we can to limit the number of alloca 52 // instructions that make it through to the code generator. allocas 53 // require us to use indirect addressing, which is slow and prone to 54 // compiler bugs. If this loop does an address calculation on an 55 // alloca ptr, then we want to use a higher than normal loop unroll 56 // threshold. This will give SROA a better chance to eliminate these 57 // allocas. 58 // 59 // Don't use the maximum allowed value here as it will make some 60 // programs way too big. 61 UP.Threshold = 800; 62 } 63 } 64 } 65 } 66 67 unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) { 68 if (Vec) 69 return 0; 70 71 // Number of VGPRs on SI. 72 if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) 73 return 256; 74 75 return 4 * 128; // XXX - 4 channels. Should these count as vector instead? 76 } 77 78 unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) { 79 return Vector ? 0 : 32; 80 } 81 82 unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { 83 // Semi-arbitrary large amount. 84 return 64; 85 } 86 87 unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) { 88 // XXX - For some reason this isn't called for switch. 89 switch (Opcode) { 90 case Instruction::Br: 91 case Instruction::Ret: 92 return 10; 93 default: 94 return BaseT::getCFInstrCost(Opcode); 95 } 96 } 97 98 int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, 99 unsigned Index) { 100 switch (Opcode) { 101 case Instruction::ExtractElement: 102 // Dynamic indexing isn't free and is best avoided. 103 return Index == ~0u ? 2 : 0; 104 default: 105 return BaseT::getVectorInstrCost(Opcode, ValTy, Index); 106 } 107 } 108 109 static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII, 110 const IntrinsicInst *I) { 111 switch (I->getIntrinsicID()) { 112 default: 113 return false; 114 case Intrinsic::not_intrinsic: 115 // This means we have an intrinsic that isn't defined in 116 // IntrinsicsAMDGPU.td 117 break; 118 119 case Intrinsic::amdgcn_workitem_id_x: 120 case Intrinsic::amdgcn_workitem_id_y: 121 case Intrinsic::amdgcn_workitem_id_z: 122 case Intrinsic::amdgcn_interp_p1: 123 case Intrinsic::amdgcn_interp_p2: 124 case Intrinsic::amdgcn_mbcnt_hi: 125 case Intrinsic::amdgcn_mbcnt_lo: 126 case Intrinsic::r600_read_tidig_x: 127 case Intrinsic::r600_read_tidig_y: 128 case Intrinsic::r600_read_tidig_z: 129 case Intrinsic::amdgcn_image_atomic_swap: 130 case Intrinsic::amdgcn_image_atomic_add: 131 case Intrinsic::amdgcn_image_atomic_sub: 132 case Intrinsic::amdgcn_image_atomic_smin: 133 case Intrinsic::amdgcn_image_atomic_umin: 134 case Intrinsic::amdgcn_image_atomic_smax: 135 case Intrinsic::amdgcn_image_atomic_umax: 136 case Intrinsic::amdgcn_image_atomic_and: 137 case Intrinsic::amdgcn_image_atomic_or: 138 case Intrinsic::amdgcn_image_atomic_xor: 139 case Intrinsic::amdgcn_image_atomic_inc: 140 case Intrinsic::amdgcn_image_atomic_dec: 141 case Intrinsic::amdgcn_image_atomic_cmpswap: 142 return true; 143 } 144 145 StringRef Name = I->getCalledFunction()->getName(); 146 switch (TII->lookupName((const char *)Name.bytes_begin(), Name.size())) { 147 default: 148 return false; 149 case AMDGPUIntrinsic::SI_tid: 150 case AMDGPUIntrinsic::SI_fs_interp: 151 return true; 152 } 153 } 154 155 static bool isArgPassedInSGPR(const Argument *A) { 156 const Function *F = A->getParent(); 157 unsigned ShaderType = AMDGPU::getShaderType(*F); 158 159 // Arguments to compute shaders are never a source of divergence. 160 if (ShaderType == ShaderType::COMPUTE) 161 return true; 162 163 // For non-compute shaders, SGPR inputs are marked with either inreg or byval. 164 if (F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::InReg) || 165 F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::ByVal)) 166 return true; 167 168 // Everything else is in VGPRs. 169 return false; 170 } 171 172 /// 173 /// \returns true if the result of the value could potentially be 174 /// different across workitems in a wavefront. 175 bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const { 176 177 if (const Argument *A = dyn_cast<Argument>(V)) 178 return !isArgPassedInSGPR(A); 179 180 // Loads from the private address space are divergent, because threads 181 // can execute the load instruction with the same inputs and get different 182 // results. 183 // 184 // All other loads are not divergent, because if threads issue loads with the 185 // same arguments, they will always get the same result. 186 if (const LoadInst *Load = dyn_cast<LoadInst>(V)) 187 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; 188 189 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) { 190 const TargetMachine &TM = getTLI()->getTargetMachine(); 191 return isIntrinsicSourceOfDivergence(TM.getIntrinsicInfo(), Intrinsic); 192 } 193 194 // Assume all function calls are a source of divergence. 195 if (isa<CallInst>(V) || isa<InvokeInst>(V)) 196 return true; 197 198 return false; 199 } 200