1 //===-- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass ---------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // \file 11 // This file implements a TargetTransformInfo analysis pass specific to the 12 // AMDGPU target machine. It uses the target's detailed information to provide 13 // more precise answers to certain TTI queries, while letting the target 14 // independent and default TTI implementations handle the rest. 15 // 16 //===----------------------------------------------------------------------===// 17 18 #include "AMDGPUTargetTransformInfo.h" 19 #include "llvm/Analysis/LoopInfo.h" 20 #include "llvm/Analysis/TargetTransformInfo.h" 21 #include "llvm/Analysis/ValueTracking.h" 22 #include "llvm/CodeGen/BasicTTIImpl.h" 23 #include "llvm/IR/Module.h" 24 #include "llvm/IR/Intrinsics.h" 25 #include "llvm/Support/Debug.h" 26 #include "llvm/Target/CostTable.h" 27 #include "llvm/Target/TargetLowering.h" 28 using namespace llvm; 29 30 #define DEBUG_TYPE "AMDGPUtti" 31 32 static cl::opt<unsigned> UnrollThresholdPrivate( 33 "amdgpu-unroll-threshold-private", 34 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), 35 cl::init(2000), cl::Hidden); 36 37 void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, 38 TTI::UnrollingPreferences &UP) { 39 UP.Threshold = 300; // Twice the default. 40 UP.MaxCount = UINT_MAX; 41 UP.Partial = true; 42 43 // TODO: Do we want runtime unrolling? 44 45 // Maximum alloca size than can fit registers. Reserve 16 registers. 46 const unsigned MaxAlloca = (256 - 16) * 4; 47 for (const BasicBlock *BB : L->getBlocks()) { 48 const DataLayout &DL = BB->getModule()->getDataLayout(); 49 for (const Instruction &I : *BB) { 50 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I); 51 if (!GEP || GEP->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) 52 continue; 53 54 const Value *Ptr = GEP->getPointerOperand(); 55 const AllocaInst *Alloca = 56 dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL)); 57 if (Alloca && Alloca->isStaticAlloca()) { 58 Type *Ty = Alloca->getAllocatedType(); 59 unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0; 60 if (AllocaSize > MaxAlloca) 61 continue; 62 63 // Check if GEP depends on a value defined by this loop itself. 64 bool HasLoopDef = false; 65 for (const Value *Op : GEP->operands()) { 66 const Instruction *Inst = dyn_cast<Instruction>(Op); 67 if (!Inst || L->isLoopInvariant(Op)) 68 continue; 69 if (any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) { 70 return SubLoop->contains(Inst); })) 71 continue; 72 HasLoopDef = true; 73 break; 74 } 75 if (!HasLoopDef) 76 continue; 77 78 // We want to do whatever we can to limit the number of alloca 79 // instructions that make it through to the code generator. allocas 80 // require us to use indirect addressing, which is slow and prone to 81 // compiler bugs. If this loop does an address calculation on an 82 // alloca ptr, then we want to use a higher than normal loop unroll 83 // threshold. This will give SROA a better chance to eliminate these 84 // allocas. 85 // 86 // Don't use the maximum allowed value here as it will make some 87 // programs way too big. 88 UP.Threshold = UnrollThresholdPrivate; 89 return; 90 } 91 } 92 } 93 } 94 95 unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) { 96 if (Vec) 97 return 0; 98 99 // Number of VGPRs on SI. 100 if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) 101 return 256; 102 103 return 4 * 128; // XXX - 4 channels. Should these count as vector instead? 104 } 105 106 unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) { 107 return Vector ? 0 : 32; 108 } 109 110 unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { 111 switch (AddrSpace) { 112 case AMDGPUAS::GLOBAL_ADDRESS: 113 case AMDGPUAS::CONSTANT_ADDRESS: 114 case AMDGPUAS::FLAT_ADDRESS: 115 return 128; 116 case AMDGPUAS::LOCAL_ADDRESS: 117 case AMDGPUAS::REGION_ADDRESS: 118 return 64; 119 case AMDGPUAS::PRIVATE_ADDRESS: 120 return 8 * ST->getMaxPrivateElementSize(); 121 default: 122 if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS && 123 (AddrSpace == AMDGPUAS::PARAM_D_ADDRESS || 124 AddrSpace == AMDGPUAS::PARAM_I_ADDRESS || 125 (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 && 126 AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15))) 127 return 128; 128 llvm_unreachable("unhandled address space"); 129 } 130 } 131 132 bool AMDGPUTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, 133 unsigned Alignment, 134 unsigned AddrSpace) const { 135 // We allow vectorization of flat stores, even though we may need to decompose 136 // them later if they may access private memory. We don't have enough context 137 // here, and legalization can handle it. 138 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { 139 return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) && 140 ChainSizeInBytes <= ST->getMaxPrivateElementSize(); 141 } 142 return true; 143 } 144 145 bool AMDGPUTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, 146 unsigned Alignment, 147 unsigned AddrSpace) const { 148 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); 149 } 150 151 bool AMDGPUTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, 152 unsigned Alignment, 153 unsigned AddrSpace) const { 154 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); 155 } 156 157 unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { 158 // Disable unrolling if the loop is not vectorized. 159 if (VF == 1) 160 return 1; 161 162 // Semi-arbitrary large amount. 163 return 64; 164 } 165 166 int AMDGPUTTIImpl::getArithmeticInstrCost( 167 unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, 168 TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, 169 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) { 170 171 EVT OrigTy = TLI->getValueType(DL, Ty); 172 if (!OrigTy.isSimple()) { 173 return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, 174 Opd1PropInfo, Opd2PropInfo); 175 } 176 177 // Legalize the type. 178 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 179 int ISD = TLI->InstructionOpcodeToISD(Opcode); 180 181 // Because we don't have any legal vector operations, but the legal types, we 182 // need to account for split vectors. 183 unsigned NElts = LT.second.isVector() ? 184 LT.second.getVectorNumElements() : 1; 185 186 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy; 187 188 switch (ISD) { 189 case ISD::SHL: 190 case ISD::SRL: 191 case ISD::SRA: { 192 if (SLT == MVT::i64) 193 return get64BitInstrCost() * LT.first * NElts; 194 195 // i32 196 return getFullRateInstrCost() * LT.first * NElts; 197 } 198 case ISD::ADD: 199 case ISD::SUB: 200 case ISD::AND: 201 case ISD::OR: 202 case ISD::XOR: { 203 if (SLT == MVT::i64){ 204 // and, or and xor are typically split into 2 VALU instructions. 205 return 2 * getFullRateInstrCost() * LT.first * NElts; 206 } 207 208 return LT.first * NElts * getFullRateInstrCost(); 209 } 210 case ISD::MUL: { 211 const int QuarterRateCost = getQuarterRateInstrCost(); 212 if (SLT == MVT::i64) { 213 const int FullRateCost = getFullRateInstrCost(); 214 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts; 215 } 216 217 // i32 218 return QuarterRateCost * NElts * LT.first; 219 } 220 case ISD::FADD: 221 case ISD::FSUB: 222 case ISD::FMUL: 223 if (SLT == MVT::f64) 224 return LT.first * NElts * get64BitInstrCost(); 225 226 if (SLT == MVT::f32 || SLT == MVT::f16) 227 return LT.first * NElts * getFullRateInstrCost(); 228 break; 229 230 case ISD::FDIV: 231 case ISD::FREM: 232 // FIXME: frem should be handled separately. The fdiv in it is most of it, 233 // but the current lowering is also not entirely correct. 234 if (SLT == MVT::f64) { 235 int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost(); 236 237 // Add cost of workaround. 238 if (ST->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) 239 Cost += 3 * getFullRateInstrCost(); 240 241 return LT.first * Cost * NElts; 242 } 243 244 // Assuming no fp32 denormals lowering. 245 if (SLT == MVT::f32 || SLT == MVT::f16) { 246 assert(!ST->hasFP32Denormals() && "will change when supported"); 247 int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost(); 248 return LT.first * NElts * Cost; 249 } 250 251 break; 252 default: 253 break; 254 } 255 256 return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, 257 Opd1PropInfo, Opd2PropInfo); 258 } 259 260 unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) { 261 // XXX - For some reason this isn't called for switch. 262 switch (Opcode) { 263 case Instruction::Br: 264 case Instruction::Ret: 265 return 10; 266 default: 267 return BaseT::getCFInstrCost(Opcode); 268 } 269 } 270 271 int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, 272 unsigned Index) { 273 switch (Opcode) { 274 case Instruction::ExtractElement: 275 case Instruction::InsertElement: 276 // Extracts are just reads of a subregister, so are free. Inserts are 277 // considered free because we don't want to have any cost for scalarizing 278 // operations, and we don't have to copy into a different register class. 279 280 // Dynamic indexing isn't free and is best avoided. 281 return Index == ~0u ? 2 : 0; 282 default: 283 return BaseT::getVectorInstrCost(Opcode, ValTy, Index); 284 } 285 } 286 287 static bool isIntrinsicSourceOfDivergence(const IntrinsicInst *I) { 288 switch (I->getIntrinsicID()) { 289 case Intrinsic::amdgcn_workitem_id_x: 290 case Intrinsic::amdgcn_workitem_id_y: 291 case Intrinsic::amdgcn_workitem_id_z: 292 case Intrinsic::amdgcn_interp_mov: 293 case Intrinsic::amdgcn_interp_p1: 294 case Intrinsic::amdgcn_interp_p2: 295 case Intrinsic::amdgcn_mbcnt_hi: 296 case Intrinsic::amdgcn_mbcnt_lo: 297 case Intrinsic::r600_read_tidig_x: 298 case Intrinsic::r600_read_tidig_y: 299 case Intrinsic::r600_read_tidig_z: 300 case Intrinsic::amdgcn_atomic_inc: 301 case Intrinsic::amdgcn_atomic_dec: 302 case Intrinsic::amdgcn_image_atomic_swap: 303 case Intrinsic::amdgcn_image_atomic_add: 304 case Intrinsic::amdgcn_image_atomic_sub: 305 case Intrinsic::amdgcn_image_atomic_smin: 306 case Intrinsic::amdgcn_image_atomic_umin: 307 case Intrinsic::amdgcn_image_atomic_smax: 308 case Intrinsic::amdgcn_image_atomic_umax: 309 case Intrinsic::amdgcn_image_atomic_and: 310 case Intrinsic::amdgcn_image_atomic_or: 311 case Intrinsic::amdgcn_image_atomic_xor: 312 case Intrinsic::amdgcn_image_atomic_inc: 313 case Intrinsic::amdgcn_image_atomic_dec: 314 case Intrinsic::amdgcn_image_atomic_cmpswap: 315 case Intrinsic::amdgcn_buffer_atomic_swap: 316 case Intrinsic::amdgcn_buffer_atomic_add: 317 case Intrinsic::amdgcn_buffer_atomic_sub: 318 case Intrinsic::amdgcn_buffer_atomic_smin: 319 case Intrinsic::amdgcn_buffer_atomic_umin: 320 case Intrinsic::amdgcn_buffer_atomic_smax: 321 case Intrinsic::amdgcn_buffer_atomic_umax: 322 case Intrinsic::amdgcn_buffer_atomic_and: 323 case Intrinsic::amdgcn_buffer_atomic_or: 324 case Intrinsic::amdgcn_buffer_atomic_xor: 325 case Intrinsic::amdgcn_buffer_atomic_cmpswap: 326 case Intrinsic::amdgcn_ps_live: 327 case Intrinsic::amdgcn_ds_swizzle: 328 return true; 329 default: 330 return false; 331 } 332 } 333 334 static bool isArgPassedInSGPR(const Argument *A) { 335 const Function *F = A->getParent(); 336 337 // Arguments to compute shaders are never a source of divergence. 338 if (!AMDGPU::isShader(F->getCallingConv())) 339 return true; 340 341 // For non-compute shaders, SGPR inputs are marked with either inreg or byval. 342 if (F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::InReg) || 343 F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::ByVal)) 344 return true; 345 346 // Everything else is in VGPRs. 347 return false; 348 } 349 350 /// 351 /// \returns true if the result of the value could potentially be 352 /// different across workitems in a wavefront. 353 bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const { 354 355 if (const Argument *A = dyn_cast<Argument>(V)) 356 return !isArgPassedInSGPR(A); 357 358 // Loads from the private address space are divergent, because threads 359 // can execute the load instruction with the same inputs and get different 360 // results. 361 // 362 // All other loads are not divergent, because if threads issue loads with the 363 // same arguments, they will always get the same result. 364 if (const LoadInst *Load = dyn_cast<LoadInst>(V)) 365 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; 366 367 // Atomics are divergent because they are executed sequentially: when an 368 // atomic operation refers to the same address in each thread, then each 369 // thread after the first sees the value written by the previous thread as 370 // original value. 371 if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V)) 372 return true; 373 374 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) 375 return isIntrinsicSourceOfDivergence(Intrinsic); 376 377 // Assume all function calls are a source of divergence. 378 if (isa<CallInst>(V) || isa<InvokeInst>(V)) 379 return true; 380 381 return false; 382 } 383