1 //===-- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass ---------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // \file 11 // This file implements a TargetTransformInfo analysis pass specific to the 12 // AMDGPU target machine. It uses the target's detailed information to provide 13 // more precise answers to certain TTI queries, while letting the target 14 // independent and default TTI implementations handle the rest. 15 // 16 //===----------------------------------------------------------------------===// 17 18 #include "AMDGPUTargetTransformInfo.h" 19 #include "llvm/Analysis/LoopInfo.h" 20 #include "llvm/Analysis/TargetTransformInfo.h" 21 #include "llvm/Analysis/ValueTracking.h" 22 #include "llvm/CodeGen/BasicTTIImpl.h" 23 #include "llvm/IR/Module.h" 24 #include "llvm/IR/Intrinsics.h" 25 #include "llvm/Support/Debug.h" 26 #include "llvm/Target/CostTable.h" 27 #include "llvm/Target/TargetLowering.h" 28 using namespace llvm; 29 30 #define DEBUG_TYPE "AMDGPUtti" 31 32 static cl::opt<unsigned> UnrollThresholdPrivate( 33 "amdgpu-unroll-threshold-private", 34 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), 35 cl::init(2000), cl::Hidden); 36 37 static cl::opt<unsigned> UnrollThresholdLocal( 38 "amdgpu-unroll-threshold-local", 39 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), 40 cl::init(1000), cl::Hidden); 41 42 void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, 43 TTI::UnrollingPreferences &UP) { 44 UP.Threshold = 300; // Twice the default. 45 UP.MaxCount = UINT_MAX; 46 UP.Partial = true; 47 48 // TODO: Do we want runtime unrolling? 49 50 // Maximum alloca size than can fit registers. Reserve 16 registers. 51 const unsigned MaxAlloca = (256 - 16) * 4; 52 unsigned ThresholdPrivate = UnrollThresholdPrivate; 53 unsigned ThresholdLocal = UnrollThresholdLocal; 54 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal); 55 AMDGPUAS ASST = ST->getAMDGPUAS(); 56 for (const BasicBlock *BB : L->getBlocks()) { 57 const DataLayout &DL = BB->getModule()->getDataLayout(); 58 unsigned LocalGEPsSeen = 0; 59 60 for (const Instruction &I : *BB) { 61 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I); 62 if (!GEP) 63 continue; 64 65 unsigned AS = GEP->getAddressSpace(); 66 unsigned Threshold = 0; 67 if (AS == ASST.PRIVATE_ADDRESS) 68 Threshold = ThresholdPrivate; 69 else if (AS == ASST.LOCAL_ADDRESS) 70 Threshold = ThresholdLocal; 71 else 72 continue; 73 74 if (UP.Threshold >= Threshold) 75 continue; 76 77 if (AS == ASST.PRIVATE_ADDRESS) { 78 const Value *Ptr = GEP->getPointerOperand(); 79 const AllocaInst *Alloca = 80 dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL)); 81 if (!Alloca || !Alloca->isStaticAlloca()) 82 continue; 83 Type *Ty = Alloca->getAllocatedType(); 84 unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0; 85 if (AllocaSize > MaxAlloca) 86 continue; 87 } else if (AS == ASST.LOCAL_ADDRESS) { 88 LocalGEPsSeen++; 89 // Inhibit unroll for local memory if we have seen addressing not to 90 // a variable, most likely we will be unable to combine it. 91 // Do not unroll too deep inner loops for local memory to give a chance 92 // to unroll an outer loop for a more important reason. 93 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 || 94 (!isa<GlobalVariable>(GEP->getPointerOperand()) && 95 !isa<Argument>(GEP->getPointerOperand()))) 96 continue; 97 } 98 99 // Check if GEP depends on a value defined by this loop itself. 100 bool HasLoopDef = false; 101 for (const Value *Op : GEP->operands()) { 102 const Instruction *Inst = dyn_cast<Instruction>(Op); 103 if (!Inst || L->isLoopInvariant(Op)) 104 continue; 105 106 if (any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) { 107 return SubLoop->contains(Inst); })) 108 continue; 109 HasLoopDef = true; 110 break; 111 } 112 if (!HasLoopDef) 113 continue; 114 115 // We want to do whatever we can to limit the number of alloca 116 // instructions that make it through to the code generator. allocas 117 // require us to use indirect addressing, which is slow and prone to 118 // compiler bugs. If this loop does an address calculation on an 119 // alloca ptr, then we want to use a higher than normal loop unroll 120 // threshold. This will give SROA a better chance to eliminate these 121 // allocas. 122 // 123 // We also want to have more unrolling for local memory to let ds 124 // instructions with different offsets combine. 125 // 126 // Don't use the maximum allowed value here as it will make some 127 // programs way too big. 128 UP.Threshold = Threshold; 129 DEBUG(dbgs() << "Set unroll threshold " << Threshold << " for loop:\n" 130 << *L << " due to " << *GEP << '\n'); 131 if (UP.Threshold == MaxBoost) 132 return; 133 } 134 } 135 } 136 137 unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) { 138 if (Vec) 139 return 0; 140 141 // Number of VGPRs on SI. 142 if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) 143 return 256; 144 145 return 4 * 128; // XXX - 4 channels. Should these count as vector instead? 146 } 147 148 unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) { 149 return Vector ? 0 : 32; 150 } 151 152 unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { 153 AMDGPUAS AS = ST->getAMDGPUAS(); 154 if (AddrSpace == AS.GLOBAL_ADDRESS || 155 AddrSpace == AS.CONSTANT_ADDRESS || 156 AddrSpace == AS.FLAT_ADDRESS) 157 return 128; 158 if (AddrSpace == AS.LOCAL_ADDRESS || 159 AddrSpace == AS.REGION_ADDRESS) 160 return 64; 161 if (AddrSpace == AS.PRIVATE_ADDRESS) 162 return 8 * ST->getMaxPrivateElementSize(); 163 164 if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS && 165 (AddrSpace == AS.PARAM_D_ADDRESS || 166 AddrSpace == AS.PARAM_I_ADDRESS || 167 (AddrSpace >= AS.CONSTANT_BUFFER_0 && 168 AddrSpace <= AS.CONSTANT_BUFFER_15))) 169 return 128; 170 llvm_unreachable("unhandled address space"); 171 } 172 173 bool AMDGPUTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, 174 unsigned Alignment, 175 unsigned AddrSpace) const { 176 // We allow vectorization of flat stores, even though we may need to decompose 177 // them later if they may access private memory. We don't have enough context 178 // here, and legalization can handle it. 179 if (AddrSpace == ST->getAMDGPUAS().PRIVATE_ADDRESS) { 180 return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) && 181 ChainSizeInBytes <= ST->getMaxPrivateElementSize(); 182 } 183 return true; 184 } 185 186 bool AMDGPUTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, 187 unsigned Alignment, 188 unsigned AddrSpace) const { 189 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); 190 } 191 192 bool AMDGPUTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, 193 unsigned Alignment, 194 unsigned AddrSpace) const { 195 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); 196 } 197 198 unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { 199 // Disable unrolling if the loop is not vectorized. 200 if (VF == 1) 201 return 1; 202 203 // Semi-arbitrary large amount. 204 return 64; 205 } 206 207 int AMDGPUTTIImpl::getArithmeticInstrCost( 208 unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, 209 TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, 210 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) { 211 212 EVT OrigTy = TLI->getValueType(DL, Ty); 213 if (!OrigTy.isSimple()) { 214 return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, 215 Opd1PropInfo, Opd2PropInfo); 216 } 217 218 // Legalize the type. 219 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 220 int ISD = TLI->InstructionOpcodeToISD(Opcode); 221 222 // Because we don't have any legal vector operations, but the legal types, we 223 // need to account for split vectors. 224 unsigned NElts = LT.second.isVector() ? 225 LT.second.getVectorNumElements() : 1; 226 227 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy; 228 229 switch (ISD) { 230 case ISD::SHL: 231 case ISD::SRL: 232 case ISD::SRA: { 233 if (SLT == MVT::i64) 234 return get64BitInstrCost() * LT.first * NElts; 235 236 // i32 237 return getFullRateInstrCost() * LT.first * NElts; 238 } 239 case ISD::ADD: 240 case ISD::SUB: 241 case ISD::AND: 242 case ISD::OR: 243 case ISD::XOR: { 244 if (SLT == MVT::i64){ 245 // and, or and xor are typically split into 2 VALU instructions. 246 return 2 * getFullRateInstrCost() * LT.first * NElts; 247 } 248 249 return LT.first * NElts * getFullRateInstrCost(); 250 } 251 case ISD::MUL: { 252 const int QuarterRateCost = getQuarterRateInstrCost(); 253 if (SLT == MVT::i64) { 254 const int FullRateCost = getFullRateInstrCost(); 255 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts; 256 } 257 258 // i32 259 return QuarterRateCost * NElts * LT.first; 260 } 261 case ISD::FADD: 262 case ISD::FSUB: 263 case ISD::FMUL: 264 if (SLT == MVT::f64) 265 return LT.first * NElts * get64BitInstrCost(); 266 267 if (SLT == MVT::f32 || SLT == MVT::f16) 268 return LT.first * NElts * getFullRateInstrCost(); 269 break; 270 271 case ISD::FDIV: 272 case ISD::FREM: 273 // FIXME: frem should be handled separately. The fdiv in it is most of it, 274 // but the current lowering is also not entirely correct. 275 if (SLT == MVT::f64) { 276 int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost(); 277 278 // Add cost of workaround. 279 if (ST->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) 280 Cost += 3 * getFullRateInstrCost(); 281 282 return LT.first * Cost * NElts; 283 } 284 285 // Assuming no fp32 denormals lowering. 286 if (SLT == MVT::f32 || SLT == MVT::f16) { 287 assert(!ST->hasFP32Denormals() && "will change when supported"); 288 int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost(); 289 return LT.first * NElts * Cost; 290 } 291 292 break; 293 default: 294 break; 295 } 296 297 return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, 298 Opd1PropInfo, Opd2PropInfo); 299 } 300 301 unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) { 302 // XXX - For some reason this isn't called for switch. 303 switch (Opcode) { 304 case Instruction::Br: 305 case Instruction::Ret: 306 return 10; 307 default: 308 return BaseT::getCFInstrCost(Opcode); 309 } 310 } 311 312 int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, 313 unsigned Index) { 314 switch (Opcode) { 315 case Instruction::ExtractElement: 316 case Instruction::InsertElement: 317 // Extracts are just reads of a subregister, so are free. Inserts are 318 // considered free because we don't want to have any cost for scalarizing 319 // operations, and we don't have to copy into a different register class. 320 321 // Dynamic indexing isn't free and is best avoided. 322 return Index == ~0u ? 2 : 0; 323 default: 324 return BaseT::getVectorInstrCost(Opcode, ValTy, Index); 325 } 326 } 327 328 static bool isIntrinsicSourceOfDivergence(const IntrinsicInst *I) { 329 switch (I->getIntrinsicID()) { 330 case Intrinsic::amdgcn_workitem_id_x: 331 case Intrinsic::amdgcn_workitem_id_y: 332 case Intrinsic::amdgcn_workitem_id_z: 333 case Intrinsic::amdgcn_interp_mov: 334 case Intrinsic::amdgcn_interp_p1: 335 case Intrinsic::amdgcn_interp_p2: 336 case Intrinsic::amdgcn_mbcnt_hi: 337 case Intrinsic::amdgcn_mbcnt_lo: 338 case Intrinsic::r600_read_tidig_x: 339 case Intrinsic::r600_read_tidig_y: 340 case Intrinsic::r600_read_tidig_z: 341 case Intrinsic::amdgcn_atomic_inc: 342 case Intrinsic::amdgcn_atomic_dec: 343 case Intrinsic::amdgcn_image_atomic_swap: 344 case Intrinsic::amdgcn_image_atomic_add: 345 case Intrinsic::amdgcn_image_atomic_sub: 346 case Intrinsic::amdgcn_image_atomic_smin: 347 case Intrinsic::amdgcn_image_atomic_umin: 348 case Intrinsic::amdgcn_image_atomic_smax: 349 case Intrinsic::amdgcn_image_atomic_umax: 350 case Intrinsic::amdgcn_image_atomic_and: 351 case Intrinsic::amdgcn_image_atomic_or: 352 case Intrinsic::amdgcn_image_atomic_xor: 353 case Intrinsic::amdgcn_image_atomic_inc: 354 case Intrinsic::amdgcn_image_atomic_dec: 355 case Intrinsic::amdgcn_image_atomic_cmpswap: 356 case Intrinsic::amdgcn_buffer_atomic_swap: 357 case Intrinsic::amdgcn_buffer_atomic_add: 358 case Intrinsic::amdgcn_buffer_atomic_sub: 359 case Intrinsic::amdgcn_buffer_atomic_smin: 360 case Intrinsic::amdgcn_buffer_atomic_umin: 361 case Intrinsic::amdgcn_buffer_atomic_smax: 362 case Intrinsic::amdgcn_buffer_atomic_umax: 363 case Intrinsic::amdgcn_buffer_atomic_and: 364 case Intrinsic::amdgcn_buffer_atomic_or: 365 case Intrinsic::amdgcn_buffer_atomic_xor: 366 case Intrinsic::amdgcn_buffer_atomic_cmpswap: 367 case Intrinsic::amdgcn_ps_live: 368 case Intrinsic::amdgcn_ds_swizzle: 369 return true; 370 default: 371 return false; 372 } 373 } 374 375 static bool isArgPassedInSGPR(const Argument *A) { 376 const Function *F = A->getParent(); 377 378 // Arguments to compute shaders are never a source of divergence. 379 if (!AMDGPU::isShader(F->getCallingConv())) 380 return true; 381 382 // For non-compute shaders, SGPR inputs are marked with either inreg or byval. 383 if (F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::InReg) || 384 F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::ByVal)) 385 return true; 386 387 // Everything else is in VGPRs. 388 return false; 389 } 390 391 /// 392 /// \returns true if the result of the value could potentially be 393 /// different across workitems in a wavefront. 394 bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const { 395 396 if (const Argument *A = dyn_cast<Argument>(V)) 397 return !isArgPassedInSGPR(A); 398 399 // Loads from the private address space are divergent, because threads 400 // can execute the load instruction with the same inputs and get different 401 // results. 402 // 403 // All other loads are not divergent, because if threads issue loads with the 404 // same arguments, they will always get the same result. 405 if (const LoadInst *Load = dyn_cast<LoadInst>(V)) 406 return Load->getPointerAddressSpace() == ST->getAMDGPUAS().PRIVATE_ADDRESS; 407 408 // Atomics are divergent because they are executed sequentially: when an 409 // atomic operation refers to the same address in each thread, then each 410 // thread after the first sees the value written by the previous thread as 411 // original value. 412 if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V)) 413 return true; 414 415 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) 416 return isIntrinsicSourceOfDivergence(Intrinsic); 417 418 // Assume all function calls are a source of divergence. 419 if (isa<CallInst>(V) || isa<InvokeInst>(V)) 420 return true; 421 422 return false; 423 } 424