1 //===-- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass ---------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // \file 11 // This file implements a TargetTransformInfo analysis pass specific to the 12 // AMDGPU target machine. It uses the target's detailed information to provide 13 // more precise answers to certain TTI queries, while letting the target 14 // independent and default TTI implementations handle the rest. 15 // 16 //===----------------------------------------------------------------------===// 17 18 #include "AMDGPUTargetTransformInfo.h" 19 #include "llvm/Analysis/LoopInfo.h" 20 #include "llvm/Analysis/TargetTransformInfo.h" 21 #include "llvm/Analysis/ValueTracking.h" 22 #include "llvm/CodeGen/BasicTTIImpl.h" 23 #include "llvm/IR/Intrinsics.h" 24 #include "llvm/IR/Module.h" 25 #include "llvm/Support/Debug.h" 26 #include "llvm/Target/CostTable.h" 27 #include "llvm/Target/TargetLowering.h" 28 using namespace llvm; 29 30 #define DEBUG_TYPE "AMDGPUtti" 31 32 static cl::opt<unsigned> UnrollThresholdPrivate( 33 "amdgpu-unroll-threshold-private", 34 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), 35 cl::init(2500), cl::Hidden); 36 37 static cl::opt<unsigned> UnrollThresholdLocal( 38 "amdgpu-unroll-threshold-local", 39 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), 40 cl::init(1000), cl::Hidden); 41 42 static cl::opt<unsigned> UnrollThresholdIf( 43 "amdgpu-unroll-threshold-if", 44 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), 45 cl::init(150), cl::Hidden); 46 47 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, 48 unsigned Depth = 0) { 49 const Instruction *I = dyn_cast<Instruction>(Cond); 50 if (!I) 51 return false; 52 53 for (const Value *V : I->operand_values()) { 54 if (!L->contains(I)) 55 continue; 56 if (const PHINode *PHI = dyn_cast<PHINode>(V)) { 57 if (none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) { 58 return SubLoop->contains(PHI); })) 59 return true; 60 } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1)) 61 return true; 62 } 63 return false; 64 } 65 66 void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 67 TTI::UnrollingPreferences &UP) { 68 UP.Threshold = 300; // Twice the default. 69 UP.MaxCount = UINT_MAX; 70 UP.Partial = true; 71 72 // TODO: Do we want runtime unrolling? 73 74 // Maximum alloca size than can fit registers. Reserve 16 registers. 75 const unsigned MaxAlloca = (256 - 16) * 4; 76 unsigned ThresholdPrivate = UnrollThresholdPrivate; 77 unsigned ThresholdLocal = UnrollThresholdLocal; 78 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal); 79 AMDGPUAS ASST = ST->getAMDGPUAS(); 80 for (const BasicBlock *BB : L->getBlocks()) { 81 const DataLayout &DL = BB->getModule()->getDataLayout(); 82 unsigned LocalGEPsSeen = 0; 83 84 if (any_of(L->getSubLoops(), [BB](const Loop* SubLoop) { 85 return SubLoop->contains(BB); })) 86 continue; // Block belongs to an inner loop. 87 88 for (const Instruction &I : *BB) { 89 90 // Unroll a loop which contains an "if" statement whose condition 91 // defined by a PHI belonging to the loop. This may help to eliminate 92 // if region and potentially even PHI itself, saving on both divergence 93 // and registers used for the PHI. 94 // Add a small bonus for each of such "if" statements. 95 if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) { 96 if (UP.Threshold < MaxBoost && Br->isConditional()) { 97 if (L->isLoopExiting(Br->getSuccessor(0)) || 98 L->isLoopExiting(Br->getSuccessor(1))) 99 continue; 100 if (dependsOnLocalPhi(L, Br->getCondition())) { 101 UP.Threshold += UnrollThresholdIf; 102 DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold 103 << " for loop:\n" << *L << " due to " << *Br << '\n'); 104 if (UP.Threshold >= MaxBoost) 105 return; 106 } 107 } 108 continue; 109 } 110 111 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I); 112 if (!GEP) 113 continue; 114 115 unsigned AS = GEP->getAddressSpace(); 116 unsigned Threshold = 0; 117 if (AS == ASST.PRIVATE_ADDRESS) 118 Threshold = ThresholdPrivate; 119 else if (AS == ASST.LOCAL_ADDRESS) 120 Threshold = ThresholdLocal; 121 else 122 continue; 123 124 if (UP.Threshold >= Threshold) 125 continue; 126 127 if (AS == ASST.PRIVATE_ADDRESS) { 128 const Value *Ptr = GEP->getPointerOperand(); 129 const AllocaInst *Alloca = 130 dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL)); 131 if (!Alloca || !Alloca->isStaticAlloca()) 132 continue; 133 Type *Ty = Alloca->getAllocatedType(); 134 unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0; 135 if (AllocaSize > MaxAlloca) 136 continue; 137 } else if (AS == ASST.LOCAL_ADDRESS) { 138 LocalGEPsSeen++; 139 // Inhibit unroll for local memory if we have seen addressing not to 140 // a variable, most likely we will be unable to combine it. 141 // Do not unroll too deep inner loops for local memory to give a chance 142 // to unroll an outer loop for a more important reason. 143 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 || 144 (!isa<GlobalVariable>(GEP->getPointerOperand()) && 145 !isa<Argument>(GEP->getPointerOperand()))) 146 continue; 147 } 148 149 // Check if GEP depends on a value defined by this loop itself. 150 bool HasLoopDef = false; 151 for (const Value *Op : GEP->operands()) { 152 const Instruction *Inst = dyn_cast<Instruction>(Op); 153 if (!Inst || L->isLoopInvariant(Op)) 154 continue; 155 156 if (any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) { 157 return SubLoop->contains(Inst); })) 158 continue; 159 HasLoopDef = true; 160 break; 161 } 162 if (!HasLoopDef) 163 continue; 164 165 // We want to do whatever we can to limit the number of alloca 166 // instructions that make it through to the code generator. allocas 167 // require us to use indirect addressing, which is slow and prone to 168 // compiler bugs. If this loop does an address calculation on an 169 // alloca ptr, then we want to use a higher than normal loop unroll 170 // threshold. This will give SROA a better chance to eliminate these 171 // allocas. 172 // 173 // We also want to have more unrolling for local memory to let ds 174 // instructions with different offsets combine. 175 // 176 // Don't use the maximum allowed value here as it will make some 177 // programs way too big. 178 UP.Threshold = Threshold; 179 DEBUG(dbgs() << "Set unroll threshold " << Threshold << " for loop:\n" 180 << *L << " due to " << *GEP << '\n'); 181 if (UP.Threshold >= MaxBoost) 182 return; 183 } 184 } 185 } 186 187 unsigned AMDGPUTTIImpl::getHardwareNumberOfRegisters(bool Vec) const { 188 // The concept of vector registers doesn't really exist. Some packed vector 189 // operations operate on the normal 32-bit registers. 190 191 // Number of VGPRs on SI. 192 if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) 193 return 256; 194 195 return 4 * 128; // XXX - 4 channels. Should these count as vector instead? 196 } 197 198 unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) const { 199 // This is really the number of registers to fill when vectorizing / 200 // interleaving loops, so we lie to avoid trying to use all registers. 201 return getHardwareNumberOfRegisters(Vec) >> 3; 202 } 203 204 unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) const { 205 return 32; 206 } 207 208 unsigned AMDGPUTTIImpl::getMinVectorRegisterBitWidth() const { 209 return 32; 210 } 211 212 unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { 213 AMDGPUAS AS = ST->getAMDGPUAS(); 214 if (AddrSpace == AS.GLOBAL_ADDRESS || 215 AddrSpace == AS.CONSTANT_ADDRESS || 216 AddrSpace == AS.FLAT_ADDRESS) 217 return 128; 218 if (AddrSpace == AS.LOCAL_ADDRESS || 219 AddrSpace == AS.REGION_ADDRESS) 220 return 64; 221 if (AddrSpace == AS.PRIVATE_ADDRESS) 222 return 8 * ST->getMaxPrivateElementSize(); 223 224 if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS && 225 (AddrSpace == AS.PARAM_D_ADDRESS || 226 AddrSpace == AS.PARAM_I_ADDRESS || 227 (AddrSpace >= AS.CONSTANT_BUFFER_0 && 228 AddrSpace <= AS.CONSTANT_BUFFER_15))) 229 return 128; 230 llvm_unreachable("unhandled address space"); 231 } 232 233 bool AMDGPUTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, 234 unsigned Alignment, 235 unsigned AddrSpace) const { 236 // We allow vectorization of flat stores, even though we may need to decompose 237 // them later if they may access private memory. We don't have enough context 238 // here, and legalization can handle it. 239 if (AddrSpace == ST->getAMDGPUAS().PRIVATE_ADDRESS) { 240 return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) && 241 ChainSizeInBytes <= ST->getMaxPrivateElementSize(); 242 } 243 return true; 244 } 245 246 bool AMDGPUTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, 247 unsigned Alignment, 248 unsigned AddrSpace) const { 249 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); 250 } 251 252 bool AMDGPUTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, 253 unsigned Alignment, 254 unsigned AddrSpace) const { 255 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); 256 } 257 258 unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { 259 // Disable unrolling if the loop is not vectorized. 260 // TODO: Enable this again. 261 if (VF == 1) 262 return 1; 263 264 return 8; 265 } 266 267 int AMDGPUTTIImpl::getArithmeticInstrCost( 268 unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, 269 TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, 270 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) { 271 272 EVT OrigTy = TLI->getValueType(DL, Ty); 273 if (!OrigTy.isSimple()) { 274 return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, 275 Opd1PropInfo, Opd2PropInfo); 276 } 277 278 // Legalize the type. 279 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 280 int ISD = TLI->InstructionOpcodeToISD(Opcode); 281 282 // Because we don't have any legal vector operations, but the legal types, we 283 // need to account for split vectors. 284 unsigned NElts = LT.second.isVector() ? 285 LT.second.getVectorNumElements() : 1; 286 287 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy; 288 289 switch (ISD) { 290 case ISD::SHL: 291 case ISD::SRL: 292 case ISD::SRA: { 293 if (SLT == MVT::i64) 294 return get64BitInstrCost() * LT.first * NElts; 295 296 // i32 297 return getFullRateInstrCost() * LT.first * NElts; 298 } 299 case ISD::ADD: 300 case ISD::SUB: 301 case ISD::AND: 302 case ISD::OR: 303 case ISD::XOR: { 304 if (SLT == MVT::i64){ 305 // and, or and xor are typically split into 2 VALU instructions. 306 return 2 * getFullRateInstrCost() * LT.first * NElts; 307 } 308 309 return LT.first * NElts * getFullRateInstrCost(); 310 } 311 case ISD::MUL: { 312 const int QuarterRateCost = getQuarterRateInstrCost(); 313 if (SLT == MVT::i64) { 314 const int FullRateCost = getFullRateInstrCost(); 315 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts; 316 } 317 318 // i32 319 return QuarterRateCost * NElts * LT.first; 320 } 321 case ISD::FADD: 322 case ISD::FSUB: 323 case ISD::FMUL: 324 if (SLT == MVT::f64) 325 return LT.first * NElts * get64BitInstrCost(); 326 327 if (SLT == MVT::f32 || SLT == MVT::f16) 328 return LT.first * NElts * getFullRateInstrCost(); 329 break; 330 331 case ISD::FDIV: 332 case ISD::FREM: 333 // FIXME: frem should be handled separately. The fdiv in it is most of it, 334 // but the current lowering is also not entirely correct. 335 if (SLT == MVT::f64) { 336 int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost(); 337 338 // Add cost of workaround. 339 if (ST->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) 340 Cost += 3 * getFullRateInstrCost(); 341 342 return LT.first * Cost * NElts; 343 } 344 345 // Assuming no fp32 denormals lowering. 346 if (SLT == MVT::f32 || SLT == MVT::f16) { 347 assert(!ST->hasFP32Denormals() && "will change when supported"); 348 int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost(); 349 return LT.first * NElts * Cost; 350 } 351 352 break; 353 default: 354 break; 355 } 356 357 return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, 358 Opd1PropInfo, Opd2PropInfo); 359 } 360 361 unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) { 362 // XXX - For some reason this isn't called for switch. 363 switch (Opcode) { 364 case Instruction::Br: 365 case Instruction::Ret: 366 return 10; 367 default: 368 return BaseT::getCFInstrCost(Opcode); 369 } 370 } 371 372 int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, 373 unsigned Index) { 374 switch (Opcode) { 375 case Instruction::ExtractElement: 376 case Instruction::InsertElement: { 377 unsigned EltSize 378 = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType()); 379 if (EltSize < 32) { 380 if (EltSize == 16 && Index == 0 && ST->has16BitInsts()) 381 return 0; 382 return BaseT::getVectorInstrCost(Opcode, ValTy, Index); 383 } 384 385 // Extracts are just reads of a subregister, so are free. Inserts are 386 // considered free because we don't want to have any cost for scalarizing 387 // operations, and we don't have to copy into a different register class. 388 389 // Dynamic indexing isn't free and is best avoided. 390 return Index == ~0u ? 2 : 0; 391 } 392 default: 393 return BaseT::getVectorInstrCost(Opcode, ValTy, Index); 394 } 395 } 396 397 static bool isIntrinsicSourceOfDivergence(const IntrinsicInst *I) { 398 switch (I->getIntrinsicID()) { 399 case Intrinsic::amdgcn_workitem_id_x: 400 case Intrinsic::amdgcn_workitem_id_y: 401 case Intrinsic::amdgcn_workitem_id_z: 402 case Intrinsic::amdgcn_interp_mov: 403 case Intrinsic::amdgcn_interp_p1: 404 case Intrinsic::amdgcn_interp_p2: 405 case Intrinsic::amdgcn_mbcnt_hi: 406 case Intrinsic::amdgcn_mbcnt_lo: 407 case Intrinsic::r600_read_tidig_x: 408 case Intrinsic::r600_read_tidig_y: 409 case Intrinsic::r600_read_tidig_z: 410 case Intrinsic::amdgcn_atomic_inc: 411 case Intrinsic::amdgcn_atomic_dec: 412 case Intrinsic::amdgcn_image_atomic_swap: 413 case Intrinsic::amdgcn_image_atomic_add: 414 case Intrinsic::amdgcn_image_atomic_sub: 415 case Intrinsic::amdgcn_image_atomic_smin: 416 case Intrinsic::amdgcn_image_atomic_umin: 417 case Intrinsic::amdgcn_image_atomic_smax: 418 case Intrinsic::amdgcn_image_atomic_umax: 419 case Intrinsic::amdgcn_image_atomic_and: 420 case Intrinsic::amdgcn_image_atomic_or: 421 case Intrinsic::amdgcn_image_atomic_xor: 422 case Intrinsic::amdgcn_image_atomic_inc: 423 case Intrinsic::amdgcn_image_atomic_dec: 424 case Intrinsic::amdgcn_image_atomic_cmpswap: 425 case Intrinsic::amdgcn_buffer_atomic_swap: 426 case Intrinsic::amdgcn_buffer_atomic_add: 427 case Intrinsic::amdgcn_buffer_atomic_sub: 428 case Intrinsic::amdgcn_buffer_atomic_smin: 429 case Intrinsic::amdgcn_buffer_atomic_umin: 430 case Intrinsic::amdgcn_buffer_atomic_smax: 431 case Intrinsic::amdgcn_buffer_atomic_umax: 432 case Intrinsic::amdgcn_buffer_atomic_and: 433 case Intrinsic::amdgcn_buffer_atomic_or: 434 case Intrinsic::amdgcn_buffer_atomic_xor: 435 case Intrinsic::amdgcn_buffer_atomic_cmpswap: 436 case Intrinsic::amdgcn_ps_live: 437 case Intrinsic::amdgcn_ds_swizzle: 438 return true; 439 default: 440 return false; 441 } 442 } 443 444 static bool isArgPassedInSGPR(const Argument *A) { 445 const Function *F = A->getParent(); 446 447 // Arguments to compute shaders are never a source of divergence. 448 CallingConv::ID CC = F->getCallingConv(); 449 switch (CC) { 450 case CallingConv::AMDGPU_KERNEL: 451 case CallingConv::SPIR_KERNEL: 452 return true; 453 case CallingConv::AMDGPU_VS: 454 case CallingConv::AMDGPU_HS: 455 case CallingConv::AMDGPU_GS: 456 case CallingConv::AMDGPU_PS: 457 case CallingConv::AMDGPU_CS: 458 // For non-compute shaders, SGPR inputs are marked with either inreg or byval. 459 // Everything else is in VGPRs. 460 return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) || 461 F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal); 462 default: 463 // TODO: Should calls support inreg for SGPR inputs? 464 return false; 465 } 466 } 467 468 /// 469 /// \returns true if the result of the value could potentially be 470 /// different across workitems in a wavefront. 471 bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const { 472 473 if (const Argument *A = dyn_cast<Argument>(V)) 474 return !isArgPassedInSGPR(A); 475 476 // Loads from the private address space are divergent, because threads 477 // can execute the load instruction with the same inputs and get different 478 // results. 479 // 480 // All other loads are not divergent, because if threads issue loads with the 481 // same arguments, they will always get the same result. 482 if (const LoadInst *Load = dyn_cast<LoadInst>(V)) 483 return Load->getPointerAddressSpace() == ST->getAMDGPUAS().PRIVATE_ADDRESS; 484 485 // Atomics are divergent because they are executed sequentially: when an 486 // atomic operation refers to the same address in each thread, then each 487 // thread after the first sees the value written by the previous thread as 488 // original value. 489 if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V)) 490 return true; 491 492 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) 493 return isIntrinsicSourceOfDivergence(Intrinsic); 494 495 // Assume all function calls are a source of divergence. 496 if (isa<CallInst>(V) || isa<InvokeInst>(V)) 497 return true; 498 499 return false; 500 } 501 502 bool AMDGPUTTIImpl::isAlwaysUniform(const Value *V) const { 503 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) { 504 switch (Intrinsic->getIntrinsicID()) { 505 default: 506 return false; 507 case Intrinsic::amdgcn_readfirstlane: 508 case Intrinsic::amdgcn_readlane: 509 return true; 510 } 511 } 512 return false; 513 } 514 515 unsigned AMDGPUTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, 516 Type *SubTp) { 517 if (ST->hasVOP3PInsts()) { 518 VectorType *VT = cast<VectorType>(Tp); 519 if (VT->getNumElements() == 2 && 520 DL.getTypeSizeInBits(VT->getElementType()) == 16) { 521 // With op_sel VOP3P instructions freely can access the low half or high 522 // half of a register, so any swizzle is free. 523 524 switch (Kind) { 525 case TTI::SK_Broadcast: 526 case TTI::SK_Reverse: 527 case TTI::SK_PermuteSingleSrc: 528 return 0; 529 default: 530 break; 531 } 532 } 533 } 534 535 return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); 536 } 537