1 //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // \file 10 // This file implements a TargetTransformInfo analysis pass specific to the 11 // AMDGPU target machine. It uses the target's detailed information to provide 12 // more precise answers to certain TTI queries, while letting the target 13 // independent and default TTI implementations handle the rest. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "AMDGPUTargetTransformInfo.h" 18 #include "AMDGPUSubtarget.h" 19 #include "Utils/AMDGPUBaseInfo.h" 20 #include "llvm/ADT/STLExtras.h" 21 #include "llvm/Analysis/LoopInfo.h" 22 #include "llvm/Analysis/TargetTransformInfo.h" 23 #include "llvm/Analysis/ValueTracking.h" 24 #include "llvm/CodeGen/ISDOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/Argument.h" 27 #include "llvm/IR/Attributes.h" 28 #include "llvm/IR/BasicBlock.h" 29 #include "llvm/IR/CallingConv.h" 30 #include "llvm/IR/DataLayout.h" 31 #include "llvm/IR/DerivedTypes.h" 32 #include "llvm/IR/Function.h" 33 #include "llvm/IR/Instruction.h" 34 #include "llvm/IR/Instructions.h" 35 #include "llvm/IR/IntrinsicInst.h" 36 #include "llvm/IR/Module.h" 37 #include "llvm/IR/PatternMatch.h" 38 #include "llvm/IR/Type.h" 39 #include "llvm/IR/Value.h" 40 #include "llvm/MC/SubtargetFeature.h" 41 #include "llvm/Support/Casting.h" 42 #include "llvm/Support/CommandLine.h" 43 #include "llvm/Support/Debug.h" 44 #include "llvm/Support/ErrorHandling.h" 45 #include "llvm/Support/KnownBits.h" 46 #include "llvm/Support/MachineValueType.h" 47 #include "llvm/Support/raw_ostream.h" 48 #include "llvm/Target/TargetMachine.h" 49 #include <algorithm> 50 #include <cassert> 51 #include <limits> 52 #include <utility> 53 54 using namespace llvm; 55 56 #define DEBUG_TYPE "AMDGPUtti" 57 58 static cl::opt<unsigned> UnrollThresholdPrivate( 59 "amdgpu-unroll-threshold-private", 60 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), 61 cl::init(2700), cl::Hidden); 62 63 static cl::opt<unsigned> UnrollThresholdLocal( 64 "amdgpu-unroll-threshold-local", 65 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), 66 cl::init(1000), cl::Hidden); 67 68 static cl::opt<unsigned> UnrollThresholdIf( 69 "amdgpu-unroll-threshold-if", 70 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), 71 cl::init(150), cl::Hidden); 72 73 static cl::opt<bool> UnrollRuntimeLocal( 74 "amdgpu-unroll-runtime-local", 75 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), 76 cl::init(true), cl::Hidden); 77 78 static cl::opt<bool> UseLegacyDA( 79 "amdgpu-use-legacy-divergence-analysis", 80 cl::desc("Enable legacy divergence analysis for AMDGPU"), 81 cl::init(false), cl::Hidden); 82 83 static cl::opt<unsigned> UnrollMaxBlockToAnalyze( 84 "amdgpu-unroll-max-block-to-analyze", 85 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), 86 cl::init(32), cl::Hidden); 87 88 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, 89 unsigned Depth = 0) { 90 const Instruction *I = dyn_cast<Instruction>(Cond); 91 if (!I) 92 return false; 93 94 for (const Value *V : I->operand_values()) { 95 if (!L->contains(I)) 96 continue; 97 if (const PHINode *PHI = dyn_cast<PHINode>(V)) { 98 if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) { 99 return SubLoop->contains(PHI); })) 100 return true; 101 } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1)) 102 return true; 103 } 104 return false; 105 } 106 107 void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 108 TTI::UnrollingPreferences &UP) { 109 const Function &F = *L->getHeader()->getParent(); 110 UP.Threshold = AMDGPU::getIntegerAttribute(F, "amdgpu-unroll-threshold", 300); 111 UP.MaxCount = std::numeric_limits<unsigned>::max(); 112 UP.Partial = true; 113 114 // TODO: Do we want runtime unrolling? 115 116 // Maximum alloca size than can fit registers. Reserve 16 registers. 117 const unsigned MaxAlloca = (256 - 16) * 4; 118 unsigned ThresholdPrivate = UnrollThresholdPrivate; 119 unsigned ThresholdLocal = UnrollThresholdLocal; 120 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal); 121 for (const BasicBlock *BB : L->getBlocks()) { 122 const DataLayout &DL = BB->getModule()->getDataLayout(); 123 unsigned LocalGEPsSeen = 0; 124 125 if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) { 126 return SubLoop->contains(BB); })) 127 continue; // Block belongs to an inner loop. 128 129 for (const Instruction &I : *BB) { 130 // Unroll a loop which contains an "if" statement whose condition 131 // defined by a PHI belonging to the loop. This may help to eliminate 132 // if region and potentially even PHI itself, saving on both divergence 133 // and registers used for the PHI. 134 // Add a small bonus for each of such "if" statements. 135 if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) { 136 if (UP.Threshold < MaxBoost && Br->isConditional()) { 137 BasicBlock *Succ0 = Br->getSuccessor(0); 138 BasicBlock *Succ1 = Br->getSuccessor(1); 139 if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) || 140 (L->contains(Succ1) && L->isLoopExiting(Succ1))) 141 continue; 142 if (dependsOnLocalPhi(L, Br->getCondition())) { 143 UP.Threshold += UnrollThresholdIf; 144 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold 145 << " for loop:\n" 146 << *L << " due to " << *Br << '\n'); 147 if (UP.Threshold >= MaxBoost) 148 return; 149 } 150 } 151 continue; 152 } 153 154 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I); 155 if (!GEP) 156 continue; 157 158 unsigned AS = GEP->getAddressSpace(); 159 unsigned Threshold = 0; 160 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 161 Threshold = ThresholdPrivate; 162 else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) 163 Threshold = ThresholdLocal; 164 else 165 continue; 166 167 if (UP.Threshold >= Threshold) 168 continue; 169 170 if (AS == AMDGPUAS::PRIVATE_ADDRESS) { 171 const Value *Ptr = GEP->getPointerOperand(); 172 const AllocaInst *Alloca = 173 dyn_cast<AllocaInst>(getUnderlyingObject(Ptr)); 174 if (!Alloca || !Alloca->isStaticAlloca()) 175 continue; 176 Type *Ty = Alloca->getAllocatedType(); 177 unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0; 178 if (AllocaSize > MaxAlloca) 179 continue; 180 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || 181 AS == AMDGPUAS::REGION_ADDRESS) { 182 LocalGEPsSeen++; 183 // Inhibit unroll for local memory if we have seen addressing not to 184 // a variable, most likely we will be unable to combine it. 185 // Do not unroll too deep inner loops for local memory to give a chance 186 // to unroll an outer loop for a more important reason. 187 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 || 188 (!isa<GlobalVariable>(GEP->getPointerOperand()) && 189 !isa<Argument>(GEP->getPointerOperand()))) 190 continue; 191 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n" 192 << *L << " due to LDS use.\n"); 193 UP.Runtime = UnrollRuntimeLocal; 194 } 195 196 // Check if GEP depends on a value defined by this loop itself. 197 bool HasLoopDef = false; 198 for (const Value *Op : GEP->operands()) { 199 const Instruction *Inst = dyn_cast<Instruction>(Op); 200 if (!Inst || L->isLoopInvariant(Op)) 201 continue; 202 203 if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) { 204 return SubLoop->contains(Inst); })) 205 continue; 206 HasLoopDef = true; 207 break; 208 } 209 if (!HasLoopDef) 210 continue; 211 212 // We want to do whatever we can to limit the number of alloca 213 // instructions that make it through to the code generator. allocas 214 // require us to use indirect addressing, which is slow and prone to 215 // compiler bugs. If this loop does an address calculation on an 216 // alloca ptr, then we want to use a higher than normal loop unroll 217 // threshold. This will give SROA a better chance to eliminate these 218 // allocas. 219 // 220 // We also want to have more unrolling for local memory to let ds 221 // instructions with different offsets combine. 222 // 223 // Don't use the maximum allowed value here as it will make some 224 // programs way too big. 225 UP.Threshold = Threshold; 226 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold 227 << " for loop:\n" 228 << *L << " due to " << *GEP << '\n'); 229 if (UP.Threshold >= MaxBoost) 230 return; 231 } 232 233 // If we got a GEP in a small BB from inner loop then increase max trip 234 // count to analyze for better estimation cost in unroll 235 if (L->empty() && BB->size() < UnrollMaxBlockToAnalyze) 236 UP.MaxIterationsCountToAnalyze = 32; 237 } 238 } 239 240 void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 241 TTI::PeelingPreferences &PP) { 242 BaseT::getPeelingPreferences(L, SE, PP); 243 } 244 unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const { 245 // The concept of vector registers doesn't really exist. Some packed vector 246 // operations operate on the normal 32-bit registers. 247 return MaxVGPRs; 248 } 249 250 unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const { 251 // This is really the number of registers to fill when vectorizing / 252 // interleaving loops, so we lie to avoid trying to use all registers. 253 return getHardwareNumberOfRegisters(Vec) >> 3; 254 } 255 256 unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const { 257 const SIRegisterInfo *TRI = ST->getRegisterInfo(); 258 const TargetRegisterClass *RC = TRI->getRegClass(RCID); 259 unsigned NumVGPRs = (TRI->getRegSizeInBits(*RC) + 31) / 32; 260 return getHardwareNumberOfRegisters(false) / NumVGPRs; 261 } 262 263 unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const { 264 return 32; 265 } 266 267 unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const { 268 return 32; 269 } 270 271 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize, 272 unsigned ChainSizeInBytes, 273 VectorType *VecTy) const { 274 unsigned VecRegBitWidth = VF * LoadSize; 275 if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32) 276 // TODO: Support element-size less than 32bit? 277 return 128 / LoadSize; 278 279 return VF; 280 } 281 282 unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize, 283 unsigned ChainSizeInBytes, 284 VectorType *VecTy) const { 285 unsigned VecRegBitWidth = VF * StoreSize; 286 if (VecRegBitWidth > 128) 287 return 128 / StoreSize; 288 289 return VF; 290 } 291 292 unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { 293 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS || 294 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS || 295 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT || 296 AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) { 297 return 512; 298 } 299 300 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) 301 return 8 * ST->getMaxPrivateElementSize(); 302 303 // Common to flat, global, local and region. Assume for unknown addrspace. 304 return 128; 305 } 306 307 bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, 308 Align Alignment, 309 unsigned AddrSpace) const { 310 // We allow vectorization of flat stores, even though we may need to decompose 311 // them later if they may access private memory. We don't have enough context 312 // here, and legalization can handle it. 313 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { 314 return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) && 315 ChainSizeInBytes <= ST->getMaxPrivateElementSize(); 316 } 317 return true; 318 } 319 320 bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, 321 Align Alignment, 322 unsigned AddrSpace) const { 323 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); 324 } 325 326 bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, 327 Align Alignment, 328 unsigned AddrSpace) const { 329 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); 330 } 331 332 // FIXME: Really we would like to issue multiple 128-bit loads and stores per 333 // iteration. Should we report a larger size and let it legalize? 334 // 335 // FIXME: Should we use narrower types for local/region, or account for when 336 // unaligned access is legal? 337 // 338 // FIXME: This could use fine tuning and microbenchmarks. 339 Type *GCNTTIImpl::getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, 340 unsigned SrcAddrSpace, 341 unsigned DestAddrSpace, 342 unsigned SrcAlign, 343 unsigned DestAlign) const { 344 unsigned MinAlign = std::min(SrcAlign, DestAlign); 345 346 // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the 347 // hardware into byte accesses. If you assume all alignments are equally 348 // probable, it's more efficient on average to use short accesses for this 349 // case. 350 if (MinAlign == 2) 351 return Type::getInt16Ty(Context); 352 353 // Not all subtargets have 128-bit DS instructions, and we currently don't 354 // form them by default. 355 if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS || 356 SrcAddrSpace == AMDGPUAS::REGION_ADDRESS || 357 DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS || 358 DestAddrSpace == AMDGPUAS::REGION_ADDRESS) { 359 return FixedVectorType::get(Type::getInt32Ty(Context), 2); 360 } 361 362 // Global memory works best with 16-byte accesses. Private memory will also 363 // hit this, although they'll be decomposed. 364 return FixedVectorType::get(Type::getInt32Ty(Context), 4); 365 } 366 367 void GCNTTIImpl::getMemcpyLoopResidualLoweringType( 368 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context, 369 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, 370 unsigned SrcAlign, unsigned DestAlign) const { 371 assert(RemainingBytes < 16); 372 373 unsigned MinAlign = std::min(SrcAlign, DestAlign); 374 375 if (MinAlign != 2) { 376 Type *I64Ty = Type::getInt64Ty(Context); 377 while (RemainingBytes >= 8) { 378 OpsOut.push_back(I64Ty); 379 RemainingBytes -= 8; 380 } 381 382 Type *I32Ty = Type::getInt32Ty(Context); 383 while (RemainingBytes >= 4) { 384 OpsOut.push_back(I32Ty); 385 RemainingBytes -= 4; 386 } 387 } 388 389 Type *I16Ty = Type::getInt16Ty(Context); 390 while (RemainingBytes >= 2) { 391 OpsOut.push_back(I16Ty); 392 RemainingBytes -= 2; 393 } 394 395 Type *I8Ty = Type::getInt8Ty(Context); 396 while (RemainingBytes) { 397 OpsOut.push_back(I8Ty); 398 --RemainingBytes; 399 } 400 } 401 402 unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) { 403 // Disable unrolling if the loop is not vectorized. 404 // TODO: Enable this again. 405 if (VF == 1) 406 return 1; 407 408 return 8; 409 } 410 411 bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, 412 MemIntrinsicInfo &Info) const { 413 switch (Inst->getIntrinsicID()) { 414 case Intrinsic::amdgcn_atomic_inc: 415 case Intrinsic::amdgcn_atomic_dec: 416 case Intrinsic::amdgcn_ds_ordered_add: 417 case Intrinsic::amdgcn_ds_ordered_swap: 418 case Intrinsic::amdgcn_ds_fadd: 419 case Intrinsic::amdgcn_ds_fmin: 420 case Intrinsic::amdgcn_ds_fmax: { 421 auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2)); 422 auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4)); 423 if (!Ordering || !Volatile) 424 return false; // Invalid. 425 426 unsigned OrderingVal = Ordering->getZExtValue(); 427 if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent)) 428 return false; 429 430 Info.PtrVal = Inst->getArgOperand(0); 431 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal); 432 Info.ReadMem = true; 433 Info.WriteMem = true; 434 Info.IsVolatile = !Volatile->isNullValue(); 435 return true; 436 } 437 default: 438 return false; 439 } 440 } 441 442 int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, 443 TTI::TargetCostKind CostKind, 444 TTI::OperandValueKind Opd1Info, 445 TTI::OperandValueKind Opd2Info, 446 TTI::OperandValueProperties Opd1PropInfo, 447 TTI::OperandValueProperties Opd2PropInfo, 448 ArrayRef<const Value *> Args, 449 const Instruction *CxtI) { 450 EVT OrigTy = TLI->getValueType(DL, Ty); 451 if (!OrigTy.isSimple()) { 452 // FIXME: We're having to query the throughput cost so that the basic 453 // implementation tries to generate legalize and scalarization costs. Maybe 454 // we could hoist the scalarization code here? 455 return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput, 456 Opd1Info, Opd2Info, Opd1PropInfo, 457 Opd2PropInfo, Args, CxtI); 458 } 459 460 // Legalize the type. 461 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 462 int ISD = TLI->InstructionOpcodeToISD(Opcode); 463 464 // Because we don't have any legal vector operations, but the legal types, we 465 // need to account for split vectors. 466 unsigned NElts = LT.second.isVector() ? 467 LT.second.getVectorNumElements() : 1; 468 469 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy; 470 471 switch (ISD) { 472 case ISD::SHL: 473 case ISD::SRL: 474 case ISD::SRA: 475 if (SLT == MVT::i64) 476 return get64BitInstrCost() * LT.first * NElts; 477 478 if (ST->has16BitInsts() && SLT == MVT::i16) 479 NElts = (NElts + 1) / 2; 480 481 // i32 482 return getFullRateInstrCost() * LT.first * NElts; 483 case ISD::ADD: 484 case ISD::SUB: 485 case ISD::AND: 486 case ISD::OR: 487 case ISD::XOR: 488 if (SLT == MVT::i64) { 489 // and, or and xor are typically split into 2 VALU instructions. 490 return 2 * getFullRateInstrCost() * LT.first * NElts; 491 } 492 493 if (ST->has16BitInsts() && SLT == MVT::i16) 494 NElts = (NElts + 1) / 2; 495 496 return LT.first * NElts * getFullRateInstrCost(); 497 case ISD::MUL: { 498 const int QuarterRateCost = getQuarterRateInstrCost(); 499 if (SLT == MVT::i64) { 500 const int FullRateCost = getFullRateInstrCost(); 501 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts; 502 } 503 504 if (ST->has16BitInsts() && SLT == MVT::i16) 505 NElts = (NElts + 1) / 2; 506 507 // i32 508 return QuarterRateCost * NElts * LT.first; 509 } 510 case ISD::FMUL: 511 // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for 512 // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole 513 // fused operation. 514 if (CxtI && CxtI->hasOneUse()) 515 if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) { 516 const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode()); 517 if (OPC == ISD::FADD || OPC == ISD::FSUB) { 518 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals) 519 return TargetTransformInfo::TCC_Free; 520 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals) 521 return TargetTransformInfo::TCC_Free; 522 523 // Estimate all types may be fused with contract/unsafe flags 524 const TargetOptions &Options = TLI->getTargetMachine().Options; 525 if (Options.AllowFPOpFusion == FPOpFusion::Fast || 526 Options.UnsafeFPMath || 527 (FAdd->hasAllowContract() && CxtI->hasAllowContract())) 528 return TargetTransformInfo::TCC_Free; 529 } 530 } 531 LLVM_FALLTHROUGH; 532 case ISD::FADD: 533 case ISD::FSUB: 534 if (SLT == MVT::f64) 535 return LT.first * NElts * get64BitInstrCost(); 536 537 if (ST->has16BitInsts() && SLT == MVT::f16) 538 NElts = (NElts + 1) / 2; 539 540 if (SLT == MVT::f32 || SLT == MVT::f16) 541 return LT.first * NElts * getFullRateInstrCost(); 542 break; 543 case ISD::FDIV: 544 case ISD::FREM: 545 // FIXME: frem should be handled separately. The fdiv in it is most of it, 546 // but the current lowering is also not entirely correct. 547 if (SLT == MVT::f64) { 548 int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost(); 549 // Add cost of workaround. 550 if (!ST->hasUsableDivScaleConditionOutput()) 551 Cost += 3 * getFullRateInstrCost(); 552 553 return LT.first * Cost * NElts; 554 } 555 556 if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) { 557 // TODO: This is more complicated, unsafe flags etc. 558 if ((SLT == MVT::f32 && !HasFP32Denormals) || 559 (SLT == MVT::f16 && ST->has16BitInsts())) { 560 return LT.first * getQuarterRateInstrCost() * NElts; 561 } 562 } 563 564 if (SLT == MVT::f16 && ST->has16BitInsts()) { 565 // 2 x v_cvt_f32_f16 566 // f32 rcp 567 // f32 fmul 568 // v_cvt_f16_f32 569 // f16 div_fixup 570 int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(); 571 return LT.first * Cost * NElts; 572 } 573 574 if (SLT == MVT::f32 || SLT == MVT::f16) { 575 int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost(); 576 577 if (!HasFP32Denormals) { 578 // FP mode switches. 579 Cost += 2 * getFullRateInstrCost(); 580 } 581 582 return LT.first * NElts * Cost; 583 } 584 break; 585 case ISD::FNEG: 586 // Use the backend' estimation. If fneg is not free each element will cost 587 // one additional instruction. 588 return TLI->isFNegFree(SLT) ? 0 : NElts; 589 default: 590 break; 591 } 592 593 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info, 594 Opd1PropInfo, Opd2PropInfo, Args, CxtI); 595 } 596 597 // Return true if there's a potential benefit from using v2f16/v2i16 598 // instructions for an intrinsic, even if it requires nontrivial legalization. 599 static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) { 600 switch (ID) { 601 case Intrinsic::fma: // TODO: fmuladd 602 // There's a small benefit to using vector ops in the legalized code. 603 case Intrinsic::round: 604 case Intrinsic::uadd_sat: 605 case Intrinsic::usub_sat: 606 case Intrinsic::sadd_sat: 607 case Intrinsic::ssub_sat: 608 return true; 609 default: 610 return false; 611 } 612 } 613 614 int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 615 TTI::TargetCostKind CostKind) { 616 if (ICA.getID() == Intrinsic::fabs) 617 return 0; 618 619 if (!intrinsicHasPackedVectorBenefit(ICA.getID())) 620 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 621 622 Type *RetTy = ICA.getReturnType(); 623 EVT OrigTy = TLI->getValueType(DL, RetTy); 624 if (!OrigTy.isSimple()) { 625 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 626 } 627 628 // Legalize the type. 629 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy); 630 631 unsigned NElts = LT.second.isVector() ? 632 LT.second.getVectorNumElements() : 1; 633 634 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy; 635 636 if (SLT == MVT::f64) 637 return LT.first * NElts * get64BitInstrCost(); 638 639 if (ST->has16BitInsts() && SLT == MVT::f16) 640 NElts = (NElts + 1) / 2; 641 642 // TODO: Get more refined intrinsic costs? 643 unsigned InstRate = getQuarterRateInstrCost(); 644 if (ICA.getID() == Intrinsic::fma) { 645 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost() 646 : getQuarterRateInstrCost(); 647 } 648 649 return LT.first * NElts * InstRate; 650 } 651 652 unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode, 653 TTI::TargetCostKind CostKind) { 654 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) 655 return Opcode == Instruction::PHI ? 0 : 1; 656 657 // XXX - For some reason this isn't called for switch. 658 switch (Opcode) { 659 case Instruction::Br: 660 case Instruction::Ret: 661 return 10; 662 default: 663 return BaseT::getCFInstrCost(Opcode, CostKind); 664 } 665 } 666 667 int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, 668 bool IsPairwise, 669 TTI::TargetCostKind CostKind) { 670 EVT OrigTy = TLI->getValueType(DL, Ty); 671 672 // Computes cost on targets that have packed math instructions(which support 673 // 16-bit types only). 674 if (IsPairwise || 675 !ST->hasVOP3PInsts() || 676 OrigTy.getScalarSizeInBits() != 16) 677 return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise, CostKind); 678 679 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 680 return LT.first * getFullRateInstrCost(); 681 } 682 683 int GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, 684 bool IsPairwise, bool IsUnsigned, 685 TTI::TargetCostKind CostKind) { 686 EVT OrigTy = TLI->getValueType(DL, Ty); 687 688 // Computes cost on targets that have packed math instructions(which support 689 // 16-bit types only). 690 if (IsPairwise || 691 !ST->hasVOP3PInsts() || 692 OrigTy.getScalarSizeInBits() != 16) 693 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned, 694 CostKind); 695 696 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 697 return LT.first * getHalfRateInstrCost(); 698 } 699 700 int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, 701 unsigned Index) { 702 switch (Opcode) { 703 case Instruction::ExtractElement: 704 case Instruction::InsertElement: { 705 unsigned EltSize 706 = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType()); 707 if (EltSize < 32) { 708 if (EltSize == 16 && Index == 0 && ST->has16BitInsts()) 709 return 0; 710 return BaseT::getVectorInstrCost(Opcode, ValTy, Index); 711 } 712 713 // Extracts are just reads of a subregister, so are free. Inserts are 714 // considered free because we don't want to have any cost for scalarizing 715 // operations, and we don't have to copy into a different register class. 716 717 // Dynamic indexing isn't free and is best avoided. 718 return Index == ~0u ? 2 : 0; 719 } 720 default: 721 return BaseT::getVectorInstrCost(Opcode, ValTy, Index); 722 } 723 } 724 725 static bool isArgPassedInSGPR(const Argument *A) { 726 const Function *F = A->getParent(); 727 728 // Arguments to compute shaders are never a source of divergence. 729 CallingConv::ID CC = F->getCallingConv(); 730 switch (CC) { 731 case CallingConv::AMDGPU_KERNEL: 732 case CallingConv::SPIR_KERNEL: 733 return true; 734 case CallingConv::AMDGPU_VS: 735 case CallingConv::AMDGPU_LS: 736 case CallingConv::AMDGPU_HS: 737 case CallingConv::AMDGPU_ES: 738 case CallingConv::AMDGPU_GS: 739 case CallingConv::AMDGPU_PS: 740 case CallingConv::AMDGPU_CS: 741 // For non-compute shaders, SGPR inputs are marked with either inreg. 742 // Everything else is in VGPRs. 743 return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg); 744 default: 745 // TODO: Should calls support inreg for SGPR inputs? 746 return false; 747 } 748 } 749 750 /// Analyze if the results of inline asm are divergent. If \p Indices is empty, 751 /// this is analyzing the collective result of all output registers. Otherwise, 752 /// this is only querying a specific result index if this returns multiple 753 /// registers in a struct. 754 bool GCNTTIImpl::isInlineAsmSourceOfDivergence( 755 const CallInst *CI, ArrayRef<unsigned> Indices) const { 756 // TODO: Handle complex extract indices 757 if (Indices.size() > 1) 758 return true; 759 760 const DataLayout &DL = CI->getModule()->getDataLayout(); 761 const SIRegisterInfo *TRI = ST->getRegisterInfo(); 762 TargetLowering::AsmOperandInfoVector TargetConstraints = 763 TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI); 764 765 const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0]; 766 767 int OutputIdx = 0; 768 for (auto &TC : TargetConstraints) { 769 if (TC.Type != InlineAsm::isOutput) 770 continue; 771 772 // Skip outputs we don't care about. 773 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++) 774 continue; 775 776 TLI->ComputeConstraintToUse(TC, SDValue()); 777 778 Register AssignedReg; 779 const TargetRegisterClass *RC; 780 std::tie(AssignedReg, RC) = TLI->getRegForInlineAsmConstraint( 781 TRI, TC.ConstraintCode, TC.ConstraintVT); 782 if (AssignedReg) { 783 // FIXME: This is a workaround for getRegForInlineAsmConstraint 784 // returning VS_32 785 RC = TRI->getPhysRegClass(AssignedReg); 786 } 787 788 // For AGPR constraints null is returned on subtargets without AGPRs, so 789 // assume divergent for null. 790 if (!RC || !TRI->isSGPRClass(RC)) 791 return true; 792 } 793 794 return false; 795 } 796 797 /// \returns true if the new GPU divergence analysis is enabled. 798 bool GCNTTIImpl::useGPUDivergenceAnalysis() const { 799 return !UseLegacyDA; 800 } 801 802 /// \returns true if the result of the value could potentially be 803 /// different across workitems in a wavefront. 804 bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const { 805 if (const Argument *A = dyn_cast<Argument>(V)) 806 return !isArgPassedInSGPR(A); 807 808 // Loads from the private and flat address spaces are divergent, because 809 // threads can execute the load instruction with the same inputs and get 810 // different results. 811 // 812 // All other loads are not divergent, because if threads issue loads with the 813 // same arguments, they will always get the same result. 814 if (const LoadInst *Load = dyn_cast<LoadInst>(V)) 815 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS || 816 Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS; 817 818 // Atomics are divergent because they are executed sequentially: when an 819 // atomic operation refers to the same address in each thread, then each 820 // thread after the first sees the value written by the previous thread as 821 // original value. 822 if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V)) 823 return true; 824 825 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) 826 return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID()); 827 828 // Assume all function calls are a source of divergence. 829 if (const CallInst *CI = dyn_cast<CallInst>(V)) { 830 if (CI->isInlineAsm()) 831 return isInlineAsmSourceOfDivergence(CI); 832 return true; 833 } 834 835 // Assume all function calls are a source of divergence. 836 if (isa<InvokeInst>(V)) 837 return true; 838 839 return false; 840 } 841 842 bool GCNTTIImpl::isAlwaysUniform(const Value *V) const { 843 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) { 844 switch (Intrinsic->getIntrinsicID()) { 845 default: 846 return false; 847 case Intrinsic::amdgcn_readfirstlane: 848 case Intrinsic::amdgcn_readlane: 849 case Intrinsic::amdgcn_icmp: 850 case Intrinsic::amdgcn_fcmp: 851 case Intrinsic::amdgcn_ballot: 852 case Intrinsic::amdgcn_if_break: 853 return true; 854 } 855 } 856 857 if (const CallInst *CI = dyn_cast<CallInst>(V)) { 858 if (CI->isInlineAsm()) 859 return !isInlineAsmSourceOfDivergence(CI); 860 return false; 861 } 862 863 const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V); 864 if (!ExtValue) 865 return false; 866 867 const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0)); 868 if (!CI) 869 return false; 870 871 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) { 872 switch (Intrinsic->getIntrinsicID()) { 873 default: 874 return false; 875 case Intrinsic::amdgcn_if: 876 case Intrinsic::amdgcn_else: { 877 ArrayRef<unsigned> Indices = ExtValue->getIndices(); 878 return Indices.size() == 1 && Indices[0] == 1; 879 } 880 } 881 } 882 883 // If we have inline asm returning mixed SGPR and VGPR results, we inferred 884 // divergent for the overall struct return. We need to override it in the 885 // case we're extracting an SGPR component here. 886 if (CI->isInlineAsm()) 887 return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices()); 888 889 return false; 890 } 891 892 bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, 893 Intrinsic::ID IID) const { 894 switch (IID) { 895 case Intrinsic::amdgcn_atomic_inc: 896 case Intrinsic::amdgcn_atomic_dec: 897 case Intrinsic::amdgcn_ds_fadd: 898 case Intrinsic::amdgcn_ds_fmin: 899 case Intrinsic::amdgcn_ds_fmax: 900 case Intrinsic::amdgcn_is_shared: 901 case Intrinsic::amdgcn_is_private: 902 OpIndexes.push_back(0); 903 return true; 904 default: 905 return false; 906 } 907 } 908 909 Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, 910 Value *OldV, 911 Value *NewV) const { 912 auto IntrID = II->getIntrinsicID(); 913 switch (IntrID) { 914 case Intrinsic::amdgcn_atomic_inc: 915 case Intrinsic::amdgcn_atomic_dec: 916 case Intrinsic::amdgcn_ds_fadd: 917 case Intrinsic::amdgcn_ds_fmin: 918 case Intrinsic::amdgcn_ds_fmax: { 919 const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4)); 920 if (!IsVolatile->isZero()) 921 return nullptr; 922 Module *M = II->getParent()->getParent()->getParent(); 923 Type *DestTy = II->getType(); 924 Type *SrcTy = NewV->getType(); 925 Function *NewDecl = 926 Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy}); 927 II->setArgOperand(0, NewV); 928 II->setCalledFunction(NewDecl); 929 return II; 930 } 931 case Intrinsic::amdgcn_is_shared: 932 case Intrinsic::amdgcn_is_private: { 933 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ? 934 AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS; 935 unsigned NewAS = NewV->getType()->getPointerAddressSpace(); 936 LLVMContext &Ctx = NewV->getType()->getContext(); 937 ConstantInt *NewVal = (TrueAS == NewAS) ? 938 ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx); 939 return NewVal; 940 } 941 case Intrinsic::ptrmask: { 942 unsigned OldAS = OldV->getType()->getPointerAddressSpace(); 943 unsigned NewAS = NewV->getType()->getPointerAddressSpace(); 944 Value *MaskOp = II->getArgOperand(1); 945 Type *MaskTy = MaskOp->getType(); 946 947 bool DoTruncate = false; 948 949 const GCNTargetMachine &TM = 950 static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine()); 951 if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) { 952 // All valid 64-bit to 32-bit casts work by chopping off the high 953 // bits. Any masking only clearing the low bits will also apply in the new 954 // address space. 955 if (DL.getPointerSizeInBits(OldAS) != 64 || 956 DL.getPointerSizeInBits(NewAS) != 32) 957 return nullptr; 958 959 // TODO: Do we need to thread more context in here? 960 KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II); 961 if (Known.countMinLeadingOnes() < 32) 962 return nullptr; 963 964 DoTruncate = true; 965 } 966 967 IRBuilder<> B(II); 968 if (DoTruncate) { 969 MaskTy = B.getInt32Ty(); 970 MaskOp = B.CreateTrunc(MaskOp, MaskTy); 971 } 972 973 return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy}, 974 {NewV, MaskOp}); 975 } 976 default: 977 return nullptr; 978 } 979 } 980 981 unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *VT, 982 int Index, VectorType *SubTp) { 983 if (ST->hasVOP3PInsts()) { 984 if (cast<FixedVectorType>(VT)->getNumElements() == 2 && 985 DL.getTypeSizeInBits(VT->getElementType()) == 16) { 986 // With op_sel VOP3P instructions freely can access the low half or high 987 // half of a register, so any swizzle is free. 988 989 switch (Kind) { 990 case TTI::SK_Broadcast: 991 case TTI::SK_Reverse: 992 case TTI::SK_PermuteSingleSrc: 993 return 0; 994 default: 995 break; 996 } 997 } 998 } 999 1000 return BaseT::getShuffleCost(Kind, VT, Index, SubTp); 1001 } 1002 1003 bool GCNTTIImpl::areInlineCompatible(const Function *Caller, 1004 const Function *Callee) const { 1005 const TargetMachine &TM = getTLI()->getTargetMachine(); 1006 const GCNSubtarget *CallerST 1007 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller)); 1008 const GCNSubtarget *CalleeST 1009 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee)); 1010 1011 const FeatureBitset &CallerBits = CallerST->getFeatureBits(); 1012 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits(); 1013 1014 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; 1015 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; 1016 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits) 1017 return false; 1018 1019 // FIXME: dx10_clamp can just take the caller setting, but there seems to be 1020 // no way to support merge for backend defined attributes. 1021 AMDGPU::SIModeRegisterDefaults CallerMode(*Caller); 1022 AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee); 1023 return CallerMode.isInlineCompatible(CalleeMode); 1024 } 1025 1026 void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 1027 TTI::UnrollingPreferences &UP) { 1028 CommonTTI.getUnrollingPreferences(L, SE, UP); 1029 } 1030 1031 void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 1032 TTI::PeelingPreferences &PP) { 1033 CommonTTI.getPeelingPreferences(L, SE, PP); 1034 } 1035 1036 unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const { 1037 return 4 * 128; // XXX - 4 channels. Should these count as vector instead? 1038 } 1039 1040 unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const { 1041 return getHardwareNumberOfRegisters(Vec); 1042 } 1043 1044 unsigned R600TTIImpl::getRegisterBitWidth(bool Vector) const { 1045 return 32; 1046 } 1047 1048 unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const { 1049 return 32; 1050 } 1051 1052 unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { 1053 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS || 1054 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS) 1055 return 128; 1056 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || 1057 AddrSpace == AMDGPUAS::REGION_ADDRESS) 1058 return 64; 1059 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) 1060 return 32; 1061 1062 if ((AddrSpace == AMDGPUAS::PARAM_D_ADDRESS || 1063 AddrSpace == AMDGPUAS::PARAM_I_ADDRESS || 1064 (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 && 1065 AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15))) 1066 return 128; 1067 llvm_unreachable("unhandled address space"); 1068 } 1069 1070 bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, 1071 Align Alignment, 1072 unsigned AddrSpace) const { 1073 // We allow vectorization of flat stores, even though we may need to decompose 1074 // them later if they may access private memory. We don't have enough context 1075 // here, and legalization can handle it. 1076 return (AddrSpace != AMDGPUAS::PRIVATE_ADDRESS); 1077 } 1078 1079 bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, 1080 Align Alignment, 1081 unsigned AddrSpace) const { 1082 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); 1083 } 1084 1085 bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, 1086 Align Alignment, 1087 unsigned AddrSpace) const { 1088 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); 1089 } 1090 1091 unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) { 1092 // Disable unrolling if the loop is not vectorized. 1093 // TODO: Enable this again. 1094 if (VF == 1) 1095 return 1; 1096 1097 return 8; 1098 } 1099 1100 unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode, 1101 TTI::TargetCostKind CostKind) { 1102 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) 1103 return Opcode == Instruction::PHI ? 0 : 1; 1104 1105 // XXX - For some reason this isn't called for switch. 1106 switch (Opcode) { 1107 case Instruction::Br: 1108 case Instruction::Ret: 1109 return 10; 1110 default: 1111 return BaseT::getCFInstrCost(Opcode, CostKind); 1112 } 1113 } 1114 1115 int R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, 1116 unsigned Index) { 1117 switch (Opcode) { 1118 case Instruction::ExtractElement: 1119 case Instruction::InsertElement: { 1120 unsigned EltSize 1121 = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType()); 1122 if (EltSize < 32) { 1123 return BaseT::getVectorInstrCost(Opcode, ValTy, Index); 1124 } 1125 1126 // Extracts are just reads of a subregister, so are free. Inserts are 1127 // considered free because we don't want to have any cost for scalarizing 1128 // operations, and we don't have to copy into a different register class. 1129 1130 // Dynamic indexing isn't free and is best avoided. 1131 return Index == ~0u ? 2 : 0; 1132 } 1133 default: 1134 return BaseT::getVectorInstrCost(Opcode, ValTy, Index); 1135 } 1136 } 1137 1138 void R600TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 1139 TTI::UnrollingPreferences &UP) { 1140 CommonTTI.getUnrollingPreferences(L, SE, UP); 1141 } 1142 1143 void R600TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 1144 TTI::PeelingPreferences &PP) { 1145 CommonTTI.getPeelingPreferences(L, SE, PP); 1146 } 1147