1 //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "AMDGPUTargetTransformInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "Utils/AMDGPUBaseInfo.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/Analysis/LoopInfo.h"
22 #include "llvm/Analysis/TargetTransformInfo.h"
23 #include "llvm/Analysis/ValueTracking.h"
24 #include "llvm/CodeGen/ISDOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/Argument.h"
27 #include "llvm/IR/Attributes.h"
28 #include "llvm/IR/BasicBlock.h"
29 #include "llvm/IR/CallingConv.h"
30 #include "llvm/IR/DataLayout.h"
31 #include "llvm/IR/DerivedTypes.h"
32 #include "llvm/IR/Function.h"
33 #include "llvm/IR/Instruction.h"
34 #include "llvm/IR/Instructions.h"
35 #include "llvm/IR/IntrinsicInst.h"
36 #include "llvm/IR/Module.h"
37 #include "llvm/IR/PatternMatch.h"
38 #include "llvm/IR/Type.h"
39 #include "llvm/IR/Value.h"
40 #include "llvm/MC/SubtargetFeature.h"
41 #include "llvm/Support/Casting.h"
42 #include "llvm/Support/CommandLine.h"
43 #include "llvm/Support/Debug.h"
44 #include "llvm/Support/ErrorHandling.h"
45 #include "llvm/Support/MachineValueType.h"
46 #include "llvm/Support/raw_ostream.h"
47 #include "llvm/Target/TargetMachine.h"
48 #include <algorithm>
49 #include <cassert>
50 #include <limits>
51 #include <utility>
52 
53 using namespace llvm;
54 
55 #define DEBUG_TYPE "AMDGPUtti"
56 
57 static cl::opt<unsigned> UnrollThresholdPrivate(
58   "amdgpu-unroll-threshold-private",
59   cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
60   cl::init(2700), cl::Hidden);
61 
62 static cl::opt<unsigned> UnrollThresholdLocal(
63   "amdgpu-unroll-threshold-local",
64   cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
65   cl::init(1000), cl::Hidden);
66 
67 static cl::opt<unsigned> UnrollThresholdIf(
68   "amdgpu-unroll-threshold-if",
69   cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
70   cl::init(150), cl::Hidden);
71 
72 static cl::opt<bool> UnrollRuntimeLocal(
73   "amdgpu-unroll-runtime-local",
74   cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
75   cl::init(true), cl::Hidden);
76 
77 static cl::opt<bool> UseLegacyDA(
78   "amdgpu-use-legacy-divergence-analysis",
79   cl::desc("Enable legacy divergence analysis for AMDGPU"),
80   cl::init(false), cl::Hidden);
81 
82 static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
83     "amdgpu-unroll-max-block-to-analyze",
84     cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
85     cl::init(20), cl::Hidden);
86 
87 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
88                               unsigned Depth = 0) {
89   const Instruction *I = dyn_cast<Instruction>(Cond);
90   if (!I)
91     return false;
92 
93   for (const Value *V : I->operand_values()) {
94     if (!L->contains(I))
95       continue;
96     if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
97       if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
98                   return SubLoop->contains(PHI); }))
99         return true;
100     } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
101       return true;
102   }
103   return false;
104 }
105 
106 void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
107                                             TTI::UnrollingPreferences &UP) {
108   const Function &F = *L->getHeader()->getParent();
109   UP.Threshold = AMDGPU::getIntegerAttribute(F, "amdgpu-unroll-threshold", 300);
110   UP.MaxCount = std::numeric_limits<unsigned>::max();
111   UP.Partial = true;
112 
113   // TODO: Do we want runtime unrolling?
114 
115   // Maximum alloca size than can fit registers. Reserve 16 registers.
116   const unsigned MaxAlloca = (256 - 16) * 4;
117   unsigned ThresholdPrivate = UnrollThresholdPrivate;
118   unsigned ThresholdLocal = UnrollThresholdLocal;
119   unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
120   for (const BasicBlock *BB : L->getBlocks()) {
121     const DataLayout &DL = BB->getModule()->getDataLayout();
122     unsigned LocalGEPsSeen = 0;
123 
124     if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
125                return SubLoop->contains(BB); }))
126         continue; // Block belongs to an inner loop.
127 
128     for (const Instruction &I : *BB) {
129       // Unroll a loop which contains an "if" statement whose condition
130       // defined by a PHI belonging to the loop. This may help to eliminate
131       // if region and potentially even PHI itself, saving on both divergence
132       // and registers used for the PHI.
133       // Add a small bonus for each of such "if" statements.
134       if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
135         if (UP.Threshold < MaxBoost && Br->isConditional()) {
136           BasicBlock *Succ0 = Br->getSuccessor(0);
137           BasicBlock *Succ1 = Br->getSuccessor(1);
138           if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
139               (L->contains(Succ1) && L->isLoopExiting(Succ1)))
140             continue;
141           if (dependsOnLocalPhi(L, Br->getCondition())) {
142             UP.Threshold += UnrollThresholdIf;
143             LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
144                               << " for loop:\n"
145                               << *L << " due to " << *Br << '\n');
146             if (UP.Threshold >= MaxBoost)
147               return;
148           }
149         }
150         continue;
151       }
152 
153       const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
154       if (!GEP)
155         continue;
156 
157       unsigned AS = GEP->getAddressSpace();
158       unsigned Threshold = 0;
159       if (AS == AMDGPUAS::PRIVATE_ADDRESS)
160         Threshold = ThresholdPrivate;
161       else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
162         Threshold = ThresholdLocal;
163       else
164         continue;
165 
166       if (UP.Threshold >= Threshold)
167         continue;
168 
169       if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
170         const Value *Ptr = GEP->getPointerOperand();
171         const AllocaInst *Alloca =
172             dyn_cast<AllocaInst>(getUnderlyingObject(Ptr));
173         if (!Alloca || !Alloca->isStaticAlloca())
174           continue;
175         Type *Ty = Alloca->getAllocatedType();
176         unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
177         if (AllocaSize > MaxAlloca)
178           continue;
179       } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
180                  AS == AMDGPUAS::REGION_ADDRESS) {
181         LocalGEPsSeen++;
182         // Inhibit unroll for local memory if we have seen addressing not to
183         // a variable, most likely we will be unable to combine it.
184         // Do not unroll too deep inner loops for local memory to give a chance
185         // to unroll an outer loop for a more important reason.
186         if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
187             (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
188              !isa<Argument>(GEP->getPointerOperand())))
189           continue;
190         LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
191                           << *L << " due to LDS use.\n");
192         UP.Runtime = UnrollRuntimeLocal;
193       }
194 
195       // Check if GEP depends on a value defined by this loop itself.
196       bool HasLoopDef = false;
197       for (const Value *Op : GEP->operands()) {
198         const Instruction *Inst = dyn_cast<Instruction>(Op);
199         if (!Inst || L->isLoopInvariant(Op))
200           continue;
201 
202         if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
203              return SubLoop->contains(Inst); }))
204           continue;
205         HasLoopDef = true;
206         break;
207       }
208       if (!HasLoopDef)
209         continue;
210 
211       // We want to do whatever we can to limit the number of alloca
212       // instructions that make it through to the code generator.  allocas
213       // require us to use indirect addressing, which is slow and prone to
214       // compiler bugs.  If this loop does an address calculation on an
215       // alloca ptr, then we want to use a higher than normal loop unroll
216       // threshold. This will give SROA a better chance to eliminate these
217       // allocas.
218       //
219       // We also want to have more unrolling for local memory to let ds
220       // instructions with different offsets combine.
221       //
222       // Don't use the maximum allowed value here as it will make some
223       // programs way too big.
224       UP.Threshold = Threshold;
225       LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
226                         << " for loop:\n"
227                         << *L << " due to " << *GEP << '\n');
228       if (UP.Threshold >= MaxBoost)
229         return;
230     }
231 
232     // If we got a GEP in a small BB from inner loop then increase max trip
233     // count to analyze for better estimation cost in unroll
234     if (L->empty() && BB->size() < UnrollMaxBlockToAnalyze)
235       UP.MaxIterationsCountToAnalyze = 32;
236   }
237 }
238 
239 void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
240                                           TTI::PeelingPreferences &PP) {
241   BaseT::getPeelingPreferences(L, SE, PP);
242 }
243 unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
244   // The concept of vector registers doesn't really exist. Some packed vector
245   // operations operate on the normal 32-bit registers.
246   return MaxVGPRs;
247 }
248 
249 unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
250   // This is really the number of registers to fill when vectorizing /
251   // interleaving loops, so we lie to avoid trying to use all registers.
252   return getHardwareNumberOfRegisters(Vec) >> 3;
253 }
254 
255 unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
256   const SIRegisterInfo *TRI = ST->getRegisterInfo();
257   const TargetRegisterClass *RC = TRI->getRegClass(RCID);
258   unsigned NumVGPRs = (TRI->getRegSizeInBits(*RC) + 31) / 32;
259   return getHardwareNumberOfRegisters(false) / NumVGPRs;
260 }
261 
262 unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const {
263   return 32;
264 }
265 
266 unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
267   return 32;
268 }
269 
270 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
271                                          unsigned ChainSizeInBytes,
272                                          VectorType *VecTy) const {
273   unsigned VecRegBitWidth = VF * LoadSize;
274   if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
275     // TODO: Support element-size less than 32bit?
276     return 128 / LoadSize;
277 
278   return VF;
279 }
280 
281 unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
282                                              unsigned ChainSizeInBytes,
283                                              VectorType *VecTy) const {
284   unsigned VecRegBitWidth = VF * StoreSize;
285   if (VecRegBitWidth > 128)
286     return 128 / StoreSize;
287 
288   return VF;
289 }
290 
291 unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
292   if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
293       AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
294       AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
295       AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) {
296     return 512;
297   }
298 
299   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
300     return 8 * ST->getMaxPrivateElementSize();
301 
302   // Common to flat, global, local and region. Assume for unknown addrspace.
303   return 128;
304 }
305 
306 bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
307                                             Align Alignment,
308                                             unsigned AddrSpace) const {
309   // We allow vectorization of flat stores, even though we may need to decompose
310   // them later if they may access private memory. We don't have enough context
311   // here, and legalization can handle it.
312   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
313     return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
314       ChainSizeInBytes <= ST->getMaxPrivateElementSize();
315   }
316   return true;
317 }
318 
319 bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
320                                              Align Alignment,
321                                              unsigned AddrSpace) const {
322   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
323 }
324 
325 bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
326                                               Align Alignment,
327                                               unsigned AddrSpace) const {
328   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
329 }
330 
331 // FIXME: Really we would like to issue multiple 128-bit loads and stores per
332 // iteration. Should we report a larger size and let it legalize?
333 //
334 // FIXME: Should we use narrower types for local/region, or account for when
335 // unaligned access is legal?
336 //
337 // FIXME: This could use fine tuning and microbenchmarks.
338 Type *GCNTTIImpl::getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
339                                             unsigned SrcAddrSpace,
340                                             unsigned DestAddrSpace,
341                                             unsigned SrcAlign,
342                                             unsigned DestAlign) const {
343   unsigned MinAlign = std::min(SrcAlign, DestAlign);
344 
345   // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
346   // hardware into byte accesses. If you assume all alignments are equally
347   // probable, it's more efficient on average to use short accesses for this
348   // case.
349   if (MinAlign == 2)
350     return Type::getInt16Ty(Context);
351 
352   // Not all subtargets have 128-bit DS instructions, and we currently don't
353   // form them by default.
354   if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
355       SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
356       DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
357       DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
358     return FixedVectorType::get(Type::getInt32Ty(Context), 2);
359   }
360 
361   // Global memory works best with 16-byte accesses. Private memory will also
362   // hit this, although they'll be decomposed.
363   return FixedVectorType::get(Type::getInt32Ty(Context), 4);
364 }
365 
366 void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
367   SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
368   unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
369   unsigned SrcAlign, unsigned DestAlign) const {
370   assert(RemainingBytes < 16);
371 
372   unsigned MinAlign = std::min(SrcAlign, DestAlign);
373 
374   if (MinAlign != 2) {
375     Type *I64Ty = Type::getInt64Ty(Context);
376     while (RemainingBytes >= 8) {
377       OpsOut.push_back(I64Ty);
378       RemainingBytes -= 8;
379     }
380 
381     Type *I32Ty = Type::getInt32Ty(Context);
382     while (RemainingBytes >= 4) {
383       OpsOut.push_back(I32Ty);
384       RemainingBytes -= 4;
385     }
386   }
387 
388   Type *I16Ty = Type::getInt16Ty(Context);
389   while (RemainingBytes >= 2) {
390     OpsOut.push_back(I16Ty);
391     RemainingBytes -= 2;
392   }
393 
394   Type *I8Ty = Type::getInt8Ty(Context);
395   while (RemainingBytes) {
396     OpsOut.push_back(I8Ty);
397     --RemainingBytes;
398   }
399 }
400 
401 unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
402   // Disable unrolling if the loop is not vectorized.
403   // TODO: Enable this again.
404   if (VF == 1)
405     return 1;
406 
407   return 8;
408 }
409 
410 bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
411                                        MemIntrinsicInfo &Info) const {
412   switch (Inst->getIntrinsicID()) {
413   case Intrinsic::amdgcn_atomic_inc:
414   case Intrinsic::amdgcn_atomic_dec:
415   case Intrinsic::amdgcn_ds_ordered_add:
416   case Intrinsic::amdgcn_ds_ordered_swap:
417   case Intrinsic::amdgcn_ds_fadd:
418   case Intrinsic::amdgcn_ds_fmin:
419   case Intrinsic::amdgcn_ds_fmax: {
420     auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
421     auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
422     if (!Ordering || !Volatile)
423       return false; // Invalid.
424 
425     unsigned OrderingVal = Ordering->getZExtValue();
426     if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
427       return false;
428 
429     Info.PtrVal = Inst->getArgOperand(0);
430     Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
431     Info.ReadMem = true;
432     Info.WriteMem = true;
433     Info.IsVolatile = !Volatile->isNullValue();
434     return true;
435   }
436   default:
437     return false;
438   }
439 }
440 
441 int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
442                                        TTI::TargetCostKind CostKind,
443                                        TTI::OperandValueKind Opd1Info,
444                                        TTI::OperandValueKind Opd2Info,
445                                        TTI::OperandValueProperties Opd1PropInfo,
446                                        TTI::OperandValueProperties Opd2PropInfo,
447                                        ArrayRef<const Value *> Args,
448                                        const Instruction *CxtI) {
449   EVT OrigTy = TLI->getValueType(DL, Ty);
450   if (!OrigTy.isSimple()) {
451     // FIXME: We're having to query the throughput cost so that the basic
452     // implementation tries to generate legalize and scalarization costs. Maybe
453     // we could hoist the scalarization code here?
454     return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput,
455                                          Opd1Info, Opd2Info, Opd1PropInfo,
456                                          Opd2PropInfo, Args, CxtI);
457   }
458 
459   // Legalize the type.
460   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
461   int ISD = TLI->InstructionOpcodeToISD(Opcode);
462 
463   // Because we don't have any legal vector operations, but the legal types, we
464   // need to account for split vectors.
465   unsigned NElts = LT.second.isVector() ?
466     LT.second.getVectorNumElements() : 1;
467 
468   MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
469 
470   switch (ISD) {
471   case ISD::SHL:
472   case ISD::SRL:
473   case ISD::SRA:
474     if (SLT == MVT::i64)
475       return get64BitInstrCost() * LT.first * NElts;
476 
477     if (ST->has16BitInsts() && SLT == MVT::i16)
478       NElts = (NElts + 1) / 2;
479 
480     // i32
481     return getFullRateInstrCost() * LT.first * NElts;
482   case ISD::ADD:
483   case ISD::SUB:
484   case ISD::AND:
485   case ISD::OR:
486   case ISD::XOR:
487     if (SLT == MVT::i64) {
488       // and, or and xor are typically split into 2 VALU instructions.
489       return 2 * getFullRateInstrCost() * LT.first * NElts;
490     }
491 
492     if (ST->has16BitInsts() && SLT == MVT::i16)
493       NElts = (NElts + 1) / 2;
494 
495     return LT.first * NElts * getFullRateInstrCost();
496   case ISD::MUL: {
497     const int QuarterRateCost = getQuarterRateInstrCost();
498     if (SLT == MVT::i64) {
499       const int FullRateCost = getFullRateInstrCost();
500       return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
501     }
502 
503     if (ST->has16BitInsts() && SLT == MVT::i16)
504       NElts = (NElts + 1) / 2;
505 
506     // i32
507     return QuarterRateCost * NElts * LT.first;
508   }
509   case ISD::FMUL:
510     // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
511     // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
512     // fused operation.
513     if (CxtI && CxtI->hasOneUse())
514       if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
515         const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
516         if (OPC == ISD::FADD || OPC == ISD::FSUB) {
517           if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
518             return TargetTransformInfo::TCC_Free;
519           if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
520             return TargetTransformInfo::TCC_Free;
521 
522           // Estimate all types may be fused with contract/unsafe flags
523           const TargetOptions &Options = TLI->getTargetMachine().Options;
524           if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
525               Options.UnsafeFPMath ||
526               (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
527             return TargetTransformInfo::TCC_Free;
528         }
529       }
530     LLVM_FALLTHROUGH;
531   case ISD::FADD:
532   case ISD::FSUB:
533     if (SLT == MVT::f64)
534       return LT.first * NElts * get64BitInstrCost();
535 
536     if (ST->has16BitInsts() && SLT == MVT::f16)
537       NElts = (NElts + 1) / 2;
538 
539     if (SLT == MVT::f32 || SLT == MVT::f16)
540       return LT.first * NElts * getFullRateInstrCost();
541     break;
542   case ISD::FDIV:
543   case ISD::FREM:
544     // FIXME: frem should be handled separately. The fdiv in it is most of it,
545     // but the current lowering is also not entirely correct.
546     if (SLT == MVT::f64) {
547       int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
548       // Add cost of workaround.
549       if (!ST->hasUsableDivScaleConditionOutput())
550         Cost += 3 * getFullRateInstrCost();
551 
552       return LT.first * Cost * NElts;
553     }
554 
555     if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
556       // TODO: This is more complicated, unsafe flags etc.
557       if ((SLT == MVT::f32 && !HasFP32Denormals) ||
558           (SLT == MVT::f16 && ST->has16BitInsts())) {
559         return LT.first * getQuarterRateInstrCost() * NElts;
560       }
561     }
562 
563     if (SLT == MVT::f16 && ST->has16BitInsts()) {
564       // 2 x v_cvt_f32_f16
565       // f32 rcp
566       // f32 fmul
567       // v_cvt_f16_f32
568       // f16 div_fixup
569       int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost();
570       return LT.first * Cost * NElts;
571     }
572 
573     if (SLT == MVT::f32 || SLT == MVT::f16) {
574       int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
575 
576       if (!HasFP32Denormals) {
577         // FP mode switches.
578         Cost += 2 * getFullRateInstrCost();
579       }
580 
581       return LT.first * NElts * Cost;
582     }
583     break;
584   case ISD::FNEG:
585     // Use the backend' estimation. If fneg is not free each element will cost
586     // one additional instruction.
587     return TLI->isFNegFree(SLT) ? 0 : NElts;
588   default:
589     break;
590   }
591 
592   return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info,
593                                        Opd1PropInfo, Opd2PropInfo, Args, CxtI);
594 }
595 
596 // Return true if there's a potential benefit from using v2f16/v2i16
597 // instructions for an intrinsic, even if it requires nontrivial legalization.
598 static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
599   switch (ID) {
600   case Intrinsic::fma: // TODO: fmuladd
601   // There's a small benefit to using vector ops in the legalized code.
602   case Intrinsic::round:
603   case Intrinsic::uadd_sat:
604   case Intrinsic::usub_sat:
605   case Intrinsic::sadd_sat:
606   case Intrinsic::ssub_sat:
607     return true;
608   default:
609     return false;
610   }
611 }
612 
613 int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
614                                       TTI::TargetCostKind CostKind) {
615   if (ICA.getID() == Intrinsic::fabs)
616     return 0;
617 
618   if (!intrinsicHasPackedVectorBenefit(ICA.getID()))
619     return BaseT::getIntrinsicInstrCost(ICA, CostKind);
620 
621   Type *RetTy = ICA.getReturnType();
622   EVT OrigTy = TLI->getValueType(DL, RetTy);
623   if (!OrigTy.isSimple()) {
624     return BaseT::getIntrinsicInstrCost(ICA, CostKind);
625   }
626 
627   // Legalize the type.
628   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
629 
630   unsigned NElts = LT.second.isVector() ?
631     LT.second.getVectorNumElements() : 1;
632 
633   MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
634 
635   if (SLT == MVT::f64)
636     return LT.first * NElts * get64BitInstrCost();
637 
638   if (ST->has16BitInsts() && SLT == MVT::f16)
639     NElts = (NElts + 1) / 2;
640 
641   // TODO: Get more refined intrinsic costs?
642   unsigned InstRate = getQuarterRateInstrCost();
643   if (ICA.getID() == Intrinsic::fma) {
644     InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost()
645                                    : getQuarterRateInstrCost();
646   }
647 
648   return LT.first * NElts * InstRate;
649 }
650 
651 unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode,
652                                     TTI::TargetCostKind CostKind) {
653   if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
654     return Opcode == Instruction::PHI ? 0 : 1;
655 
656   // XXX - For some reason this isn't called for switch.
657   switch (Opcode) {
658   case Instruction::Br:
659   case Instruction::Ret:
660     return 10;
661   default:
662     return BaseT::getCFInstrCost(Opcode, CostKind);
663   }
664 }
665 
666 int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
667                                            bool IsPairwise,
668                                            TTI::TargetCostKind CostKind) {
669   EVT OrigTy = TLI->getValueType(DL, Ty);
670 
671   // Computes cost on targets that have packed math instructions(which support
672   // 16-bit types only).
673   if (IsPairwise ||
674       !ST->hasVOP3PInsts() ||
675       OrigTy.getScalarSizeInBits() != 16)
676     return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise, CostKind);
677 
678   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
679   return LT.first * getFullRateInstrCost();
680 }
681 
682 int GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
683                                        bool IsPairwise, bool IsUnsigned,
684                                        TTI::TargetCostKind CostKind) {
685   EVT OrigTy = TLI->getValueType(DL, Ty);
686 
687   // Computes cost on targets that have packed math instructions(which support
688   // 16-bit types only).
689   if (IsPairwise ||
690       !ST->hasVOP3PInsts() ||
691       OrigTy.getScalarSizeInBits() != 16)
692     return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned,
693                                          CostKind);
694 
695   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
696   return LT.first * getHalfRateInstrCost();
697 }
698 
699 int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
700                                       unsigned Index) {
701   switch (Opcode) {
702   case Instruction::ExtractElement:
703   case Instruction::InsertElement: {
704     unsigned EltSize
705       = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
706     if (EltSize < 32) {
707       if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
708         return 0;
709       return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
710     }
711 
712     // Extracts are just reads of a subregister, so are free. Inserts are
713     // considered free because we don't want to have any cost for scalarizing
714     // operations, and we don't have to copy into a different register class.
715 
716     // Dynamic indexing isn't free and is best avoided.
717     return Index == ~0u ? 2 : 0;
718   }
719   default:
720     return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
721   }
722 }
723 
724 static bool isArgPassedInSGPR(const Argument *A) {
725   const Function *F = A->getParent();
726 
727   // Arguments to compute shaders are never a source of divergence.
728   CallingConv::ID CC = F->getCallingConv();
729   switch (CC) {
730   case CallingConv::AMDGPU_KERNEL:
731   case CallingConv::SPIR_KERNEL:
732     return true;
733   case CallingConv::AMDGPU_VS:
734   case CallingConv::AMDGPU_LS:
735   case CallingConv::AMDGPU_HS:
736   case CallingConv::AMDGPU_ES:
737   case CallingConv::AMDGPU_GS:
738   case CallingConv::AMDGPU_PS:
739   case CallingConv::AMDGPU_CS:
740     // For non-compute shaders, SGPR inputs are marked with either inreg.
741     // Everything else is in VGPRs.
742     return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg);
743   default:
744     // TODO: Should calls support inreg for SGPR inputs?
745     return false;
746   }
747 }
748 
749 /// Analyze if the results of inline asm are divergent. If \p Indices is empty,
750 /// this is analyzing the collective result of all output registers. Otherwise,
751 /// this is only querying a specific result index if this returns multiple
752 /// registers in a struct.
753 bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
754   const CallInst *CI, ArrayRef<unsigned> Indices) const {
755   // TODO: Handle complex extract indices
756   if (Indices.size() > 1)
757     return true;
758 
759   const DataLayout &DL = CI->getModule()->getDataLayout();
760   const SIRegisterInfo *TRI = ST->getRegisterInfo();
761   TargetLowering::AsmOperandInfoVector TargetConstraints =
762       TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
763 
764   const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
765 
766   int OutputIdx = 0;
767   for (auto &TC : TargetConstraints) {
768     if (TC.Type != InlineAsm::isOutput)
769       continue;
770 
771     // Skip outputs we don't care about.
772     if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
773       continue;
774 
775     TLI->ComputeConstraintToUse(TC, SDValue());
776 
777     Register AssignedReg;
778     const TargetRegisterClass *RC;
779     std::tie(AssignedReg, RC) = TLI->getRegForInlineAsmConstraint(
780       TRI, TC.ConstraintCode, TC.ConstraintVT);
781     if (AssignedReg) {
782       // FIXME: This is a workaround for getRegForInlineAsmConstraint
783       // returning VS_32
784       RC = TRI->getPhysRegClass(AssignedReg);
785     }
786 
787     // For AGPR constraints null is returned on subtargets without AGPRs, so
788     // assume divergent for null.
789     if (!RC || !TRI->isSGPRClass(RC))
790       return true;
791   }
792 
793   return false;
794 }
795 
796 /// \returns true if the new GPU divergence analysis is enabled.
797 bool GCNTTIImpl::useGPUDivergenceAnalysis() const {
798   return !UseLegacyDA;
799 }
800 
801 /// \returns true if the result of the value could potentially be
802 /// different across workitems in a wavefront.
803 bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
804   if (const Argument *A = dyn_cast<Argument>(V))
805     return !isArgPassedInSGPR(A);
806 
807   // Loads from the private and flat address spaces are divergent, because
808   // threads can execute the load instruction with the same inputs and get
809   // different results.
810   //
811   // All other loads are not divergent, because if threads issue loads with the
812   // same arguments, they will always get the same result.
813   if (const LoadInst *Load = dyn_cast<LoadInst>(V))
814     return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
815            Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
816 
817   // Atomics are divergent because they are executed sequentially: when an
818   // atomic operation refers to the same address in each thread, then each
819   // thread after the first sees the value written by the previous thread as
820   // original value.
821   if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
822     return true;
823 
824   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
825     return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
826 
827   // Assume all function calls are a source of divergence.
828   if (const CallInst *CI = dyn_cast<CallInst>(V)) {
829     if (CI->isInlineAsm())
830       return isInlineAsmSourceOfDivergence(CI);
831     return true;
832   }
833 
834   // Assume all function calls are a source of divergence.
835   if (isa<InvokeInst>(V))
836     return true;
837 
838   return false;
839 }
840 
841 bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
842   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
843     switch (Intrinsic->getIntrinsicID()) {
844     default:
845       return false;
846     case Intrinsic::amdgcn_readfirstlane:
847     case Intrinsic::amdgcn_readlane:
848     case Intrinsic::amdgcn_icmp:
849     case Intrinsic::amdgcn_fcmp:
850     case Intrinsic::amdgcn_ballot:
851     case Intrinsic::amdgcn_if_break:
852       return true;
853     }
854   }
855 
856   if (const CallInst *CI = dyn_cast<CallInst>(V)) {
857     if (CI->isInlineAsm())
858       return !isInlineAsmSourceOfDivergence(CI);
859     return false;
860   }
861 
862   const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
863   if (!ExtValue)
864     return false;
865 
866   const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
867   if (!CI)
868     return false;
869 
870   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
871     switch (Intrinsic->getIntrinsicID()) {
872     default:
873       return false;
874     case Intrinsic::amdgcn_if:
875     case Intrinsic::amdgcn_else: {
876       ArrayRef<unsigned> Indices = ExtValue->getIndices();
877       return Indices.size() == 1 && Indices[0] == 1;
878     }
879     }
880   }
881 
882   // If we have inline asm returning mixed SGPR and VGPR results, we inferred
883   // divergent for the overall struct return. We need to override it in the
884   // case we're extracting an SGPR component here.
885   if (CI->isInlineAsm())
886     return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
887 
888   return false;
889 }
890 
891 bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
892                                             Intrinsic::ID IID) const {
893   switch (IID) {
894   case Intrinsic::amdgcn_atomic_inc:
895   case Intrinsic::amdgcn_atomic_dec:
896   case Intrinsic::amdgcn_ds_fadd:
897   case Intrinsic::amdgcn_ds_fmin:
898   case Intrinsic::amdgcn_ds_fmax:
899   case Intrinsic::amdgcn_is_shared:
900   case Intrinsic::amdgcn_is_private:
901     OpIndexes.push_back(0);
902     return true;
903   default:
904     return false;
905   }
906 }
907 
908 Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
909                                                     Value *OldV,
910                                                     Value *NewV) const {
911   auto IntrID = II->getIntrinsicID();
912   switch (IntrID) {
913   case Intrinsic::amdgcn_atomic_inc:
914   case Intrinsic::amdgcn_atomic_dec:
915   case Intrinsic::amdgcn_ds_fadd:
916   case Intrinsic::amdgcn_ds_fmin:
917   case Intrinsic::amdgcn_ds_fmax: {
918     const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
919     if (!IsVolatile->isZero())
920       return nullptr;
921     Module *M = II->getParent()->getParent()->getParent();
922     Type *DestTy = II->getType();
923     Type *SrcTy = NewV->getType();
924     Function *NewDecl =
925         Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
926     II->setArgOperand(0, NewV);
927     II->setCalledFunction(NewDecl);
928     return II;
929   }
930   case Intrinsic::amdgcn_is_shared:
931   case Intrinsic::amdgcn_is_private: {
932     unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
933       AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
934     unsigned NewAS = NewV->getType()->getPointerAddressSpace();
935     LLVMContext &Ctx = NewV->getType()->getContext();
936     ConstantInt *NewVal = (TrueAS == NewAS) ?
937       ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);
938     return NewVal;
939   }
940   case Intrinsic::ptrmask: {
941     unsigned OldAS = OldV->getType()->getPointerAddressSpace();
942     unsigned NewAS = NewV->getType()->getPointerAddressSpace();
943     Value *MaskOp = II->getArgOperand(1);
944     Type *MaskTy = MaskOp->getType();
945 
946     bool DoTruncate = false;
947 
948     const GCNTargetMachine &TM =
949         static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
950     if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
951       // All valid 64-bit to 32-bit casts work by chopping off the high
952       // bits. Any masking only clearing the low bits will also apply in the new
953       // address space.
954       if (DL.getPointerSizeInBits(OldAS) != 64 ||
955           DL.getPointerSizeInBits(NewAS) != 32)
956         return nullptr;
957 
958       // TODO: Do we need to thread more context in here?
959       KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II);
960       if (Known.countMinLeadingOnes() < 32)
961         return nullptr;
962 
963       DoTruncate = true;
964     }
965 
966     IRBuilder<> B(II);
967     if (DoTruncate) {
968       MaskTy = B.getInt32Ty();
969       MaskOp = B.CreateTrunc(MaskOp, MaskTy);
970     }
971 
972     return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
973                              {NewV, MaskOp});
974   }
975   default:
976     return nullptr;
977   }
978 }
979 
980 unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *VT,
981                                     int Index, VectorType *SubTp) {
982   if (ST->hasVOP3PInsts()) {
983     if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
984         DL.getTypeSizeInBits(VT->getElementType()) == 16) {
985       // With op_sel VOP3P instructions freely can access the low half or high
986       // half of a register, so any swizzle is free.
987 
988       switch (Kind) {
989       case TTI::SK_Broadcast:
990       case TTI::SK_Reverse:
991       case TTI::SK_PermuteSingleSrc:
992         return 0;
993       default:
994         break;
995       }
996     }
997   }
998 
999   return BaseT::getShuffleCost(Kind, VT, Index, SubTp);
1000 }
1001 
1002 bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
1003                                      const Function *Callee) const {
1004   const TargetMachine &TM = getTLI()->getTargetMachine();
1005   const GCNSubtarget *CallerST
1006     = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1007   const GCNSubtarget *CalleeST
1008     = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1009 
1010   const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1011   const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1012 
1013   FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1014   FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1015   if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1016     return false;
1017 
1018   // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1019   // no way to support merge for backend defined attributes.
1020   AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
1021   AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee);
1022   return CallerMode.isInlineCompatible(CalleeMode);
1023 }
1024 
1025 void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1026                                          TTI::UnrollingPreferences &UP) {
1027   CommonTTI.getUnrollingPreferences(L, SE, UP);
1028 }
1029 
1030 void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1031                                        TTI::PeelingPreferences &PP) {
1032   CommonTTI.getPeelingPreferences(L, SE, PP);
1033 }
1034 
1035 unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
1036   return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
1037 }
1038 
1039 unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const {
1040   return getHardwareNumberOfRegisters(Vec);
1041 }
1042 
1043 unsigned R600TTIImpl::getRegisterBitWidth(bool Vector) const {
1044   return 32;
1045 }
1046 
1047 unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const {
1048   return 32;
1049 }
1050 
1051 unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
1052   if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
1053       AddrSpace == AMDGPUAS::CONSTANT_ADDRESS)
1054     return 128;
1055   if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1056       AddrSpace == AMDGPUAS::REGION_ADDRESS)
1057     return 64;
1058   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
1059     return 32;
1060 
1061   if ((AddrSpace == AMDGPUAS::PARAM_D_ADDRESS ||
1062       AddrSpace == AMDGPUAS::PARAM_I_ADDRESS ||
1063       (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&
1064       AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))
1065     return 128;
1066   llvm_unreachable("unhandled address space");
1067 }
1068 
1069 bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
1070                                              Align Alignment,
1071                                              unsigned AddrSpace) const {
1072   // We allow vectorization of flat stores, even though we may need to decompose
1073   // them later if they may access private memory. We don't have enough context
1074   // here, and legalization can handle it.
1075   return (AddrSpace != AMDGPUAS::PRIVATE_ADDRESS);
1076 }
1077 
1078 bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
1079                                               Align Alignment,
1080                                               unsigned AddrSpace) const {
1081   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
1082 }
1083 
1084 bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
1085                                                Align Alignment,
1086                                                unsigned AddrSpace) const {
1087   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
1088 }
1089 
1090 unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) {
1091   // Disable unrolling if the loop is not vectorized.
1092   // TODO: Enable this again.
1093   if (VF == 1)
1094     return 1;
1095 
1096   return 8;
1097 }
1098 
1099 unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode,
1100                                      TTI::TargetCostKind CostKind) {
1101   if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
1102     return Opcode == Instruction::PHI ? 0 : 1;
1103 
1104   // XXX - For some reason this isn't called for switch.
1105   switch (Opcode) {
1106   case Instruction::Br:
1107   case Instruction::Ret:
1108     return 10;
1109   default:
1110     return BaseT::getCFInstrCost(Opcode, CostKind);
1111   }
1112 }
1113 
1114 int R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
1115                                     unsigned Index) {
1116   switch (Opcode) {
1117   case Instruction::ExtractElement:
1118   case Instruction::InsertElement: {
1119     unsigned EltSize
1120       = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
1121     if (EltSize < 32) {
1122       return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
1123     }
1124 
1125     // Extracts are just reads of a subregister, so are free. Inserts are
1126     // considered free because we don't want to have any cost for scalarizing
1127     // operations, and we don't have to copy into a different register class.
1128 
1129     // Dynamic indexing isn't free and is best avoided.
1130     return Index == ~0u ? 2 : 0;
1131   }
1132   default:
1133     return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
1134   }
1135 }
1136 
1137 void R600TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1138                                           TTI::UnrollingPreferences &UP) {
1139   CommonTTI.getUnrollingPreferences(L, SE, UP);
1140 }
1141 
1142 void R600TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1143                                         TTI::PeelingPreferences &PP) {
1144   CommonTTI.getPeelingPreferences(L, SE, PP);
1145 }
1146