1 //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "AMDGPUTargetTransformInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "Utils/AMDGPUBaseInfo.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/Analysis/LoopInfo.h"
22 #include "llvm/Analysis/TargetTransformInfo.h"
23 #include "llvm/Analysis/ValueTracking.h"
24 #include "llvm/CodeGen/ISDOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/Argument.h"
27 #include "llvm/IR/Attributes.h"
28 #include "llvm/IR/BasicBlock.h"
29 #include "llvm/IR/CallingConv.h"
30 #include "llvm/IR/DataLayout.h"
31 #include "llvm/IR/DerivedTypes.h"
32 #include "llvm/IR/Function.h"
33 #include "llvm/IR/Instruction.h"
34 #include "llvm/IR/Instructions.h"
35 #include "llvm/IR/IntrinsicInst.h"
36 #include "llvm/IR/Module.h"
37 #include "llvm/IR/PatternMatch.h"
38 #include "llvm/IR/Type.h"
39 #include "llvm/IR/Value.h"
40 #include "llvm/MC/SubtargetFeature.h"
41 #include "llvm/Support/Casting.h"
42 #include "llvm/Support/CommandLine.h"
43 #include "llvm/Support/Debug.h"
44 #include "llvm/Support/ErrorHandling.h"
45 #include "llvm/Support/KnownBits.h"
46 #include "llvm/Support/MachineValueType.h"
47 #include "llvm/Support/raw_ostream.h"
48 #include "llvm/Target/TargetMachine.h"
49 #include <algorithm>
50 #include <cassert>
51 #include <limits>
52 #include <utility>
53 
54 using namespace llvm;
55 
56 #define DEBUG_TYPE "AMDGPUtti"
57 
58 static cl::opt<unsigned> UnrollThresholdPrivate(
59   "amdgpu-unroll-threshold-private",
60   cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
61   cl::init(2700), cl::Hidden);
62 
63 static cl::opt<unsigned> UnrollThresholdLocal(
64   "amdgpu-unroll-threshold-local",
65   cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
66   cl::init(1000), cl::Hidden);
67 
68 static cl::opt<unsigned> UnrollThresholdIf(
69   "amdgpu-unroll-threshold-if",
70   cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
71   cl::init(150), cl::Hidden);
72 
73 static cl::opt<bool> UnrollRuntimeLocal(
74   "amdgpu-unroll-runtime-local",
75   cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
76   cl::init(true), cl::Hidden);
77 
78 static cl::opt<bool> UseLegacyDA(
79   "amdgpu-use-legacy-divergence-analysis",
80   cl::desc("Enable legacy divergence analysis for AMDGPU"),
81   cl::init(false), cl::Hidden);
82 
83 static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
84     "amdgpu-unroll-max-block-to-analyze",
85     cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
86     cl::init(32), cl::Hidden);
87 
88 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
89                               unsigned Depth = 0) {
90   const Instruction *I = dyn_cast<Instruction>(Cond);
91   if (!I)
92     return false;
93 
94   for (const Value *V : I->operand_values()) {
95     if (!L->contains(I))
96       continue;
97     if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
98       if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
99                   return SubLoop->contains(PHI); }))
100         return true;
101     } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
102       return true;
103   }
104   return false;
105 }
106 
107 void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
108                                             TTI::UnrollingPreferences &UP) {
109   const Function &F = *L->getHeader()->getParent();
110   UP.Threshold = AMDGPU::getIntegerAttribute(F, "amdgpu-unroll-threshold", 300);
111   UP.MaxCount = std::numeric_limits<unsigned>::max();
112   UP.Partial = true;
113 
114   // TODO: Do we want runtime unrolling?
115 
116   // Maximum alloca size than can fit registers. Reserve 16 registers.
117   const unsigned MaxAlloca = (256 - 16) * 4;
118   unsigned ThresholdPrivate = UnrollThresholdPrivate;
119   unsigned ThresholdLocal = UnrollThresholdLocal;
120   unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
121   for (const BasicBlock *BB : L->getBlocks()) {
122     const DataLayout &DL = BB->getModule()->getDataLayout();
123     unsigned LocalGEPsSeen = 0;
124 
125     if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
126                return SubLoop->contains(BB); }))
127         continue; // Block belongs to an inner loop.
128 
129     for (const Instruction &I : *BB) {
130       // Unroll a loop which contains an "if" statement whose condition
131       // defined by a PHI belonging to the loop. This may help to eliminate
132       // if region and potentially even PHI itself, saving on both divergence
133       // and registers used for the PHI.
134       // Add a small bonus for each of such "if" statements.
135       if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
136         if (UP.Threshold < MaxBoost && Br->isConditional()) {
137           BasicBlock *Succ0 = Br->getSuccessor(0);
138           BasicBlock *Succ1 = Br->getSuccessor(1);
139           if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
140               (L->contains(Succ1) && L->isLoopExiting(Succ1)))
141             continue;
142           if (dependsOnLocalPhi(L, Br->getCondition())) {
143             UP.Threshold += UnrollThresholdIf;
144             LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
145                               << " for loop:\n"
146                               << *L << " due to " << *Br << '\n');
147             if (UP.Threshold >= MaxBoost)
148               return;
149           }
150         }
151         continue;
152       }
153 
154       const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
155       if (!GEP)
156         continue;
157 
158       unsigned AS = GEP->getAddressSpace();
159       unsigned Threshold = 0;
160       if (AS == AMDGPUAS::PRIVATE_ADDRESS)
161         Threshold = ThresholdPrivate;
162       else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
163         Threshold = ThresholdLocal;
164       else
165         continue;
166 
167       if (UP.Threshold >= Threshold)
168         continue;
169 
170       if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
171         const Value *Ptr = GEP->getPointerOperand();
172         const AllocaInst *Alloca =
173             dyn_cast<AllocaInst>(getUnderlyingObject(Ptr));
174         if (!Alloca || !Alloca->isStaticAlloca())
175           continue;
176         Type *Ty = Alloca->getAllocatedType();
177         unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
178         if (AllocaSize > MaxAlloca)
179           continue;
180       } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
181                  AS == AMDGPUAS::REGION_ADDRESS) {
182         LocalGEPsSeen++;
183         // Inhibit unroll for local memory if we have seen addressing not to
184         // a variable, most likely we will be unable to combine it.
185         // Do not unroll too deep inner loops for local memory to give a chance
186         // to unroll an outer loop for a more important reason.
187         if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
188             (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
189              !isa<Argument>(GEP->getPointerOperand())))
190           continue;
191         LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
192                           << *L << " due to LDS use.\n");
193         UP.Runtime = UnrollRuntimeLocal;
194       }
195 
196       // Check if GEP depends on a value defined by this loop itself.
197       bool HasLoopDef = false;
198       for (const Value *Op : GEP->operands()) {
199         const Instruction *Inst = dyn_cast<Instruction>(Op);
200         if (!Inst || L->isLoopInvariant(Op))
201           continue;
202 
203         if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
204              return SubLoop->contains(Inst); }))
205           continue;
206         HasLoopDef = true;
207         break;
208       }
209       if (!HasLoopDef)
210         continue;
211 
212       // We want to do whatever we can to limit the number of alloca
213       // instructions that make it through to the code generator.  allocas
214       // require us to use indirect addressing, which is slow and prone to
215       // compiler bugs.  If this loop does an address calculation on an
216       // alloca ptr, then we want to use a higher than normal loop unroll
217       // threshold. This will give SROA a better chance to eliminate these
218       // allocas.
219       //
220       // We also want to have more unrolling for local memory to let ds
221       // instructions with different offsets combine.
222       //
223       // Don't use the maximum allowed value here as it will make some
224       // programs way too big.
225       UP.Threshold = Threshold;
226       LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
227                         << " for loop:\n"
228                         << *L << " due to " << *GEP << '\n');
229       if (UP.Threshold >= MaxBoost)
230         return;
231     }
232 
233     // If we got a GEP in a small BB from inner loop then increase max trip
234     // count to analyze for better estimation cost in unroll
235     if (L->empty() && BB->size() < UnrollMaxBlockToAnalyze)
236       UP.MaxIterationsCountToAnalyze = 32;
237   }
238 }
239 
240 void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
241                                           TTI::PeelingPreferences &PP) {
242   BaseT::getPeelingPreferences(L, SE, PP);
243 }
244 unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
245   // The concept of vector registers doesn't really exist. Some packed vector
246   // operations operate on the normal 32-bit registers.
247   return MaxVGPRs;
248 }
249 
250 unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
251   // This is really the number of registers to fill when vectorizing /
252   // interleaving loops, so we lie to avoid trying to use all registers.
253   return getHardwareNumberOfRegisters(Vec) >> 3;
254 }
255 
256 unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
257   const SIRegisterInfo *TRI = ST->getRegisterInfo();
258   const TargetRegisterClass *RC = TRI->getRegClass(RCID);
259   unsigned NumVGPRs = (TRI->getRegSizeInBits(*RC) + 31) / 32;
260   return getHardwareNumberOfRegisters(false) / NumVGPRs;
261 }
262 
263 unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const {
264   return 32;
265 }
266 
267 unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
268   return 32;
269 }
270 
271 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
272                                          unsigned ChainSizeInBytes,
273                                          VectorType *VecTy) const {
274   unsigned VecRegBitWidth = VF * LoadSize;
275   if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
276     // TODO: Support element-size less than 32bit?
277     return 128 / LoadSize;
278 
279   return VF;
280 }
281 
282 unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
283                                              unsigned ChainSizeInBytes,
284                                              VectorType *VecTy) const {
285   unsigned VecRegBitWidth = VF * StoreSize;
286   if (VecRegBitWidth > 128)
287     return 128 / StoreSize;
288 
289   return VF;
290 }
291 
292 unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
293   if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
294       AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
295       AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
296       AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) {
297     return 512;
298   }
299 
300   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
301     return 8 * ST->getMaxPrivateElementSize();
302 
303   // Common to flat, global, local and region. Assume for unknown addrspace.
304   return 128;
305 }
306 
307 bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
308                                             Align Alignment,
309                                             unsigned AddrSpace) const {
310   // We allow vectorization of flat stores, even though we may need to decompose
311   // them later if they may access private memory. We don't have enough context
312   // here, and legalization can handle it.
313   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
314     return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
315       ChainSizeInBytes <= ST->getMaxPrivateElementSize();
316   }
317   return true;
318 }
319 
320 bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
321                                              Align Alignment,
322                                              unsigned AddrSpace) const {
323   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
324 }
325 
326 bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
327                                               Align Alignment,
328                                               unsigned AddrSpace) const {
329   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
330 }
331 
332 // FIXME: Really we would like to issue multiple 128-bit loads and stores per
333 // iteration. Should we report a larger size and let it legalize?
334 //
335 // FIXME: Should we use narrower types for local/region, or account for when
336 // unaligned access is legal?
337 //
338 // FIXME: This could use fine tuning and microbenchmarks.
339 Type *GCNTTIImpl::getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
340                                             unsigned SrcAddrSpace,
341                                             unsigned DestAddrSpace,
342                                             unsigned SrcAlign,
343                                             unsigned DestAlign) const {
344   unsigned MinAlign = std::min(SrcAlign, DestAlign);
345 
346   // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
347   // hardware into byte accesses. If you assume all alignments are equally
348   // probable, it's more efficient on average to use short accesses for this
349   // case.
350   if (MinAlign == 2)
351     return Type::getInt16Ty(Context);
352 
353   // Not all subtargets have 128-bit DS instructions, and we currently don't
354   // form them by default.
355   if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
356       SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
357       DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
358       DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
359     return FixedVectorType::get(Type::getInt32Ty(Context), 2);
360   }
361 
362   // Global memory works best with 16-byte accesses. Private memory will also
363   // hit this, although they'll be decomposed.
364   return FixedVectorType::get(Type::getInt32Ty(Context), 4);
365 }
366 
367 void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
368   SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
369   unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
370   unsigned SrcAlign, unsigned DestAlign) const {
371   assert(RemainingBytes < 16);
372 
373   unsigned MinAlign = std::min(SrcAlign, DestAlign);
374 
375   if (MinAlign != 2) {
376     Type *I64Ty = Type::getInt64Ty(Context);
377     while (RemainingBytes >= 8) {
378       OpsOut.push_back(I64Ty);
379       RemainingBytes -= 8;
380     }
381 
382     Type *I32Ty = Type::getInt32Ty(Context);
383     while (RemainingBytes >= 4) {
384       OpsOut.push_back(I32Ty);
385       RemainingBytes -= 4;
386     }
387   }
388 
389   Type *I16Ty = Type::getInt16Ty(Context);
390   while (RemainingBytes >= 2) {
391     OpsOut.push_back(I16Ty);
392     RemainingBytes -= 2;
393   }
394 
395   Type *I8Ty = Type::getInt8Ty(Context);
396   while (RemainingBytes) {
397     OpsOut.push_back(I8Ty);
398     --RemainingBytes;
399   }
400 }
401 
402 unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
403   // Disable unrolling if the loop is not vectorized.
404   // TODO: Enable this again.
405   if (VF == 1)
406     return 1;
407 
408   return 8;
409 }
410 
411 bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
412                                        MemIntrinsicInfo &Info) const {
413   switch (Inst->getIntrinsicID()) {
414   case Intrinsic::amdgcn_atomic_inc:
415   case Intrinsic::amdgcn_atomic_dec:
416   case Intrinsic::amdgcn_ds_ordered_add:
417   case Intrinsic::amdgcn_ds_ordered_swap:
418   case Intrinsic::amdgcn_ds_fadd:
419   case Intrinsic::amdgcn_ds_fmin:
420   case Intrinsic::amdgcn_ds_fmax: {
421     auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
422     auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
423     if (!Ordering || !Volatile)
424       return false; // Invalid.
425 
426     unsigned OrderingVal = Ordering->getZExtValue();
427     if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
428       return false;
429 
430     Info.PtrVal = Inst->getArgOperand(0);
431     Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
432     Info.ReadMem = true;
433     Info.WriteMem = true;
434     Info.IsVolatile = !Volatile->isNullValue();
435     return true;
436   }
437   default:
438     return false;
439   }
440 }
441 
442 int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
443                                        TTI::TargetCostKind CostKind,
444                                        TTI::OperandValueKind Opd1Info,
445                                        TTI::OperandValueKind Opd2Info,
446                                        TTI::OperandValueProperties Opd1PropInfo,
447                                        TTI::OperandValueProperties Opd2PropInfo,
448                                        ArrayRef<const Value *> Args,
449                                        const Instruction *CxtI) {
450   EVT OrigTy = TLI->getValueType(DL, Ty);
451   if (!OrigTy.isSimple()) {
452     // FIXME: We're having to query the throughput cost so that the basic
453     // implementation tries to generate legalize and scalarization costs. Maybe
454     // we could hoist the scalarization code here?
455     return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput,
456                                          Opd1Info, Opd2Info, Opd1PropInfo,
457                                          Opd2PropInfo, Args, CxtI);
458   }
459 
460   // Legalize the type.
461   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
462   int ISD = TLI->InstructionOpcodeToISD(Opcode);
463 
464   // Because we don't have any legal vector operations, but the legal types, we
465   // need to account for split vectors.
466   unsigned NElts = LT.second.isVector() ?
467     LT.second.getVectorNumElements() : 1;
468 
469   MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
470 
471   switch (ISD) {
472   case ISD::SHL:
473   case ISD::SRL:
474   case ISD::SRA:
475     if (SLT == MVT::i64)
476       return get64BitInstrCost() * LT.first * NElts;
477 
478     if (ST->has16BitInsts() && SLT == MVT::i16)
479       NElts = (NElts + 1) / 2;
480 
481     // i32
482     return getFullRateInstrCost() * LT.first * NElts;
483   case ISD::ADD:
484   case ISD::SUB:
485   case ISD::AND:
486   case ISD::OR:
487   case ISD::XOR:
488     if (SLT == MVT::i64) {
489       // and, or and xor are typically split into 2 VALU instructions.
490       return 2 * getFullRateInstrCost() * LT.first * NElts;
491     }
492 
493     if (ST->has16BitInsts() && SLT == MVT::i16)
494       NElts = (NElts + 1) / 2;
495 
496     return LT.first * NElts * getFullRateInstrCost();
497   case ISD::MUL: {
498     const int QuarterRateCost = getQuarterRateInstrCost();
499     if (SLT == MVT::i64) {
500       const int FullRateCost = getFullRateInstrCost();
501       return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
502     }
503 
504     if (ST->has16BitInsts() && SLT == MVT::i16)
505       NElts = (NElts + 1) / 2;
506 
507     // i32
508     return QuarterRateCost * NElts * LT.first;
509   }
510   case ISD::FMUL:
511     // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
512     // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
513     // fused operation.
514     if (CxtI && CxtI->hasOneUse())
515       if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
516         const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
517         if (OPC == ISD::FADD || OPC == ISD::FSUB) {
518           if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
519             return TargetTransformInfo::TCC_Free;
520           if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
521             return TargetTransformInfo::TCC_Free;
522 
523           // Estimate all types may be fused with contract/unsafe flags
524           const TargetOptions &Options = TLI->getTargetMachine().Options;
525           if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
526               Options.UnsafeFPMath ||
527               (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
528             return TargetTransformInfo::TCC_Free;
529         }
530       }
531     LLVM_FALLTHROUGH;
532   case ISD::FADD:
533   case ISD::FSUB:
534     if (SLT == MVT::f64)
535       return LT.first * NElts * get64BitInstrCost();
536 
537     if (ST->has16BitInsts() && SLT == MVT::f16)
538       NElts = (NElts + 1) / 2;
539 
540     if (SLT == MVT::f32 || SLT == MVT::f16)
541       return LT.first * NElts * getFullRateInstrCost();
542     break;
543   case ISD::FDIV:
544   case ISD::FREM:
545     // FIXME: frem should be handled separately. The fdiv in it is most of it,
546     // but the current lowering is also not entirely correct.
547     if (SLT == MVT::f64) {
548       int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
549       // Add cost of workaround.
550       if (!ST->hasUsableDivScaleConditionOutput())
551         Cost += 3 * getFullRateInstrCost();
552 
553       return LT.first * Cost * NElts;
554     }
555 
556     if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
557       // TODO: This is more complicated, unsafe flags etc.
558       if ((SLT == MVT::f32 && !HasFP32Denormals) ||
559           (SLT == MVT::f16 && ST->has16BitInsts())) {
560         return LT.first * getQuarterRateInstrCost() * NElts;
561       }
562     }
563 
564     if (SLT == MVT::f16 && ST->has16BitInsts()) {
565       // 2 x v_cvt_f32_f16
566       // f32 rcp
567       // f32 fmul
568       // v_cvt_f16_f32
569       // f16 div_fixup
570       int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost();
571       return LT.first * Cost * NElts;
572     }
573 
574     if (SLT == MVT::f32 || SLT == MVT::f16) {
575       int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
576 
577       if (!HasFP32Denormals) {
578         // FP mode switches.
579         Cost += 2 * getFullRateInstrCost();
580       }
581 
582       return LT.first * NElts * Cost;
583     }
584     break;
585   case ISD::FNEG:
586     // Use the backend' estimation. If fneg is not free each element will cost
587     // one additional instruction.
588     return TLI->isFNegFree(SLT) ? 0 : NElts;
589   default:
590     break;
591   }
592 
593   return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info,
594                                        Opd1PropInfo, Opd2PropInfo, Args, CxtI);
595 }
596 
597 // Return true if there's a potential benefit from using v2f16/v2i16
598 // instructions for an intrinsic, even if it requires nontrivial legalization.
599 static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
600   switch (ID) {
601   case Intrinsic::fma: // TODO: fmuladd
602   // There's a small benefit to using vector ops in the legalized code.
603   case Intrinsic::round:
604   case Intrinsic::uadd_sat:
605   case Intrinsic::usub_sat:
606   case Intrinsic::sadd_sat:
607   case Intrinsic::ssub_sat:
608     return true;
609   default:
610     return false;
611   }
612 }
613 
614 int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
615                                       TTI::TargetCostKind CostKind) {
616   if (ICA.getID() == Intrinsic::fabs)
617     return 0;
618 
619   if (!intrinsicHasPackedVectorBenefit(ICA.getID()))
620     return BaseT::getIntrinsicInstrCost(ICA, CostKind);
621 
622   Type *RetTy = ICA.getReturnType();
623   EVT OrigTy = TLI->getValueType(DL, RetTy);
624   if (!OrigTy.isSimple()) {
625     return BaseT::getIntrinsicInstrCost(ICA, CostKind);
626   }
627 
628   // Legalize the type.
629   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
630 
631   unsigned NElts = LT.second.isVector() ?
632     LT.second.getVectorNumElements() : 1;
633 
634   MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
635 
636   if (SLT == MVT::f64)
637     return LT.first * NElts * get64BitInstrCost();
638 
639   if (ST->has16BitInsts() && SLT == MVT::f16)
640     NElts = (NElts + 1) / 2;
641 
642   // TODO: Get more refined intrinsic costs?
643   unsigned InstRate = getQuarterRateInstrCost();
644   if (ICA.getID() == Intrinsic::fma) {
645     InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost()
646                                    : getQuarterRateInstrCost();
647   }
648 
649   return LT.first * NElts * InstRate;
650 }
651 
652 unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode,
653                                     TTI::TargetCostKind CostKind) {
654   if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
655     return Opcode == Instruction::PHI ? 0 : 1;
656 
657   // XXX - For some reason this isn't called for switch.
658   switch (Opcode) {
659   case Instruction::Br:
660   case Instruction::Ret:
661     return 10;
662   default:
663     return BaseT::getCFInstrCost(Opcode, CostKind);
664   }
665 }
666 
667 int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
668                                            bool IsPairwise,
669                                            TTI::TargetCostKind CostKind) {
670   EVT OrigTy = TLI->getValueType(DL, Ty);
671 
672   // Computes cost on targets that have packed math instructions(which support
673   // 16-bit types only).
674   if (IsPairwise ||
675       !ST->hasVOP3PInsts() ||
676       OrigTy.getScalarSizeInBits() != 16)
677     return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise, CostKind);
678 
679   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
680   return LT.first * getFullRateInstrCost();
681 }
682 
683 int GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
684                                        bool IsPairwise, bool IsUnsigned,
685                                        TTI::TargetCostKind CostKind) {
686   EVT OrigTy = TLI->getValueType(DL, Ty);
687 
688   // Computes cost on targets that have packed math instructions(which support
689   // 16-bit types only).
690   if (IsPairwise ||
691       !ST->hasVOP3PInsts() ||
692       OrigTy.getScalarSizeInBits() != 16)
693     return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned,
694                                          CostKind);
695 
696   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
697   return LT.first * getHalfRateInstrCost();
698 }
699 
700 int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
701                                       unsigned Index) {
702   switch (Opcode) {
703   case Instruction::ExtractElement:
704   case Instruction::InsertElement: {
705     unsigned EltSize
706       = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
707     if (EltSize < 32) {
708       if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
709         return 0;
710       return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
711     }
712 
713     // Extracts are just reads of a subregister, so are free. Inserts are
714     // considered free because we don't want to have any cost for scalarizing
715     // operations, and we don't have to copy into a different register class.
716 
717     // Dynamic indexing isn't free and is best avoided.
718     return Index == ~0u ? 2 : 0;
719   }
720   default:
721     return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
722   }
723 }
724 
725 static bool isArgPassedInSGPR(const Argument *A) {
726   const Function *F = A->getParent();
727 
728   // Arguments to compute shaders are never a source of divergence.
729   CallingConv::ID CC = F->getCallingConv();
730   switch (CC) {
731   case CallingConv::AMDGPU_KERNEL:
732   case CallingConv::SPIR_KERNEL:
733     return true;
734   case CallingConv::AMDGPU_VS:
735   case CallingConv::AMDGPU_LS:
736   case CallingConv::AMDGPU_HS:
737   case CallingConv::AMDGPU_ES:
738   case CallingConv::AMDGPU_GS:
739   case CallingConv::AMDGPU_PS:
740   case CallingConv::AMDGPU_CS:
741     // For non-compute shaders, SGPR inputs are marked with either inreg.
742     // Everything else is in VGPRs.
743     return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg);
744   default:
745     // TODO: Should calls support inreg for SGPR inputs?
746     return false;
747   }
748 }
749 
750 /// Analyze if the results of inline asm are divergent. If \p Indices is empty,
751 /// this is analyzing the collective result of all output registers. Otherwise,
752 /// this is only querying a specific result index if this returns multiple
753 /// registers in a struct.
754 bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
755   const CallInst *CI, ArrayRef<unsigned> Indices) const {
756   // TODO: Handle complex extract indices
757   if (Indices.size() > 1)
758     return true;
759 
760   const DataLayout &DL = CI->getModule()->getDataLayout();
761   const SIRegisterInfo *TRI = ST->getRegisterInfo();
762   TargetLowering::AsmOperandInfoVector TargetConstraints =
763       TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
764 
765   const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
766 
767   int OutputIdx = 0;
768   for (auto &TC : TargetConstraints) {
769     if (TC.Type != InlineAsm::isOutput)
770       continue;
771 
772     // Skip outputs we don't care about.
773     if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
774       continue;
775 
776     TLI->ComputeConstraintToUse(TC, SDValue());
777 
778     Register AssignedReg;
779     const TargetRegisterClass *RC;
780     std::tie(AssignedReg, RC) = TLI->getRegForInlineAsmConstraint(
781       TRI, TC.ConstraintCode, TC.ConstraintVT);
782     if (AssignedReg) {
783       // FIXME: This is a workaround for getRegForInlineAsmConstraint
784       // returning VS_32
785       RC = TRI->getPhysRegClass(AssignedReg);
786     }
787 
788     // For AGPR constraints null is returned on subtargets without AGPRs, so
789     // assume divergent for null.
790     if (!RC || !TRI->isSGPRClass(RC))
791       return true;
792   }
793 
794   return false;
795 }
796 
797 /// \returns true if the new GPU divergence analysis is enabled.
798 bool GCNTTIImpl::useGPUDivergenceAnalysis() const {
799   return !UseLegacyDA;
800 }
801 
802 /// \returns true if the result of the value could potentially be
803 /// different across workitems in a wavefront.
804 bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
805   if (const Argument *A = dyn_cast<Argument>(V))
806     return !isArgPassedInSGPR(A);
807 
808   // Loads from the private and flat address spaces are divergent, because
809   // threads can execute the load instruction with the same inputs and get
810   // different results.
811   //
812   // All other loads are not divergent, because if threads issue loads with the
813   // same arguments, they will always get the same result.
814   if (const LoadInst *Load = dyn_cast<LoadInst>(V))
815     return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
816            Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
817 
818   // Atomics are divergent because they are executed sequentially: when an
819   // atomic operation refers to the same address in each thread, then each
820   // thread after the first sees the value written by the previous thread as
821   // original value.
822   if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
823     return true;
824 
825   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
826     return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
827 
828   // Assume all function calls are a source of divergence.
829   if (const CallInst *CI = dyn_cast<CallInst>(V)) {
830     if (CI->isInlineAsm())
831       return isInlineAsmSourceOfDivergence(CI);
832     return true;
833   }
834 
835   // Assume all function calls are a source of divergence.
836   if (isa<InvokeInst>(V))
837     return true;
838 
839   return false;
840 }
841 
842 bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
843   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
844     switch (Intrinsic->getIntrinsicID()) {
845     default:
846       return false;
847     case Intrinsic::amdgcn_readfirstlane:
848     case Intrinsic::amdgcn_readlane:
849     case Intrinsic::amdgcn_icmp:
850     case Intrinsic::amdgcn_fcmp:
851     case Intrinsic::amdgcn_ballot:
852     case Intrinsic::amdgcn_if_break:
853       return true;
854     }
855   }
856 
857   if (const CallInst *CI = dyn_cast<CallInst>(V)) {
858     if (CI->isInlineAsm())
859       return !isInlineAsmSourceOfDivergence(CI);
860     return false;
861   }
862 
863   const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
864   if (!ExtValue)
865     return false;
866 
867   const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
868   if (!CI)
869     return false;
870 
871   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
872     switch (Intrinsic->getIntrinsicID()) {
873     default:
874       return false;
875     case Intrinsic::amdgcn_if:
876     case Intrinsic::amdgcn_else: {
877       ArrayRef<unsigned> Indices = ExtValue->getIndices();
878       return Indices.size() == 1 && Indices[0] == 1;
879     }
880     }
881   }
882 
883   // If we have inline asm returning mixed SGPR and VGPR results, we inferred
884   // divergent for the overall struct return. We need to override it in the
885   // case we're extracting an SGPR component here.
886   if (CI->isInlineAsm())
887     return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
888 
889   return false;
890 }
891 
892 bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
893                                             Intrinsic::ID IID) const {
894   switch (IID) {
895   case Intrinsic::amdgcn_atomic_inc:
896   case Intrinsic::amdgcn_atomic_dec:
897   case Intrinsic::amdgcn_ds_fadd:
898   case Intrinsic::amdgcn_ds_fmin:
899   case Intrinsic::amdgcn_ds_fmax:
900   case Intrinsic::amdgcn_is_shared:
901   case Intrinsic::amdgcn_is_private:
902     OpIndexes.push_back(0);
903     return true;
904   default:
905     return false;
906   }
907 }
908 
909 Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
910                                                     Value *OldV,
911                                                     Value *NewV) const {
912   auto IntrID = II->getIntrinsicID();
913   switch (IntrID) {
914   case Intrinsic::amdgcn_atomic_inc:
915   case Intrinsic::amdgcn_atomic_dec:
916   case Intrinsic::amdgcn_ds_fadd:
917   case Intrinsic::amdgcn_ds_fmin:
918   case Intrinsic::amdgcn_ds_fmax: {
919     const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
920     if (!IsVolatile->isZero())
921       return nullptr;
922     Module *M = II->getParent()->getParent()->getParent();
923     Type *DestTy = II->getType();
924     Type *SrcTy = NewV->getType();
925     Function *NewDecl =
926         Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
927     II->setArgOperand(0, NewV);
928     II->setCalledFunction(NewDecl);
929     return II;
930   }
931   case Intrinsic::amdgcn_is_shared:
932   case Intrinsic::amdgcn_is_private: {
933     unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
934       AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
935     unsigned NewAS = NewV->getType()->getPointerAddressSpace();
936     LLVMContext &Ctx = NewV->getType()->getContext();
937     ConstantInt *NewVal = (TrueAS == NewAS) ?
938       ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);
939     return NewVal;
940   }
941   case Intrinsic::ptrmask: {
942     unsigned OldAS = OldV->getType()->getPointerAddressSpace();
943     unsigned NewAS = NewV->getType()->getPointerAddressSpace();
944     Value *MaskOp = II->getArgOperand(1);
945     Type *MaskTy = MaskOp->getType();
946 
947     bool DoTruncate = false;
948 
949     const GCNTargetMachine &TM =
950         static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
951     if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
952       // All valid 64-bit to 32-bit casts work by chopping off the high
953       // bits. Any masking only clearing the low bits will also apply in the new
954       // address space.
955       if (DL.getPointerSizeInBits(OldAS) != 64 ||
956           DL.getPointerSizeInBits(NewAS) != 32)
957         return nullptr;
958 
959       // TODO: Do we need to thread more context in here?
960       KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II);
961       if (Known.countMinLeadingOnes() < 32)
962         return nullptr;
963 
964       DoTruncate = true;
965     }
966 
967     IRBuilder<> B(II);
968     if (DoTruncate) {
969       MaskTy = B.getInt32Ty();
970       MaskOp = B.CreateTrunc(MaskOp, MaskTy);
971     }
972 
973     return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
974                              {NewV, MaskOp});
975   }
976   default:
977     return nullptr;
978   }
979 }
980 
981 unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *VT,
982                                     int Index, VectorType *SubTp) {
983   if (ST->hasVOP3PInsts()) {
984     if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
985         DL.getTypeSizeInBits(VT->getElementType()) == 16) {
986       // With op_sel VOP3P instructions freely can access the low half or high
987       // half of a register, so any swizzle is free.
988 
989       switch (Kind) {
990       case TTI::SK_Broadcast:
991       case TTI::SK_Reverse:
992       case TTI::SK_PermuteSingleSrc:
993         return 0;
994       default:
995         break;
996       }
997     }
998   }
999 
1000   return BaseT::getShuffleCost(Kind, VT, Index, SubTp);
1001 }
1002 
1003 bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
1004                                      const Function *Callee) const {
1005   const TargetMachine &TM = getTLI()->getTargetMachine();
1006   const GCNSubtarget *CallerST
1007     = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1008   const GCNSubtarget *CalleeST
1009     = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1010 
1011   const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1012   const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1013 
1014   FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1015   FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1016   if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1017     return false;
1018 
1019   // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1020   // no way to support merge for backend defined attributes.
1021   AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
1022   AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee);
1023   return CallerMode.isInlineCompatible(CalleeMode);
1024 }
1025 
1026 void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1027                                          TTI::UnrollingPreferences &UP) {
1028   CommonTTI.getUnrollingPreferences(L, SE, UP);
1029 }
1030 
1031 void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1032                                        TTI::PeelingPreferences &PP) {
1033   CommonTTI.getPeelingPreferences(L, SE, PP);
1034 }
1035 
1036 unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
1037   return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
1038 }
1039 
1040 unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const {
1041   return getHardwareNumberOfRegisters(Vec);
1042 }
1043 
1044 unsigned R600TTIImpl::getRegisterBitWidth(bool Vector) const {
1045   return 32;
1046 }
1047 
1048 unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const {
1049   return 32;
1050 }
1051 
1052 unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
1053   if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
1054       AddrSpace == AMDGPUAS::CONSTANT_ADDRESS)
1055     return 128;
1056   if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1057       AddrSpace == AMDGPUAS::REGION_ADDRESS)
1058     return 64;
1059   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
1060     return 32;
1061 
1062   if ((AddrSpace == AMDGPUAS::PARAM_D_ADDRESS ||
1063       AddrSpace == AMDGPUAS::PARAM_I_ADDRESS ||
1064       (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&
1065       AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))
1066     return 128;
1067   llvm_unreachable("unhandled address space");
1068 }
1069 
1070 bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
1071                                              Align Alignment,
1072                                              unsigned AddrSpace) const {
1073   // We allow vectorization of flat stores, even though we may need to decompose
1074   // them later if they may access private memory. We don't have enough context
1075   // here, and legalization can handle it.
1076   return (AddrSpace != AMDGPUAS::PRIVATE_ADDRESS);
1077 }
1078 
1079 bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
1080                                               Align Alignment,
1081                                               unsigned AddrSpace) const {
1082   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
1083 }
1084 
1085 bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
1086                                                Align Alignment,
1087                                                unsigned AddrSpace) const {
1088   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
1089 }
1090 
1091 unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) {
1092   // Disable unrolling if the loop is not vectorized.
1093   // TODO: Enable this again.
1094   if (VF == 1)
1095     return 1;
1096 
1097   return 8;
1098 }
1099 
1100 unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode,
1101                                      TTI::TargetCostKind CostKind) {
1102   if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
1103     return Opcode == Instruction::PHI ? 0 : 1;
1104 
1105   // XXX - For some reason this isn't called for switch.
1106   switch (Opcode) {
1107   case Instruction::Br:
1108   case Instruction::Ret:
1109     return 10;
1110   default:
1111     return BaseT::getCFInstrCost(Opcode, CostKind);
1112   }
1113 }
1114 
1115 int R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
1116                                     unsigned Index) {
1117   switch (Opcode) {
1118   case Instruction::ExtractElement:
1119   case Instruction::InsertElement: {
1120     unsigned EltSize
1121       = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
1122     if (EltSize < 32) {
1123       return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
1124     }
1125 
1126     // Extracts are just reads of a subregister, so are free. Inserts are
1127     // considered free because we don't want to have any cost for scalarizing
1128     // operations, and we don't have to copy into a different register class.
1129 
1130     // Dynamic indexing isn't free and is best avoided.
1131     return Index == ~0u ? 2 : 0;
1132   }
1133   default:
1134     return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
1135   }
1136 }
1137 
1138 void R600TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1139                                           TTI::UnrollingPreferences &UP) {
1140   CommonTTI.getUnrollingPreferences(L, SE, UP);
1141 }
1142 
1143 void R600TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1144                                         TTI::PeelingPreferences &PP) {
1145   CommonTTI.getPeelingPreferences(L, SE, PP);
1146 }
1147