1 //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "AMDGPUTargetTransformInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "Utils/AMDGPUBaseInfo.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/Analysis/LoopInfo.h"
22 #include "llvm/Analysis/TargetTransformInfo.h"
23 #include "llvm/Analysis/ValueTracking.h"
24 #include "llvm/CodeGen/ISDOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/Argument.h"
27 #include "llvm/IR/Attributes.h"
28 #include "llvm/IR/BasicBlock.h"
29 #include "llvm/IR/CallingConv.h"
30 #include "llvm/IR/DataLayout.h"
31 #include "llvm/IR/DerivedTypes.h"
32 #include "llvm/IR/Function.h"
33 #include "llvm/IR/Instruction.h"
34 #include "llvm/IR/Instructions.h"
35 #include "llvm/IR/IntrinsicInst.h"
36 #include "llvm/IR/Module.h"
37 #include "llvm/IR/PatternMatch.h"
38 #include "llvm/IR/Type.h"
39 #include "llvm/IR/Value.h"
40 #include "llvm/MC/SubtargetFeature.h"
41 #include "llvm/Support/Casting.h"
42 #include "llvm/Support/CommandLine.h"
43 #include "llvm/Support/Debug.h"
44 #include "llvm/Support/ErrorHandling.h"
45 #include "llvm/Support/MachineValueType.h"
46 #include "llvm/Support/raw_ostream.h"
47 #include "llvm/Target/TargetMachine.h"
48 #include <algorithm>
49 #include <cassert>
50 #include <limits>
51 #include <utility>
52 
53 using namespace llvm;
54 
55 #define DEBUG_TYPE "AMDGPUtti"
56 
57 static cl::opt<unsigned> UnrollThresholdPrivate(
58   "amdgpu-unroll-threshold-private",
59   cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
60   cl::init(2700), cl::Hidden);
61 
62 static cl::opt<unsigned> UnrollThresholdLocal(
63   "amdgpu-unroll-threshold-local",
64   cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
65   cl::init(1000), cl::Hidden);
66 
67 static cl::opt<unsigned> UnrollThresholdIf(
68   "amdgpu-unroll-threshold-if",
69   cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
70   cl::init(150), cl::Hidden);
71 
72 static cl::opt<bool> UnrollRuntimeLocal(
73   "amdgpu-unroll-runtime-local",
74   cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
75   cl::init(true), cl::Hidden);
76 
77 static cl::opt<bool> UseLegacyDA(
78   "amdgpu-use-legacy-divergence-analysis",
79   cl::desc("Enable legacy divergence analysis for AMDGPU"),
80   cl::init(false), cl::Hidden);
81 
82 static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
83     "amdgpu-unroll-max-block-to-analyze",
84     cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
85     cl::init(20), cl::Hidden);
86 
87 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
88                               unsigned Depth = 0) {
89   const Instruction *I = dyn_cast<Instruction>(Cond);
90   if (!I)
91     return false;
92 
93   for (const Value *V : I->operand_values()) {
94     if (!L->contains(I))
95       continue;
96     if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
97       if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
98                   return SubLoop->contains(PHI); }))
99         return true;
100     } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
101       return true;
102   }
103   return false;
104 }
105 
106 void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
107                                             TTI::UnrollingPreferences &UP) {
108   const Function &F = *L->getHeader()->getParent();
109   UP.Threshold = AMDGPU::getIntegerAttribute(F, "amdgpu-unroll-threshold", 300);
110   UP.MaxCount = std::numeric_limits<unsigned>::max();
111   UP.Partial = true;
112 
113   // TODO: Do we want runtime unrolling?
114 
115   // Maximum alloca size than can fit registers. Reserve 16 registers.
116   const unsigned MaxAlloca = (256 - 16) * 4;
117   unsigned ThresholdPrivate = UnrollThresholdPrivate;
118   unsigned ThresholdLocal = UnrollThresholdLocal;
119   unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
120   for (const BasicBlock *BB : L->getBlocks()) {
121     const DataLayout &DL = BB->getModule()->getDataLayout();
122     unsigned LocalGEPsSeen = 0;
123 
124     if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
125                return SubLoop->contains(BB); }))
126         continue; // Block belongs to an inner loop.
127 
128     for (const Instruction &I : *BB) {
129       // Unroll a loop which contains an "if" statement whose condition
130       // defined by a PHI belonging to the loop. This may help to eliminate
131       // if region and potentially even PHI itself, saving on both divergence
132       // and registers used for the PHI.
133       // Add a small bonus for each of such "if" statements.
134       if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
135         if (UP.Threshold < MaxBoost && Br->isConditional()) {
136           BasicBlock *Succ0 = Br->getSuccessor(0);
137           BasicBlock *Succ1 = Br->getSuccessor(1);
138           if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
139               (L->contains(Succ1) && L->isLoopExiting(Succ1)))
140             continue;
141           if (dependsOnLocalPhi(L, Br->getCondition())) {
142             UP.Threshold += UnrollThresholdIf;
143             LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
144                               << " for loop:\n"
145                               << *L << " due to " << *Br << '\n');
146             if (UP.Threshold >= MaxBoost)
147               return;
148           }
149         }
150         continue;
151       }
152 
153       const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
154       if (!GEP)
155         continue;
156 
157       unsigned AS = GEP->getAddressSpace();
158       unsigned Threshold = 0;
159       if (AS == AMDGPUAS::PRIVATE_ADDRESS)
160         Threshold = ThresholdPrivate;
161       else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
162         Threshold = ThresholdLocal;
163       else
164         continue;
165 
166       if (UP.Threshold >= Threshold)
167         continue;
168 
169       if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
170         const Value *Ptr = GEP->getPointerOperand();
171         const AllocaInst *Alloca =
172             dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
173         if (!Alloca || !Alloca->isStaticAlloca())
174           continue;
175         Type *Ty = Alloca->getAllocatedType();
176         unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
177         if (AllocaSize > MaxAlloca)
178           continue;
179       } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
180                  AS == AMDGPUAS::REGION_ADDRESS) {
181         LocalGEPsSeen++;
182         // Inhibit unroll for local memory if we have seen addressing not to
183         // a variable, most likely we will be unable to combine it.
184         // Do not unroll too deep inner loops for local memory to give a chance
185         // to unroll an outer loop for a more important reason.
186         if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
187             (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
188              !isa<Argument>(GEP->getPointerOperand())))
189           continue;
190         LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
191                           << *L << " due to LDS use.\n");
192         UP.Runtime = UnrollRuntimeLocal;
193       }
194 
195       // Check if GEP depends on a value defined by this loop itself.
196       bool HasLoopDef = false;
197       for (const Value *Op : GEP->operands()) {
198         const Instruction *Inst = dyn_cast<Instruction>(Op);
199         if (!Inst || L->isLoopInvariant(Op))
200           continue;
201 
202         if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
203              return SubLoop->contains(Inst); }))
204           continue;
205         HasLoopDef = true;
206         break;
207       }
208       if (!HasLoopDef)
209         continue;
210 
211       // We want to do whatever we can to limit the number of alloca
212       // instructions that make it through to the code generator.  allocas
213       // require us to use indirect addressing, which is slow and prone to
214       // compiler bugs.  If this loop does an address calculation on an
215       // alloca ptr, then we want to use a higher than normal loop unroll
216       // threshold. This will give SROA a better chance to eliminate these
217       // allocas.
218       //
219       // We also want to have more unrolling for local memory to let ds
220       // instructions with different offsets combine.
221       //
222       // Don't use the maximum allowed value here as it will make some
223       // programs way too big.
224       UP.Threshold = Threshold;
225       LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
226                         << " for loop:\n"
227                         << *L << " due to " << *GEP << '\n');
228       if (UP.Threshold >= MaxBoost)
229         return;
230     }
231 
232     // If we got a GEP in a small BB from inner loop then increase max trip
233     // count to analyze for better estimation cost in unroll
234     if (L->empty() && BB->size() < UnrollMaxBlockToAnalyze)
235       UP.MaxIterationsCountToAnalyze = 32;
236   }
237 }
238 
239 void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
240                                           TTI::PeelingPreferences &PP) {
241   BaseT::getPeelingPreferences(L, SE, PP);
242 }
243 unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
244   // The concept of vector registers doesn't really exist. Some packed vector
245   // operations operate on the normal 32-bit registers.
246   return MaxVGPRs;
247 }
248 
249 unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
250   // This is really the number of registers to fill when vectorizing /
251   // interleaving loops, so we lie to avoid trying to use all registers.
252   return getHardwareNumberOfRegisters(Vec) >> 3;
253 }
254 
255 unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
256   const SIRegisterInfo *TRI = ST->getRegisterInfo();
257   const TargetRegisterClass *RC = TRI->getRegClass(RCID);
258   unsigned NumVGPRs = (TRI->getRegSizeInBits(*RC) + 31) / 32;
259   return getHardwareNumberOfRegisters(false) / NumVGPRs;
260 }
261 
262 unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const {
263   return 32;
264 }
265 
266 unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
267   return 32;
268 }
269 
270 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
271                                          unsigned ChainSizeInBytes,
272                                          VectorType *VecTy) const {
273   unsigned VecRegBitWidth = VF * LoadSize;
274   if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
275     // TODO: Support element-size less than 32bit?
276     return 128 / LoadSize;
277 
278   return VF;
279 }
280 
281 unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
282                                              unsigned ChainSizeInBytes,
283                                              VectorType *VecTy) const {
284   unsigned VecRegBitWidth = VF * StoreSize;
285   if (VecRegBitWidth > 128)
286     return 128 / StoreSize;
287 
288   return VF;
289 }
290 
291 unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
292   if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
293       AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
294       AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
295       AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) {
296     return 512;
297   }
298 
299   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
300     return 8 * ST->getMaxPrivateElementSize();
301 
302   // Common to flat, global, local and region. Assume for unknown addrspace.
303   return 128;
304 }
305 
306 bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
307                                             Align Alignment,
308                                             unsigned AddrSpace) const {
309   // We allow vectorization of flat stores, even though we may need to decompose
310   // them later if they may access private memory. We don't have enough context
311   // here, and legalization can handle it.
312   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
313     return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
314       ChainSizeInBytes <= ST->getMaxPrivateElementSize();
315   }
316   return true;
317 }
318 
319 bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
320                                              Align Alignment,
321                                              unsigned AddrSpace) const {
322   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
323 }
324 
325 bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
326                                               Align Alignment,
327                                               unsigned AddrSpace) const {
328   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
329 }
330 
331 // FIXME: Really we would like to issue multiple 128-bit loads and stores per
332 // iteration. Should we report a larger size and let it legalize?
333 //
334 // FIXME: Should we use narrower types for local/region, or account for when
335 // unaligned access is legal?
336 //
337 // FIXME: This could use fine tuning and microbenchmarks.
338 Type *GCNTTIImpl::getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
339                                             unsigned SrcAddrSpace,
340                                             unsigned DestAddrSpace,
341                                             unsigned SrcAlign,
342                                             unsigned DestAlign) const {
343   unsigned MinAlign = std::min(SrcAlign, DestAlign);
344 
345   // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
346   // hardware into byte accesses. If you assume all alignments are equally
347   // probable, it's more efficient on average to use short accesses for this
348   // case.
349   if (MinAlign == 2)
350     return Type::getInt16Ty(Context);
351 
352   // Not all subtargets have 128-bit DS instructions, and we currently don't
353   // form them by default.
354   if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
355       SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
356       DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
357       DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
358     return FixedVectorType::get(Type::getInt32Ty(Context), 2);
359   }
360 
361   // Global memory works best with 16-byte accesses. Private memory will also
362   // hit this, although they'll be decomposed.
363   return FixedVectorType::get(Type::getInt32Ty(Context), 4);
364 }
365 
366 void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
367   SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
368   unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
369   unsigned SrcAlign, unsigned DestAlign) const {
370   assert(RemainingBytes < 16);
371 
372   unsigned MinAlign = std::min(SrcAlign, DestAlign);
373 
374   if (MinAlign != 2) {
375     Type *I64Ty = Type::getInt64Ty(Context);
376     while (RemainingBytes >= 8) {
377       OpsOut.push_back(I64Ty);
378       RemainingBytes -= 8;
379     }
380 
381     Type *I32Ty = Type::getInt32Ty(Context);
382     while (RemainingBytes >= 4) {
383       OpsOut.push_back(I32Ty);
384       RemainingBytes -= 4;
385     }
386   }
387 
388   Type *I16Ty = Type::getInt16Ty(Context);
389   while (RemainingBytes >= 2) {
390     OpsOut.push_back(I16Ty);
391     RemainingBytes -= 2;
392   }
393 
394   Type *I8Ty = Type::getInt8Ty(Context);
395   while (RemainingBytes) {
396     OpsOut.push_back(I8Ty);
397     --RemainingBytes;
398   }
399 }
400 
401 unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
402   // Disable unrolling if the loop is not vectorized.
403   // TODO: Enable this again.
404   if (VF == 1)
405     return 1;
406 
407   return 8;
408 }
409 
410 bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
411                                        MemIntrinsicInfo &Info) const {
412   switch (Inst->getIntrinsicID()) {
413   case Intrinsic::amdgcn_atomic_inc:
414   case Intrinsic::amdgcn_atomic_dec:
415   case Intrinsic::amdgcn_ds_ordered_add:
416   case Intrinsic::amdgcn_ds_ordered_swap:
417   case Intrinsic::amdgcn_ds_fadd:
418   case Intrinsic::amdgcn_ds_fmin:
419   case Intrinsic::amdgcn_ds_fmax: {
420     auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
421     auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
422     if (!Ordering || !Volatile)
423       return false; // Invalid.
424 
425     unsigned OrderingVal = Ordering->getZExtValue();
426     if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
427       return false;
428 
429     Info.PtrVal = Inst->getArgOperand(0);
430     Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
431     Info.ReadMem = true;
432     Info.WriteMem = true;
433     Info.IsVolatile = !Volatile->isNullValue();
434     return true;
435   }
436   default:
437     return false;
438   }
439 }
440 
441 int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
442                                        TTI::TargetCostKind CostKind,
443                                        TTI::OperandValueKind Opd1Info,
444                                        TTI::OperandValueKind Opd2Info,
445                                        TTI::OperandValueProperties Opd1PropInfo,
446                                        TTI::OperandValueProperties Opd2PropInfo,
447                                        ArrayRef<const Value *> Args,
448                                        const Instruction *CxtI) {
449   EVT OrigTy = TLI->getValueType(DL, Ty);
450   if (!OrigTy.isSimple()) {
451     // FIXME: We're having to query the throughput cost so that the basic
452     // implementation tries to generate legalize and scalarization costs. Maybe
453     // we could hoist the scalarization code here?
454     return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput,
455                                          Opd1Info, Opd2Info, Opd1PropInfo,
456                                          Opd2PropInfo, Args, CxtI);
457   }
458 
459   // Legalize the type.
460   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
461   int ISD = TLI->InstructionOpcodeToISD(Opcode);
462 
463   // Because we don't have any legal vector operations, but the legal types, we
464   // need to account for split vectors.
465   unsigned NElts = LT.second.isVector() ?
466     LT.second.getVectorNumElements() : 1;
467 
468   MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
469 
470   switch (ISD) {
471   case ISD::SHL:
472   case ISD::SRL:
473   case ISD::SRA:
474     if (SLT == MVT::i64)
475       return get64BitInstrCost() * LT.first * NElts;
476 
477     if (ST->has16BitInsts() && SLT == MVT::i16)
478       NElts = (NElts + 1) / 2;
479 
480     // i32
481     return getFullRateInstrCost() * LT.first * NElts;
482   case ISD::ADD:
483   case ISD::SUB:
484   case ISD::AND:
485   case ISD::OR:
486   case ISD::XOR:
487     if (SLT == MVT::i64) {
488       // and, or and xor are typically split into 2 VALU instructions.
489       return 2 * getFullRateInstrCost() * LT.first * NElts;
490     }
491 
492     if (ST->has16BitInsts() && SLT == MVT::i16)
493       NElts = (NElts + 1) / 2;
494 
495     return LT.first * NElts * getFullRateInstrCost();
496   case ISD::MUL: {
497     const int QuarterRateCost = getQuarterRateInstrCost();
498     if (SLT == MVT::i64) {
499       const int FullRateCost = getFullRateInstrCost();
500       return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
501     }
502 
503     if (ST->has16BitInsts() && SLT == MVT::i16)
504       NElts = (NElts + 1) / 2;
505 
506     // i32
507     return QuarterRateCost * NElts * LT.first;
508   }
509   case ISD::FMUL:
510     // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
511     // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
512     // fused operation.
513     if (!HasFP32Denormals && SLT == MVT::f32 && CxtI && CxtI->hasOneUse())
514       if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
515         const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
516         if (OPC == ISD::FADD || OPC == ISD::FSUB) {
517           return TargetTransformInfo::TCC_Free;
518         }
519       }
520     LLVM_FALLTHROUGH;
521   case ISD::FADD:
522   case ISD::FSUB:
523     if (SLT == MVT::f64)
524       return LT.first * NElts * get64BitInstrCost();
525 
526     if (ST->has16BitInsts() && SLT == MVT::f16)
527       NElts = (NElts + 1) / 2;
528 
529     if (SLT == MVT::f32 || SLT == MVT::f16)
530       return LT.first * NElts * getFullRateInstrCost();
531     break;
532   case ISD::FDIV:
533   case ISD::FREM:
534     // FIXME: frem should be handled separately. The fdiv in it is most of it,
535     // but the current lowering is also not entirely correct.
536     if (SLT == MVT::f64) {
537       int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
538       // Add cost of workaround.
539       if (!ST->hasUsableDivScaleConditionOutput())
540         Cost += 3 * getFullRateInstrCost();
541 
542       return LT.first * Cost * NElts;
543     }
544 
545     if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
546       // TODO: This is more complicated, unsafe flags etc.
547       if ((SLT == MVT::f32 && !HasFP32Denormals) ||
548           (SLT == MVT::f16 && ST->has16BitInsts())) {
549         return LT.first * getQuarterRateInstrCost() * NElts;
550       }
551     }
552 
553     if (SLT == MVT::f16 && ST->has16BitInsts()) {
554       // 2 x v_cvt_f32_f16
555       // f32 rcp
556       // f32 fmul
557       // v_cvt_f16_f32
558       // f16 div_fixup
559       int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost();
560       return LT.first * Cost * NElts;
561     }
562 
563     if (SLT == MVT::f32 || SLT == MVT::f16) {
564       int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
565 
566       if (!HasFP32Denormals) {
567         // FP mode switches.
568         Cost += 2 * getFullRateInstrCost();
569       }
570 
571       return LT.first * NElts * Cost;
572     }
573     break;
574   case ISD::FNEG:
575     // Use the backend' estimation. If fneg is not free each element will cost
576     // one additional instruction.
577     return TLI->isFNegFree(SLT) ? 0 : NElts;
578   default:
579     break;
580   }
581 
582   return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info,
583                                        Opd1PropInfo, Opd2PropInfo, Args, CxtI);
584 }
585 
586 // Return true if there's a potential benefit from using v2f16/v2i16
587 // instructions for an intrinsic, even if it requires nontrivial legalization.
588 static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
589   switch (ID) {
590   case Intrinsic::fma: // TODO: fmuladd
591   // There's a small benefit to using vector ops in the legalized code.
592   case Intrinsic::round:
593   case Intrinsic::uadd_sat:
594   case Intrinsic::usub_sat:
595   case Intrinsic::sadd_sat:
596   case Intrinsic::ssub_sat:
597     return true;
598   default:
599     return false;
600   }
601 }
602 
603 int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
604                                       TTI::TargetCostKind CostKind) {
605   if (ICA.getID() == Intrinsic::fabs)
606     return 0;
607 
608   if (!intrinsicHasPackedVectorBenefit(ICA.getID()))
609     return BaseT::getIntrinsicInstrCost(ICA, CostKind);
610 
611   Type *RetTy = ICA.getReturnType();
612   EVT OrigTy = TLI->getValueType(DL, RetTy);
613   if (!OrigTy.isSimple()) {
614     return BaseT::getIntrinsicInstrCost(ICA, CostKind);
615   }
616 
617   // Legalize the type.
618   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
619 
620   unsigned NElts = LT.second.isVector() ?
621     LT.second.getVectorNumElements() : 1;
622 
623   MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
624 
625   if (SLT == MVT::f64)
626     return LT.first * NElts * get64BitInstrCost();
627 
628   if (ST->has16BitInsts() && SLT == MVT::f16)
629     NElts = (NElts + 1) / 2;
630 
631   // TODO: Get more refined intrinsic costs?
632   unsigned InstRate = getQuarterRateInstrCost();
633   if (ICA.getID() == Intrinsic::fma) {
634     InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost()
635                                    : getQuarterRateInstrCost();
636   }
637 
638   return LT.first * NElts * InstRate;
639 }
640 
641 unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode,
642                                     TTI::TargetCostKind CostKind) {
643   if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
644     return Opcode == Instruction::PHI ? 0 : 1;
645 
646   // XXX - For some reason this isn't called for switch.
647   switch (Opcode) {
648   case Instruction::Br:
649   case Instruction::Ret:
650     return 10;
651   default:
652     return BaseT::getCFInstrCost(Opcode, CostKind);
653   }
654 }
655 
656 int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
657                                            bool IsPairwise,
658                                            TTI::TargetCostKind CostKind) {
659   EVT OrigTy = TLI->getValueType(DL, Ty);
660 
661   // Computes cost on targets that have packed math instructions(which support
662   // 16-bit types only).
663   if (IsPairwise ||
664       !ST->hasVOP3PInsts() ||
665       OrigTy.getScalarSizeInBits() != 16)
666     return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise, CostKind);
667 
668   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
669   return LT.first * getFullRateInstrCost();
670 }
671 
672 int GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
673                                        bool IsPairwise, bool IsUnsigned,
674                                        TTI::TargetCostKind CostKind) {
675   EVT OrigTy = TLI->getValueType(DL, Ty);
676 
677   // Computes cost on targets that have packed math instructions(which support
678   // 16-bit types only).
679   if (IsPairwise ||
680       !ST->hasVOP3PInsts() ||
681       OrigTy.getScalarSizeInBits() != 16)
682     return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned,
683                                          CostKind);
684 
685   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
686   return LT.first * getHalfRateInstrCost();
687 }
688 
689 int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
690                                       unsigned Index) {
691   switch (Opcode) {
692   case Instruction::ExtractElement:
693   case Instruction::InsertElement: {
694     unsigned EltSize
695       = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
696     if (EltSize < 32) {
697       if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
698         return 0;
699       return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
700     }
701 
702     // Extracts are just reads of a subregister, so are free. Inserts are
703     // considered free because we don't want to have any cost for scalarizing
704     // operations, and we don't have to copy into a different register class.
705 
706     // Dynamic indexing isn't free and is best avoided.
707     return Index == ~0u ? 2 : 0;
708   }
709   default:
710     return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
711   }
712 }
713 
714 static bool isArgPassedInSGPR(const Argument *A) {
715   const Function *F = A->getParent();
716 
717   // Arguments to compute shaders are never a source of divergence.
718   CallingConv::ID CC = F->getCallingConv();
719   switch (CC) {
720   case CallingConv::AMDGPU_KERNEL:
721   case CallingConv::SPIR_KERNEL:
722     return true;
723   case CallingConv::AMDGPU_VS:
724   case CallingConv::AMDGPU_LS:
725   case CallingConv::AMDGPU_HS:
726   case CallingConv::AMDGPU_ES:
727   case CallingConv::AMDGPU_GS:
728   case CallingConv::AMDGPU_PS:
729   case CallingConv::AMDGPU_CS:
730     // For non-compute shaders, SGPR inputs are marked with either inreg.
731     // Everything else is in VGPRs.
732     return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg);
733   default:
734     // TODO: Should calls support inreg for SGPR inputs?
735     return false;
736   }
737 }
738 
739 /// Analyze if the results of inline asm are divergent. If \p Indices is empty,
740 /// this is analyzing the collective result of all output registers. Otherwise,
741 /// this is only querying a specific result index if this returns multiple
742 /// registers in a struct.
743 bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
744   const CallInst *CI, ArrayRef<unsigned> Indices) const {
745   // TODO: Handle complex extract indices
746   if (Indices.size() > 1)
747     return true;
748 
749   const DataLayout &DL = CI->getModule()->getDataLayout();
750   const SIRegisterInfo *TRI = ST->getRegisterInfo();
751   TargetLowering::AsmOperandInfoVector TargetConstraints =
752       TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
753 
754   const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
755 
756   int OutputIdx = 0;
757   for (auto &TC : TargetConstraints) {
758     if (TC.Type != InlineAsm::isOutput)
759       continue;
760 
761     // Skip outputs we don't care about.
762     if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
763       continue;
764 
765     TLI->ComputeConstraintToUse(TC, SDValue());
766 
767     Register AssignedReg;
768     const TargetRegisterClass *RC;
769     std::tie(AssignedReg, RC) = TLI->getRegForInlineAsmConstraint(
770       TRI, TC.ConstraintCode, TC.ConstraintVT);
771     if (AssignedReg) {
772       // FIXME: This is a workaround for getRegForInlineAsmConstraint
773       // returning VS_32
774       RC = TRI->getPhysRegClass(AssignedReg);
775     }
776 
777     // For AGPR constraints null is returned on subtargets without AGPRs, so
778     // assume divergent for null.
779     if (!RC || !TRI->isSGPRClass(RC))
780       return true;
781   }
782 
783   return false;
784 }
785 
786 /// \returns true if the new GPU divergence analysis is enabled.
787 bool GCNTTIImpl::useGPUDivergenceAnalysis() const {
788   return !UseLegacyDA;
789 }
790 
791 /// \returns true if the result of the value could potentially be
792 /// different across workitems in a wavefront.
793 bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
794   if (const Argument *A = dyn_cast<Argument>(V))
795     return !isArgPassedInSGPR(A);
796 
797   // Loads from the private and flat address spaces are divergent, because
798   // threads can execute the load instruction with the same inputs and get
799   // different results.
800   //
801   // All other loads are not divergent, because if threads issue loads with the
802   // same arguments, they will always get the same result.
803   if (const LoadInst *Load = dyn_cast<LoadInst>(V))
804     return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
805            Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
806 
807   // Atomics are divergent because they are executed sequentially: when an
808   // atomic operation refers to the same address in each thread, then each
809   // thread after the first sees the value written by the previous thread as
810   // original value.
811   if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
812     return true;
813 
814   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
815     return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
816 
817   // Assume all function calls are a source of divergence.
818   if (const CallInst *CI = dyn_cast<CallInst>(V)) {
819     if (CI->isInlineAsm())
820       return isInlineAsmSourceOfDivergence(CI);
821     return true;
822   }
823 
824   // Assume all function calls are a source of divergence.
825   if (isa<InvokeInst>(V))
826     return true;
827 
828   return false;
829 }
830 
831 bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
832   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
833     switch (Intrinsic->getIntrinsicID()) {
834     default:
835       return false;
836     case Intrinsic::amdgcn_readfirstlane:
837     case Intrinsic::amdgcn_readlane:
838     case Intrinsic::amdgcn_icmp:
839     case Intrinsic::amdgcn_fcmp:
840     case Intrinsic::amdgcn_ballot:
841     case Intrinsic::amdgcn_if_break:
842       return true;
843     }
844   }
845 
846   if (const CallInst *CI = dyn_cast<CallInst>(V)) {
847     if (CI->isInlineAsm())
848       return !isInlineAsmSourceOfDivergence(CI);
849     return false;
850   }
851 
852   const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
853   if (!ExtValue)
854     return false;
855 
856   const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
857   if (!CI)
858     return false;
859 
860   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
861     switch (Intrinsic->getIntrinsicID()) {
862     default:
863       return false;
864     case Intrinsic::amdgcn_if:
865     case Intrinsic::amdgcn_else: {
866       ArrayRef<unsigned> Indices = ExtValue->getIndices();
867       return Indices.size() == 1 && Indices[0] == 1;
868     }
869     }
870   }
871 
872   // If we have inline asm returning mixed SGPR and VGPR results, we inferred
873   // divergent for the overall struct return. We need to override it in the
874   // case we're extracting an SGPR component here.
875   if (CI->isInlineAsm())
876     return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
877 
878   return false;
879 }
880 
881 bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
882                                             Intrinsic::ID IID) const {
883   switch (IID) {
884   case Intrinsic::amdgcn_atomic_inc:
885   case Intrinsic::amdgcn_atomic_dec:
886   case Intrinsic::amdgcn_ds_fadd:
887   case Intrinsic::amdgcn_ds_fmin:
888   case Intrinsic::amdgcn_ds_fmax:
889   case Intrinsic::amdgcn_is_shared:
890   case Intrinsic::amdgcn_is_private:
891     OpIndexes.push_back(0);
892     return true;
893   default:
894     return false;
895   }
896 }
897 
898 Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
899                                                     Value *OldV,
900                                                     Value *NewV) const {
901   auto IntrID = II->getIntrinsicID();
902   switch (IntrID) {
903   case Intrinsic::amdgcn_atomic_inc:
904   case Intrinsic::amdgcn_atomic_dec:
905   case Intrinsic::amdgcn_ds_fadd:
906   case Intrinsic::amdgcn_ds_fmin:
907   case Intrinsic::amdgcn_ds_fmax: {
908     const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
909     if (!IsVolatile->isZero())
910       return nullptr;
911     Module *M = II->getParent()->getParent()->getParent();
912     Type *DestTy = II->getType();
913     Type *SrcTy = NewV->getType();
914     Function *NewDecl =
915         Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
916     II->setArgOperand(0, NewV);
917     II->setCalledFunction(NewDecl);
918     return II;
919   }
920   case Intrinsic::amdgcn_is_shared:
921   case Intrinsic::amdgcn_is_private: {
922     unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
923       AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
924     unsigned NewAS = NewV->getType()->getPointerAddressSpace();
925     LLVMContext &Ctx = NewV->getType()->getContext();
926     ConstantInt *NewVal = (TrueAS == NewAS) ?
927       ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);
928     return NewVal;
929   }
930   case Intrinsic::ptrmask: {
931     unsigned OldAS = OldV->getType()->getPointerAddressSpace();
932     unsigned NewAS = NewV->getType()->getPointerAddressSpace();
933     Value *MaskOp = II->getArgOperand(1);
934     Type *MaskTy = MaskOp->getType();
935 
936     bool DoTruncate = false;
937     if (!getTLI()->isNoopAddrSpaceCast(OldAS, NewAS)) {
938       // All valid 64-bit to 32-bit casts work by chopping off the high
939       // bits. Any masking only clearing the low bits will also apply in the new
940       // address space.
941       if (DL.getPointerSizeInBits(OldAS) != 64 ||
942           DL.getPointerSizeInBits(NewAS) != 32)
943         return nullptr;
944 
945       // TODO: Do we need to thread more context in here?
946       KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II);
947       if (Known.countMinLeadingOnes() < 32)
948         return nullptr;
949 
950       DoTruncate = true;
951     }
952 
953     IRBuilder<> B(II);
954     if (DoTruncate) {
955       MaskTy = B.getInt32Ty();
956       MaskOp = B.CreateTrunc(MaskOp, MaskTy);
957     }
958 
959     return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
960                              {NewV, MaskOp});
961   }
962   default:
963     return nullptr;
964   }
965 }
966 
967 unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *VT,
968                                     int Index, VectorType *SubTp) {
969   if (ST->hasVOP3PInsts()) {
970     if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
971         DL.getTypeSizeInBits(VT->getElementType()) == 16) {
972       // With op_sel VOP3P instructions freely can access the low half or high
973       // half of a register, so any swizzle is free.
974 
975       switch (Kind) {
976       case TTI::SK_Broadcast:
977       case TTI::SK_Reverse:
978       case TTI::SK_PermuteSingleSrc:
979         return 0;
980       default:
981         break;
982       }
983     }
984   }
985 
986   return BaseT::getShuffleCost(Kind, VT, Index, SubTp);
987 }
988 
989 bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
990                                      const Function *Callee) const {
991   const TargetMachine &TM = getTLI()->getTargetMachine();
992   const GCNSubtarget *CallerST
993     = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
994   const GCNSubtarget *CalleeST
995     = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
996 
997   const FeatureBitset &CallerBits = CallerST->getFeatureBits();
998   const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
999 
1000   FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1001   FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1002   if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1003     return false;
1004 
1005   // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1006   // no way to support merge for backend defined attributes.
1007   AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
1008   AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee);
1009   return CallerMode.isInlineCompatible(CalleeMode);
1010 }
1011 
1012 void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1013                                          TTI::UnrollingPreferences &UP) {
1014   CommonTTI.getUnrollingPreferences(L, SE, UP);
1015 }
1016 
1017 void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1018                                        TTI::PeelingPreferences &PP) {
1019   CommonTTI.getPeelingPreferences(L, SE, PP);
1020 }
1021 
1022 unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
1023   return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
1024 }
1025 
1026 unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const {
1027   return getHardwareNumberOfRegisters(Vec);
1028 }
1029 
1030 unsigned R600TTIImpl::getRegisterBitWidth(bool Vector) const {
1031   return 32;
1032 }
1033 
1034 unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const {
1035   return 32;
1036 }
1037 
1038 unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
1039   if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
1040       AddrSpace == AMDGPUAS::CONSTANT_ADDRESS)
1041     return 128;
1042   if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1043       AddrSpace == AMDGPUAS::REGION_ADDRESS)
1044     return 64;
1045   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
1046     return 32;
1047 
1048   if ((AddrSpace == AMDGPUAS::PARAM_D_ADDRESS ||
1049       AddrSpace == AMDGPUAS::PARAM_I_ADDRESS ||
1050       (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&
1051       AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))
1052     return 128;
1053   llvm_unreachable("unhandled address space");
1054 }
1055 
1056 bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
1057                                              Align Alignment,
1058                                              unsigned AddrSpace) const {
1059   // We allow vectorization of flat stores, even though we may need to decompose
1060   // them later if they may access private memory. We don't have enough context
1061   // here, and legalization can handle it.
1062   return (AddrSpace != AMDGPUAS::PRIVATE_ADDRESS);
1063 }
1064 
1065 bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
1066                                               Align Alignment,
1067                                               unsigned AddrSpace) const {
1068   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
1069 }
1070 
1071 bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
1072                                                Align Alignment,
1073                                                unsigned AddrSpace) const {
1074   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
1075 }
1076 
1077 unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) {
1078   // Disable unrolling if the loop is not vectorized.
1079   // TODO: Enable this again.
1080   if (VF == 1)
1081     return 1;
1082 
1083   return 8;
1084 }
1085 
1086 unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode,
1087                                      TTI::TargetCostKind CostKind) {
1088   if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
1089     return Opcode == Instruction::PHI ? 0 : 1;
1090 
1091   // XXX - For some reason this isn't called for switch.
1092   switch (Opcode) {
1093   case Instruction::Br:
1094   case Instruction::Ret:
1095     return 10;
1096   default:
1097     return BaseT::getCFInstrCost(Opcode, CostKind);
1098   }
1099 }
1100 
1101 int R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
1102                                     unsigned Index) {
1103   switch (Opcode) {
1104   case Instruction::ExtractElement:
1105   case Instruction::InsertElement: {
1106     unsigned EltSize
1107       = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
1108     if (EltSize < 32) {
1109       return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
1110     }
1111 
1112     // Extracts are just reads of a subregister, so are free. Inserts are
1113     // considered free because we don't want to have any cost for scalarizing
1114     // operations, and we don't have to copy into a different register class.
1115 
1116     // Dynamic indexing isn't free and is best avoided.
1117     return Index == ~0u ? 2 : 0;
1118   }
1119   default:
1120     return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
1121   }
1122 }
1123 
1124 void R600TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1125                                           TTI::UnrollingPreferences &UP) {
1126   CommonTTI.getUnrollingPreferences(L, SE, UP);
1127 }
1128 
1129 void R600TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1130                                         TTI::PeelingPreferences &PP) {
1131   CommonTTI.getPeelingPreferences(L, SE, PP);
1132 }
1133