1 //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "AMDGPUTargetTransformInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "Utils/AMDGPUBaseInfo.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/Analysis/LoopInfo.h"
22 #include "llvm/Analysis/TargetTransformInfo.h"
23 #include "llvm/Analysis/ValueTracking.h"
24 #include "llvm/CodeGen/ISDOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/Argument.h"
27 #include "llvm/IR/Attributes.h"
28 #include "llvm/IR/BasicBlock.h"
29 #include "llvm/IR/CallingConv.h"
30 #include "llvm/IR/DataLayout.h"
31 #include "llvm/IR/DerivedTypes.h"
32 #include "llvm/IR/Function.h"
33 #include "llvm/IR/Instruction.h"
34 #include "llvm/IR/Instructions.h"
35 #include "llvm/IR/IntrinsicInst.h"
36 #include "llvm/IR/Module.h"
37 #include "llvm/IR/PatternMatch.h"
38 #include "llvm/IR/Type.h"
39 #include "llvm/IR/Value.h"
40 #include "llvm/MC/SubtargetFeature.h"
41 #include "llvm/Support/Casting.h"
42 #include "llvm/Support/CommandLine.h"
43 #include "llvm/Support/Debug.h"
44 #include "llvm/Support/ErrorHandling.h"
45 #include "llvm/Support/MachineValueType.h"
46 #include "llvm/Support/raw_ostream.h"
47 #include "llvm/Target/TargetMachine.h"
48 #include <algorithm>
49 #include <cassert>
50 #include <limits>
51 #include <utility>
52 
53 using namespace llvm;
54 
55 #define DEBUG_TYPE "AMDGPUtti"
56 
57 static cl::opt<unsigned> UnrollThresholdPrivate(
58   "amdgpu-unroll-threshold-private",
59   cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
60   cl::init(2700), cl::Hidden);
61 
62 static cl::opt<unsigned> UnrollThresholdLocal(
63   "amdgpu-unroll-threshold-local",
64   cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
65   cl::init(1000), cl::Hidden);
66 
67 static cl::opt<unsigned> UnrollThresholdIf(
68   "amdgpu-unroll-threshold-if",
69   cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
70   cl::init(150), cl::Hidden);
71 
72 static cl::opt<bool> UnrollRuntimeLocal(
73   "amdgpu-unroll-runtime-local",
74   cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
75   cl::init(true), cl::Hidden);
76 
77 static cl::opt<bool> UseLegacyDA(
78   "amdgpu-use-legacy-divergence-analysis",
79   cl::desc("Enable legacy divergence analysis for AMDGPU"),
80   cl::init(false), cl::Hidden);
81 
82 static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
83     "amdgpu-unroll-max-block-to-analyze",
84     cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
85     cl::init(20), cl::Hidden);
86 
87 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
88                               unsigned Depth = 0) {
89   const Instruction *I = dyn_cast<Instruction>(Cond);
90   if (!I)
91     return false;
92 
93   for (const Value *V : I->operand_values()) {
94     if (!L->contains(I))
95       continue;
96     if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
97       if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
98                   return SubLoop->contains(PHI); }))
99         return true;
100     } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
101       return true;
102   }
103   return false;
104 }
105 
106 void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
107                                             TTI::UnrollingPreferences &UP) {
108   const Function &F = *L->getHeader()->getParent();
109   UP.Threshold = AMDGPU::getIntegerAttribute(F, "amdgpu-unroll-threshold", 300);
110   UP.MaxCount = std::numeric_limits<unsigned>::max();
111   UP.Partial = true;
112 
113   // TODO: Do we want runtime unrolling?
114 
115   // Maximum alloca size than can fit registers. Reserve 16 registers.
116   const unsigned MaxAlloca = (256 - 16) * 4;
117   unsigned ThresholdPrivate = UnrollThresholdPrivate;
118   unsigned ThresholdLocal = UnrollThresholdLocal;
119   unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
120   for (const BasicBlock *BB : L->getBlocks()) {
121     const DataLayout &DL = BB->getModule()->getDataLayout();
122     unsigned LocalGEPsSeen = 0;
123 
124     if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
125                return SubLoop->contains(BB); }))
126         continue; // Block belongs to an inner loop.
127 
128     for (const Instruction &I : *BB) {
129       // Unroll a loop which contains an "if" statement whose condition
130       // defined by a PHI belonging to the loop. This may help to eliminate
131       // if region and potentially even PHI itself, saving on both divergence
132       // and registers used for the PHI.
133       // Add a small bonus for each of such "if" statements.
134       if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
135         if (UP.Threshold < MaxBoost && Br->isConditional()) {
136           BasicBlock *Succ0 = Br->getSuccessor(0);
137           BasicBlock *Succ1 = Br->getSuccessor(1);
138           if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
139               (L->contains(Succ1) && L->isLoopExiting(Succ1)))
140             continue;
141           if (dependsOnLocalPhi(L, Br->getCondition())) {
142             UP.Threshold += UnrollThresholdIf;
143             LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
144                               << " for loop:\n"
145                               << *L << " due to " << *Br << '\n');
146             if (UP.Threshold >= MaxBoost)
147               return;
148           }
149         }
150         continue;
151       }
152 
153       const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
154       if (!GEP)
155         continue;
156 
157       unsigned AS = GEP->getAddressSpace();
158       unsigned Threshold = 0;
159       if (AS == AMDGPUAS::PRIVATE_ADDRESS)
160         Threshold = ThresholdPrivate;
161       else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
162         Threshold = ThresholdLocal;
163       else
164         continue;
165 
166       if (UP.Threshold >= Threshold)
167         continue;
168 
169       if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
170         const Value *Ptr = GEP->getPointerOperand();
171         const AllocaInst *Alloca =
172             dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
173         if (!Alloca || !Alloca->isStaticAlloca())
174           continue;
175         Type *Ty = Alloca->getAllocatedType();
176         unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
177         if (AllocaSize > MaxAlloca)
178           continue;
179       } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
180                  AS == AMDGPUAS::REGION_ADDRESS) {
181         LocalGEPsSeen++;
182         // Inhibit unroll for local memory if we have seen addressing not to
183         // a variable, most likely we will be unable to combine it.
184         // Do not unroll too deep inner loops for local memory to give a chance
185         // to unroll an outer loop for a more important reason.
186         if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
187             (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
188              !isa<Argument>(GEP->getPointerOperand())))
189           continue;
190         LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
191                           << *L << " due to LDS use.\n");
192         UP.Runtime = UnrollRuntimeLocal;
193       }
194 
195       // Check if GEP depends on a value defined by this loop itself.
196       bool HasLoopDef = false;
197       for (const Value *Op : GEP->operands()) {
198         const Instruction *Inst = dyn_cast<Instruction>(Op);
199         if (!Inst || L->isLoopInvariant(Op))
200           continue;
201 
202         if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
203              return SubLoop->contains(Inst); }))
204           continue;
205         HasLoopDef = true;
206         break;
207       }
208       if (!HasLoopDef)
209         continue;
210 
211       // We want to do whatever we can to limit the number of alloca
212       // instructions that make it through to the code generator.  allocas
213       // require us to use indirect addressing, which is slow and prone to
214       // compiler bugs.  If this loop does an address calculation on an
215       // alloca ptr, then we want to use a higher than normal loop unroll
216       // threshold. This will give SROA a better chance to eliminate these
217       // allocas.
218       //
219       // We also want to have more unrolling for local memory to let ds
220       // instructions with different offsets combine.
221       //
222       // Don't use the maximum allowed value here as it will make some
223       // programs way too big.
224       UP.Threshold = Threshold;
225       LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
226                         << " for loop:\n"
227                         << *L << " due to " << *GEP << '\n');
228       if (UP.Threshold >= MaxBoost)
229         return;
230     }
231 
232     // If we got a GEP in a small BB from inner loop then increase max trip
233     // count to analyze for better estimation cost in unroll
234     if (L->empty() && BB->size() < UnrollMaxBlockToAnalyze)
235       UP.MaxIterationsCountToAnalyze = 32;
236   }
237 }
238 
239 void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
240                                           TTI::PeelingPreferences &PP) {
241   BaseT::getPeelingPreferences(L, SE, PP);
242 }
243 unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
244   // The concept of vector registers doesn't really exist. Some packed vector
245   // operations operate on the normal 32-bit registers.
246   return MaxVGPRs;
247 }
248 
249 unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
250   // This is really the number of registers to fill when vectorizing /
251   // interleaving loops, so we lie to avoid trying to use all registers.
252   return getHardwareNumberOfRegisters(Vec) >> 3;
253 }
254 
255 unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
256   const SIRegisterInfo *TRI = ST->getRegisterInfo();
257   const TargetRegisterClass *RC = TRI->getRegClass(RCID);
258   unsigned NumVGPRs = (TRI->getRegSizeInBits(*RC) + 31) / 32;
259   return getHardwareNumberOfRegisters(false) / NumVGPRs;
260 }
261 
262 unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const {
263   return 32;
264 }
265 
266 unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
267   return 32;
268 }
269 
270 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
271                                          unsigned ChainSizeInBytes,
272                                          VectorType *VecTy) const {
273   unsigned VecRegBitWidth = VF * LoadSize;
274   if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
275     // TODO: Support element-size less than 32bit?
276     return 128 / LoadSize;
277 
278   return VF;
279 }
280 
281 unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
282                                              unsigned ChainSizeInBytes,
283                                              VectorType *VecTy) const {
284   unsigned VecRegBitWidth = VF * StoreSize;
285   if (VecRegBitWidth > 128)
286     return 128 / StoreSize;
287 
288   return VF;
289 }
290 
291 unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
292   if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
293       AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
294       AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
295       AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) {
296     return 512;
297   }
298 
299   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
300     return 8 * ST->getMaxPrivateElementSize();
301 
302   // Common to flat, global, local and region. Assume for unknown addrspace.
303   return 128;
304 }
305 
306 bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
307                                             Align Alignment,
308                                             unsigned AddrSpace) const {
309   // We allow vectorization of flat stores, even though we may need to decompose
310   // them later if they may access private memory. We don't have enough context
311   // here, and legalization can handle it.
312   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
313     return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
314       ChainSizeInBytes <= ST->getMaxPrivateElementSize();
315   }
316   return true;
317 }
318 
319 bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
320                                              Align Alignment,
321                                              unsigned AddrSpace) const {
322   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
323 }
324 
325 bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
326                                               Align Alignment,
327                                               unsigned AddrSpace) const {
328   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
329 }
330 
331 // FIXME: Really we would like to issue multiple 128-bit loads and stores per
332 // iteration. Should we report a larger size and let it legalize?
333 //
334 // FIXME: Should we use narrower types for local/region, or account for when
335 // unaligned access is legal?
336 //
337 // FIXME: This could use fine tuning and microbenchmarks.
338 Type *GCNTTIImpl::getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
339                                             unsigned SrcAddrSpace,
340                                             unsigned DestAddrSpace,
341                                             unsigned SrcAlign,
342                                             unsigned DestAlign) const {
343   unsigned MinAlign = std::min(SrcAlign, DestAlign);
344 
345   // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
346   // hardware into byte accesses. If you assume all alignments are equally
347   // probable, it's more efficient on average to use short accesses for this
348   // case.
349   if (MinAlign == 2)
350     return Type::getInt16Ty(Context);
351 
352   // Not all subtargets have 128-bit DS instructions, and we currently don't
353   // form them by default.
354   if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
355       SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
356       DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
357       DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
358     return FixedVectorType::get(Type::getInt32Ty(Context), 2);
359   }
360 
361   // Global memory works best with 16-byte accesses. Private memory will also
362   // hit this, although they'll be decomposed.
363   return FixedVectorType::get(Type::getInt32Ty(Context), 4);
364 }
365 
366 void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
367   SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
368   unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
369   unsigned SrcAlign, unsigned DestAlign) const {
370   assert(RemainingBytes < 16);
371 
372   unsigned MinAlign = std::min(SrcAlign, DestAlign);
373 
374   if (MinAlign != 2) {
375     Type *I64Ty = Type::getInt64Ty(Context);
376     while (RemainingBytes >= 8) {
377       OpsOut.push_back(I64Ty);
378       RemainingBytes -= 8;
379     }
380 
381     Type *I32Ty = Type::getInt32Ty(Context);
382     while (RemainingBytes >= 4) {
383       OpsOut.push_back(I32Ty);
384       RemainingBytes -= 4;
385     }
386   }
387 
388   Type *I16Ty = Type::getInt16Ty(Context);
389   while (RemainingBytes >= 2) {
390     OpsOut.push_back(I16Ty);
391     RemainingBytes -= 2;
392   }
393 
394   Type *I8Ty = Type::getInt8Ty(Context);
395   while (RemainingBytes) {
396     OpsOut.push_back(I8Ty);
397     --RemainingBytes;
398   }
399 }
400 
401 unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
402   // Disable unrolling if the loop is not vectorized.
403   // TODO: Enable this again.
404   if (VF == 1)
405     return 1;
406 
407   return 8;
408 }
409 
410 bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
411                                        MemIntrinsicInfo &Info) const {
412   switch (Inst->getIntrinsicID()) {
413   case Intrinsic::amdgcn_atomic_inc:
414   case Intrinsic::amdgcn_atomic_dec:
415   case Intrinsic::amdgcn_ds_ordered_add:
416   case Intrinsic::amdgcn_ds_ordered_swap:
417   case Intrinsic::amdgcn_ds_fadd:
418   case Intrinsic::amdgcn_ds_fmin:
419   case Intrinsic::amdgcn_ds_fmax: {
420     auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
421     auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
422     if (!Ordering || !Volatile)
423       return false; // Invalid.
424 
425     unsigned OrderingVal = Ordering->getZExtValue();
426     if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
427       return false;
428 
429     Info.PtrVal = Inst->getArgOperand(0);
430     Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
431     Info.ReadMem = true;
432     Info.WriteMem = true;
433     Info.IsVolatile = !Volatile->isNullValue();
434     return true;
435   }
436   default:
437     return false;
438   }
439 }
440 
441 int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
442                                        TTI::TargetCostKind CostKind,
443                                        TTI::OperandValueKind Opd1Info,
444                                        TTI::OperandValueKind Opd2Info,
445                                        TTI::OperandValueProperties Opd1PropInfo,
446                                        TTI::OperandValueProperties Opd2PropInfo,
447                                        ArrayRef<const Value *> Args,
448                                        const Instruction *CxtI) {
449   EVT OrigTy = TLI->getValueType(DL, Ty);
450   if (!OrigTy.isSimple()) {
451     // FIXME: We're having to query the throughput cost so that the basic
452     // implementation tries to generate legalize and scalarization costs. Maybe
453     // we could hoist the scalarization code here?
454     return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput,
455                                          Opd1Info, Opd2Info, Opd1PropInfo,
456                                          Opd2PropInfo, Args, CxtI);
457   }
458 
459   // Legalize the type.
460   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
461   int ISD = TLI->InstructionOpcodeToISD(Opcode);
462 
463   // Because we don't have any legal vector operations, but the legal types, we
464   // need to account for split vectors.
465   unsigned NElts = LT.second.isVector() ?
466     LT.second.getVectorNumElements() : 1;
467 
468   MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
469 
470   switch (ISD) {
471   case ISD::SHL:
472   case ISD::SRL:
473   case ISD::SRA:
474     if (SLT == MVT::i64)
475       return get64BitInstrCost() * LT.first * NElts;
476 
477     if (ST->has16BitInsts() && SLT == MVT::i16)
478       NElts = (NElts + 1) / 2;
479 
480     // i32
481     return getFullRateInstrCost() * LT.first * NElts;
482   case ISD::ADD:
483   case ISD::SUB:
484   case ISD::AND:
485   case ISD::OR:
486   case ISD::XOR:
487     if (SLT == MVT::i64) {
488       // and, or and xor are typically split into 2 VALU instructions.
489       return 2 * getFullRateInstrCost() * LT.first * NElts;
490     }
491 
492     if (ST->has16BitInsts() && SLT == MVT::i16)
493       NElts = (NElts + 1) / 2;
494 
495     return LT.first * NElts * getFullRateInstrCost();
496   case ISD::MUL: {
497     const int QuarterRateCost = getQuarterRateInstrCost();
498     if (SLT == MVT::i64) {
499       const int FullRateCost = getFullRateInstrCost();
500       return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
501     }
502 
503     if (ST->has16BitInsts() && SLT == MVT::i16)
504       NElts = (NElts + 1) / 2;
505 
506     // i32
507     return QuarterRateCost * NElts * LT.first;
508   }
509   case ISD::FMUL:
510     // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
511     // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
512     // fused operation.
513     if (!HasFP32Denormals && SLT == MVT::f32 && CxtI && CxtI->hasOneUse())
514       if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
515         const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
516         if (OPC == ISD::FADD || OPC == ISD::FSUB) {
517           return TargetTransformInfo::TCC_Free;
518         }
519       }
520     LLVM_FALLTHROUGH;
521   case ISD::FADD:
522   case ISD::FSUB:
523     if (SLT == MVT::f64)
524       return LT.first * NElts * get64BitInstrCost();
525 
526     if (ST->has16BitInsts() && SLT == MVT::f16)
527       NElts = (NElts + 1) / 2;
528 
529     if (SLT == MVT::f32 || SLT == MVT::f16)
530       return LT.first * NElts * getFullRateInstrCost();
531     break;
532   case ISD::FDIV:
533   case ISD::FREM:
534     // FIXME: frem should be handled separately. The fdiv in it is most of it,
535     // but the current lowering is also not entirely correct.
536     if (SLT == MVT::f64) {
537       int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
538       // Add cost of workaround.
539       if (!ST->hasUsableDivScaleConditionOutput())
540         Cost += 3 * getFullRateInstrCost();
541 
542       return LT.first * Cost * NElts;
543     }
544 
545     if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
546       // TODO: This is more complicated, unsafe flags etc.
547       if ((SLT == MVT::f32 && !HasFP32Denormals) ||
548           (SLT == MVT::f16 && ST->has16BitInsts())) {
549         return LT.first * getQuarterRateInstrCost() * NElts;
550       }
551     }
552 
553     if (SLT == MVT::f16 && ST->has16BitInsts()) {
554       // 2 x v_cvt_f32_f16
555       // f32 rcp
556       // f32 fmul
557       // v_cvt_f16_f32
558       // f16 div_fixup
559       int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost();
560       return LT.first * Cost * NElts;
561     }
562 
563     if (SLT == MVT::f32 || SLT == MVT::f16) {
564       int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
565 
566       if (!HasFP32Denormals) {
567         // FP mode switches.
568         Cost += 2 * getFullRateInstrCost();
569       }
570 
571       return LT.first * NElts * Cost;
572     }
573     break;
574   case ISD::FNEG:
575     // Use the backend' estimation. If fneg is not free each element will cost
576     // one additional instruction.
577     return TLI->isFNegFree(SLT) ? 0 : NElts;
578   default:
579     break;
580   }
581 
582   return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info,
583                                        Opd1PropInfo, Opd2PropInfo, Args, CxtI);
584 }
585 
586 // Return true if there's a potential benefit from using v2f16 instructions for
587 // an intrinsic, even if it requires nontrivial legalization.
588 static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
589   switch (ID) {
590   case Intrinsic::fma: // TODO: fmuladd
591   // There's a small benefit to using vector ops in the legalized code.
592   case Intrinsic::round:
593     return true;
594   default:
595     return false;
596   }
597 }
598 
599 int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
600                                       TTI::TargetCostKind CostKind) {
601   if (ICA.getID() == Intrinsic::fabs)
602     return 0;
603 
604   if (!intrinsicHasPackedVectorBenefit(ICA.getID()))
605     return BaseT::getIntrinsicInstrCost(ICA, CostKind);
606 
607   Type *RetTy = ICA.getReturnType();
608   EVT OrigTy = TLI->getValueType(DL, RetTy);
609   if (!OrigTy.isSimple()) {
610     return BaseT::getIntrinsicInstrCost(ICA, CostKind);
611   }
612 
613   // Legalize the type.
614   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
615 
616   unsigned NElts = LT.second.isVector() ?
617     LT.second.getVectorNumElements() : 1;
618 
619   MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
620 
621   if (SLT == MVT::f64)
622     return LT.first * NElts * get64BitInstrCost();
623 
624   if (ST->has16BitInsts() && SLT == MVT::f16)
625     NElts = (NElts + 1) / 2;
626 
627   // TODO: Get more refined intrinsic costs?
628   unsigned InstRate = getQuarterRateInstrCost();
629   if (ICA.getID() == Intrinsic::fma) {
630     InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost()
631                                    : getQuarterRateInstrCost();
632   }
633 
634   return LT.first * NElts * InstRate;
635 }
636 
637 unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode,
638                                     TTI::TargetCostKind CostKind) {
639   if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
640     return Opcode == Instruction::PHI ? 0 : 1;
641 
642   // XXX - For some reason this isn't called for switch.
643   switch (Opcode) {
644   case Instruction::Br:
645   case Instruction::Ret:
646     return 10;
647   default:
648     return BaseT::getCFInstrCost(Opcode, CostKind);
649   }
650 }
651 
652 int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
653                                            bool IsPairwise,
654                                            TTI::TargetCostKind CostKind) {
655   EVT OrigTy = TLI->getValueType(DL, Ty);
656 
657   // Computes cost on targets that have packed math instructions(which support
658   // 16-bit types only).
659   if (IsPairwise ||
660       !ST->hasVOP3PInsts() ||
661       OrigTy.getScalarSizeInBits() != 16)
662     return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise, CostKind);
663 
664   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
665   return LT.first * getFullRateInstrCost();
666 }
667 
668 int GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
669                                        bool IsPairwise, bool IsUnsigned,
670                                        TTI::TargetCostKind CostKind) {
671   EVT OrigTy = TLI->getValueType(DL, Ty);
672 
673   // Computes cost on targets that have packed math instructions(which support
674   // 16-bit types only).
675   if (IsPairwise ||
676       !ST->hasVOP3PInsts() ||
677       OrigTy.getScalarSizeInBits() != 16)
678     return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned,
679                                          CostKind);
680 
681   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
682   return LT.first * getHalfRateInstrCost();
683 }
684 
685 int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
686                                       unsigned Index) {
687   switch (Opcode) {
688   case Instruction::ExtractElement:
689   case Instruction::InsertElement: {
690     unsigned EltSize
691       = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
692     if (EltSize < 32) {
693       if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
694         return 0;
695       return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
696     }
697 
698     // Extracts are just reads of a subregister, so are free. Inserts are
699     // considered free because we don't want to have any cost for scalarizing
700     // operations, and we don't have to copy into a different register class.
701 
702     // Dynamic indexing isn't free and is best avoided.
703     return Index == ~0u ? 2 : 0;
704   }
705   default:
706     return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
707   }
708 }
709 
710 static bool isArgPassedInSGPR(const Argument *A) {
711   const Function *F = A->getParent();
712 
713   // Arguments to compute shaders are never a source of divergence.
714   CallingConv::ID CC = F->getCallingConv();
715   switch (CC) {
716   case CallingConv::AMDGPU_KERNEL:
717   case CallingConv::SPIR_KERNEL:
718     return true;
719   case CallingConv::AMDGPU_VS:
720   case CallingConv::AMDGPU_LS:
721   case CallingConv::AMDGPU_HS:
722   case CallingConv::AMDGPU_ES:
723   case CallingConv::AMDGPU_GS:
724   case CallingConv::AMDGPU_PS:
725   case CallingConv::AMDGPU_CS:
726     // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
727     // Everything else is in VGPRs.
728     return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) ||
729            F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal);
730   default:
731     // TODO: Should calls support inreg for SGPR inputs?
732     return false;
733   }
734 }
735 
736 /// Analyze if the results of inline asm are divergent. If \p Indices is empty,
737 /// this is analyzing the collective result of all output registers. Otherwise,
738 /// this is only querying a specific result index if this returns multiple
739 /// registers in a struct.
740 bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
741   const CallInst *CI, ArrayRef<unsigned> Indices) const {
742   // TODO: Handle complex extract indices
743   if (Indices.size() > 1)
744     return true;
745 
746   const DataLayout &DL = CI->getModule()->getDataLayout();
747   const SIRegisterInfo *TRI = ST->getRegisterInfo();
748   TargetLowering::AsmOperandInfoVector TargetConstraints =
749       TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
750 
751   const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
752 
753   int OutputIdx = 0;
754   for (auto &TC : TargetConstraints) {
755     if (TC.Type != InlineAsm::isOutput)
756       continue;
757 
758     // Skip outputs we don't care about.
759     if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
760       continue;
761 
762     TLI->ComputeConstraintToUse(TC, SDValue());
763 
764     Register AssignedReg;
765     const TargetRegisterClass *RC;
766     std::tie(AssignedReg, RC) = TLI->getRegForInlineAsmConstraint(
767       TRI, TC.ConstraintCode, TC.ConstraintVT);
768     if (AssignedReg) {
769       // FIXME: This is a workaround for getRegForInlineAsmConstraint
770       // returning VS_32
771       RC = TRI->getPhysRegClass(AssignedReg);
772     }
773 
774     // For AGPR constraints null is returned on subtargets without AGPRs, so
775     // assume divergent for null.
776     if (!RC || !TRI->isSGPRClass(RC))
777       return true;
778   }
779 
780   return false;
781 }
782 
783 /// \returns true if the new GPU divergence analysis is enabled.
784 bool GCNTTIImpl::useGPUDivergenceAnalysis() const {
785   return !UseLegacyDA;
786 }
787 
788 /// \returns true if the result of the value could potentially be
789 /// different across workitems in a wavefront.
790 bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
791   if (const Argument *A = dyn_cast<Argument>(V))
792     return !isArgPassedInSGPR(A);
793 
794   // Loads from the private and flat address spaces are divergent, because
795   // threads can execute the load instruction with the same inputs and get
796   // different results.
797   //
798   // All other loads are not divergent, because if threads issue loads with the
799   // same arguments, they will always get the same result.
800   if (const LoadInst *Load = dyn_cast<LoadInst>(V))
801     return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
802            Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
803 
804   // Atomics are divergent because they are executed sequentially: when an
805   // atomic operation refers to the same address in each thread, then each
806   // thread after the first sees the value written by the previous thread as
807   // original value.
808   if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
809     return true;
810 
811   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
812     return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
813 
814   // Assume all function calls are a source of divergence.
815   if (const CallInst *CI = dyn_cast<CallInst>(V)) {
816     if (CI->isInlineAsm())
817       return isInlineAsmSourceOfDivergence(CI);
818     return true;
819   }
820 
821   // Assume all function calls are a source of divergence.
822   if (isa<InvokeInst>(V))
823     return true;
824 
825   return false;
826 }
827 
828 bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
829   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
830     switch (Intrinsic->getIntrinsicID()) {
831     default:
832       return false;
833     case Intrinsic::amdgcn_readfirstlane:
834     case Intrinsic::amdgcn_readlane:
835     case Intrinsic::amdgcn_icmp:
836     case Intrinsic::amdgcn_fcmp:
837     case Intrinsic::amdgcn_ballot:
838     case Intrinsic::amdgcn_if_break:
839       return true;
840     }
841   }
842 
843   if (const CallInst *CI = dyn_cast<CallInst>(V)) {
844     if (CI->isInlineAsm())
845       return !isInlineAsmSourceOfDivergence(CI);
846     return false;
847   }
848 
849   const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
850   if (!ExtValue)
851     return false;
852 
853   const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
854   if (!CI)
855     return false;
856 
857   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
858     switch (Intrinsic->getIntrinsicID()) {
859     default:
860       return false;
861     case Intrinsic::amdgcn_if:
862     case Intrinsic::amdgcn_else: {
863       ArrayRef<unsigned> Indices = ExtValue->getIndices();
864       return Indices.size() == 1 && Indices[0] == 1;
865     }
866     }
867   }
868 
869   // If we have inline asm returning mixed SGPR and VGPR results, we inferred
870   // divergent for the overall struct return. We need to override it in the
871   // case we're extracting an SGPR component here.
872   if (CI->isInlineAsm())
873     return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
874 
875   return false;
876 }
877 
878 bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
879                                             Intrinsic::ID IID) const {
880   switch (IID) {
881   case Intrinsic::amdgcn_atomic_inc:
882   case Intrinsic::amdgcn_atomic_dec:
883   case Intrinsic::amdgcn_ds_fadd:
884   case Intrinsic::amdgcn_ds_fmin:
885   case Intrinsic::amdgcn_ds_fmax:
886   case Intrinsic::amdgcn_is_shared:
887   case Intrinsic::amdgcn_is_private:
888     OpIndexes.push_back(0);
889     return true;
890   default:
891     return false;
892   }
893 }
894 
895 Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
896                                                     Value *OldV,
897                                                     Value *NewV) const {
898   auto IntrID = II->getIntrinsicID();
899   switch (IntrID) {
900   case Intrinsic::amdgcn_atomic_inc:
901   case Intrinsic::amdgcn_atomic_dec:
902   case Intrinsic::amdgcn_ds_fadd:
903   case Intrinsic::amdgcn_ds_fmin:
904   case Intrinsic::amdgcn_ds_fmax: {
905     const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
906     if (!IsVolatile->isZero())
907       return nullptr;
908     Module *M = II->getParent()->getParent()->getParent();
909     Type *DestTy = II->getType();
910     Type *SrcTy = NewV->getType();
911     Function *NewDecl =
912         Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
913     II->setArgOperand(0, NewV);
914     II->setCalledFunction(NewDecl);
915     return II;
916   }
917   case Intrinsic::amdgcn_is_shared:
918   case Intrinsic::amdgcn_is_private: {
919     unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
920       AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
921     unsigned NewAS = NewV->getType()->getPointerAddressSpace();
922     LLVMContext &Ctx = NewV->getType()->getContext();
923     ConstantInt *NewVal = (TrueAS == NewAS) ?
924       ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);
925     return NewVal;
926   }
927   case Intrinsic::ptrmask: {
928     unsigned OldAS = OldV->getType()->getPointerAddressSpace();
929     unsigned NewAS = NewV->getType()->getPointerAddressSpace();
930     Value *MaskOp = II->getArgOperand(1);
931     Type *MaskTy = MaskOp->getType();
932 
933     bool DoTruncate = false;
934     if (!getTLI()->isNoopAddrSpaceCast(OldAS, NewAS)) {
935       // All valid 64-bit to 32-bit casts work by chopping off the high
936       // bits. Any masking only clearing the low bits will also apply in the new
937       // address space.
938       if (DL.getPointerSizeInBits(OldAS) != 64 ||
939           DL.getPointerSizeInBits(NewAS) != 32)
940         return nullptr;
941 
942       // TODO: Do we need to thread more context in here?
943       KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II);
944       if (Known.countMinLeadingOnes() < 32)
945         return nullptr;
946 
947       DoTruncate = true;
948     }
949 
950     IRBuilder<> B(II);
951     if (DoTruncate) {
952       MaskTy = B.getInt32Ty();
953       MaskOp = B.CreateTrunc(MaskOp, MaskTy);
954     }
955 
956     return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
957                              {NewV, MaskOp});
958   }
959   default:
960     return nullptr;
961   }
962 }
963 
964 unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *VT,
965                                     int Index, VectorType *SubTp) {
966   if (ST->hasVOP3PInsts()) {
967     if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
968         DL.getTypeSizeInBits(VT->getElementType()) == 16) {
969       // With op_sel VOP3P instructions freely can access the low half or high
970       // half of a register, so any swizzle is free.
971 
972       switch (Kind) {
973       case TTI::SK_Broadcast:
974       case TTI::SK_Reverse:
975       case TTI::SK_PermuteSingleSrc:
976         return 0;
977       default:
978         break;
979       }
980     }
981   }
982 
983   return BaseT::getShuffleCost(Kind, VT, Index, SubTp);
984 }
985 
986 bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
987                                      const Function *Callee) const {
988   const TargetMachine &TM = getTLI()->getTargetMachine();
989   const GCNSubtarget *CallerST
990     = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
991   const GCNSubtarget *CalleeST
992     = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
993 
994   const FeatureBitset &CallerBits = CallerST->getFeatureBits();
995   const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
996 
997   FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
998   FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
999   if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1000     return false;
1001 
1002   // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1003   // no way to support merge for backend defined attributes.
1004   AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
1005   AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee);
1006   return CallerMode.isInlineCompatible(CalleeMode);
1007 }
1008 
1009 void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1010                                          TTI::UnrollingPreferences &UP) {
1011   CommonTTI.getUnrollingPreferences(L, SE, UP);
1012 }
1013 
1014 void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1015                                        TTI::PeelingPreferences &PP) {
1016   CommonTTI.getPeelingPreferences(L, SE, PP);
1017 }
1018 
1019 unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
1020   return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
1021 }
1022 
1023 unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const {
1024   return getHardwareNumberOfRegisters(Vec);
1025 }
1026 
1027 unsigned R600TTIImpl::getRegisterBitWidth(bool Vector) const {
1028   return 32;
1029 }
1030 
1031 unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const {
1032   return 32;
1033 }
1034 
1035 unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
1036   if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
1037       AddrSpace == AMDGPUAS::CONSTANT_ADDRESS)
1038     return 128;
1039   if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1040       AddrSpace == AMDGPUAS::REGION_ADDRESS)
1041     return 64;
1042   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
1043     return 32;
1044 
1045   if ((AddrSpace == AMDGPUAS::PARAM_D_ADDRESS ||
1046       AddrSpace == AMDGPUAS::PARAM_I_ADDRESS ||
1047       (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&
1048       AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))
1049     return 128;
1050   llvm_unreachable("unhandled address space");
1051 }
1052 
1053 bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
1054                                              Align Alignment,
1055                                              unsigned AddrSpace) const {
1056   // We allow vectorization of flat stores, even though we may need to decompose
1057   // them later if they may access private memory. We don't have enough context
1058   // here, and legalization can handle it.
1059   return (AddrSpace != AMDGPUAS::PRIVATE_ADDRESS);
1060 }
1061 
1062 bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
1063                                               Align Alignment,
1064                                               unsigned AddrSpace) const {
1065   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
1066 }
1067 
1068 bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
1069                                                Align Alignment,
1070                                                unsigned AddrSpace) const {
1071   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
1072 }
1073 
1074 unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) {
1075   // Disable unrolling if the loop is not vectorized.
1076   // TODO: Enable this again.
1077   if (VF == 1)
1078     return 1;
1079 
1080   return 8;
1081 }
1082 
1083 unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode,
1084                                      TTI::TargetCostKind CostKind) {
1085   if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
1086     return Opcode == Instruction::PHI ? 0 : 1;
1087 
1088   // XXX - For some reason this isn't called for switch.
1089   switch (Opcode) {
1090   case Instruction::Br:
1091   case Instruction::Ret:
1092     return 10;
1093   default:
1094     return BaseT::getCFInstrCost(Opcode, CostKind);
1095   }
1096 }
1097 
1098 int R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
1099                                     unsigned Index) {
1100   switch (Opcode) {
1101   case Instruction::ExtractElement:
1102   case Instruction::InsertElement: {
1103     unsigned EltSize
1104       = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
1105     if (EltSize < 32) {
1106       return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
1107     }
1108 
1109     // Extracts are just reads of a subregister, so are free. Inserts are
1110     // considered free because we don't want to have any cost for scalarizing
1111     // operations, and we don't have to copy into a different register class.
1112 
1113     // Dynamic indexing isn't free and is best avoided.
1114     return Index == ~0u ? 2 : 0;
1115   }
1116   default:
1117     return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
1118   }
1119 }
1120 
1121 void R600TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1122                                           TTI::UnrollingPreferences &UP) {
1123   CommonTTI.getUnrollingPreferences(L, SE, UP);
1124 }
1125 
1126 void R600TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1127                                         TTI::PeelingPreferences &PP) {
1128   CommonTTI.getPeelingPreferences(L, SE, PP);
1129 }
1130