1 //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "AMDGPUTargetTransformInfo.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "llvm/Analysis/LoopInfo.h"
20 #include "llvm/Analysis/ValueTracking.h"
21 #include "llvm/IR/IntrinsicsAMDGPU.h"
22 #include "llvm/IR/IRBuilder.h"
23 #include "llvm/IR/PatternMatch.h"
24 #include "llvm/Support/KnownBits.h"
25 
26 using namespace llvm;
27 
28 #define DEBUG_TYPE "AMDGPUtti"
29 
30 static cl::opt<unsigned> UnrollThresholdPrivate(
31   "amdgpu-unroll-threshold-private",
32   cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
33   cl::init(2700), cl::Hidden);
34 
35 static cl::opt<unsigned> UnrollThresholdLocal(
36   "amdgpu-unroll-threshold-local",
37   cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
38   cl::init(1000), cl::Hidden);
39 
40 static cl::opt<unsigned> UnrollThresholdIf(
41   "amdgpu-unroll-threshold-if",
42   cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
43   cl::init(200), cl::Hidden);
44 
45 static cl::opt<bool> UnrollRuntimeLocal(
46   "amdgpu-unroll-runtime-local",
47   cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
48   cl::init(true), cl::Hidden);
49 
50 static cl::opt<bool> UseLegacyDA(
51   "amdgpu-use-legacy-divergence-analysis",
52   cl::desc("Enable legacy divergence analysis for AMDGPU"),
53   cl::init(false), cl::Hidden);
54 
55 static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
56     "amdgpu-unroll-max-block-to-analyze",
57     cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
58     cl::init(32), cl::Hidden);
59 
60 static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
61                                        cl::Hidden, cl::init(4000),
62                                        cl::desc("Cost of alloca argument"));
63 
64 // If the amount of scratch memory to eliminate exceeds our ability to allocate
65 // it into registers we gain nothing by aggressively inlining functions for that
66 // heuristic.
67 static cl::opt<unsigned>
68     ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
69                     cl::init(256),
70                     cl::desc("Maximum alloca size to use for inline cost"));
71 
72 // Inliner constraint to achieve reasonable compilation time.
73 static cl::opt<size_t> InlineMaxBB(
74     "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
75     cl::desc("Maximum number of BBs allowed in a function after inlining"
76              " (compile time constraint)"));
77 
78 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
79                               unsigned Depth = 0) {
80   const Instruction *I = dyn_cast<Instruction>(Cond);
81   if (!I)
82     return false;
83 
84   for (const Value *V : I->operand_values()) {
85     if (!L->contains(I))
86       continue;
87     if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
88       if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
89                   return SubLoop->contains(PHI); }))
90         return true;
91     } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
92       return true;
93   }
94   return false;
95 }
96 
97 AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
98     : BaseT(TM, F.getParent()->getDataLayout()),
99       TargetTriple(TM->getTargetTriple()),
100       ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
101       TLI(ST->getTargetLowering()) {}
102 
103 void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
104                                             TTI::UnrollingPreferences &UP,
105                                             OptimizationRemarkEmitter *ORE) {
106   const Function &F = *L->getHeader()->getParent();
107   UP.Threshold = AMDGPU::getIntegerAttribute(F, "amdgpu-unroll-threshold", 300);
108   UP.MaxCount = std::numeric_limits<unsigned>::max();
109   UP.Partial = true;
110 
111   // Conditional branch in a loop back edge needs 3 additional exec
112   // manipulations in average.
113   UP.BEInsns += 3;
114 
115   // TODO: Do we want runtime unrolling?
116 
117   // Maximum alloca size than can fit registers. Reserve 16 registers.
118   const unsigned MaxAlloca = (256 - 16) * 4;
119   unsigned ThresholdPrivate = UnrollThresholdPrivate;
120   unsigned ThresholdLocal = UnrollThresholdLocal;
121 
122   // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
123   // provided threshold value as the default for Threshold
124   if (MDNode *LoopUnrollThreshold =
125           findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
126     if (LoopUnrollThreshold->getNumOperands() == 2) {
127       ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
128           LoopUnrollThreshold->getOperand(1));
129       if (MetaThresholdValue) {
130         // We will also use the supplied value for PartialThreshold for now.
131         // We may introduce additional metadata if it becomes necessary in the
132         // future.
133         UP.Threshold = MetaThresholdValue->getSExtValue();
134         UP.PartialThreshold = UP.Threshold;
135         ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
136         ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
137       }
138     }
139   }
140 
141   unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
142   for (const BasicBlock *BB : L->getBlocks()) {
143     const DataLayout &DL = BB->getModule()->getDataLayout();
144     unsigned LocalGEPsSeen = 0;
145 
146     if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
147                return SubLoop->contains(BB); }))
148         continue; // Block belongs to an inner loop.
149 
150     for (const Instruction &I : *BB) {
151       // Unroll a loop which contains an "if" statement whose condition
152       // defined by a PHI belonging to the loop. This may help to eliminate
153       // if region and potentially even PHI itself, saving on both divergence
154       // and registers used for the PHI.
155       // Add a small bonus for each of such "if" statements.
156       if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
157         if (UP.Threshold < MaxBoost && Br->isConditional()) {
158           BasicBlock *Succ0 = Br->getSuccessor(0);
159           BasicBlock *Succ1 = Br->getSuccessor(1);
160           if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
161               (L->contains(Succ1) && L->isLoopExiting(Succ1)))
162             continue;
163           if (dependsOnLocalPhi(L, Br->getCondition())) {
164             UP.Threshold += UnrollThresholdIf;
165             LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
166                               << " for loop:\n"
167                               << *L << " due to " << *Br << '\n');
168             if (UP.Threshold >= MaxBoost)
169               return;
170           }
171         }
172         continue;
173       }
174 
175       const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
176       if (!GEP)
177         continue;
178 
179       unsigned AS = GEP->getAddressSpace();
180       unsigned Threshold = 0;
181       if (AS == AMDGPUAS::PRIVATE_ADDRESS)
182         Threshold = ThresholdPrivate;
183       else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
184         Threshold = ThresholdLocal;
185       else
186         continue;
187 
188       if (UP.Threshold >= Threshold)
189         continue;
190 
191       if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
192         const Value *Ptr = GEP->getPointerOperand();
193         const AllocaInst *Alloca =
194             dyn_cast<AllocaInst>(getUnderlyingObject(Ptr));
195         if (!Alloca || !Alloca->isStaticAlloca())
196           continue;
197         Type *Ty = Alloca->getAllocatedType();
198         unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
199         if (AllocaSize > MaxAlloca)
200           continue;
201       } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
202                  AS == AMDGPUAS::REGION_ADDRESS) {
203         LocalGEPsSeen++;
204         // Inhibit unroll for local memory if we have seen addressing not to
205         // a variable, most likely we will be unable to combine it.
206         // Do not unroll too deep inner loops for local memory to give a chance
207         // to unroll an outer loop for a more important reason.
208         if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
209             (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
210              !isa<Argument>(GEP->getPointerOperand())))
211           continue;
212         LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
213                           << *L << " due to LDS use.\n");
214         UP.Runtime = UnrollRuntimeLocal;
215       }
216 
217       // Check if GEP depends on a value defined by this loop itself.
218       bool HasLoopDef = false;
219       for (const Value *Op : GEP->operands()) {
220         const Instruction *Inst = dyn_cast<Instruction>(Op);
221         if (!Inst || L->isLoopInvariant(Op))
222           continue;
223 
224         if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
225              return SubLoop->contains(Inst); }))
226           continue;
227         HasLoopDef = true;
228         break;
229       }
230       if (!HasLoopDef)
231         continue;
232 
233       // We want to do whatever we can to limit the number of alloca
234       // instructions that make it through to the code generator.  allocas
235       // require us to use indirect addressing, which is slow and prone to
236       // compiler bugs.  If this loop does an address calculation on an
237       // alloca ptr, then we want to use a higher than normal loop unroll
238       // threshold. This will give SROA a better chance to eliminate these
239       // allocas.
240       //
241       // We also want to have more unrolling for local memory to let ds
242       // instructions with different offsets combine.
243       //
244       // Don't use the maximum allowed value here as it will make some
245       // programs way too big.
246       UP.Threshold = Threshold;
247       LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
248                         << " for loop:\n"
249                         << *L << " due to " << *GEP << '\n');
250       if (UP.Threshold >= MaxBoost)
251         return;
252     }
253 
254     // If we got a GEP in a small BB from inner loop then increase max trip
255     // count to analyze for better estimation cost in unroll
256     if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
257       UP.MaxIterationsCountToAnalyze = 32;
258   }
259 }
260 
261 void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
262                                           TTI::PeelingPreferences &PP) {
263   BaseT::getPeelingPreferences(L, SE, PP);
264 }
265 
266 const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
267     // Codegen control options which don't matter.
268     AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
269     AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
270     AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
271     AMDGPU::FeatureUnalignedAccessMode,
272 
273     AMDGPU::FeatureAutoWaitcntBeforeBarrier,
274 
275     // Property of the kernel/environment which can't actually differ.
276     AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
277     AMDGPU::FeatureTrapHandler,
278 
279     // The default assumption needs to be ecc is enabled, but no directly
280     // exposed operations depend on it, so it can be safely inlined.
281     AMDGPU::FeatureSRAMECC,
282 
283     // Perf-tuning features
284     AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
285 
286 GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
287     : BaseT(TM, F.getParent()->getDataLayout()),
288       ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
289       TLI(ST->getTargetLowering()), CommonTTI(TM, F),
290       IsGraphics(AMDGPU::isGraphics(F.getCallingConv())),
291       MaxVGPRs(ST->getMaxNumVGPRs(
292           std::max(ST->getWavesPerEU(F).first,
293                    ST->getWavesPerEUForWorkGroup(
294                        ST->getFlatWorkGroupSizes(F).second)))) {
295   AMDGPU::SIModeRegisterDefaults Mode(F);
296   HasFP32Denormals = Mode.allFP32Denormals();
297   HasFP64FP16Denormals = Mode.allFP64FP16Denormals();
298 }
299 
300 unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
301   // The concept of vector registers doesn't really exist. Some packed vector
302   // operations operate on the normal 32-bit registers.
303   return MaxVGPRs;
304 }
305 
306 unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
307   // This is really the number of registers to fill when vectorizing /
308   // interleaving loops, so we lie to avoid trying to use all registers.
309   return getHardwareNumberOfRegisters(Vec) >> 3;
310 }
311 
312 unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
313   const SIRegisterInfo *TRI = ST->getRegisterInfo();
314   const TargetRegisterClass *RC = TRI->getRegClass(RCID);
315   unsigned NumVGPRs = (TRI->getRegSizeInBits(*RC) + 31) / 32;
316   return getHardwareNumberOfRegisters(false) / NumVGPRs;
317 }
318 
319 TypeSize
320 GCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
321   switch (K) {
322   case TargetTransformInfo::RGK_Scalar:
323     return TypeSize::getFixed(32);
324   case TargetTransformInfo::RGK_FixedWidthVector:
325     return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
326   case TargetTransformInfo::RGK_ScalableVector:
327     return TypeSize::getScalable(0);
328   }
329   llvm_unreachable("Unsupported register kind");
330 }
331 
332 unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
333   return 32;
334 }
335 
336 unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
337   if (Opcode == Instruction::Load || Opcode == Instruction::Store)
338     return 32 * 4 / ElemWidth;
339   return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
340        : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
341        : 1;
342 }
343 
344 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
345                                          unsigned ChainSizeInBytes,
346                                          VectorType *VecTy) const {
347   unsigned VecRegBitWidth = VF * LoadSize;
348   if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
349     // TODO: Support element-size less than 32bit?
350     return 128 / LoadSize;
351 
352   return VF;
353 }
354 
355 unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
356                                              unsigned ChainSizeInBytes,
357                                              VectorType *VecTy) const {
358   unsigned VecRegBitWidth = VF * StoreSize;
359   if (VecRegBitWidth > 128)
360     return 128 / StoreSize;
361 
362   return VF;
363 }
364 
365 unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
366   if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
367       AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
368       AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
369       AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) {
370     return 512;
371   }
372 
373   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
374     return 8 * ST->getMaxPrivateElementSize();
375 
376   // Common to flat, global, local and region. Assume for unknown addrspace.
377   return 128;
378 }
379 
380 bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
381                                             Align Alignment,
382                                             unsigned AddrSpace) const {
383   // We allow vectorization of flat stores, even though we may need to decompose
384   // them later if they may access private memory. We don't have enough context
385   // here, and legalization can handle it.
386   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
387     return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
388       ChainSizeInBytes <= ST->getMaxPrivateElementSize();
389   }
390   return true;
391 }
392 
393 bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
394                                              Align Alignment,
395                                              unsigned AddrSpace) const {
396   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
397 }
398 
399 bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
400                                               Align Alignment,
401                                               unsigned AddrSpace) const {
402   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
403 }
404 
405 // FIXME: Really we would like to issue multiple 128-bit loads and stores per
406 // iteration. Should we report a larger size and let it legalize?
407 //
408 // FIXME: Should we use narrower types for local/region, or account for when
409 // unaligned access is legal?
410 //
411 // FIXME: This could use fine tuning and microbenchmarks.
412 Type *GCNTTIImpl::getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
413                                             unsigned SrcAddrSpace,
414                                             unsigned DestAddrSpace,
415                                             unsigned SrcAlign,
416                                             unsigned DestAlign) const {
417   unsigned MinAlign = std::min(SrcAlign, DestAlign);
418 
419   // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
420   // hardware into byte accesses. If you assume all alignments are equally
421   // probable, it's more efficient on average to use short accesses for this
422   // case.
423   if (MinAlign == 2)
424     return Type::getInt16Ty(Context);
425 
426   // Not all subtargets have 128-bit DS instructions, and we currently don't
427   // form them by default.
428   if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
429       SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
430       DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
431       DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
432     return FixedVectorType::get(Type::getInt32Ty(Context), 2);
433   }
434 
435   // Global memory works best with 16-byte accesses. Private memory will also
436   // hit this, although they'll be decomposed.
437   return FixedVectorType::get(Type::getInt32Ty(Context), 4);
438 }
439 
440 void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
441   SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
442   unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
443   unsigned SrcAlign, unsigned DestAlign) const {
444   assert(RemainingBytes < 16);
445 
446   unsigned MinAlign = std::min(SrcAlign, DestAlign);
447 
448   if (MinAlign != 2) {
449     Type *I64Ty = Type::getInt64Ty(Context);
450     while (RemainingBytes >= 8) {
451       OpsOut.push_back(I64Ty);
452       RemainingBytes -= 8;
453     }
454 
455     Type *I32Ty = Type::getInt32Ty(Context);
456     while (RemainingBytes >= 4) {
457       OpsOut.push_back(I32Ty);
458       RemainingBytes -= 4;
459     }
460   }
461 
462   Type *I16Ty = Type::getInt16Ty(Context);
463   while (RemainingBytes >= 2) {
464     OpsOut.push_back(I16Ty);
465     RemainingBytes -= 2;
466   }
467 
468   Type *I8Ty = Type::getInt8Ty(Context);
469   while (RemainingBytes) {
470     OpsOut.push_back(I8Ty);
471     --RemainingBytes;
472   }
473 }
474 
475 unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
476   // Disable unrolling if the loop is not vectorized.
477   // TODO: Enable this again.
478   if (VF == 1)
479     return 1;
480 
481   return 8;
482 }
483 
484 bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
485                                        MemIntrinsicInfo &Info) const {
486   switch (Inst->getIntrinsicID()) {
487   case Intrinsic::amdgcn_atomic_inc:
488   case Intrinsic::amdgcn_atomic_dec:
489   case Intrinsic::amdgcn_ds_ordered_add:
490   case Intrinsic::amdgcn_ds_ordered_swap:
491   case Intrinsic::amdgcn_ds_fadd:
492   case Intrinsic::amdgcn_ds_fmin:
493   case Intrinsic::amdgcn_ds_fmax: {
494     auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
495     auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
496     if (!Ordering || !Volatile)
497       return false; // Invalid.
498 
499     unsigned OrderingVal = Ordering->getZExtValue();
500     if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
501       return false;
502 
503     Info.PtrVal = Inst->getArgOperand(0);
504     Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
505     Info.ReadMem = true;
506     Info.WriteMem = true;
507     Info.IsVolatile = !Volatile->isNullValue();
508     return true;
509   }
510   default:
511     return false;
512   }
513 }
514 
515 InstructionCost GCNTTIImpl::getArithmeticInstrCost(
516     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
517     TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
518     TTI::OperandValueProperties Opd1PropInfo,
519     TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
520     const Instruction *CxtI) {
521   EVT OrigTy = TLI->getValueType(DL, Ty);
522   if (!OrigTy.isSimple()) {
523     // FIXME: We're having to query the throughput cost so that the basic
524     // implementation tries to generate legalize and scalarization costs. Maybe
525     // we could hoist the scalarization code here?
526     if (CostKind != TTI::TCK_CodeSize)
527       return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput,
528                                            Opd1Info, Opd2Info, Opd1PropInfo,
529                                            Opd2PropInfo, Args, CxtI);
530     // Scalarization
531 
532     // Check if any of the operands are vector operands.
533     int ISD = TLI->InstructionOpcodeToISD(Opcode);
534     assert(ISD && "Invalid opcode");
535 
536     std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
537 
538     bool IsFloat = Ty->isFPOrFPVectorTy();
539     // Assume that floating point arithmetic operations cost twice as much as
540     // integer operations.
541     unsigned OpCost = (IsFloat ? 2 : 1);
542 
543     if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
544       // The operation is legal. Assume it costs 1.
545       // TODO: Once we have extract/insert subvector cost we need to use them.
546       return LT.first * OpCost;
547     }
548 
549     if (!TLI->isOperationExpand(ISD, LT.second)) {
550       // If the operation is custom lowered, then assume that the code is twice
551       // as expensive.
552       return LT.first * 2 * OpCost;
553     }
554 
555     // Else, assume that we need to scalarize this op.
556     // TODO: If one of the types get legalized by splitting, handle this
557     // similarly to what getCastInstrCost() does.
558     if (auto *VTy = dyn_cast<VectorType>(Ty)) {
559       unsigned Num = cast<FixedVectorType>(VTy)->getNumElements();
560       InstructionCost Cost = getArithmeticInstrCost(
561           Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
562           Opd1PropInfo, Opd2PropInfo, Args, CxtI);
563       // Return the cost of multiple scalar invocation plus the cost of
564       // inserting and extracting the values.
565       SmallVector<Type *> Tys(Args.size(), Ty);
566       return getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
567     }
568 
569     // We don't know anything about this scalar instruction.
570     return OpCost;
571   }
572 
573   // Legalize the type.
574   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
575   int ISD = TLI->InstructionOpcodeToISD(Opcode);
576 
577   // Because we don't have any legal vector operations, but the legal types, we
578   // need to account for split vectors.
579   unsigned NElts = LT.second.isVector() ?
580     LT.second.getVectorNumElements() : 1;
581 
582   MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
583 
584   switch (ISD) {
585   case ISD::SHL:
586   case ISD::SRL:
587   case ISD::SRA:
588     if (SLT == MVT::i64)
589       return get64BitInstrCost(CostKind) * LT.first * NElts;
590 
591     if (ST->has16BitInsts() && SLT == MVT::i16)
592       NElts = (NElts + 1) / 2;
593 
594     // i32
595     return getFullRateInstrCost() * LT.first * NElts;
596   case ISD::ADD:
597   case ISD::SUB:
598   case ISD::AND:
599   case ISD::OR:
600   case ISD::XOR:
601     if (SLT == MVT::i64) {
602       // and, or and xor are typically split into 2 VALU instructions.
603       return 2 * getFullRateInstrCost() * LT.first * NElts;
604     }
605 
606     if (ST->has16BitInsts() && SLT == MVT::i16)
607       NElts = (NElts + 1) / 2;
608 
609     return LT.first * NElts * getFullRateInstrCost();
610   case ISD::MUL: {
611     const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
612     if (SLT == MVT::i64) {
613       const int FullRateCost = getFullRateInstrCost();
614       return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
615     }
616 
617     if (ST->has16BitInsts() && SLT == MVT::i16)
618       NElts = (NElts + 1) / 2;
619 
620     // i32
621     return QuarterRateCost * NElts * LT.first;
622   }
623   case ISD::FMUL:
624     // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
625     // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
626     // fused operation.
627     if (CxtI && CxtI->hasOneUse())
628       if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
629         const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
630         if (OPC == ISD::FADD || OPC == ISD::FSUB) {
631           if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
632             return TargetTransformInfo::TCC_Free;
633           if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
634             return TargetTransformInfo::TCC_Free;
635 
636           // Estimate all types may be fused with contract/unsafe flags
637           const TargetOptions &Options = TLI->getTargetMachine().Options;
638           if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
639               Options.UnsafeFPMath ||
640               (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
641             return TargetTransformInfo::TCC_Free;
642         }
643       }
644     LLVM_FALLTHROUGH;
645   case ISD::FADD:
646   case ISD::FSUB:
647     if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
648       NElts = (NElts + 1) / 2;
649     if (SLT == MVT::f64)
650       return LT.first * NElts * get64BitInstrCost(CostKind);
651 
652     if (ST->has16BitInsts() && SLT == MVT::f16)
653       NElts = (NElts + 1) / 2;
654 
655     if (SLT == MVT::f32 || SLT == MVT::f16)
656       return LT.first * NElts * getFullRateInstrCost();
657     break;
658   case ISD::FDIV:
659   case ISD::FREM:
660     // FIXME: frem should be handled separately. The fdiv in it is most of it,
661     // but the current lowering is also not entirely correct.
662     if (SLT == MVT::f64) {
663       int Cost = 7 * get64BitInstrCost(CostKind) +
664                  getQuarterRateInstrCost(CostKind) +
665                  3 * getHalfRateInstrCost(CostKind);
666       // Add cost of workaround.
667       if (!ST->hasUsableDivScaleConditionOutput())
668         Cost += 3 * getFullRateInstrCost();
669 
670       return LT.first * Cost * NElts;
671     }
672 
673     if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
674       // TODO: This is more complicated, unsafe flags etc.
675       if ((SLT == MVT::f32 && !HasFP32Denormals) ||
676           (SLT == MVT::f16 && ST->has16BitInsts())) {
677         return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
678       }
679     }
680 
681     if (SLT == MVT::f16 && ST->has16BitInsts()) {
682       // 2 x v_cvt_f32_f16
683       // f32 rcp
684       // f32 fmul
685       // v_cvt_f16_f32
686       // f16 div_fixup
687       int Cost =
688           4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
689       return LT.first * Cost * NElts;
690     }
691 
692     if (SLT == MVT::f32 || SLT == MVT::f16) {
693       // 4 more v_cvt_* insts without f16 insts support
694       int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
695                  1 * getQuarterRateInstrCost(CostKind);
696 
697       if (!HasFP32Denormals) {
698         // FP mode switches.
699         Cost += 2 * getFullRateInstrCost();
700       }
701 
702       return LT.first * NElts * Cost;
703     }
704     break;
705   case ISD::FNEG:
706     // Use the backend' estimation. If fneg is not free each element will cost
707     // one additional instruction.
708     return TLI->isFNegFree(SLT) ? 0 : NElts;
709   default:
710     break;
711   }
712 
713   return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info,
714                                        Opd1PropInfo, Opd2PropInfo, Args, CxtI);
715 }
716 
717 // Return true if there's a potential benefit from using v2f16/v2i16
718 // instructions for an intrinsic, even if it requires nontrivial legalization.
719 static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
720   switch (ID) {
721   case Intrinsic::fma: // TODO: fmuladd
722   // There's a small benefit to using vector ops in the legalized code.
723   case Intrinsic::round:
724   case Intrinsic::uadd_sat:
725   case Intrinsic::usub_sat:
726   case Intrinsic::sadd_sat:
727   case Intrinsic::ssub_sat:
728     return true;
729   default:
730     return false;
731   }
732 }
733 
734 InstructionCost
735 GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
736                                   TTI::TargetCostKind CostKind) {
737   if (ICA.getID() == Intrinsic::fabs)
738     return 0;
739 
740   if (!intrinsicHasPackedVectorBenefit(ICA.getID()))
741     return BaseT::getIntrinsicInstrCost(ICA, CostKind);
742 
743   Type *RetTy = ICA.getReturnType();
744   EVT OrigTy = TLI->getValueType(DL, RetTy);
745   if (!OrigTy.isSimple()) {
746     if (CostKind != TTI::TCK_CodeSize)
747       return BaseT::getIntrinsicInstrCost(ICA, CostKind);
748 
749     // TODO: Combine these two logic paths.
750     if (ICA.isTypeBasedOnly())
751       return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
752 
753     unsigned RetVF =
754         (RetTy->isVectorTy() ? cast<FixedVectorType>(RetTy)->getNumElements()
755                              : 1);
756     const IntrinsicInst *I = ICA.getInst();
757     const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
758     FastMathFlags FMF = ICA.getFlags();
759     // Assume that we need to scalarize this intrinsic.
760 
761     // Compute the scalarization overhead based on Args for a vector
762     // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
763     // CostModel will pass a vector RetTy and VF is 1.
764     InstructionCost ScalarizationCost = InstructionCost::getInvalid();
765     if (RetVF > 1) {
766       ScalarizationCost = 0;
767       if (!RetTy->isVoidTy())
768         ScalarizationCost +=
769             getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
770       ScalarizationCost +=
771           getOperandsScalarizationOverhead(Args, ICA.getArgTypes());
772     }
773 
774     IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, ICA.getArgTypes(), FMF, I,
775                                   ScalarizationCost);
776     return getIntrinsicInstrCost(Attrs, CostKind);
777   }
778 
779   // Legalize the type.
780   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
781 
782   unsigned NElts = LT.second.isVector() ?
783     LT.second.getVectorNumElements() : 1;
784 
785   MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
786 
787   if (SLT == MVT::f64)
788     return LT.first * NElts * get64BitInstrCost(CostKind);
789 
790   if ((ST->has16BitInsts() && SLT == MVT::f16) ||
791       (ST->hasPackedFP32Ops() && SLT == MVT::f32))
792     NElts = (NElts + 1) / 2;
793 
794   // TODO: Get more refined intrinsic costs?
795   unsigned InstRate = getQuarterRateInstrCost(CostKind);
796 
797   switch (ICA.getID()) {
798   case Intrinsic::fma:
799     InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
800                                    : getQuarterRateInstrCost(CostKind);
801     break;
802   case Intrinsic::uadd_sat:
803   case Intrinsic::usub_sat:
804   case Intrinsic::sadd_sat:
805   case Intrinsic::ssub_sat:
806     static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
807     if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
808       NElts = 1;
809     break;
810   }
811 
812   return LT.first * NElts * InstRate;
813 }
814 
815 InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode,
816                                            TTI::TargetCostKind CostKind,
817                                            const Instruction *I) {
818   assert((I == nullptr || I->getOpcode() == Opcode) &&
819          "Opcode should reflect passed instruction.");
820   const bool SCost =
821       (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency);
822   const int CBrCost = SCost ? 5 : 7;
823   switch (Opcode) {
824   case Instruction::Br: {
825     // Branch instruction takes about 4 slots on gfx900.
826     auto BI = dyn_cast_or_null<BranchInst>(I);
827     if (BI && BI->isUnconditional())
828       return SCost ? 1 : 4;
829     // Suppose conditional branch takes additional 3 exec manipulations
830     // instructions in average.
831     return CBrCost;
832   }
833   case Instruction::Switch: {
834     auto SI = dyn_cast_or_null<SwitchInst>(I);
835     // Each case (including default) takes 1 cmp + 1 cbr instructions in
836     // average.
837     return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
838   }
839   case Instruction::Ret:
840     return SCost ? 1 : 10;
841   }
842   return BaseT::getCFInstrCost(Opcode, CostKind, I);
843 }
844 
845 InstructionCost
846 GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
847                                        Optional<FastMathFlags> FMF,
848                                        TTI::TargetCostKind CostKind) {
849   if (TTI::requiresOrderedReduction(FMF))
850     return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
851 
852   EVT OrigTy = TLI->getValueType(DL, Ty);
853 
854   // Computes cost on targets that have packed math instructions(which support
855   // 16-bit types only).
856   if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
857     return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
858 
859   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
860   return LT.first * getFullRateInstrCost();
861 }
862 
863 InstructionCost
864 GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
865                                    bool IsUnsigned,
866                                    TTI::TargetCostKind CostKind) {
867   EVT OrigTy = TLI->getValueType(DL, Ty);
868 
869   // Computes cost on targets that have packed math instructions(which support
870   // 16-bit types only).
871   if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
872     return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
873 
874   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
875   return LT.first * getHalfRateInstrCost(CostKind);
876 }
877 
878 InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
879                                                unsigned Index) {
880   switch (Opcode) {
881   case Instruction::ExtractElement:
882   case Instruction::InsertElement: {
883     unsigned EltSize
884       = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
885     if (EltSize < 32) {
886       if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
887         return 0;
888       return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
889     }
890 
891     // Extracts are just reads of a subregister, so are free. Inserts are
892     // considered free because we don't want to have any cost for scalarizing
893     // operations, and we don't have to copy into a different register class.
894 
895     // Dynamic indexing isn't free and is best avoided.
896     return Index == ~0u ? 2 : 0;
897   }
898   default:
899     return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
900   }
901 }
902 
903 /// Analyze if the results of inline asm are divergent. If \p Indices is empty,
904 /// this is analyzing the collective result of all output registers. Otherwise,
905 /// this is only querying a specific result index if this returns multiple
906 /// registers in a struct.
907 bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
908   const CallInst *CI, ArrayRef<unsigned> Indices) const {
909   // TODO: Handle complex extract indices
910   if (Indices.size() > 1)
911     return true;
912 
913   const DataLayout &DL = CI->getModule()->getDataLayout();
914   const SIRegisterInfo *TRI = ST->getRegisterInfo();
915   TargetLowering::AsmOperandInfoVector TargetConstraints =
916       TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
917 
918   const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
919 
920   int OutputIdx = 0;
921   for (auto &TC : TargetConstraints) {
922     if (TC.Type != InlineAsm::isOutput)
923       continue;
924 
925     // Skip outputs we don't care about.
926     if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
927       continue;
928 
929     TLI->ComputeConstraintToUse(TC, SDValue());
930 
931     Register AssignedReg;
932     const TargetRegisterClass *RC;
933     std::tie(AssignedReg, RC) = TLI->getRegForInlineAsmConstraint(
934       TRI, TC.ConstraintCode, TC.ConstraintVT);
935     if (AssignedReg) {
936       // FIXME: This is a workaround for getRegForInlineAsmConstraint
937       // returning VS_32
938       RC = TRI->getPhysRegClass(AssignedReg);
939     }
940 
941     // For AGPR constraints null is returned on subtargets without AGPRs, so
942     // assume divergent for null.
943     if (!RC || !TRI->isSGPRClass(RC))
944       return true;
945   }
946 
947   return false;
948 }
949 
950 /// \returns true if the new GPU divergence analysis is enabled.
951 bool GCNTTIImpl::useGPUDivergenceAnalysis() const {
952   return !UseLegacyDA;
953 }
954 
955 /// \returns true if the result of the value could potentially be
956 /// different across workitems in a wavefront.
957 bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
958   if (const Argument *A = dyn_cast<Argument>(V))
959     return !AMDGPU::isArgPassedInSGPR(A);
960 
961   // Loads from the private and flat address spaces are divergent, because
962   // threads can execute the load instruction with the same inputs and get
963   // different results.
964   //
965   // All other loads are not divergent, because if threads issue loads with the
966   // same arguments, they will always get the same result.
967   if (const LoadInst *Load = dyn_cast<LoadInst>(V))
968     return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
969            Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
970 
971   // Atomics are divergent because they are executed sequentially: when an
972   // atomic operation refers to the same address in each thread, then each
973   // thread after the first sees the value written by the previous thread as
974   // original value.
975   if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
976     return true;
977 
978   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
979     return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
980 
981   // Assume all function calls are a source of divergence.
982   if (const CallInst *CI = dyn_cast<CallInst>(V)) {
983     if (CI->isInlineAsm())
984       return isInlineAsmSourceOfDivergence(CI);
985     return true;
986   }
987 
988   // Assume all function calls are a source of divergence.
989   if (isa<InvokeInst>(V))
990     return true;
991 
992   return false;
993 }
994 
995 bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
996   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
997     switch (Intrinsic->getIntrinsicID()) {
998     default:
999       return false;
1000     case Intrinsic::amdgcn_readfirstlane:
1001     case Intrinsic::amdgcn_readlane:
1002     case Intrinsic::amdgcn_icmp:
1003     case Intrinsic::amdgcn_fcmp:
1004     case Intrinsic::amdgcn_ballot:
1005     case Intrinsic::amdgcn_if_break:
1006       return true;
1007     }
1008   }
1009 
1010   if (const CallInst *CI = dyn_cast<CallInst>(V)) {
1011     if (CI->isInlineAsm())
1012       return !isInlineAsmSourceOfDivergence(CI);
1013     return false;
1014   }
1015 
1016   const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
1017   if (!ExtValue)
1018     return false;
1019 
1020   const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
1021   if (!CI)
1022     return false;
1023 
1024   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
1025     switch (Intrinsic->getIntrinsicID()) {
1026     default:
1027       return false;
1028     case Intrinsic::amdgcn_if:
1029     case Intrinsic::amdgcn_else: {
1030       ArrayRef<unsigned> Indices = ExtValue->getIndices();
1031       return Indices.size() == 1 && Indices[0] == 1;
1032     }
1033     }
1034   }
1035 
1036   // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1037   // divergent for the overall struct return. We need to override it in the
1038   // case we're extracting an SGPR component here.
1039   if (CI->isInlineAsm())
1040     return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
1041 
1042   return false;
1043 }
1044 
1045 bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
1046                                             Intrinsic::ID IID) const {
1047   switch (IID) {
1048   case Intrinsic::amdgcn_atomic_inc:
1049   case Intrinsic::amdgcn_atomic_dec:
1050   case Intrinsic::amdgcn_ds_fadd:
1051   case Intrinsic::amdgcn_ds_fmin:
1052   case Intrinsic::amdgcn_ds_fmax:
1053   case Intrinsic::amdgcn_is_shared:
1054   case Intrinsic::amdgcn_is_private:
1055     OpIndexes.push_back(0);
1056     return true;
1057   default:
1058     return false;
1059   }
1060 }
1061 
1062 Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
1063                                                     Value *OldV,
1064                                                     Value *NewV) const {
1065   auto IntrID = II->getIntrinsicID();
1066   switch (IntrID) {
1067   case Intrinsic::amdgcn_atomic_inc:
1068   case Intrinsic::amdgcn_atomic_dec:
1069   case Intrinsic::amdgcn_ds_fadd:
1070   case Intrinsic::amdgcn_ds_fmin:
1071   case Intrinsic::amdgcn_ds_fmax: {
1072     const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
1073     if (!IsVolatile->isZero())
1074       return nullptr;
1075     Module *M = II->getParent()->getParent()->getParent();
1076     Type *DestTy = II->getType();
1077     Type *SrcTy = NewV->getType();
1078     Function *NewDecl =
1079         Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
1080     II->setArgOperand(0, NewV);
1081     II->setCalledFunction(NewDecl);
1082     return II;
1083   }
1084   case Intrinsic::amdgcn_is_shared:
1085   case Intrinsic::amdgcn_is_private: {
1086     unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1087       AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
1088     unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1089     LLVMContext &Ctx = NewV->getType()->getContext();
1090     ConstantInt *NewVal = (TrueAS == NewAS) ?
1091       ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);
1092     return NewVal;
1093   }
1094   case Intrinsic::ptrmask: {
1095     unsigned OldAS = OldV->getType()->getPointerAddressSpace();
1096     unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1097     Value *MaskOp = II->getArgOperand(1);
1098     Type *MaskTy = MaskOp->getType();
1099 
1100     bool DoTruncate = false;
1101 
1102     const GCNTargetMachine &TM =
1103         static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
1104     if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
1105       // All valid 64-bit to 32-bit casts work by chopping off the high
1106       // bits. Any masking only clearing the low bits will also apply in the new
1107       // address space.
1108       if (DL.getPointerSizeInBits(OldAS) != 64 ||
1109           DL.getPointerSizeInBits(NewAS) != 32)
1110         return nullptr;
1111 
1112       // TODO: Do we need to thread more context in here?
1113       KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II);
1114       if (Known.countMinLeadingOnes() < 32)
1115         return nullptr;
1116 
1117       DoTruncate = true;
1118     }
1119 
1120     IRBuilder<> B(II);
1121     if (DoTruncate) {
1122       MaskTy = B.getInt32Ty();
1123       MaskOp = B.CreateTrunc(MaskOp, MaskTy);
1124     }
1125 
1126     return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
1127                              {NewV, MaskOp});
1128   }
1129   default:
1130     return nullptr;
1131   }
1132 }
1133 
1134 InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1135                                            VectorType *VT, ArrayRef<int> Mask,
1136                                            int Index, VectorType *SubTp) {
1137   Kind = improveShuffleKindFromMask(Kind, Mask);
1138   if (ST->hasVOP3PInsts()) {
1139     if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
1140         DL.getTypeSizeInBits(VT->getElementType()) == 16) {
1141       // With op_sel VOP3P instructions freely can access the low half or high
1142       // half of a register, so any swizzle is free.
1143 
1144       switch (Kind) {
1145       case TTI::SK_Broadcast:
1146       case TTI::SK_Reverse:
1147       case TTI::SK_PermuteSingleSrc:
1148         return 0;
1149       default:
1150         break;
1151       }
1152     }
1153   }
1154 
1155   return BaseT::getShuffleCost(Kind, VT, Mask, Index, SubTp);
1156 }
1157 
1158 bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
1159                                      const Function *Callee) const {
1160   const TargetMachine &TM = getTLI()->getTargetMachine();
1161   const GCNSubtarget *CallerST
1162     = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1163   const GCNSubtarget *CalleeST
1164     = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1165 
1166   const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1167   const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1168 
1169   FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1170   FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1171   if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1172     return false;
1173 
1174   // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1175   // no way to support merge for backend defined attributes.
1176   AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
1177   AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee);
1178   if (!CallerMode.isInlineCompatible(CalleeMode))
1179     return false;
1180 
1181   if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1182       Callee->hasFnAttribute(Attribute::InlineHint))
1183     return true;
1184 
1185   // Hack to make compile times reasonable.
1186   if (InlineMaxBB) {
1187     // Single BB does not increase total BB amount.
1188     if (Callee->size() == 1)
1189       return true;
1190     size_t BBSize = Caller->size() + Callee->size() - 1;
1191     return BBSize <= InlineMaxBB;
1192   }
1193 
1194   return true;
1195 }
1196 
1197 unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
1198   // If we have a pointer to private array passed into a function
1199   // it will not be optimized out, leaving scratch usage.
1200   // Increase the inline threshold to allow inlining in this case.
1201   uint64_t AllocaSize = 0;
1202   SmallPtrSet<const AllocaInst *, 8> AIVisited;
1203   for (Value *PtrArg : CB->args()) {
1204     PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1205     if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
1206                 Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS))
1207       continue;
1208 
1209     PtrArg = getUnderlyingObject(PtrArg);
1210     if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
1211       if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
1212         continue;
1213       AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
1214       // If the amount of stack memory is excessive we will not be able
1215       // to get rid of the scratch anyway, bail out.
1216       if (AllocaSize > ArgAllocaCutoff) {
1217         AllocaSize = 0;
1218         break;
1219       }
1220     }
1221   }
1222   if (AllocaSize)
1223     return ArgAllocaCost;
1224   return 0;
1225 }
1226 
1227 void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1228                                          TTI::UnrollingPreferences &UP,
1229                                          OptimizationRemarkEmitter *ORE) {
1230   CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1231 }
1232 
1233 void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1234                                        TTI::PeelingPreferences &PP) {
1235   CommonTTI.getPeelingPreferences(L, SE, PP);
1236 }
1237 
1238 int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1239   return ST->hasFullRate64Ops()
1240              ? getFullRateInstrCost()
1241              : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1242                                       : getQuarterRateInstrCost(CostKind);
1243 }
1244 
1245 R600TTIImpl::R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
1246     : BaseT(TM, F.getParent()->getDataLayout()),
1247       ST(static_cast<const R600Subtarget *>(TM->getSubtargetImpl(F))),
1248       TLI(ST->getTargetLowering()), CommonTTI(TM, F) {}
1249 
1250 unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
1251   return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
1252 }
1253 
1254 unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const {
1255   return getHardwareNumberOfRegisters(Vec);
1256 }
1257 
1258 TypeSize
1259 R600TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
1260   return TypeSize::getFixed(32);
1261 }
1262 
1263 unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const {
1264   return 32;
1265 }
1266 
1267 unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
1268   if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
1269       AddrSpace == AMDGPUAS::CONSTANT_ADDRESS)
1270     return 128;
1271   if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1272       AddrSpace == AMDGPUAS::REGION_ADDRESS)
1273     return 64;
1274   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
1275     return 32;
1276 
1277   if ((AddrSpace == AMDGPUAS::PARAM_D_ADDRESS ||
1278       AddrSpace == AMDGPUAS::PARAM_I_ADDRESS ||
1279       (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&
1280       AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))
1281     return 128;
1282   llvm_unreachable("unhandled address space");
1283 }
1284 
1285 bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
1286                                              Align Alignment,
1287                                              unsigned AddrSpace) const {
1288   // We allow vectorization of flat stores, even though we may need to decompose
1289   // them later if they may access private memory. We don't have enough context
1290   // here, and legalization can handle it.
1291   return (AddrSpace != AMDGPUAS::PRIVATE_ADDRESS);
1292 }
1293 
1294 bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
1295                                               Align Alignment,
1296                                               unsigned AddrSpace) const {
1297   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
1298 }
1299 
1300 bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
1301                                                Align Alignment,
1302                                                unsigned AddrSpace) const {
1303   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
1304 }
1305 
1306 unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) {
1307   // Disable unrolling if the loop is not vectorized.
1308   // TODO: Enable this again.
1309   if (VF == 1)
1310     return 1;
1311 
1312   return 8;
1313 }
1314 
1315 InstructionCost R600TTIImpl::getCFInstrCost(unsigned Opcode,
1316                                             TTI::TargetCostKind CostKind,
1317                                             const Instruction *I) {
1318   if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
1319     return Opcode == Instruction::PHI ? 0 : 1;
1320 
1321   // XXX - For some reason this isn't called for switch.
1322   switch (Opcode) {
1323   case Instruction::Br:
1324   case Instruction::Ret:
1325     return 10;
1326   default:
1327     return BaseT::getCFInstrCost(Opcode, CostKind, I);
1328   }
1329 }
1330 
1331 InstructionCost R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
1332                                                 unsigned Index) {
1333   switch (Opcode) {
1334   case Instruction::ExtractElement:
1335   case Instruction::InsertElement: {
1336     unsigned EltSize
1337       = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
1338     if (EltSize < 32) {
1339       return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
1340     }
1341 
1342     // Extracts are just reads of a subregister, so are free. Inserts are
1343     // considered free because we don't want to have any cost for scalarizing
1344     // operations, and we don't have to copy into a different register class.
1345 
1346     // Dynamic indexing isn't free and is best avoided.
1347     return Index == ~0u ? 2 : 0;
1348   }
1349   default:
1350     return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
1351   }
1352 }
1353 
1354 void R600TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1355                                           TTI::UnrollingPreferences &UP,
1356                                           OptimizationRemarkEmitter *ORE) {
1357   CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1358 }
1359 
1360 void R600TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1361                                         TTI::PeelingPreferences &PP) {
1362   CommonTTI.getPeelingPreferences(L, SE, PP);
1363 }
1364