1 //===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI -----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "NVPTXTargetTransformInfo.h"
10 #include "NVPTXUtilities.h"
11 #include "llvm/Analysis/LoopInfo.h"
12 #include "llvm/Analysis/TargetTransformInfo.h"
13 #include "llvm/Analysis/ValueTracking.h"
14 #include "llvm/CodeGen/BasicTTIImpl.h"
15 #include "llvm/CodeGen/CostTable.h"
16 #include "llvm/CodeGen/TargetLowering.h"
17 #include "llvm/IR/IntrinsicsNVPTX.h"
18 #include "llvm/Support/Debug.h"
19 using namespace llvm;
20
21 #define DEBUG_TYPE "NVPTXtti"
22
23 // Whether the given intrinsic reads threadIdx.x/y/z.
readsThreadIndex(const IntrinsicInst * II)24 static bool readsThreadIndex(const IntrinsicInst *II) {
25 switch (II->getIntrinsicID()) {
26 default: return false;
27 case Intrinsic::nvvm_read_ptx_sreg_tid_x:
28 case Intrinsic::nvvm_read_ptx_sreg_tid_y:
29 case Intrinsic::nvvm_read_ptx_sreg_tid_z:
30 return true;
31 }
32 }
33
readsLaneId(const IntrinsicInst * II)34 static bool readsLaneId(const IntrinsicInst *II) {
35 return II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_laneid;
36 }
37
38 // Whether the given intrinsic is an atomic instruction in PTX.
isNVVMAtomic(const IntrinsicInst * II)39 static bool isNVVMAtomic(const IntrinsicInst *II) {
40 switch (II->getIntrinsicID()) {
41 default: return false;
42 case Intrinsic::nvvm_atomic_load_inc_32:
43 case Intrinsic::nvvm_atomic_load_dec_32:
44
45 case Intrinsic::nvvm_atomic_add_gen_f_cta:
46 case Intrinsic::nvvm_atomic_add_gen_f_sys:
47 case Intrinsic::nvvm_atomic_add_gen_i_cta:
48 case Intrinsic::nvvm_atomic_add_gen_i_sys:
49 case Intrinsic::nvvm_atomic_and_gen_i_cta:
50 case Intrinsic::nvvm_atomic_and_gen_i_sys:
51 case Intrinsic::nvvm_atomic_cas_gen_i_cta:
52 case Intrinsic::nvvm_atomic_cas_gen_i_sys:
53 case Intrinsic::nvvm_atomic_dec_gen_i_cta:
54 case Intrinsic::nvvm_atomic_dec_gen_i_sys:
55 case Intrinsic::nvvm_atomic_inc_gen_i_cta:
56 case Intrinsic::nvvm_atomic_inc_gen_i_sys:
57 case Intrinsic::nvvm_atomic_max_gen_i_cta:
58 case Intrinsic::nvvm_atomic_max_gen_i_sys:
59 case Intrinsic::nvvm_atomic_min_gen_i_cta:
60 case Intrinsic::nvvm_atomic_min_gen_i_sys:
61 case Intrinsic::nvvm_atomic_or_gen_i_cta:
62 case Intrinsic::nvvm_atomic_or_gen_i_sys:
63 case Intrinsic::nvvm_atomic_exch_gen_i_cta:
64 case Intrinsic::nvvm_atomic_exch_gen_i_sys:
65 case Intrinsic::nvvm_atomic_xor_gen_i_cta:
66 case Intrinsic::nvvm_atomic_xor_gen_i_sys:
67 return true;
68 }
69 }
70
isSourceOfDivergence(const Value * V)71 bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) {
72 // Without inter-procedural analysis, we conservatively assume that arguments
73 // to __device__ functions are divergent.
74 if (const Argument *Arg = dyn_cast<Argument>(V))
75 return !isKernelFunction(*Arg->getParent());
76
77 if (const Instruction *I = dyn_cast<Instruction>(V)) {
78 // Without pointer analysis, we conservatively assume values loaded from
79 // generic or local address space are divergent.
80 if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
81 unsigned AS = LI->getPointerAddressSpace();
82 return AS == ADDRESS_SPACE_GENERIC || AS == ADDRESS_SPACE_LOCAL;
83 }
84 // Atomic instructions may cause divergence. Atomic instructions are
85 // executed sequentially across all threads in a warp. Therefore, an earlier
86 // executed thread may see different memory inputs than a later executed
87 // thread. For example, suppose *a = 0 initially.
88 //
89 // atom.global.add.s32 d, [a], 1
90 //
91 // returns 0 for the first thread that enters the critical region, and 1 for
92 // the second thread.
93 if (I->isAtomic())
94 return true;
95 if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
96 // Instructions that read threadIdx are obviously divergent.
97 if (readsThreadIndex(II) || readsLaneId(II))
98 return true;
99 // Handle the NVPTX atomic intrinsics that cannot be represented as an
100 // atomic IR instruction.
101 if (isNVVMAtomic(II))
102 return true;
103 }
104 // Conservatively consider the return value of function calls as divergent.
105 // We could analyze callees with bodies more precisely using
106 // inter-procedural analysis.
107 if (isa<CallInst>(I))
108 return true;
109 }
110
111 return false;
112 }
113
114 // Convert NVVM intrinsics to target-generic LLVM code where possible.
simplifyNvvmIntrinsic(IntrinsicInst * II,InstCombiner & IC)115 static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
116 // Each NVVM intrinsic we can simplify can be replaced with one of:
117 //
118 // * an LLVM intrinsic,
119 // * an LLVM cast operation,
120 // * an LLVM binary operation, or
121 // * ad-hoc LLVM IR for the particular operation.
122
123 // Some transformations are only valid when the module's
124 // flush-denormals-to-zero (ftz) setting is true/false, whereas other
125 // transformations are valid regardless of the module's ftz setting.
126 enum FtzRequirementTy {
127 FTZ_Any, // Any ftz setting is ok.
128 FTZ_MustBeOn, // Transformation is valid only if ftz is on.
129 FTZ_MustBeOff, // Transformation is valid only if ftz is off.
130 };
131 // Classes of NVVM intrinsics that can't be replaced one-to-one with a
132 // target-generic intrinsic, cast op, or binary op but that we can nonetheless
133 // simplify.
134 enum SpecialCase {
135 SPC_Reciprocal,
136 };
137
138 // SimplifyAction is a poor-man's variant (plus an additional flag) that
139 // represents how to replace an NVVM intrinsic with target-generic LLVM IR.
140 struct SimplifyAction {
141 // Invariant: At most one of these Optionals has a value.
142 Optional<Intrinsic::ID> IID;
143 Optional<Instruction::CastOps> CastOp;
144 Optional<Instruction::BinaryOps> BinaryOp;
145 Optional<SpecialCase> Special;
146
147 FtzRequirementTy FtzRequirement = FTZ_Any;
148 // Denormal handling is guarded by different attributes depending on the
149 // type (denormal-fp-math vs denormal-fp-math-f32), take note of halfs.
150 bool IsHalfTy = false;
151
152 SimplifyAction() = default;
153
154 SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq,
155 bool IsHalfTy = false)
156 : IID(IID), FtzRequirement(FtzReq), IsHalfTy(IsHalfTy) {}
157
158 // Cast operations don't have anything to do with FTZ, so we skip that
159 // argument.
160 SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {}
161
162 SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq)
163 : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {}
164
165 SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq)
166 : Special(Special), FtzRequirement(FtzReq) {}
167 };
168
169 // Try to generate a SimplifyAction describing how to replace our
170 // IntrinsicInstr with target-generic LLVM IR.
171 const SimplifyAction Action = [II]() -> SimplifyAction {
172 switch (II->getIntrinsicID()) {
173 // NVVM intrinsics that map directly to LLVM intrinsics.
174 case Intrinsic::nvvm_ceil_d:
175 return {Intrinsic::ceil, FTZ_Any};
176 case Intrinsic::nvvm_ceil_f:
177 return {Intrinsic::ceil, FTZ_MustBeOff};
178 case Intrinsic::nvvm_ceil_ftz_f:
179 return {Intrinsic::ceil, FTZ_MustBeOn};
180 case Intrinsic::nvvm_fabs_d:
181 return {Intrinsic::fabs, FTZ_Any};
182 case Intrinsic::nvvm_fabs_f:
183 return {Intrinsic::fabs, FTZ_MustBeOff};
184 case Intrinsic::nvvm_fabs_ftz_f:
185 return {Intrinsic::fabs, FTZ_MustBeOn};
186 case Intrinsic::nvvm_floor_d:
187 return {Intrinsic::floor, FTZ_Any};
188 case Intrinsic::nvvm_floor_f:
189 return {Intrinsic::floor, FTZ_MustBeOff};
190 case Intrinsic::nvvm_floor_ftz_f:
191 return {Intrinsic::floor, FTZ_MustBeOn};
192 case Intrinsic::nvvm_fma_rn_d:
193 return {Intrinsic::fma, FTZ_Any};
194 case Intrinsic::nvvm_fma_rn_f:
195 return {Intrinsic::fma, FTZ_MustBeOff};
196 case Intrinsic::nvvm_fma_rn_ftz_f:
197 return {Intrinsic::fma, FTZ_MustBeOn};
198 case Intrinsic::nvvm_fma_rn_f16:
199 return {Intrinsic::fma, FTZ_MustBeOff, true};
200 case Intrinsic::nvvm_fma_rn_ftz_f16:
201 return {Intrinsic::fma, FTZ_MustBeOn, true};
202 case Intrinsic::nvvm_fma_rn_f16x2:
203 return {Intrinsic::fma, FTZ_MustBeOff, true};
204 case Intrinsic::nvvm_fma_rn_ftz_f16x2:
205 return {Intrinsic::fma, FTZ_MustBeOn, true};
206 case Intrinsic::nvvm_fmax_d:
207 return {Intrinsic::maxnum, FTZ_Any};
208 case Intrinsic::nvvm_fmax_f:
209 return {Intrinsic::maxnum, FTZ_MustBeOff};
210 case Intrinsic::nvvm_fmax_ftz_f:
211 return {Intrinsic::maxnum, FTZ_MustBeOn};
212 case Intrinsic::nvvm_fmax_nan_f:
213 return {Intrinsic::maximum, FTZ_MustBeOff};
214 case Intrinsic::nvvm_fmax_ftz_nan_f:
215 return {Intrinsic::maximum, FTZ_MustBeOn};
216 case Intrinsic::nvvm_fmax_f16:
217 return {Intrinsic::maxnum, FTZ_MustBeOff, true};
218 case Intrinsic::nvvm_fmax_ftz_f16:
219 return {Intrinsic::maxnum, FTZ_MustBeOn, true};
220 case Intrinsic::nvvm_fmax_f16x2:
221 return {Intrinsic::maxnum, FTZ_MustBeOff, true};
222 case Intrinsic::nvvm_fmax_ftz_f16x2:
223 return {Intrinsic::maxnum, FTZ_MustBeOn, true};
224 case Intrinsic::nvvm_fmax_nan_f16:
225 return {Intrinsic::maximum, FTZ_MustBeOff, true};
226 case Intrinsic::nvvm_fmax_ftz_nan_f16:
227 return {Intrinsic::maximum, FTZ_MustBeOn, true};
228 case Intrinsic::nvvm_fmax_nan_f16x2:
229 return {Intrinsic::maximum, FTZ_MustBeOff, true};
230 case Intrinsic::nvvm_fmax_ftz_nan_f16x2:
231 return {Intrinsic::maximum, FTZ_MustBeOn, true};
232 case Intrinsic::nvvm_fmin_d:
233 return {Intrinsic::minnum, FTZ_Any};
234 case Intrinsic::nvvm_fmin_f:
235 return {Intrinsic::minnum, FTZ_MustBeOff};
236 case Intrinsic::nvvm_fmin_ftz_f:
237 return {Intrinsic::minnum, FTZ_MustBeOn};
238 case Intrinsic::nvvm_fmin_nan_f:
239 return {Intrinsic::minimum, FTZ_MustBeOff};
240 case Intrinsic::nvvm_fmin_ftz_nan_f:
241 return {Intrinsic::minimum, FTZ_MustBeOn};
242 case Intrinsic::nvvm_fmin_f16:
243 return {Intrinsic::minnum, FTZ_MustBeOff, true};
244 case Intrinsic::nvvm_fmin_ftz_f16:
245 return {Intrinsic::minnum, FTZ_MustBeOn, true};
246 case Intrinsic::nvvm_fmin_f16x2:
247 return {Intrinsic::minnum, FTZ_MustBeOff, true};
248 case Intrinsic::nvvm_fmin_ftz_f16x2:
249 return {Intrinsic::minnum, FTZ_MustBeOn, true};
250 case Intrinsic::nvvm_fmin_nan_f16:
251 return {Intrinsic::minimum, FTZ_MustBeOff, true};
252 case Intrinsic::nvvm_fmin_ftz_nan_f16:
253 return {Intrinsic::minimum, FTZ_MustBeOn, true};
254 case Intrinsic::nvvm_fmin_nan_f16x2:
255 return {Intrinsic::minimum, FTZ_MustBeOff, true};
256 case Intrinsic::nvvm_fmin_ftz_nan_f16x2:
257 return {Intrinsic::minimum, FTZ_MustBeOn, true};
258 case Intrinsic::nvvm_round_d:
259 return {Intrinsic::round, FTZ_Any};
260 case Intrinsic::nvvm_round_f:
261 return {Intrinsic::round, FTZ_MustBeOff};
262 case Intrinsic::nvvm_round_ftz_f:
263 return {Intrinsic::round, FTZ_MustBeOn};
264 case Intrinsic::nvvm_sqrt_rn_d:
265 return {Intrinsic::sqrt, FTZ_Any};
266 case Intrinsic::nvvm_sqrt_f:
267 // nvvm_sqrt_f is a special case. For most intrinsics, foo_ftz_f is the
268 // ftz version, and foo_f is the non-ftz version. But nvvm_sqrt_f adopts
269 // the ftz-ness of the surrounding code. sqrt_rn_f and sqrt_rn_ftz_f are
270 // the versions with explicit ftz-ness.
271 return {Intrinsic::sqrt, FTZ_Any};
272 case Intrinsic::nvvm_sqrt_rn_f:
273 return {Intrinsic::sqrt, FTZ_MustBeOff};
274 case Intrinsic::nvvm_sqrt_rn_ftz_f:
275 return {Intrinsic::sqrt, FTZ_MustBeOn};
276 case Intrinsic::nvvm_trunc_d:
277 return {Intrinsic::trunc, FTZ_Any};
278 case Intrinsic::nvvm_trunc_f:
279 return {Intrinsic::trunc, FTZ_MustBeOff};
280 case Intrinsic::nvvm_trunc_ftz_f:
281 return {Intrinsic::trunc, FTZ_MustBeOn};
282
283 // NVVM intrinsics that map to LLVM cast operations.
284 //
285 // Note that llvm's target-generic conversion operators correspond to the rz
286 // (round to zero) versions of the nvvm conversion intrinsics, even though
287 // most everything else here uses the rn (round to nearest even) nvvm ops.
288 case Intrinsic::nvvm_d2i_rz:
289 case Intrinsic::nvvm_f2i_rz:
290 case Intrinsic::nvvm_d2ll_rz:
291 case Intrinsic::nvvm_f2ll_rz:
292 return {Instruction::FPToSI};
293 case Intrinsic::nvvm_d2ui_rz:
294 case Intrinsic::nvvm_f2ui_rz:
295 case Intrinsic::nvvm_d2ull_rz:
296 case Intrinsic::nvvm_f2ull_rz:
297 return {Instruction::FPToUI};
298 case Intrinsic::nvvm_i2d_rz:
299 case Intrinsic::nvvm_i2f_rz:
300 case Intrinsic::nvvm_ll2d_rz:
301 case Intrinsic::nvvm_ll2f_rz:
302 return {Instruction::SIToFP};
303 case Intrinsic::nvvm_ui2d_rz:
304 case Intrinsic::nvvm_ui2f_rz:
305 case Intrinsic::nvvm_ull2d_rz:
306 case Intrinsic::nvvm_ull2f_rz:
307 return {Instruction::UIToFP};
308
309 // NVVM intrinsics that map to LLVM binary ops.
310 case Intrinsic::nvvm_add_rn_d:
311 return {Instruction::FAdd, FTZ_Any};
312 case Intrinsic::nvvm_add_rn_f:
313 return {Instruction::FAdd, FTZ_MustBeOff};
314 case Intrinsic::nvvm_add_rn_ftz_f:
315 return {Instruction::FAdd, FTZ_MustBeOn};
316 case Intrinsic::nvvm_mul_rn_d:
317 return {Instruction::FMul, FTZ_Any};
318 case Intrinsic::nvvm_mul_rn_f:
319 return {Instruction::FMul, FTZ_MustBeOff};
320 case Intrinsic::nvvm_mul_rn_ftz_f:
321 return {Instruction::FMul, FTZ_MustBeOn};
322 case Intrinsic::nvvm_div_rn_d:
323 return {Instruction::FDiv, FTZ_Any};
324 case Intrinsic::nvvm_div_rn_f:
325 return {Instruction::FDiv, FTZ_MustBeOff};
326 case Intrinsic::nvvm_div_rn_ftz_f:
327 return {Instruction::FDiv, FTZ_MustBeOn};
328
329 // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but
330 // need special handling.
331 //
332 // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just
333 // as well.
334 case Intrinsic::nvvm_rcp_rn_d:
335 return {SPC_Reciprocal, FTZ_Any};
336 case Intrinsic::nvvm_rcp_rn_f:
337 return {SPC_Reciprocal, FTZ_MustBeOff};
338 case Intrinsic::nvvm_rcp_rn_ftz_f:
339 return {SPC_Reciprocal, FTZ_MustBeOn};
340
341 // We do not currently simplify intrinsics that give an approximate
342 // answer. These include:
343 //
344 // - nvvm_cos_approx_{f,ftz_f}
345 // - nvvm_ex2_approx_{d,f,ftz_f}
346 // - nvvm_lg2_approx_{d,f,ftz_f}
347 // - nvvm_sin_approx_{f,ftz_f}
348 // - nvvm_sqrt_approx_{f,ftz_f}
349 // - nvvm_rsqrt_approx_{d,f,ftz_f}
350 // - nvvm_div_approx_{ftz_d,ftz_f,f}
351 // - nvvm_rcp_approx_ftz_d
352 //
353 // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast"
354 // means that fastmath is enabled in the intrinsic. Unfortunately only
355 // binary operators (currently) have a fastmath bit in SelectionDAG, so
356 // this information gets lost and we can't select on it.
357 //
358 // TODO: div and rcp are lowered to a binary op, so these we could in
359 // theory lower them to "fast fdiv".
360
361 default:
362 return {};
363 }
364 }();
365
366 // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we
367 // can bail out now. (Notice that in the case that IID is not an NVVM
368 // intrinsic, we don't have to look up any module metadata, as
369 // FtzRequirementTy will be FTZ_Any.)
370 if (Action.FtzRequirement != FTZ_Any) {
371 const char *AttrName =
372 Action.IsHalfTy ? "denormal-fp-math" : "denormal-fp-math-f32";
373 StringRef Attr =
374 II->getFunction()->getFnAttribute(AttrName).getValueAsString();
375 DenormalMode Mode = parseDenormalFPAttribute(Attr);
376 bool FtzEnabled = Mode.Output != DenormalMode::IEEE;
377
378 if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn))
379 return nullptr;
380 }
381
382 // Simplify to target-generic intrinsic.
383 if (Action.IID) {
384 SmallVector<Value *, 4> Args(II->args());
385 // All the target-generic intrinsics currently of interest to us have one
386 // type argument, equal to that of the nvvm intrinsic's argument.
387 Type *Tys[] = {II->getArgOperand(0)->getType()};
388 return CallInst::Create(
389 Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args);
390 }
391
392 // Simplify to target-generic binary op.
393 if (Action.BinaryOp)
394 return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0),
395 II->getArgOperand(1), II->getName());
396
397 // Simplify to target-generic cast op.
398 if (Action.CastOp)
399 return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(),
400 II->getName());
401
402 // All that's left are the special cases.
403 if (!Action.Special)
404 return nullptr;
405
406 switch (*Action.Special) {
407 case SPC_Reciprocal:
408 // Simplify reciprocal.
409 return BinaryOperator::Create(
410 Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1),
411 II->getArgOperand(0), II->getName());
412 }
413 llvm_unreachable("All SpecialCase enumerators should be handled in switch.");
414 }
415
416 Optional<Instruction *>
instCombineIntrinsic(InstCombiner & IC,IntrinsicInst & II) const417 NVPTXTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
418 if (Instruction *I = simplifyNvvmIntrinsic(&II, IC)) {
419 return I;
420 }
421 return None;
422 }
423
getArithmeticInstrCost(unsigned Opcode,Type * Ty,TTI::TargetCostKind CostKind,TTI::OperandValueKind Opd1Info,TTI::OperandValueKind Opd2Info,TTI::OperandValueProperties Opd1PropInfo,TTI::OperandValueProperties Opd2PropInfo,ArrayRef<const Value * > Args,const Instruction * CxtI)424 InstructionCost NVPTXTTIImpl::getArithmeticInstrCost(
425 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
426 TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
427 TTI::OperandValueProperties Opd1PropInfo,
428 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
429 const Instruction *CxtI) {
430 // Legalize the type.
431 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
432
433 int ISD = TLI->InstructionOpcodeToISD(Opcode);
434
435 switch (ISD) {
436 default:
437 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
438 Opd2Info,
439 Opd1PropInfo, Opd2PropInfo);
440 case ISD::ADD:
441 case ISD::MUL:
442 case ISD::XOR:
443 case ISD::OR:
444 case ISD::AND:
445 // The machine code (SASS) simulates an i64 with two i32. Therefore, we
446 // estimate that arithmetic operations on i64 are twice as expensive as
447 // those on types that can fit into one machine register.
448 if (LT.second.SimpleTy == MVT::i64)
449 return 2 * LT.first;
450 // Delegate other cases to the basic TTI.
451 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
452 Opd2Info,
453 Opd1PropInfo, Opd2PropInfo);
454 }
455 }
456
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP,OptimizationRemarkEmitter * ORE)457 void NVPTXTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
458 TTI::UnrollingPreferences &UP,
459 OptimizationRemarkEmitter *ORE) {
460 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
461
462 // Enable partial unrolling and runtime unrolling, but reduce the
463 // threshold. This partially unrolls small loops which are often
464 // unrolled by the PTX to SASS compiler and unrolling earlier can be
465 // beneficial.
466 UP.Partial = UP.Runtime = true;
467 UP.PartialThreshold = UP.Threshold / 4;
468 }
469
getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)470 void NVPTXTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
471 TTI::PeelingPreferences &PP) {
472 BaseT::getPeelingPreferences(L, SE, PP);
473 }
474