1 //===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a TargetTransformInfo analysis pass specific to the
10 // SystemZ target machine. It uses the target's detailed information to provide
11 // more precise answers to certain TTI queries, while letting the target
12 // independent and default TTI implementations handle the rest.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "SystemZTargetTransformInfo.h"
17 #include "llvm/Analysis/TargetTransformInfo.h"
18 #include "llvm/CodeGen/BasicTTIImpl.h"
19 #include "llvm/CodeGen/CostTable.h"
20 #include "llvm/CodeGen/TargetLowering.h"
21 #include "llvm/IR/IntrinsicInst.h"
22 #include "llvm/Support/Debug.h"
23 using namespace llvm;
24 
25 #define DEBUG_TYPE "systemztti"
26 
27 //===----------------------------------------------------------------------===//
28 //
29 // SystemZ cost model.
30 //
31 //===----------------------------------------------------------------------===//
32 
33 int SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
34                                   TTI::TargetCostKind CostKind) {
35   assert(Ty->isIntegerTy());
36 
37   unsigned BitSize = Ty->getPrimitiveSizeInBits();
38   // There is no cost model for constants with a bit size of 0. Return TCC_Free
39   // here, so that constant hoisting will ignore this constant.
40   if (BitSize == 0)
41     return TTI::TCC_Free;
42   // No cost model for operations on integers larger than 64 bit implemented yet.
43   if (BitSize > 64)
44     return TTI::TCC_Free;
45 
46   if (Imm == 0)
47     return TTI::TCC_Free;
48 
49   if (Imm.getBitWidth() <= 64) {
50     // Constants loaded via lgfi.
51     if (isInt<32>(Imm.getSExtValue()))
52       return TTI::TCC_Basic;
53     // Constants loaded via llilf.
54     if (isUInt<32>(Imm.getZExtValue()))
55       return TTI::TCC_Basic;
56     // Constants loaded via llihf:
57     if ((Imm.getZExtValue() & 0xffffffff) == 0)
58       return TTI::TCC_Basic;
59 
60     return 2 * TTI::TCC_Basic;
61   }
62 
63   return 4 * TTI::TCC_Basic;
64 }
65 
66 int SystemZTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
67                                   const APInt &Imm, Type *Ty,
68                                   TTI::TargetCostKind CostKind) {
69   assert(Ty->isIntegerTy());
70 
71   unsigned BitSize = Ty->getPrimitiveSizeInBits();
72   // There is no cost model for constants with a bit size of 0. Return TCC_Free
73   // here, so that constant hoisting will ignore this constant.
74   if (BitSize == 0)
75     return TTI::TCC_Free;
76   // No cost model for operations on integers larger than 64 bit implemented yet.
77   if (BitSize > 64)
78     return TTI::TCC_Free;
79 
80   switch (Opcode) {
81   default:
82     return TTI::TCC_Free;
83   case Instruction::GetElementPtr:
84     // Always hoist the base address of a GetElementPtr. This prevents the
85     // creation of new constants for every base constant that gets constant
86     // folded with the offset.
87     if (Idx == 0)
88       return 2 * TTI::TCC_Basic;
89     return TTI::TCC_Free;
90   case Instruction::Store:
91     if (Idx == 0 && Imm.getBitWidth() <= 64) {
92       // Any 8-bit immediate store can by implemented via mvi.
93       if (BitSize == 8)
94         return TTI::TCC_Free;
95       // 16-bit immediate values can be stored via mvhhi/mvhi/mvghi.
96       if (isInt<16>(Imm.getSExtValue()))
97         return TTI::TCC_Free;
98     }
99     break;
100   case Instruction::ICmp:
101     if (Idx == 1 && Imm.getBitWidth() <= 64) {
102       // Comparisons against signed 32-bit immediates implemented via cgfi.
103       if (isInt<32>(Imm.getSExtValue()))
104         return TTI::TCC_Free;
105       // Comparisons against unsigned 32-bit immediates implemented via clgfi.
106       if (isUInt<32>(Imm.getZExtValue()))
107         return TTI::TCC_Free;
108     }
109     break;
110   case Instruction::Add:
111   case Instruction::Sub:
112     if (Idx == 1 && Imm.getBitWidth() <= 64) {
113       // We use algfi/slgfi to add/subtract 32-bit unsigned immediates.
114       if (isUInt<32>(Imm.getZExtValue()))
115         return TTI::TCC_Free;
116       // Or their negation, by swapping addition vs. subtraction.
117       if (isUInt<32>(-Imm.getSExtValue()))
118         return TTI::TCC_Free;
119     }
120     break;
121   case Instruction::Mul:
122     if (Idx == 1 && Imm.getBitWidth() <= 64) {
123       // We use msgfi to multiply by 32-bit signed immediates.
124       if (isInt<32>(Imm.getSExtValue()))
125         return TTI::TCC_Free;
126     }
127     break;
128   case Instruction::Or:
129   case Instruction::Xor:
130     if (Idx == 1 && Imm.getBitWidth() <= 64) {
131       // Masks supported by oilf/xilf.
132       if (isUInt<32>(Imm.getZExtValue()))
133         return TTI::TCC_Free;
134       // Masks supported by oihf/xihf.
135       if ((Imm.getZExtValue() & 0xffffffff) == 0)
136         return TTI::TCC_Free;
137     }
138     break;
139   case Instruction::And:
140     if (Idx == 1 && Imm.getBitWidth() <= 64) {
141       // Any 32-bit AND operation can by implemented via nilf.
142       if (BitSize <= 32)
143         return TTI::TCC_Free;
144       // 64-bit masks supported by nilf.
145       if (isUInt<32>(~Imm.getZExtValue()))
146         return TTI::TCC_Free;
147       // 64-bit masks supported by nilh.
148       if ((Imm.getZExtValue() & 0xffffffff) == 0xffffffff)
149         return TTI::TCC_Free;
150       // Some 64-bit AND operations can be implemented via risbg.
151       const SystemZInstrInfo *TII = ST->getInstrInfo();
152       unsigned Start, End;
153       if (TII->isRxSBGMask(Imm.getZExtValue(), BitSize, Start, End))
154         return TTI::TCC_Free;
155     }
156     break;
157   case Instruction::Shl:
158   case Instruction::LShr:
159   case Instruction::AShr:
160     // Always return TCC_Free for the shift value of a shift instruction.
161     if (Idx == 1)
162       return TTI::TCC_Free;
163     break;
164   case Instruction::UDiv:
165   case Instruction::SDiv:
166   case Instruction::URem:
167   case Instruction::SRem:
168   case Instruction::Trunc:
169   case Instruction::ZExt:
170   case Instruction::SExt:
171   case Instruction::IntToPtr:
172   case Instruction::PtrToInt:
173   case Instruction::BitCast:
174   case Instruction::PHI:
175   case Instruction::Call:
176   case Instruction::Select:
177   case Instruction::Ret:
178   case Instruction::Load:
179     break;
180   }
181 
182   return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind);
183 }
184 
185 int SystemZTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
186                                         const APInt &Imm, Type *Ty,
187                                         TTI::TargetCostKind CostKind) {
188   assert(Ty->isIntegerTy());
189 
190   unsigned BitSize = Ty->getPrimitiveSizeInBits();
191   // There is no cost model for constants with a bit size of 0. Return TCC_Free
192   // here, so that constant hoisting will ignore this constant.
193   if (BitSize == 0)
194     return TTI::TCC_Free;
195   // No cost model for operations on integers larger than 64 bit implemented yet.
196   if (BitSize > 64)
197     return TTI::TCC_Free;
198 
199   switch (IID) {
200   default:
201     return TTI::TCC_Free;
202   case Intrinsic::sadd_with_overflow:
203   case Intrinsic::uadd_with_overflow:
204   case Intrinsic::ssub_with_overflow:
205   case Intrinsic::usub_with_overflow:
206     // These get expanded to include a normal addition/subtraction.
207     if (Idx == 1 && Imm.getBitWidth() <= 64) {
208       if (isUInt<32>(Imm.getZExtValue()))
209         return TTI::TCC_Free;
210       if (isUInt<32>(-Imm.getSExtValue()))
211         return TTI::TCC_Free;
212     }
213     break;
214   case Intrinsic::smul_with_overflow:
215   case Intrinsic::umul_with_overflow:
216     // These get expanded to include a normal multiplication.
217     if (Idx == 1 && Imm.getBitWidth() <= 64) {
218       if (isInt<32>(Imm.getSExtValue()))
219         return TTI::TCC_Free;
220     }
221     break;
222   case Intrinsic::experimental_stackmap:
223     if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
224       return TTI::TCC_Free;
225     break;
226   case Intrinsic::experimental_patchpoint_void:
227   case Intrinsic::experimental_patchpoint_i64:
228     if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
229       return TTI::TCC_Free;
230     break;
231   }
232   return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind);
233 }
234 
235 TargetTransformInfo::PopcntSupportKind
236 SystemZTTIImpl::getPopcntSupport(unsigned TyWidth) {
237   assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2");
238   if (ST->hasPopulationCount() && TyWidth <= 64)
239     return TTI::PSK_FastHardware;
240   return TTI::PSK_Software;
241 }
242 
243 void SystemZTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
244                                              TTI::UnrollingPreferences &UP) {
245   // Find out if L contains a call, what the machine instruction count
246   // estimate is, and how many stores there are.
247   bool HasCall = false;
248   unsigned NumStores = 0;
249   for (auto &BB : L->blocks())
250     for (auto &I : *BB) {
251       if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) {
252         if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
253           if (isLoweredToCall(F))
254             HasCall = true;
255           if (F->getIntrinsicID() == Intrinsic::memcpy ||
256               F->getIntrinsicID() == Intrinsic::memset)
257             NumStores++;
258         } else { // indirect call.
259           HasCall = true;
260         }
261       }
262       if (isa<StoreInst>(&I)) {
263         Type *MemAccessTy = I.getOperand(0)->getType();
264         NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy, None, 0,
265                                      TTI::TCK_RecipThroughput);
266       }
267     }
268 
269   // The z13 processor will run out of store tags if too many stores
270   // are fed into it too quickly. Therefore make sure there are not
271   // too many stores in the resulting unrolled loop.
272   unsigned const Max = (NumStores ? (12 / NumStores) : UINT_MAX);
273 
274   if (HasCall) {
275     // Only allow full unrolling if loop has any calls.
276     UP.FullUnrollMaxCount = Max;
277     UP.MaxCount = 1;
278     return;
279   }
280 
281   UP.MaxCount = Max;
282   if (UP.MaxCount <= 1)
283     return;
284 
285   // Allow partial and runtime trip count unrolling.
286   UP.Partial = UP.Runtime = true;
287 
288   UP.PartialThreshold = 75;
289   UP.DefaultUnrollRuntimeCount = 4;
290 
291   // Allow expensive instructions in the pre-header of the loop.
292   UP.AllowExpensiveTripCount = true;
293 
294   UP.Force = true;
295 }
296 
297 
298 bool SystemZTTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
299                                    TargetTransformInfo::LSRCost &C2) {
300   // SystemZ specific: check instruction count (first), and don't care about
301   // ImmCost, since offsets are checked explicitly.
302   return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
303                   C1.NumIVMuls, C1.NumBaseAdds,
304                   C1.ScaleCost, C1.SetupCost) <
305     std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
306              C2.NumIVMuls, C2.NumBaseAdds,
307              C2.ScaleCost, C2.SetupCost);
308 }
309 
310 unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
311   bool Vector = (ClassID == 1);
312   if (!Vector)
313     // Discount the stack pointer.  Also leave out %r0, since it can't
314     // be used in an address.
315     return 14;
316   if (ST->hasVector())
317     return 32;
318   return 0;
319 }
320 
321 unsigned SystemZTTIImpl::getRegisterBitWidth(bool Vector) const {
322   if (!Vector)
323     return 64;
324   if (ST->hasVector())
325     return 128;
326   return 0;
327 }
328 
329 unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses,
330                                               unsigned NumStridedMemAccesses,
331                                               unsigned NumPrefetches,
332                                               bool HasCall) const {
333   // Don't prefetch a loop with many far apart accesses.
334   if (NumPrefetches > 16)
335     return UINT_MAX;
336 
337   // Emit prefetch instructions for smaller strides in cases where we think
338   // the hardware prefetcher might not be able to keep up.
339   if (NumStridedMemAccesses > 32 &&
340       NumStridedMemAccesses == NumMemAccesses && !HasCall)
341     return 1;
342 
343   return ST->hasMiscellaneousExtensions3() ? 8192 : 2048;
344 }
345 
346 bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
347   EVT VT = TLI->getValueType(DL, DataType);
348   return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
349 }
350 
351 // Return the bit size for the scalar type or vector element
352 // type. getScalarSizeInBits() returns 0 for a pointer type.
353 static unsigned getScalarSizeInBits(Type *Ty) {
354   unsigned Size =
355     (Ty->isPtrOrPtrVectorTy() ? 64U : Ty->getScalarSizeInBits());
356   assert(Size > 0 && "Element must have non-zero size.");
357   return Size;
358 }
359 
360 // getNumberOfParts() calls getTypeLegalizationCost() which splits the vector
361 // type until it is legal. This would e.g. return 4 for <6 x i64>, instead of
362 // 3.
363 static unsigned getNumVectorRegs(Type *Ty) {
364   auto *VTy = cast<FixedVectorType>(Ty);
365   unsigned WideBits = getScalarSizeInBits(Ty) * VTy->getNumElements();
366   assert(WideBits > 0 && "Could not compute size of vector");
367   return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
368 }
369 
370 int SystemZTTIImpl::getArithmeticInstrCost(
371     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
372     TTI::OperandValueKind Op1Info,
373     TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
374     TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
375     const Instruction *CxtI) {
376 
377   // TODO: Handle more cost kinds.
378   if (CostKind != TTI::TCK_RecipThroughput)
379     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
380                                          Op2Info, Opd1PropInfo,
381                                          Opd2PropInfo, Args, CxtI);
382 
383   // TODO: return a good value for BB-VECTORIZER that includes the
384   // immediate loads, which we do not want to count for the loop
385   // vectorizer, since they are hopefully hoisted out of the loop. This
386   // would require a new parameter 'InLoop', but not sure if constant
387   // args are common enough to motivate this.
388 
389   unsigned ScalarBits = Ty->getScalarSizeInBits();
390 
391   // There are thre cases of division and remainder: Dividing with a register
392   // needs a divide instruction. A divisor which is a power of two constant
393   // can be implemented with a sequence of shifts. Any other constant needs a
394   // multiply and shifts.
395   const unsigned DivInstrCost = 20;
396   const unsigned DivMulSeqCost = 10;
397   const unsigned SDivPow2Cost = 4;
398 
399   bool SignedDivRem =
400       Opcode == Instruction::SDiv || Opcode == Instruction::SRem;
401   bool UnsignedDivRem =
402       Opcode == Instruction::UDiv || Opcode == Instruction::URem;
403 
404   // Check for a constant divisor.
405   bool DivRemConst = false;
406   bool DivRemConstPow2 = false;
407   if ((SignedDivRem || UnsignedDivRem) && Args.size() == 2) {
408     if (const Constant *C = dyn_cast<Constant>(Args[1])) {
409       const ConstantInt *CVal =
410           (C->getType()->isVectorTy()
411                ? dyn_cast_or_null<const ConstantInt>(C->getSplatValue())
412                : dyn_cast<const ConstantInt>(C));
413       if (CVal != nullptr &&
414           (CVal->getValue().isPowerOf2() || (-CVal->getValue()).isPowerOf2()))
415         DivRemConstPow2 = true;
416       else
417         DivRemConst = true;
418     }
419   }
420 
421   if (!Ty->isVectorTy()) {
422     // These FP operations are supported with a dedicated instruction for
423     // float, double and fp128 (base implementation assumes float generally
424     // costs 2).
425     if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
426         Opcode == Instruction::FMul || Opcode == Instruction::FDiv)
427       return 1;
428 
429     // There is no native support for FRem.
430     if (Opcode == Instruction::FRem)
431       return LIBCALL_COST;
432 
433     // Give discount for some combined logical operations if supported.
434     if (Args.size() == 2 && ST->hasMiscellaneousExtensions3()) {
435       if (Opcode == Instruction::Xor) {
436         for (const Value *A : Args) {
437           if (const Instruction *I = dyn_cast<Instruction>(A))
438             if (I->hasOneUse() &&
439                 (I->getOpcode() == Instruction::And ||
440                  I->getOpcode() == Instruction::Or ||
441                  I->getOpcode() == Instruction::Xor))
442               return 0;
443         }
444       }
445       else if (Opcode == Instruction::Or || Opcode == Instruction::And) {
446         for (const Value *A : Args) {
447           if (const Instruction *I = dyn_cast<Instruction>(A))
448             if (I->hasOneUse() && I->getOpcode() == Instruction::Xor)
449               return 0;
450         }
451       }
452     }
453 
454     // Or requires one instruction, although it has custom handling for i64.
455     if (Opcode == Instruction::Or)
456       return 1;
457 
458     if (Opcode == Instruction::Xor && ScalarBits == 1) {
459       if (ST->hasLoadStoreOnCond2())
460         return 5; // 2 * (li 0; loc 1); xor
461       return 7; // 2 * ipm sequences ; xor ; shift ; compare
462     }
463 
464     if (DivRemConstPow2)
465       return (SignedDivRem ? SDivPow2Cost : 1);
466     if (DivRemConst)
467       return DivMulSeqCost;
468     if (SignedDivRem || UnsignedDivRem)
469       return DivInstrCost;
470   }
471   else if (ST->hasVector()) {
472     auto *VTy = cast<FixedVectorType>(Ty);
473     unsigned VF = VTy->getNumElements();
474     unsigned NumVectors = getNumVectorRegs(Ty);
475 
476     // These vector operations are custom handled, but are still supported
477     // with one instruction per vector, regardless of element size.
478     if (Opcode == Instruction::Shl || Opcode == Instruction::LShr ||
479         Opcode == Instruction::AShr) {
480       return NumVectors;
481     }
482 
483     if (DivRemConstPow2)
484       return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1));
485     if (DivRemConst)
486       return VF * DivMulSeqCost + getScalarizationOverhead(VTy, Args);
487     if ((SignedDivRem || UnsignedDivRem) && VF > 4)
488       // Temporary hack: disable high vectorization factors with integer
489       // division/remainder, which will get scalarized and handled with
490       // GR128 registers. The mischeduler is not clever enough to avoid
491       // spilling yet.
492       return 1000;
493 
494     // These FP operations are supported with a single vector instruction for
495     // double (base implementation assumes float generally costs 2). For
496     // FP128, the scalar cost is 1, and there is no overhead since the values
497     // are already in scalar registers.
498     if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
499         Opcode == Instruction::FMul || Opcode == Instruction::FDiv) {
500       switch (ScalarBits) {
501       case 32: {
502         // The vector enhancements facility 1 provides v4f32 instructions.
503         if (ST->hasVectorEnhancements1())
504           return NumVectors;
505         // Return the cost of multiple scalar invocation plus the cost of
506         // inserting and extracting the values.
507         unsigned ScalarCost =
508             getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
509         unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(VTy, Args);
510         // FIXME: VF 2 for these FP operations are currently just as
511         // expensive as for VF 4.
512         if (VF == 2)
513           Cost *= 2;
514         return Cost;
515       }
516       case 64:
517       case 128:
518         return NumVectors;
519       default:
520         break;
521       }
522     }
523 
524     // There is no native support for FRem.
525     if (Opcode == Instruction::FRem) {
526       unsigned Cost = (VF * LIBCALL_COST) + getScalarizationOverhead(VTy, Args);
527       // FIXME: VF 2 for float is currently just as expensive as for VF 4.
528       if (VF == 2 && ScalarBits == 32)
529         Cost *= 2;
530       return Cost;
531     }
532   }
533 
534   // Fallback to the default implementation.
535   return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
536                                        Opd1PropInfo, Opd2PropInfo, Args, CxtI);
537 }
538 
539 int SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
540                                    int Index, VectorType *SubTp) {
541   if (ST->hasVector()) {
542     unsigned NumVectors = getNumVectorRegs(Tp);
543 
544     // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
545 
546     // FP128 values are always in scalar registers, so there is no work
547     // involved with a shuffle, except for broadcast. In that case register
548     // moves are done with a single instruction per element.
549     if (Tp->getScalarType()->isFP128Ty())
550       return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
551 
552     switch (Kind) {
553     case  TargetTransformInfo::SK_ExtractSubvector:
554       // ExtractSubvector Index indicates start offset.
555 
556       // Extracting a subvector from first index is a noop.
557       return (Index == 0 ? 0 : NumVectors);
558 
559     case TargetTransformInfo::SK_Broadcast:
560       // Loop vectorizer calls here to figure out the extra cost of
561       // broadcasting a loaded value to all elements of a vector. Since vlrep
562       // loads and replicates with a single instruction, adjust the returned
563       // value.
564       return NumVectors - 1;
565 
566     default:
567 
568       // SystemZ supports single instruction permutation / replication.
569       return NumVectors;
570     }
571   }
572 
573   return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
574 }
575 
576 // Return the log2 difference of the element sizes of the two vector types.
577 static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1) {
578   unsigned Bits0 = Ty0->getScalarSizeInBits();
579   unsigned Bits1 = Ty1->getScalarSizeInBits();
580 
581   if (Bits1 >  Bits0)
582     return (Log2_32(Bits1) - Log2_32(Bits0));
583 
584   return (Log2_32(Bits0) - Log2_32(Bits1));
585 }
586 
587 // Return the number of instructions needed to truncate SrcTy to DstTy.
588 unsigned SystemZTTIImpl::
589 getVectorTruncCost(Type *SrcTy, Type *DstTy) {
590   assert (SrcTy->isVectorTy() && DstTy->isVectorTy());
591   assert (SrcTy->getPrimitiveSizeInBits() > DstTy->getPrimitiveSizeInBits() &&
592           "Packing must reduce size of vector type.");
593   assert(cast<FixedVectorType>(SrcTy)->getNumElements() ==
594              cast<FixedVectorType>(DstTy)->getNumElements() &&
595          "Packing should not change number of elements.");
596 
597   // TODO: Since fp32 is expanded, the extract cost should always be 0.
598 
599   unsigned NumParts = getNumVectorRegs(SrcTy);
600   if (NumParts <= 2)
601     // Up to 2 vector registers can be truncated efficiently with pack or
602     // permute. The latter requires an immediate mask to be loaded, which
603     // typically gets hoisted out of a loop.  TODO: return a good value for
604     // BB-VECTORIZER that includes the immediate loads, which we do not want
605     // to count for the loop vectorizer.
606     return 1;
607 
608   unsigned Cost = 0;
609   unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
610   unsigned VF = cast<FixedVectorType>(SrcTy)->getNumElements();
611   for (unsigned P = 0; P < Log2Diff; ++P) {
612     if (NumParts > 1)
613       NumParts /= 2;
614     Cost += NumParts;
615   }
616 
617   // Currently, a general mix of permutes and pack instructions is output by
618   // isel, which follow the cost computation above except for this case which
619   // is one instruction less:
620   if (VF == 8 && SrcTy->getScalarSizeInBits() == 64 &&
621       DstTy->getScalarSizeInBits() == 8)
622     Cost--;
623 
624   return Cost;
625 }
626 
627 // Return the cost of converting a vector bitmask produced by a compare
628 // (SrcTy), to the type of the select or extend instruction (DstTy).
629 unsigned SystemZTTIImpl::
630 getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy) {
631   assert (SrcTy->isVectorTy() && DstTy->isVectorTy() &&
632           "Should only be called with vector types.");
633 
634   unsigned PackCost = 0;
635   unsigned SrcScalarBits = SrcTy->getScalarSizeInBits();
636   unsigned DstScalarBits = DstTy->getScalarSizeInBits();
637   unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
638   if (SrcScalarBits > DstScalarBits)
639     // The bitmask will be truncated.
640     PackCost = getVectorTruncCost(SrcTy, DstTy);
641   else if (SrcScalarBits < DstScalarBits) {
642     unsigned DstNumParts = getNumVectorRegs(DstTy);
643     // Each vector select needs its part of the bitmask unpacked.
644     PackCost = Log2Diff * DstNumParts;
645     // Extra cost for moving part of mask before unpacking.
646     PackCost += DstNumParts - 1;
647   }
648 
649   return PackCost;
650 }
651 
652 // Return the type of the compared operands. This is needed to compute the
653 // cost for a Select / ZExt or SExt instruction.
654 static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) {
655   Type *OpTy = nullptr;
656   if (CmpInst *CI = dyn_cast<CmpInst>(I->getOperand(0)))
657     OpTy = CI->getOperand(0)->getType();
658   else if (Instruction *LogicI = dyn_cast<Instruction>(I->getOperand(0)))
659     if (LogicI->getNumOperands() == 2)
660       if (CmpInst *CI0 = dyn_cast<CmpInst>(LogicI->getOperand(0)))
661         if (isa<CmpInst>(LogicI->getOperand(1)))
662           OpTy = CI0->getOperand(0)->getType();
663 
664   if (OpTy != nullptr) {
665     if (VF == 1) {
666       assert (!OpTy->isVectorTy() && "Expected scalar type");
667       return OpTy;
668     }
669     // Return the potentially vectorized type based on 'I' and 'VF'.  'I' may
670     // be either scalar or already vectorized with a same or lesser VF.
671     Type *ElTy = OpTy->getScalarType();
672     return FixedVectorType::get(ElTy, VF);
673   }
674 
675   return nullptr;
676 }
677 
678 // Get the cost of converting a boolean vector to a vector with same width
679 // and element size as Dst, plus the cost of zero extending if needed.
680 unsigned SystemZTTIImpl::
681 getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
682                               const Instruction *I) {
683   auto *DstVTy = cast<FixedVectorType>(Dst);
684   unsigned VF = DstVTy->getNumElements();
685   unsigned Cost = 0;
686   // If we know what the widths of the compared operands, get any cost of
687   // converting it to match Dst. Otherwise assume same widths.
688   Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
689   if (CmpOpTy != nullptr)
690     Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst);
691   if (Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP)
692     // One 'vn' per dst vector with an immediate mask.
693     Cost += getNumVectorRegs(Dst);
694   return Cost;
695 }
696 
697 int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
698                                      TTI::TargetCostKind CostKind,
699                                      const Instruction *I) {
700   // FIXME: Can the logic below also be used for these cost kinds?
701   if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) {
702     int BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I);
703     return BaseCost == 0 ? BaseCost : 1;
704   }
705 
706   unsigned DstScalarBits = Dst->getScalarSizeInBits();
707   unsigned SrcScalarBits = Src->getScalarSizeInBits();
708 
709   if (!Src->isVectorTy()) {
710     assert (!Dst->isVectorTy());
711 
712     if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) {
713       if (SrcScalarBits >= 32 ||
714           (I != nullptr && isa<LoadInst>(I->getOperand(0))))
715         return 1;
716       return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
717     }
718 
719     if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
720         Src->isIntegerTy(1)) {
721       if (ST->hasLoadStoreOnCond2())
722         return 2; // li 0; loc 1
723 
724       // This should be extension of a compare i1 result, which is done with
725       // ipm and a varying sequence of instructions.
726       unsigned Cost = 0;
727       if (Opcode == Instruction::SExt)
728         Cost = (DstScalarBits < 64 ? 3 : 4);
729       if (Opcode == Instruction::ZExt)
730         Cost = 3;
731       Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
732       if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
733         // If operands of an fp-type was compared, this costs +1.
734         Cost++;
735       return Cost;
736     }
737   }
738   else if (ST->hasVector()) {
739     auto *SrcVecTy = cast<FixedVectorType>(Src);
740     auto *DstVecTy = cast<FixedVectorType>(Dst);
741     unsigned VF = SrcVecTy->getNumElements();
742     unsigned NumDstVectors = getNumVectorRegs(Dst);
743     unsigned NumSrcVectors = getNumVectorRegs(Src);
744 
745     if (Opcode == Instruction::Trunc) {
746       if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits())
747         return 0; // Check for NOOP conversions.
748       return getVectorTruncCost(Src, Dst);
749     }
750 
751     if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
752       if (SrcScalarBits >= 8) {
753         // ZExt/SExt will be handled with one unpack per doubling of width.
754         unsigned NumUnpacks = getElSizeLog2Diff(Src, Dst);
755 
756         // For types that spans multiple vector registers, some additional
757         // instructions are used to setup the unpacking.
758         unsigned NumSrcVectorOps =
759           (NumUnpacks > 1 ? (NumDstVectors - NumSrcVectors)
760                           : (NumDstVectors / 2));
761 
762         return (NumUnpacks * NumDstVectors) + NumSrcVectorOps;
763       }
764       else if (SrcScalarBits == 1)
765         return getBoolVecToIntConversionCost(Opcode, Dst, I);
766     }
767 
768     if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP ||
769         Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) {
770       // TODO: Fix base implementation which could simplify things a bit here
771       // (seems to miss on differentiating on scalar/vector types).
772 
773       // Only 64 bit vector conversions are natively supported before z15.
774       if (DstScalarBits == 64 || ST->hasVectorEnhancements2()) {
775         if (SrcScalarBits == DstScalarBits)
776           return NumDstVectors;
777 
778         if (SrcScalarBits == 1)
779           return getBoolVecToIntConversionCost(Opcode, Dst, I) + NumDstVectors;
780       }
781 
782       // Return the cost of multiple scalar invocation plus the cost of
783       // inserting and extracting the values. Base implementation does not
784       // realize float->int gets scalarized.
785       unsigned ScalarCost = getCastInstrCost(Opcode, Dst->getScalarType(),
786                                              Src->getScalarType(), CostKind);
787       unsigned TotCost = VF * ScalarCost;
788       bool NeedsInserts = true, NeedsExtracts = true;
789       // FP128 registers do not get inserted or extracted.
790       if (DstScalarBits == 128 &&
791           (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP))
792         NeedsInserts = false;
793       if (SrcScalarBits == 128 &&
794           (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
795         NeedsExtracts = false;
796 
797       TotCost += getScalarizationOverhead(SrcVecTy, false, NeedsExtracts);
798       TotCost += getScalarizationOverhead(DstVecTy, NeedsInserts, false);
799 
800       // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
801       if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
802         TotCost *= 2;
803 
804       return TotCost;
805     }
806 
807     if (Opcode == Instruction::FPTrunc) {
808       if (SrcScalarBits == 128)  // fp128 -> double/float + inserts of elements.
809         return VF /*ldxbr/lexbr*/ +
810                getScalarizationOverhead(DstVecTy, true, false);
811       else // double -> float
812         return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
813     }
814 
815     if (Opcode == Instruction::FPExt) {
816       if (SrcScalarBits == 32 && DstScalarBits == 64) {
817         // float -> double is very rare and currently unoptimized. Instead of
818         // using vldeb, which can do two at a time, all conversions are
819         // scalarized.
820         return VF * 2;
821       }
822       // -> fp128.  VF * lxdb/lxeb + extraction of elements.
823       return VF + getScalarizationOverhead(SrcVecTy, false, true);
824     }
825   }
826 
827   return BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I);
828 }
829 
830 // Scalar i8 / i16 operations will typically be made after first extending
831 // the operands to i32.
832 static unsigned getOperandsExtensionCost(const Instruction *I) {
833   unsigned ExtCost = 0;
834   for (Value *Op : I->operands())
835     // A load of i8 or i16 sign/zero extends to i32.
836     if (!isa<LoadInst>(Op) && !isa<ConstantInt>(Op))
837       ExtCost++;
838 
839   return ExtCost;
840 }
841 
842 int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
843                                        Type *CondTy,
844                                        TTI::TargetCostKind CostKind,
845                                        const Instruction *I) {
846   if (CostKind != TTI::TCK_RecipThroughput)
847     return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind);
848 
849   if (!ValTy->isVectorTy()) {
850     switch (Opcode) {
851     case Instruction::ICmp: {
852       // A loaded value compared with 0 with multiple users becomes Load and
853       // Test. The load is then not foldable, so return 0 cost for the ICmp.
854       unsigned ScalarBits = ValTy->getScalarSizeInBits();
855       if (I != nullptr && ScalarBits >= 32)
856         if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
857           if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
858             if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
859                 C->getZExtValue() == 0)
860               return 0;
861 
862       unsigned Cost = 1;
863       if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
864         Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2);
865       return Cost;
866     }
867     case Instruction::Select:
868       if (ValTy->isFloatingPointTy())
869         return 4; // No load on condition for FP - costs a conditional jump.
870       return 1; // Load On Condition / Select Register.
871     }
872   }
873   else if (ST->hasVector()) {
874     unsigned VF = cast<FixedVectorType>(ValTy)->getNumElements();
875 
876     // Called with a compare instruction.
877     if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
878       unsigned PredicateExtraCost = 0;
879       if (I != nullptr) {
880         // Some predicates cost one or two extra instructions.
881         switch (cast<CmpInst>(I)->getPredicate()) {
882         case CmpInst::Predicate::ICMP_NE:
883         case CmpInst::Predicate::ICMP_UGE:
884         case CmpInst::Predicate::ICMP_ULE:
885         case CmpInst::Predicate::ICMP_SGE:
886         case CmpInst::Predicate::ICMP_SLE:
887           PredicateExtraCost = 1;
888           break;
889         case CmpInst::Predicate::FCMP_ONE:
890         case CmpInst::Predicate::FCMP_ORD:
891         case CmpInst::Predicate::FCMP_UEQ:
892         case CmpInst::Predicate::FCMP_UNO:
893           PredicateExtraCost = 2;
894           break;
895         default:
896           break;
897         }
898       }
899 
900       // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of
901       // floats.  FIXME: <2 x float> generates same code as <4 x float>.
902       unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1);
903       unsigned NumVecs_cmp = getNumVectorRegs(ValTy);
904 
905       unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost));
906       return Cost;
907     }
908     else { // Called with a select instruction.
909       assert (Opcode == Instruction::Select);
910 
911       // We can figure out the extra cost of packing / unpacking if the
912       // instruction was passed and the compare instruction is found.
913       unsigned PackCost = 0;
914       Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
915       if (CmpOpTy != nullptr)
916         PackCost =
917           getVectorBitmaskConversionCost(CmpOpTy, ValTy);
918 
919       return getNumVectorRegs(ValTy) /*vsel*/ + PackCost;
920     }
921   }
922 
923   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind);
924 }
925 
926 int SystemZTTIImpl::
927 getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
928   // vlvgp will insert two grs into a vector register, so only count half the
929   // number of instructions.
930   if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy(64))
931     return ((Index % 2 == 0) ? 1 : 0);
932 
933   if (Opcode == Instruction::ExtractElement) {
934     int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1);
935 
936     // Give a slight penalty for moving out of vector pipeline to FXU unit.
937     if (Index == 0 && Val->isIntOrIntVectorTy())
938       Cost += 1;
939 
940     return Cost;
941   }
942 
943   return BaseT::getVectorInstrCost(Opcode, Val, Index);
944 }
945 
946 // Check if a load may be folded as a memory operand in its user.
947 bool SystemZTTIImpl::
948 isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) {
949   if (!Ld->hasOneUse())
950     return false;
951   FoldedValue = Ld;
952   const Instruction *UserI = cast<Instruction>(*Ld->user_begin());
953   unsigned LoadedBits = getScalarSizeInBits(Ld->getType());
954   unsigned TruncBits = 0;
955   unsigned SExtBits = 0;
956   unsigned ZExtBits = 0;
957   if (UserI->hasOneUse()) {
958     unsigned UserBits = UserI->getType()->getScalarSizeInBits();
959     if (isa<TruncInst>(UserI))
960       TruncBits = UserBits;
961     else if (isa<SExtInst>(UserI))
962       SExtBits = UserBits;
963     else if (isa<ZExtInst>(UserI))
964       ZExtBits = UserBits;
965   }
966   if (TruncBits || SExtBits || ZExtBits) {
967     FoldedValue = UserI;
968     UserI = cast<Instruction>(*UserI->user_begin());
969     // Load (single use) -> trunc/extend (single use) -> UserI
970   }
971   if ((UserI->getOpcode() == Instruction::Sub ||
972        UserI->getOpcode() == Instruction::SDiv ||
973        UserI->getOpcode() == Instruction::UDiv) &&
974       UserI->getOperand(1) != FoldedValue)
975     return false; // Not commutative, only RHS foldable.
976   // LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an
977   // extension was made of the load.
978   unsigned LoadOrTruncBits =
979       ((SExtBits || ZExtBits) ? 0 : (TruncBits ? TruncBits : LoadedBits));
980   switch (UserI->getOpcode()) {
981   case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64
982   case Instruction::Sub:
983   case Instruction::ICmp:
984     if (LoadedBits == 32 && ZExtBits == 64)
985       return true;
986     LLVM_FALLTHROUGH;
987   case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64
988     if (UserI->getOpcode() != Instruction::ICmp) {
989       if (LoadedBits == 16 &&
990           (SExtBits == 32 ||
991            (SExtBits == 64 && ST->hasMiscellaneousExtensions2())))
992         return true;
993       if (LoadOrTruncBits == 16)
994         return true;
995     }
996     LLVM_FALLTHROUGH;
997   case Instruction::SDiv:// SE: 32->64
998     if (LoadedBits == 32 && SExtBits == 64)
999       return true;
1000     LLVM_FALLTHROUGH;
1001   case Instruction::UDiv:
1002   case Instruction::And:
1003   case Instruction::Or:
1004   case Instruction::Xor:
1005     // This also makes sense for float operations, but disabled for now due
1006     // to regressions.
1007     // case Instruction::FCmp:
1008     // case Instruction::FAdd:
1009     // case Instruction::FSub:
1010     // case Instruction::FMul:
1011     // case Instruction::FDiv:
1012 
1013     // All possible extensions of memory checked above.
1014 
1015     // Comparison between memory and immediate.
1016     if (UserI->getOpcode() == Instruction::ICmp)
1017       if (ConstantInt *CI = dyn_cast<ConstantInt>(UserI->getOperand(1)))
1018         if (isUInt<16>(CI->getZExtValue()))
1019           return true;
1020     return (LoadOrTruncBits == 32 || LoadOrTruncBits == 64);
1021     break;
1022   }
1023   return false;
1024 }
1025 
1026 static bool isBswapIntrinsicCall(const Value *V) {
1027   if (const Instruction *I = dyn_cast<Instruction>(V))
1028     if (auto *CI = dyn_cast<CallInst>(I))
1029       if (auto *F = CI->getCalledFunction())
1030         if (F->getIntrinsicID() == Intrinsic::bswap)
1031           return true;
1032   return false;
1033 }
1034 
1035 int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1036                                     MaybeAlign Alignment, unsigned AddressSpace,
1037                                     TTI::TargetCostKind CostKind,
1038                                     const Instruction *I) {
1039   assert(!Src->isVoidTy() && "Invalid type");
1040 
1041   // TODO: Handle other cost kinds.
1042   if (CostKind != TTI::TCK_RecipThroughput)
1043     return 1;
1044 
1045   if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) {
1046     // Store the load or its truncated or extended value in FoldedValue.
1047     const Instruction *FoldedValue = nullptr;
1048     if (isFoldableLoad(cast<LoadInst>(I), FoldedValue)) {
1049       const Instruction *UserI = cast<Instruction>(*FoldedValue->user_begin());
1050       assert (UserI->getNumOperands() == 2 && "Expected a binop.");
1051 
1052       // UserI can't fold two loads, so in that case return 0 cost only
1053       // half of the time.
1054       for (unsigned i = 0; i < 2; ++i) {
1055         if (UserI->getOperand(i) == FoldedValue)
1056           continue;
1057 
1058         if (Instruction *OtherOp = dyn_cast<Instruction>(UserI->getOperand(i))){
1059           LoadInst *OtherLoad = dyn_cast<LoadInst>(OtherOp);
1060           if (!OtherLoad &&
1061               (isa<TruncInst>(OtherOp) || isa<SExtInst>(OtherOp) ||
1062                isa<ZExtInst>(OtherOp)))
1063             OtherLoad = dyn_cast<LoadInst>(OtherOp->getOperand(0));
1064           if (OtherLoad && isFoldableLoad(OtherLoad, FoldedValue/*dummy*/))
1065             return i == 0; // Both operands foldable.
1066         }
1067       }
1068 
1069       return 0; // Only I is foldable in user.
1070     }
1071   }
1072 
1073   unsigned NumOps =
1074     (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src));
1075 
1076   // Store/Load reversed saves one instruction.
1077   if (((!Src->isVectorTy() && NumOps == 1) || ST->hasVectorEnhancements2()) &&
1078       I != nullptr) {
1079     if (Opcode == Instruction::Load && I->hasOneUse()) {
1080       const Instruction *LdUser = cast<Instruction>(*I->user_begin());
1081       // In case of load -> bswap -> store, return normal cost for the load.
1082       if (isBswapIntrinsicCall(LdUser) &&
1083           (!LdUser->hasOneUse() || !isa<StoreInst>(*LdUser->user_begin())))
1084         return 0;
1085     }
1086     else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
1087       const Value *StoredVal = SI->getValueOperand();
1088       if (StoredVal->hasOneUse() && isBswapIntrinsicCall(StoredVal))
1089         return 0;
1090     }
1091   }
1092 
1093   if (Src->getScalarSizeInBits() == 128)
1094     // 128 bit scalars are held in a pair of two 64 bit registers.
1095     NumOps *= 2;
1096 
1097   return  NumOps;
1098 }
1099 
1100 // The generic implementation of getInterleavedMemoryOpCost() is based on
1101 // adding costs of the memory operations plus all the extracts and inserts
1102 // needed for using / defining the vector operands. The SystemZ version does
1103 // roughly the same but bases the computations on vector permutations
1104 // instead.
1105 int SystemZTTIImpl::getInterleavedMemoryOpCost(
1106     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1107     Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1108     bool UseMaskForCond, bool UseMaskForGaps) {
1109   if (UseMaskForCond || UseMaskForGaps)
1110     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1111                                              Alignment, AddressSpace, CostKind,
1112                                              UseMaskForCond, UseMaskForGaps);
1113   assert(isa<VectorType>(VecTy) &&
1114          "Expect a vector type for interleaved memory op");
1115 
1116   // Return the ceiling of dividing A by B.
1117   auto ceil = [](unsigned A, unsigned B) { return (A + B - 1) / B; };
1118 
1119   unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1120   assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1121   unsigned VF = NumElts / Factor;
1122   unsigned NumEltsPerVecReg = (128U / getScalarSizeInBits(VecTy));
1123   unsigned NumVectorMemOps = getNumVectorRegs(VecTy);
1124   unsigned NumPermutes = 0;
1125 
1126   if (Opcode == Instruction::Load) {
1127     // Loading interleave groups may have gaps, which may mean fewer
1128     // loads. Find out how many vectors will be loaded in total, and in how
1129     // many of them each value will be in.
1130     BitVector UsedInsts(NumVectorMemOps, false);
1131     std::vector<BitVector> ValueVecs(Factor, BitVector(NumVectorMemOps, false));
1132     for (unsigned Index : Indices)
1133       for (unsigned Elt = 0; Elt < VF; ++Elt) {
1134         unsigned Vec = (Index + Elt * Factor) / NumEltsPerVecReg;
1135         UsedInsts.set(Vec);
1136         ValueVecs[Index].set(Vec);
1137       }
1138     NumVectorMemOps = UsedInsts.count();
1139 
1140     for (unsigned Index : Indices) {
1141       // Estimate that each loaded source vector containing this Index
1142       // requires one operation, except that vperm can handle two input
1143       // registers first time for each dst vector.
1144       unsigned NumSrcVecs = ValueVecs[Index].count();
1145       unsigned NumDstVecs = ceil(VF * getScalarSizeInBits(VecTy), 128U);
1146       assert (NumSrcVecs >= NumDstVecs && "Expected at least as many sources");
1147       NumPermutes += std::max(1U, NumSrcVecs - NumDstVecs);
1148     }
1149   } else {
1150     // Estimate the permutes for each stored vector as the smaller of the
1151     // number of elements and the number of source vectors. Subtract one per
1152     // dst vector for vperm (S.A.).
1153     unsigned NumSrcVecs = std::min(NumEltsPerVecReg, Factor);
1154     unsigned NumDstVecs = NumVectorMemOps;
1155     assert (NumSrcVecs > 1 && "Expected at least two source vectors.");
1156     NumPermutes += (NumDstVecs * NumSrcVecs) - NumDstVecs;
1157   }
1158 
1159   // Cost of load/store operations and the permutations needed.
1160   return NumVectorMemOps + NumPermutes;
1161 }
1162 
1163 static int getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy) {
1164   if (RetTy->isVectorTy() && ID == Intrinsic::bswap)
1165     return getNumVectorRegs(RetTy); // VPERM
1166   return -1;
1167 }
1168 
1169 int SystemZTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1170                                           TTI::TargetCostKind CostKind) {
1171   int Cost = getVectorIntrinsicInstrCost(ICA.getID(), ICA.getReturnType());
1172   if (Cost != -1)
1173     return Cost;
1174   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1175 }
1176