1 //===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a TargetTransformInfo analysis pass specific to the
10 // SystemZ target machine. It uses the target's detailed information to provide
11 // more precise answers to certain TTI queries, while letting the target
12 // independent and default TTI implementations handle the rest.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "SystemZTargetTransformInfo.h"
17 #include "llvm/Analysis/TargetTransformInfo.h"
18 #include "llvm/CodeGen/BasicTTIImpl.h"
19 #include "llvm/CodeGen/CostTable.h"
20 #include "llvm/CodeGen/TargetLowering.h"
21 #include "llvm/IR/IntrinsicInst.h"
22 #include "llvm/Support/Debug.h"
23 using namespace llvm;
24 
25 #define DEBUG_TYPE "systemztti"
26 
27 //===----------------------------------------------------------------------===//
28 //
29 // SystemZ cost model.
30 //
31 //===----------------------------------------------------------------------===//
32 
33 int SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
34                                   TTI::TargetCostKind CostKind) {
35   assert(Ty->isIntegerTy());
36 
37   unsigned BitSize = Ty->getPrimitiveSizeInBits();
38   // There is no cost model for constants with a bit size of 0. Return TCC_Free
39   // here, so that constant hoisting will ignore this constant.
40   if (BitSize == 0)
41     return TTI::TCC_Free;
42   // No cost model for operations on integers larger than 64 bit implemented yet.
43   if (BitSize > 64)
44     return TTI::TCC_Free;
45 
46   if (Imm == 0)
47     return TTI::TCC_Free;
48 
49   if (Imm.getBitWidth() <= 64) {
50     // Constants loaded via lgfi.
51     if (isInt<32>(Imm.getSExtValue()))
52       return TTI::TCC_Basic;
53     // Constants loaded via llilf.
54     if (isUInt<32>(Imm.getZExtValue()))
55       return TTI::TCC_Basic;
56     // Constants loaded via llihf:
57     if ((Imm.getZExtValue() & 0xffffffff) == 0)
58       return TTI::TCC_Basic;
59 
60     return 2 * TTI::TCC_Basic;
61   }
62 
63   return 4 * TTI::TCC_Basic;
64 }
65 
66 int SystemZTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
67                                   const APInt &Imm, Type *Ty,
68                                   TTI::TargetCostKind CostKind) {
69   assert(Ty->isIntegerTy());
70 
71   unsigned BitSize = Ty->getPrimitiveSizeInBits();
72   // There is no cost model for constants with a bit size of 0. Return TCC_Free
73   // here, so that constant hoisting will ignore this constant.
74   if (BitSize == 0)
75     return TTI::TCC_Free;
76   // No cost model for operations on integers larger than 64 bit implemented yet.
77   if (BitSize > 64)
78     return TTI::TCC_Free;
79 
80   switch (Opcode) {
81   default:
82     return TTI::TCC_Free;
83   case Instruction::GetElementPtr:
84     // Always hoist the base address of a GetElementPtr. This prevents the
85     // creation of new constants for every base constant that gets constant
86     // folded with the offset.
87     if (Idx == 0)
88       return 2 * TTI::TCC_Basic;
89     return TTI::TCC_Free;
90   case Instruction::Store:
91     if (Idx == 0 && Imm.getBitWidth() <= 64) {
92       // Any 8-bit immediate store can by implemented via mvi.
93       if (BitSize == 8)
94         return TTI::TCC_Free;
95       // 16-bit immediate values can be stored via mvhhi/mvhi/mvghi.
96       if (isInt<16>(Imm.getSExtValue()))
97         return TTI::TCC_Free;
98     }
99     break;
100   case Instruction::ICmp:
101     if (Idx == 1 && Imm.getBitWidth() <= 64) {
102       // Comparisons against signed 32-bit immediates implemented via cgfi.
103       if (isInt<32>(Imm.getSExtValue()))
104         return TTI::TCC_Free;
105       // Comparisons against unsigned 32-bit immediates implemented via clgfi.
106       if (isUInt<32>(Imm.getZExtValue()))
107         return TTI::TCC_Free;
108     }
109     break;
110   case Instruction::Add:
111   case Instruction::Sub:
112     if (Idx == 1 && Imm.getBitWidth() <= 64) {
113       // We use algfi/slgfi to add/subtract 32-bit unsigned immediates.
114       if (isUInt<32>(Imm.getZExtValue()))
115         return TTI::TCC_Free;
116       // Or their negation, by swapping addition vs. subtraction.
117       if (isUInt<32>(-Imm.getSExtValue()))
118         return TTI::TCC_Free;
119     }
120     break;
121   case Instruction::Mul:
122     if (Idx == 1 && Imm.getBitWidth() <= 64) {
123       // We use msgfi to multiply by 32-bit signed immediates.
124       if (isInt<32>(Imm.getSExtValue()))
125         return TTI::TCC_Free;
126     }
127     break;
128   case Instruction::Or:
129   case Instruction::Xor:
130     if (Idx == 1 && Imm.getBitWidth() <= 64) {
131       // Masks supported by oilf/xilf.
132       if (isUInt<32>(Imm.getZExtValue()))
133         return TTI::TCC_Free;
134       // Masks supported by oihf/xihf.
135       if ((Imm.getZExtValue() & 0xffffffff) == 0)
136         return TTI::TCC_Free;
137     }
138     break;
139   case Instruction::And:
140     if (Idx == 1 && Imm.getBitWidth() <= 64) {
141       // Any 32-bit AND operation can by implemented via nilf.
142       if (BitSize <= 32)
143         return TTI::TCC_Free;
144       // 64-bit masks supported by nilf.
145       if (isUInt<32>(~Imm.getZExtValue()))
146         return TTI::TCC_Free;
147       // 64-bit masks supported by nilh.
148       if ((Imm.getZExtValue() & 0xffffffff) == 0xffffffff)
149         return TTI::TCC_Free;
150       // Some 64-bit AND operations can be implemented via risbg.
151       const SystemZInstrInfo *TII = ST->getInstrInfo();
152       unsigned Start, End;
153       if (TII->isRxSBGMask(Imm.getZExtValue(), BitSize, Start, End))
154         return TTI::TCC_Free;
155     }
156     break;
157   case Instruction::Shl:
158   case Instruction::LShr:
159   case Instruction::AShr:
160     // Always return TCC_Free for the shift value of a shift instruction.
161     if (Idx == 1)
162       return TTI::TCC_Free;
163     break;
164   case Instruction::UDiv:
165   case Instruction::SDiv:
166   case Instruction::URem:
167   case Instruction::SRem:
168   case Instruction::Trunc:
169   case Instruction::ZExt:
170   case Instruction::SExt:
171   case Instruction::IntToPtr:
172   case Instruction::PtrToInt:
173   case Instruction::BitCast:
174   case Instruction::PHI:
175   case Instruction::Call:
176   case Instruction::Select:
177   case Instruction::Ret:
178   case Instruction::Load:
179     break;
180   }
181 
182   return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind);
183 }
184 
185 int SystemZTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
186                                         const APInt &Imm, Type *Ty,
187                                         TTI::TargetCostKind CostKind) {
188   assert(Ty->isIntegerTy());
189 
190   unsigned BitSize = Ty->getPrimitiveSizeInBits();
191   // There is no cost model for constants with a bit size of 0. Return TCC_Free
192   // here, so that constant hoisting will ignore this constant.
193   if (BitSize == 0)
194     return TTI::TCC_Free;
195   // No cost model for operations on integers larger than 64 bit implemented yet.
196   if (BitSize > 64)
197     return TTI::TCC_Free;
198 
199   switch (IID) {
200   default:
201     return TTI::TCC_Free;
202   case Intrinsic::sadd_with_overflow:
203   case Intrinsic::uadd_with_overflow:
204   case Intrinsic::ssub_with_overflow:
205   case Intrinsic::usub_with_overflow:
206     // These get expanded to include a normal addition/subtraction.
207     if (Idx == 1 && Imm.getBitWidth() <= 64) {
208       if (isUInt<32>(Imm.getZExtValue()))
209         return TTI::TCC_Free;
210       if (isUInt<32>(-Imm.getSExtValue()))
211         return TTI::TCC_Free;
212     }
213     break;
214   case Intrinsic::smul_with_overflow:
215   case Intrinsic::umul_with_overflow:
216     // These get expanded to include a normal multiplication.
217     if (Idx == 1 && Imm.getBitWidth() <= 64) {
218       if (isInt<32>(Imm.getSExtValue()))
219         return TTI::TCC_Free;
220     }
221     break;
222   case Intrinsic::experimental_stackmap:
223     if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
224       return TTI::TCC_Free;
225     break;
226   case Intrinsic::experimental_patchpoint_void:
227   case Intrinsic::experimental_patchpoint_i64:
228     if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
229       return TTI::TCC_Free;
230     break;
231   }
232   return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind);
233 }
234 
235 TargetTransformInfo::PopcntSupportKind
236 SystemZTTIImpl::getPopcntSupport(unsigned TyWidth) {
237   assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2");
238   if (ST->hasPopulationCount() && TyWidth <= 64)
239     return TTI::PSK_FastHardware;
240   return TTI::PSK_Software;
241 }
242 
243 void SystemZTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
244                                              TTI::UnrollingPreferences &UP) {
245   // Find out if L contains a call, what the machine instruction count
246   // estimate is, and how many stores there are.
247   bool HasCall = false;
248   unsigned NumStores = 0;
249   for (auto &BB : L->blocks())
250     for (auto &I : *BB) {
251       if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) {
252         if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
253           if (isLoweredToCall(F))
254             HasCall = true;
255           if (F->getIntrinsicID() == Intrinsic::memcpy ||
256               F->getIntrinsicID() == Intrinsic::memset)
257             NumStores++;
258         } else { // indirect call.
259           HasCall = true;
260         }
261       }
262       if (isa<StoreInst>(&I)) {
263         Type *MemAccessTy = I.getOperand(0)->getType();
264         NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy, None, 0,
265                                      TTI::TCK_RecipThroughput);
266       }
267     }
268 
269   // The z13 processor will run out of store tags if too many stores
270   // are fed into it too quickly. Therefore make sure there are not
271   // too many stores in the resulting unrolled loop.
272   unsigned const Max = (NumStores ? (12 / NumStores) : UINT_MAX);
273 
274   if (HasCall) {
275     // Only allow full unrolling if loop has any calls.
276     UP.FullUnrollMaxCount = Max;
277     UP.MaxCount = 1;
278     return;
279   }
280 
281   UP.MaxCount = Max;
282   if (UP.MaxCount <= 1)
283     return;
284 
285   // Allow partial and runtime trip count unrolling.
286   UP.Partial = UP.Runtime = true;
287 
288   UP.PartialThreshold = 75;
289   UP.DefaultUnrollRuntimeCount = 4;
290 
291   // Allow expensive instructions in the pre-header of the loop.
292   UP.AllowExpensiveTripCount = true;
293 
294   UP.Force = true;
295 }
296 
297 
298 bool SystemZTTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
299                                    TargetTransformInfo::LSRCost &C2) {
300   // SystemZ specific: check instruction count (first), and don't care about
301   // ImmCost, since offsets are checked explicitly.
302   return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
303                   C1.NumIVMuls, C1.NumBaseAdds,
304                   C1.ScaleCost, C1.SetupCost) <
305     std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
306              C2.NumIVMuls, C2.NumBaseAdds,
307              C2.ScaleCost, C2.SetupCost);
308 }
309 
310 unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
311   bool Vector = (ClassID == 1);
312   if (!Vector)
313     // Discount the stack pointer.  Also leave out %r0, since it can't
314     // be used in an address.
315     return 14;
316   if (ST->hasVector())
317     return 32;
318   return 0;
319 }
320 
321 unsigned SystemZTTIImpl::getRegisterBitWidth(bool Vector) const {
322   if (!Vector)
323     return 64;
324   if (ST->hasVector())
325     return 128;
326   return 0;
327 }
328 
329 unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses,
330                                               unsigned NumStridedMemAccesses,
331                                               unsigned NumPrefetches,
332                                               bool HasCall) const {
333   // Don't prefetch a loop with many far apart accesses.
334   if (NumPrefetches > 16)
335     return UINT_MAX;
336 
337   // Emit prefetch instructions for smaller strides in cases where we think
338   // the hardware prefetcher might not be able to keep up.
339   if (NumStridedMemAccesses > 32 &&
340       NumStridedMemAccesses == NumMemAccesses && !HasCall)
341     return 1;
342 
343   return ST->hasMiscellaneousExtensions3() ? 8192 : 2048;
344 }
345 
346 bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
347   EVT VT = TLI->getValueType(DL, DataType);
348   return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
349 }
350 
351 // Return the bit size for the scalar type or vector element
352 // type. getScalarSizeInBits() returns 0 for a pointer type.
353 static unsigned getScalarSizeInBits(Type *Ty) {
354   unsigned Size =
355     (Ty->isPtrOrPtrVectorTy() ? 64U : Ty->getScalarSizeInBits());
356   assert(Size > 0 && "Element must have non-zero size.");
357   return Size;
358 }
359 
360 // getNumberOfParts() calls getTypeLegalizationCost() which splits the vector
361 // type until it is legal. This would e.g. return 4 for <6 x i64>, instead of
362 // 3.
363 static unsigned getNumVectorRegs(Type *Ty) {
364   assert(Ty->isVectorTy() && "Expected vector type");
365   unsigned WideBits =
366       getScalarSizeInBits(Ty) * cast<VectorType>(Ty)->getNumElements();
367   assert(WideBits > 0 && "Could not compute size of vector");
368   return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
369 }
370 
371 int SystemZTTIImpl::getArithmeticInstrCost(
372     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
373     TTI::OperandValueKind Op1Info,
374     TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
375     TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
376     const Instruction *CxtI) {
377 
378   // TODO: return a good value for BB-VECTORIZER that includes the
379   // immediate loads, which we do not want to count for the loop
380   // vectorizer, since they are hopefully hoisted out of the loop. This
381   // would require a new parameter 'InLoop', but not sure if constant
382   // args are common enough to motivate this.
383 
384   unsigned ScalarBits = Ty->getScalarSizeInBits();
385 
386   // There are thre cases of division and remainder: Dividing with a register
387   // needs a divide instruction. A divisor which is a power of two constant
388   // can be implemented with a sequence of shifts. Any other constant needs a
389   // multiply and shifts.
390   const unsigned DivInstrCost = 20;
391   const unsigned DivMulSeqCost = 10;
392   const unsigned SDivPow2Cost = 4;
393 
394   bool SignedDivRem =
395       Opcode == Instruction::SDiv || Opcode == Instruction::SRem;
396   bool UnsignedDivRem =
397       Opcode == Instruction::UDiv || Opcode == Instruction::URem;
398 
399   // Check for a constant divisor.
400   bool DivRemConst = false;
401   bool DivRemConstPow2 = false;
402   if ((SignedDivRem || UnsignedDivRem) && Args.size() == 2) {
403     if (const Constant *C = dyn_cast<Constant>(Args[1])) {
404       const ConstantInt *CVal =
405           (C->getType()->isVectorTy()
406                ? dyn_cast_or_null<const ConstantInt>(C->getSplatValue())
407                : dyn_cast<const ConstantInt>(C));
408       if (CVal != nullptr &&
409           (CVal->getValue().isPowerOf2() || (-CVal->getValue()).isPowerOf2()))
410         DivRemConstPow2 = true;
411       else
412         DivRemConst = true;
413     }
414   }
415 
416   if (!Ty->isVectorTy()) {
417     // These FP operations are supported with a dedicated instruction for
418     // float, double and fp128 (base implementation assumes float generally
419     // costs 2).
420     if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
421         Opcode == Instruction::FMul || Opcode == Instruction::FDiv)
422       return 1;
423 
424     // There is no native support for FRem.
425     if (Opcode == Instruction::FRem)
426       return LIBCALL_COST;
427 
428     // Give discount for some combined logical operations if supported.
429     if (Args.size() == 2 && ST->hasMiscellaneousExtensions3()) {
430       if (Opcode == Instruction::Xor) {
431         for (const Value *A : Args) {
432           if (const Instruction *I = dyn_cast<Instruction>(A))
433             if (I->hasOneUse() &&
434                 (I->getOpcode() == Instruction::And ||
435                  I->getOpcode() == Instruction::Or ||
436                  I->getOpcode() == Instruction::Xor))
437               return 0;
438         }
439       }
440       else if (Opcode == Instruction::Or || Opcode == Instruction::And) {
441         for (const Value *A : Args) {
442           if (const Instruction *I = dyn_cast<Instruction>(A))
443             if (I->hasOneUse() && I->getOpcode() == Instruction::Xor)
444               return 0;
445         }
446       }
447     }
448 
449     // Or requires one instruction, although it has custom handling for i64.
450     if (Opcode == Instruction::Or)
451       return 1;
452 
453     if (Opcode == Instruction::Xor && ScalarBits == 1) {
454       if (ST->hasLoadStoreOnCond2())
455         return 5; // 2 * (li 0; loc 1); xor
456       return 7; // 2 * ipm sequences ; xor ; shift ; compare
457     }
458 
459     if (DivRemConstPow2)
460       return (SignedDivRem ? SDivPow2Cost : 1);
461     if (DivRemConst)
462       return DivMulSeqCost;
463     if (SignedDivRem || UnsignedDivRem)
464       return DivInstrCost;
465   }
466   else if (ST->hasVector()) {
467     auto *VTy = cast<VectorType>(Ty);
468     unsigned VF = VTy->getNumElements();
469     unsigned NumVectors = getNumVectorRegs(Ty);
470 
471     // These vector operations are custom handled, but are still supported
472     // with one instruction per vector, regardless of element size.
473     if (Opcode == Instruction::Shl || Opcode == Instruction::LShr ||
474         Opcode == Instruction::AShr) {
475       return NumVectors;
476     }
477 
478     if (DivRemConstPow2)
479       return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1));
480     if (DivRemConst)
481       return VF * DivMulSeqCost + getScalarizationOverhead(VTy, Args);
482     if ((SignedDivRem || UnsignedDivRem) && VF > 4)
483       // Temporary hack: disable high vectorization factors with integer
484       // division/remainder, which will get scalarized and handled with
485       // GR128 registers. The mischeduler is not clever enough to avoid
486       // spilling yet.
487       return 1000;
488 
489     // These FP operations are supported with a single vector instruction for
490     // double (base implementation assumes float generally costs 2). For
491     // FP128, the scalar cost is 1, and there is no overhead since the values
492     // are already in scalar registers.
493     if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
494         Opcode == Instruction::FMul || Opcode == Instruction::FDiv) {
495       switch (ScalarBits) {
496       case 32: {
497         // The vector enhancements facility 1 provides v4f32 instructions.
498         if (ST->hasVectorEnhancements1())
499           return NumVectors;
500         // Return the cost of multiple scalar invocation plus the cost of
501         // inserting and extracting the values.
502         unsigned ScalarCost =
503             getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
504         unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(VTy, Args);
505         // FIXME: VF 2 for these FP operations are currently just as
506         // expensive as for VF 4.
507         if (VF == 2)
508           Cost *= 2;
509         return Cost;
510       }
511       case 64:
512       case 128:
513         return NumVectors;
514       default:
515         break;
516       }
517     }
518 
519     // There is no native support for FRem.
520     if (Opcode == Instruction::FRem) {
521       unsigned Cost = (VF * LIBCALL_COST) + getScalarizationOverhead(VTy, Args);
522       // FIXME: VF 2 for float is currently just as expensive as for VF 4.
523       if (VF == 2 && ScalarBits == 32)
524         Cost *= 2;
525       return Cost;
526     }
527   }
528 
529   // Fallback to the default implementation.
530   return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
531                                        Opd1PropInfo, Opd2PropInfo, Args, CxtI);
532 }
533 
534 int SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
535                                    int Index, VectorType *SubTp) {
536   if (ST->hasVector()) {
537     unsigned NumVectors = getNumVectorRegs(Tp);
538 
539     // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
540 
541     // FP128 values are always in scalar registers, so there is no work
542     // involved with a shuffle, except for broadcast. In that case register
543     // moves are done with a single instruction per element.
544     if (Tp->getScalarType()->isFP128Ty())
545       return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
546 
547     switch (Kind) {
548     case  TargetTransformInfo::SK_ExtractSubvector:
549       // ExtractSubvector Index indicates start offset.
550 
551       // Extracting a subvector from first index is a noop.
552       return (Index == 0 ? 0 : NumVectors);
553 
554     case TargetTransformInfo::SK_Broadcast:
555       // Loop vectorizer calls here to figure out the extra cost of
556       // broadcasting a loaded value to all elements of a vector. Since vlrep
557       // loads and replicates with a single instruction, adjust the returned
558       // value.
559       return NumVectors - 1;
560 
561     default:
562 
563       // SystemZ supports single instruction permutation / replication.
564       return NumVectors;
565     }
566   }
567 
568   return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
569 }
570 
571 // Return the log2 difference of the element sizes of the two vector types.
572 static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1) {
573   unsigned Bits0 = Ty0->getScalarSizeInBits();
574   unsigned Bits1 = Ty1->getScalarSizeInBits();
575 
576   if (Bits1 >  Bits0)
577     return (Log2_32(Bits1) - Log2_32(Bits0));
578 
579   return (Log2_32(Bits0) - Log2_32(Bits1));
580 }
581 
582 // Return the number of instructions needed to truncate SrcTy to DstTy.
583 unsigned SystemZTTIImpl::
584 getVectorTruncCost(Type *SrcTy, Type *DstTy) {
585   assert (SrcTy->isVectorTy() && DstTy->isVectorTy());
586   assert (SrcTy->getPrimitiveSizeInBits() > DstTy->getPrimitiveSizeInBits() &&
587           "Packing must reduce size of vector type.");
588   assert(cast<VectorType>(SrcTy)->getNumElements() ==
589              cast<VectorType>(DstTy)->getNumElements() &&
590          "Packing should not change number of elements.");
591 
592   // TODO: Since fp32 is expanded, the extract cost should always be 0.
593 
594   unsigned NumParts = getNumVectorRegs(SrcTy);
595   if (NumParts <= 2)
596     // Up to 2 vector registers can be truncated efficiently with pack or
597     // permute. The latter requires an immediate mask to be loaded, which
598     // typically gets hoisted out of a loop.  TODO: return a good value for
599     // BB-VECTORIZER that includes the immediate loads, which we do not want
600     // to count for the loop vectorizer.
601     return 1;
602 
603   unsigned Cost = 0;
604   unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
605   unsigned VF = cast<VectorType>(SrcTy)->getNumElements();
606   for (unsigned P = 0; P < Log2Diff; ++P) {
607     if (NumParts > 1)
608       NumParts /= 2;
609     Cost += NumParts;
610   }
611 
612   // Currently, a general mix of permutes and pack instructions is output by
613   // isel, which follow the cost computation above except for this case which
614   // is one instruction less:
615   if (VF == 8 && SrcTy->getScalarSizeInBits() == 64 &&
616       DstTy->getScalarSizeInBits() == 8)
617     Cost--;
618 
619   return Cost;
620 }
621 
622 // Return the cost of converting a vector bitmask produced by a compare
623 // (SrcTy), to the type of the select or extend instruction (DstTy).
624 unsigned SystemZTTIImpl::
625 getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy) {
626   assert (SrcTy->isVectorTy() && DstTy->isVectorTy() &&
627           "Should only be called with vector types.");
628 
629   unsigned PackCost = 0;
630   unsigned SrcScalarBits = SrcTy->getScalarSizeInBits();
631   unsigned DstScalarBits = DstTy->getScalarSizeInBits();
632   unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
633   if (SrcScalarBits > DstScalarBits)
634     // The bitmask will be truncated.
635     PackCost = getVectorTruncCost(SrcTy, DstTy);
636   else if (SrcScalarBits < DstScalarBits) {
637     unsigned DstNumParts = getNumVectorRegs(DstTy);
638     // Each vector select needs its part of the bitmask unpacked.
639     PackCost = Log2Diff * DstNumParts;
640     // Extra cost for moving part of mask before unpacking.
641     PackCost += DstNumParts - 1;
642   }
643 
644   return PackCost;
645 }
646 
647 // Return the type of the compared operands. This is needed to compute the
648 // cost for a Select / ZExt or SExt instruction.
649 static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) {
650   Type *OpTy = nullptr;
651   if (CmpInst *CI = dyn_cast<CmpInst>(I->getOperand(0)))
652     OpTy = CI->getOperand(0)->getType();
653   else if (Instruction *LogicI = dyn_cast<Instruction>(I->getOperand(0)))
654     if (LogicI->getNumOperands() == 2)
655       if (CmpInst *CI0 = dyn_cast<CmpInst>(LogicI->getOperand(0)))
656         if (isa<CmpInst>(LogicI->getOperand(1)))
657           OpTy = CI0->getOperand(0)->getType();
658 
659   if (OpTy != nullptr) {
660     if (VF == 1) {
661       assert (!OpTy->isVectorTy() && "Expected scalar type");
662       return OpTy;
663     }
664     // Return the potentially vectorized type based on 'I' and 'VF'.  'I' may
665     // be either scalar or already vectorized with a same or lesser VF.
666     Type *ElTy = OpTy->getScalarType();
667     return VectorType::get(ElTy, VF);
668   }
669 
670   return nullptr;
671 }
672 
673 // Get the cost of converting a boolean vector to a vector with same width
674 // and element size as Dst, plus the cost of zero extending if needed.
675 unsigned SystemZTTIImpl::
676 getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
677                               const Instruction *I) {
678   assert (Dst->isVectorTy());
679   unsigned VF = cast<VectorType>(Dst)->getNumElements();
680   unsigned Cost = 0;
681   // If we know what the widths of the compared operands, get any cost of
682   // converting it to match Dst. Otherwise assume same widths.
683   Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
684   if (CmpOpTy != nullptr)
685     Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst);
686   if (Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP)
687     // One 'vn' per dst vector with an immediate mask.
688     Cost += getNumVectorRegs(Dst);
689   return Cost;
690 }
691 
692 int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
693                                      TTI::TargetCostKind CostKind,
694                                      const Instruction *I) {
695   unsigned DstScalarBits = Dst->getScalarSizeInBits();
696   unsigned SrcScalarBits = Src->getScalarSizeInBits();
697 
698   if (!Src->isVectorTy()) {
699     assert (!Dst->isVectorTy());
700 
701     if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) {
702       if (SrcScalarBits >= 32 ||
703           (I != nullptr && isa<LoadInst>(I->getOperand(0))))
704         return 1;
705       return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
706     }
707 
708     if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
709         Src->isIntegerTy(1)) {
710       if (ST->hasLoadStoreOnCond2())
711         return 2; // li 0; loc 1
712 
713       // This should be extension of a compare i1 result, which is done with
714       // ipm and a varying sequence of instructions.
715       unsigned Cost = 0;
716       if (Opcode == Instruction::SExt)
717         Cost = (DstScalarBits < 64 ? 3 : 4);
718       if (Opcode == Instruction::ZExt)
719         Cost = 3;
720       Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
721       if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
722         // If operands of an fp-type was compared, this costs +1.
723         Cost++;
724       return Cost;
725     }
726   }
727   else if (ST->hasVector()) {
728     auto *SrcVecTy = cast<VectorType>(Src);
729     auto *DstVecTy = cast<VectorType>(Dst);
730     unsigned VF = SrcVecTy->getNumElements();
731     unsigned NumDstVectors = getNumVectorRegs(Dst);
732     unsigned NumSrcVectors = getNumVectorRegs(Src);
733 
734     if (Opcode == Instruction::Trunc) {
735       if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits())
736         return 0; // Check for NOOP conversions.
737       return getVectorTruncCost(Src, Dst);
738     }
739 
740     if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
741       if (SrcScalarBits >= 8) {
742         // ZExt/SExt will be handled with one unpack per doubling of width.
743         unsigned NumUnpacks = getElSizeLog2Diff(Src, Dst);
744 
745         // For types that spans multiple vector registers, some additional
746         // instructions are used to setup the unpacking.
747         unsigned NumSrcVectorOps =
748           (NumUnpacks > 1 ? (NumDstVectors - NumSrcVectors)
749                           : (NumDstVectors / 2));
750 
751         return (NumUnpacks * NumDstVectors) + NumSrcVectorOps;
752       }
753       else if (SrcScalarBits == 1)
754         return getBoolVecToIntConversionCost(Opcode, Dst, I);
755     }
756 
757     if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP ||
758         Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) {
759       // TODO: Fix base implementation which could simplify things a bit here
760       // (seems to miss on differentiating on scalar/vector types).
761 
762       // Only 64 bit vector conversions are natively supported before z15.
763       if (DstScalarBits == 64 || ST->hasVectorEnhancements2()) {
764         if (SrcScalarBits == DstScalarBits)
765           return NumDstVectors;
766 
767         if (SrcScalarBits == 1)
768           return getBoolVecToIntConversionCost(Opcode, Dst, I) + NumDstVectors;
769       }
770 
771       // Return the cost of multiple scalar invocation plus the cost of
772       // inserting and extracting the values. Base implementation does not
773       // realize float->int gets scalarized.
774       unsigned ScalarCost = getCastInstrCost(Opcode, Dst->getScalarType(),
775                                              Src->getScalarType(), CostKind);
776       unsigned TotCost = VF * ScalarCost;
777       bool NeedsInserts = true, NeedsExtracts = true;
778       // FP128 registers do not get inserted or extracted.
779       if (DstScalarBits == 128 &&
780           (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP))
781         NeedsInserts = false;
782       if (SrcScalarBits == 128 &&
783           (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
784         NeedsExtracts = false;
785 
786       TotCost += getScalarizationOverhead(SrcVecTy, false, NeedsExtracts);
787       TotCost += getScalarizationOverhead(DstVecTy, NeedsInserts, false);
788 
789       // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
790       if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
791         TotCost *= 2;
792 
793       return TotCost;
794     }
795 
796     if (Opcode == Instruction::FPTrunc) {
797       if (SrcScalarBits == 128)  // fp128 -> double/float + inserts of elements.
798         return VF /*ldxbr/lexbr*/ +
799                getScalarizationOverhead(DstVecTy, true, false);
800       else // double -> float
801         return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
802     }
803 
804     if (Opcode == Instruction::FPExt) {
805       if (SrcScalarBits == 32 && DstScalarBits == 64) {
806         // float -> double is very rare and currently unoptimized. Instead of
807         // using vldeb, which can do two at a time, all conversions are
808         // scalarized.
809         return VF * 2;
810       }
811       // -> fp128.  VF * lxdb/lxeb + extraction of elements.
812       return VF + getScalarizationOverhead(SrcVecTy, false, true);
813     }
814   }
815 
816   return BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I);
817 }
818 
819 // Scalar i8 / i16 operations will typically be made after first extending
820 // the operands to i32.
821 static unsigned getOperandsExtensionCost(const Instruction *I) {
822   unsigned ExtCost = 0;
823   for (Value *Op : I->operands())
824     // A load of i8 or i16 sign/zero extends to i32.
825     if (!isa<LoadInst>(Op) && !isa<ConstantInt>(Op))
826       ExtCost++;
827 
828   return ExtCost;
829 }
830 
831 int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
832                                        Type *CondTy,
833                                        TTI::TargetCostKind CostKind,
834                                        const Instruction *I) {
835   if (!ValTy->isVectorTy()) {
836     switch (Opcode) {
837     case Instruction::ICmp: {
838       // A loaded value compared with 0 with multiple users becomes Load and
839       // Test. The load is then not foldable, so return 0 cost for the ICmp.
840       unsigned ScalarBits = ValTy->getScalarSizeInBits();
841       if (I != nullptr && ScalarBits >= 32)
842         if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
843           if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
844             if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
845                 C->getZExtValue() == 0)
846               return 0;
847 
848       unsigned Cost = 1;
849       if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
850         Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2);
851       return Cost;
852     }
853     case Instruction::Select:
854       if (ValTy->isFloatingPointTy())
855         return 4; // No load on condition for FP - costs a conditional jump.
856       return 1; // Load On Condition / Select Register.
857     }
858   }
859   else if (ST->hasVector()) {
860     unsigned VF = cast<VectorType>(ValTy)->getNumElements();
861 
862     // Called with a compare instruction.
863     if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
864       unsigned PredicateExtraCost = 0;
865       if (I != nullptr) {
866         // Some predicates cost one or two extra instructions.
867         switch (cast<CmpInst>(I)->getPredicate()) {
868         case CmpInst::Predicate::ICMP_NE:
869         case CmpInst::Predicate::ICMP_UGE:
870         case CmpInst::Predicate::ICMP_ULE:
871         case CmpInst::Predicate::ICMP_SGE:
872         case CmpInst::Predicate::ICMP_SLE:
873           PredicateExtraCost = 1;
874           break;
875         case CmpInst::Predicate::FCMP_ONE:
876         case CmpInst::Predicate::FCMP_ORD:
877         case CmpInst::Predicate::FCMP_UEQ:
878         case CmpInst::Predicate::FCMP_UNO:
879           PredicateExtraCost = 2;
880           break;
881         default:
882           break;
883         }
884       }
885 
886       // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of
887       // floats.  FIXME: <2 x float> generates same code as <4 x float>.
888       unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1);
889       unsigned NumVecs_cmp = getNumVectorRegs(ValTy);
890 
891       unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost));
892       return Cost;
893     }
894     else { // Called with a select instruction.
895       assert (Opcode == Instruction::Select);
896 
897       // We can figure out the extra cost of packing / unpacking if the
898       // instruction was passed and the compare instruction is found.
899       unsigned PackCost = 0;
900       Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
901       if (CmpOpTy != nullptr)
902         PackCost =
903           getVectorBitmaskConversionCost(CmpOpTy, ValTy);
904 
905       return getNumVectorRegs(ValTy) /*vsel*/ + PackCost;
906     }
907   }
908 
909   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind);
910 }
911 
912 int SystemZTTIImpl::
913 getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
914   // vlvgp will insert two grs into a vector register, so only count half the
915   // number of instructions.
916   if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy(64))
917     return ((Index % 2 == 0) ? 1 : 0);
918 
919   if (Opcode == Instruction::ExtractElement) {
920     int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1);
921 
922     // Give a slight penalty for moving out of vector pipeline to FXU unit.
923     if (Index == 0 && Val->isIntOrIntVectorTy())
924       Cost += 1;
925 
926     return Cost;
927   }
928 
929   return BaseT::getVectorInstrCost(Opcode, Val, Index);
930 }
931 
932 // Check if a load may be folded as a memory operand in its user.
933 bool SystemZTTIImpl::
934 isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) {
935   if (!Ld->hasOneUse())
936     return false;
937   FoldedValue = Ld;
938   const Instruction *UserI = cast<Instruction>(*Ld->user_begin());
939   unsigned LoadedBits = getScalarSizeInBits(Ld->getType());
940   unsigned TruncBits = 0;
941   unsigned SExtBits = 0;
942   unsigned ZExtBits = 0;
943   if (UserI->hasOneUse()) {
944     unsigned UserBits = UserI->getType()->getScalarSizeInBits();
945     if (isa<TruncInst>(UserI))
946       TruncBits = UserBits;
947     else if (isa<SExtInst>(UserI))
948       SExtBits = UserBits;
949     else if (isa<ZExtInst>(UserI))
950       ZExtBits = UserBits;
951   }
952   if (TruncBits || SExtBits || ZExtBits) {
953     FoldedValue = UserI;
954     UserI = cast<Instruction>(*UserI->user_begin());
955     // Load (single use) -> trunc/extend (single use) -> UserI
956   }
957   if ((UserI->getOpcode() == Instruction::Sub ||
958        UserI->getOpcode() == Instruction::SDiv ||
959        UserI->getOpcode() == Instruction::UDiv) &&
960       UserI->getOperand(1) != FoldedValue)
961     return false; // Not commutative, only RHS foldable.
962   // LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an
963   // extension was made of the load.
964   unsigned LoadOrTruncBits =
965       ((SExtBits || ZExtBits) ? 0 : (TruncBits ? TruncBits : LoadedBits));
966   switch (UserI->getOpcode()) {
967   case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64
968   case Instruction::Sub:
969   case Instruction::ICmp:
970     if (LoadedBits == 32 && ZExtBits == 64)
971       return true;
972     LLVM_FALLTHROUGH;
973   case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64
974     if (UserI->getOpcode() != Instruction::ICmp) {
975       if (LoadedBits == 16 &&
976           (SExtBits == 32 ||
977            (SExtBits == 64 && ST->hasMiscellaneousExtensions2())))
978         return true;
979       if (LoadOrTruncBits == 16)
980         return true;
981     }
982     LLVM_FALLTHROUGH;
983   case Instruction::SDiv:// SE: 32->64
984     if (LoadedBits == 32 && SExtBits == 64)
985       return true;
986     LLVM_FALLTHROUGH;
987   case Instruction::UDiv:
988   case Instruction::And:
989   case Instruction::Or:
990   case Instruction::Xor:
991     // This also makes sense for float operations, but disabled for now due
992     // to regressions.
993     // case Instruction::FCmp:
994     // case Instruction::FAdd:
995     // case Instruction::FSub:
996     // case Instruction::FMul:
997     // case Instruction::FDiv:
998 
999     // All possible extensions of memory checked above.
1000 
1001     // Comparison between memory and immediate.
1002     if (UserI->getOpcode() == Instruction::ICmp)
1003       if (ConstantInt *CI = dyn_cast<ConstantInt>(UserI->getOperand(1)))
1004         if (isUInt<16>(CI->getZExtValue()))
1005           return true;
1006     return (LoadOrTruncBits == 32 || LoadOrTruncBits == 64);
1007     break;
1008   }
1009   return false;
1010 }
1011 
1012 static bool isBswapIntrinsicCall(const Value *V) {
1013   if (const Instruction *I = dyn_cast<Instruction>(V))
1014     if (auto *CI = dyn_cast<CallInst>(I))
1015       if (auto *F = CI->getCalledFunction())
1016         if (F->getIntrinsicID() == Intrinsic::bswap)
1017           return true;
1018   return false;
1019 }
1020 
1021 int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1022                                     MaybeAlign Alignment, unsigned AddressSpace,
1023                                     TTI::TargetCostKind CostKind,
1024                                     const Instruction *I) {
1025   assert(!Src->isVoidTy() && "Invalid type");
1026 
1027   if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) {
1028     // Store the load or its truncated or extended value in FoldedValue.
1029     const Instruction *FoldedValue = nullptr;
1030     if (isFoldableLoad(cast<LoadInst>(I), FoldedValue)) {
1031       const Instruction *UserI = cast<Instruction>(*FoldedValue->user_begin());
1032       assert (UserI->getNumOperands() == 2 && "Expected a binop.");
1033 
1034       // UserI can't fold two loads, so in that case return 0 cost only
1035       // half of the time.
1036       for (unsigned i = 0; i < 2; ++i) {
1037         if (UserI->getOperand(i) == FoldedValue)
1038           continue;
1039 
1040         if (Instruction *OtherOp = dyn_cast<Instruction>(UserI->getOperand(i))){
1041           LoadInst *OtherLoad = dyn_cast<LoadInst>(OtherOp);
1042           if (!OtherLoad &&
1043               (isa<TruncInst>(OtherOp) || isa<SExtInst>(OtherOp) ||
1044                isa<ZExtInst>(OtherOp)))
1045             OtherLoad = dyn_cast<LoadInst>(OtherOp->getOperand(0));
1046           if (OtherLoad && isFoldableLoad(OtherLoad, FoldedValue/*dummy*/))
1047             return i == 0; // Both operands foldable.
1048         }
1049       }
1050 
1051       return 0; // Only I is foldable in user.
1052     }
1053   }
1054 
1055   unsigned NumOps =
1056     (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src));
1057 
1058   // Store/Load reversed saves one instruction.
1059   if (((!Src->isVectorTy() && NumOps == 1) || ST->hasVectorEnhancements2()) &&
1060       I != nullptr) {
1061     if (Opcode == Instruction::Load && I->hasOneUse()) {
1062       const Instruction *LdUser = cast<Instruction>(*I->user_begin());
1063       // In case of load -> bswap -> store, return normal cost for the load.
1064       if (isBswapIntrinsicCall(LdUser) &&
1065           (!LdUser->hasOneUse() || !isa<StoreInst>(*LdUser->user_begin())))
1066         return 0;
1067     }
1068     else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
1069       const Value *StoredVal = SI->getValueOperand();
1070       if (StoredVal->hasOneUse() && isBswapIntrinsicCall(StoredVal))
1071         return 0;
1072     }
1073   }
1074 
1075   if (Src->getScalarSizeInBits() == 128)
1076     // 128 bit scalars are held in a pair of two 64 bit registers.
1077     NumOps *= 2;
1078 
1079   return  NumOps;
1080 }
1081 
1082 // The generic implementation of getInterleavedMemoryOpCost() is based on
1083 // adding costs of the memory operations plus all the extracts and inserts
1084 // needed for using / defining the vector operands. The SystemZ version does
1085 // roughly the same but bases the computations on vector permutations
1086 // instead.
1087 int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
1088                                                unsigned Factor,
1089                                                ArrayRef<unsigned> Indices,
1090                                                unsigned Alignment,
1091                                                unsigned AddressSpace,
1092                                                TTI::TargetCostKind CostKind,
1093                                                bool UseMaskForCond,
1094                                                bool UseMaskForGaps) {
1095   if (UseMaskForCond || UseMaskForGaps)
1096     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1097                                              Alignment, AddressSpace, CostKind,
1098                                              UseMaskForCond, UseMaskForGaps);
1099   assert(isa<VectorType>(VecTy) &&
1100          "Expect a vector type for interleaved memory op");
1101 
1102   // Return the ceiling of dividing A by B.
1103   auto ceil = [](unsigned A, unsigned B) { return (A + B - 1) / B; };
1104 
1105   unsigned NumElts = cast<VectorType>(VecTy)->getNumElements();
1106   assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1107   unsigned VF = NumElts / Factor;
1108   unsigned NumEltsPerVecReg = (128U / getScalarSizeInBits(VecTy));
1109   unsigned NumVectorMemOps = getNumVectorRegs(VecTy);
1110   unsigned NumPermutes = 0;
1111 
1112   if (Opcode == Instruction::Load) {
1113     // Loading interleave groups may have gaps, which may mean fewer
1114     // loads. Find out how many vectors will be loaded in total, and in how
1115     // many of them each value will be in.
1116     BitVector UsedInsts(NumVectorMemOps, false);
1117     std::vector<BitVector> ValueVecs(Factor, BitVector(NumVectorMemOps, false));
1118     for (unsigned Index : Indices)
1119       for (unsigned Elt = 0; Elt < VF; ++Elt) {
1120         unsigned Vec = (Index + Elt * Factor) / NumEltsPerVecReg;
1121         UsedInsts.set(Vec);
1122         ValueVecs[Index].set(Vec);
1123       }
1124     NumVectorMemOps = UsedInsts.count();
1125 
1126     for (unsigned Index : Indices) {
1127       // Estimate that each loaded source vector containing this Index
1128       // requires one operation, except that vperm can handle two input
1129       // registers first time for each dst vector.
1130       unsigned NumSrcVecs = ValueVecs[Index].count();
1131       unsigned NumDstVecs = ceil(VF * getScalarSizeInBits(VecTy), 128U);
1132       assert (NumSrcVecs >= NumDstVecs && "Expected at least as many sources");
1133       NumPermutes += std::max(1U, NumSrcVecs - NumDstVecs);
1134     }
1135   } else {
1136     // Estimate the permutes for each stored vector as the smaller of the
1137     // number of elements and the number of source vectors. Subtract one per
1138     // dst vector for vperm (S.A.).
1139     unsigned NumSrcVecs = std::min(NumEltsPerVecReg, Factor);
1140     unsigned NumDstVecs = NumVectorMemOps;
1141     assert (NumSrcVecs > 1 && "Expected at least two source vectors.");
1142     NumPermutes += (NumDstVecs * NumSrcVecs) - NumDstVecs;
1143   }
1144 
1145   // Cost of load/store operations and the permutations needed.
1146   return NumVectorMemOps + NumPermutes;
1147 }
1148 
1149 static int getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy) {
1150   if (RetTy->isVectorTy() && ID == Intrinsic::bswap)
1151     return getNumVectorRegs(RetTy); // VPERM
1152   return -1;
1153 }
1154 
1155 int SystemZTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
1156                                           ArrayRef<Value *> Args,
1157                                           FastMathFlags FMF, unsigned VF,
1158                                           TTI::TargetCostKind CostKind,
1159                                           const Instruction *I) {
1160   int Cost = getVectorIntrinsicInstrCost(ID, RetTy);
1161   if (Cost != -1)
1162     return Cost;
1163   return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF, CostKind, I);
1164 }
1165 
1166 int SystemZTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
1167                                           ArrayRef<Type *> Tys,
1168                                           FastMathFlags FMF,
1169                                           unsigned ScalarizationCostPassed,
1170                                           TTI::TargetCostKind CostKind,
1171                                           const Instruction *I) {
1172   int Cost = getVectorIntrinsicInstrCost(ID, RetTy);
1173   if (Cost != -1)
1174     return Cost;
1175   return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys, FMF,
1176                                       ScalarizationCostPassed, CostKind, I);
1177 }
1178