1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 /// \file
10 /// This file implements a TargetTransformInfo analysis pass specific to the
11 /// X86 target machine. It uses the target's detailed information to provide
12 /// more precise answers to certain TTI queries, while letting the target
13 /// independent and default TTI implementations handle the rest.
14 ///
15 //===----------------------------------------------------------------------===//
16 /// About Cost Model numbers used below it's necessary to say the following:
17 /// the numbers correspond to some "generic" X86 CPU instead of usage of
18 /// concrete CPU model. Usually the numbers correspond to CPU where the feature
19 /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
20 /// the lookups below the cost is based on Nehalem as that was the first CPU
21 /// to support that feature level and thus has most likely the worst case cost.
22 /// Some examples of other technologies/CPUs:
23 ///   SSE 3   - Pentium4 / Athlon64
24 ///   SSE 4.1 - Penryn
25 ///   SSE 4.2 - Nehalem
26 ///   AVX     - Sandy Bridge
27 ///   AVX2    - Haswell
28 ///   AVX-512 - Xeon Phi / Skylake
29 /// And some examples of instruction target dependent costs (latency)
30 ///                   divss     sqrtss          rsqrtss
31 ///   AMD K7            11-16     19              3
32 ///   Piledriver        9-24      13-15           5
33 ///   Jaguar            14        16              2
34 ///   Pentium II,III    18        30              2
35 ///   Nehalem           7-14      7-18            3
36 ///   Haswell           10-13     11              5
37 /// TODO: Develop and implement  the target dependent cost model and
38 /// specialize cost numbers for different Cost Model Targets such as throughput,
39 /// code size, latency and uop count.
40 //===----------------------------------------------------------------------===//
41 
42 #include "X86TargetTransformInfo.h"
43 #include "llvm/Analysis/TargetTransformInfo.h"
44 #include "llvm/CodeGen/BasicTTIImpl.h"
45 #include "llvm/IR/IntrinsicInst.h"
46 #include "llvm/Support/Debug.h"
47 #include "llvm/Target/CostTable.h"
48 #include "llvm/Target/TargetLowering.h"
49 
50 using namespace llvm;
51 
52 #define DEBUG_TYPE "x86tti"
53 
54 //===----------------------------------------------------------------------===//
55 //
56 // X86 cost model.
57 //
58 //===----------------------------------------------------------------------===//
59 
60 TargetTransformInfo::PopcntSupportKind
61 X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
62   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
63   // TODO: Currently the __builtin_popcount() implementation using SSE3
64   //   instructions is inefficient. Once the problem is fixed, we should
65   //   call ST->hasSSE3() instead of ST->hasPOPCNT().
66   return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
67 }
68 
69 unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
70   if (Vector && !ST->hasSSE1())
71     return 0;
72 
73   if (ST->is64Bit()) {
74     if (Vector && ST->hasAVX512())
75       return 32;
76     return 16;
77   }
78   return 8;
79 }
80 
81 unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {
82   if (Vector) {
83     if (ST->hasAVX512())
84       return 512;
85     if (ST->hasAVX())
86       return 256;
87     if (ST->hasSSE1())
88       return 128;
89     return 0;
90   }
91 
92   if (ST->is64Bit())
93     return 64;
94 
95   return 32;
96 }
97 
98 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
99   return getRegisterBitWidth(true);
100 }
101 
102 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
103   // If the loop will not be vectorized, don't interleave the loop.
104   // Let regular unroll to unroll the loop, which saves the overflow
105   // check and memory check cost.
106   if (VF == 1)
107     return 1;
108 
109   if (ST->isAtom())
110     return 1;
111 
112   // Sandybridge and Haswell have multiple execution ports and pipelined
113   // vector units.
114   if (ST->hasAVX())
115     return 4;
116 
117   return 2;
118 }
119 
120 int X86TTIImpl::getArithmeticInstrCost(
121     unsigned Opcode, Type *Ty,
122     TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
123     TTI::OperandValueProperties Opd1PropInfo,
124     TTI::OperandValueProperties Opd2PropInfo,
125     ArrayRef<const Value *> Args) {
126   // Legalize the type.
127   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
128 
129   int ISD = TLI->InstructionOpcodeToISD(Opcode);
130   assert(ISD && "Invalid opcode");
131 
132   static const CostTblEntry SLMCostTable[] = {
133     { ISD::MUL,  MVT::v4i32, 11 }, // pmulld
134     { ISD::MUL,  MVT::v8i16, 2  }, // pmullw
135     { ISD::MUL,  MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
136     { ISD::FMUL, MVT::f64,   2  }, // mulsd
137     { ISD::FMUL, MVT::v2f64, 4  }, // mulpd
138     { ISD::FMUL, MVT::v4f32, 2  }, // mulps
139     { ISD::FDIV, MVT::f32,   17 }, // divss
140     { ISD::FDIV, MVT::v4f32, 39 }, // divps
141     { ISD::FDIV, MVT::f64,   32 }, // divsd
142     { ISD::FDIV, MVT::v2f64, 69 }, // divpd
143     { ISD::FADD, MVT::v2f64, 2  }, // addpd
144     { ISD::FSUB, MVT::v2f64, 2  }, // subpd
145     // v2i64/v4i64 mul is custom lowered as a series of long:
146     // multiplies(3), shifts(3) and adds(2)
147     // slm muldq version throughput is 2 and addq throughput 4
148     // thus: 3X2 (muldq throughput) + 3X1 (shift throuput) +
149     //       3X4 (addq throughput) = 17
150     { ISD::MUL,  MVT::v2i64, 17 },
151     // slm addq\subq throughput is 4
152     { ISD::ADD,  MVT::v2i64, 4  },
153     { ISD::SUB,  MVT::v2i64, 4  },
154   };
155 
156   if (ST->isSLM()) {
157     if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
158       // Check if the operands can be shrinked into a smaller datatype.
159       bool Op1Signed = false;
160       unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
161       bool Op2Signed = false;
162       unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
163 
164       bool signedMode = Op1Signed | Op2Signed;
165       unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
166 
167       if (OpMinSize <= 7)
168         return LT.first * 3; // pmullw/sext
169       if (!signedMode && OpMinSize <= 8)
170         return LT.first * 3; // pmullw/zext
171       if (OpMinSize <= 15)
172         return LT.first * 5; // pmullw/pmulhw/pshuf
173       if (!signedMode && OpMinSize <= 16)
174         return LT.first * 5; // pmullw/pmulhw/pshuf
175     }
176     if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
177                                             LT.second)) {
178       return LT.first * Entry->Cost;
179     }
180   }
181 
182   if (ISD == ISD::SDIV &&
183       Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
184       Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
185     // On X86, vector signed division by constants power-of-two are
186     // normally expanded to the sequence SRA + SRL + ADD + SRA.
187     // The OperandValue properties many not be same as that of previous
188     // operation;conservatively assume OP_None.
189     int Cost = 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info,
190                                           Op2Info, TargetTransformInfo::OP_None,
191                                           TargetTransformInfo::OP_None);
192     Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
193                                    TargetTransformInfo::OP_None,
194                                    TargetTransformInfo::OP_None);
195     Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
196                                    TargetTransformInfo::OP_None,
197                                    TargetTransformInfo::OP_None);
198 
199     return Cost;
200   }
201 
202   static const CostTblEntry AVX512BWUniformConstCostTable[] = {
203     { ISD::SHL,  MVT::v64i8,   2 }, // psllw + pand.
204     { ISD::SRL,  MVT::v64i8,   2 }, // psrlw + pand.
205     { ISD::SRA,  MVT::v64i8,   4 }, // psrlw, pand, pxor, psubb.
206 
207     { ISD::SDIV, MVT::v32i16,  6 }, // vpmulhw sequence
208     { ISD::UDIV, MVT::v32i16,  6 }, // vpmulhuw sequence
209   };
210 
211   if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
212       ST->hasBWI()) {
213     if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
214                                             LT.second))
215       return LT.first * Entry->Cost;
216   }
217 
218   static const CostTblEntry AVX512UniformConstCostTable[] = {
219     { ISD::SRA,  MVT::v2i64,   1 },
220     { ISD::SRA,  MVT::v4i64,   1 },
221     { ISD::SRA,  MVT::v8i64,   1 },
222 
223     { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
224     { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
225   };
226 
227   if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
228       ST->hasAVX512()) {
229     if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
230                                             LT.second))
231       return LT.first * Entry->Cost;
232   }
233 
234   static const CostTblEntry AVX2UniformConstCostTable[] = {
235     { ISD::SHL,  MVT::v32i8,   2 }, // psllw + pand.
236     { ISD::SRL,  MVT::v32i8,   2 }, // psrlw + pand.
237     { ISD::SRA,  MVT::v32i8,   4 }, // psrlw, pand, pxor, psubb.
238 
239     { ISD::SRA,  MVT::v4i64,   4 }, // 2 x psrad + shuffle.
240 
241     { ISD::SDIV, MVT::v16i16,  6 }, // vpmulhw sequence
242     { ISD::UDIV, MVT::v16i16,  6 }, // vpmulhuw sequence
243     { ISD::SDIV, MVT::v8i32,  15 }, // vpmuldq sequence
244     { ISD::UDIV, MVT::v8i32,  15 }, // vpmuludq sequence
245   };
246 
247   if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
248       ST->hasAVX2()) {
249     if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
250                                             LT.second))
251       return LT.first * Entry->Cost;
252   }
253 
254   static const CostTblEntry SSE2UniformConstCostTable[] = {
255     { ISD::SHL,  MVT::v16i8,     2 }, // psllw + pand.
256     { ISD::SRL,  MVT::v16i8,     2 }, // psrlw + pand.
257     { ISD::SRA,  MVT::v16i8,     4 }, // psrlw, pand, pxor, psubb.
258 
259     { ISD::SHL,  MVT::v32i8,   4+2 }, // 2*(psllw + pand) + split.
260     { ISD::SRL,  MVT::v32i8,   4+2 }, // 2*(psrlw + pand) + split.
261     { ISD::SRA,  MVT::v32i8,   8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
262 
263     { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
264     { ISD::SDIV, MVT::v8i16,     6 }, // pmulhw sequence
265     { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
266     { ISD::UDIV, MVT::v8i16,     6 }, // pmulhuw sequence
267     { ISD::SDIV, MVT::v8i32,  38+2 }, // 2*pmuludq sequence + split.
268     { ISD::SDIV, MVT::v4i32,    19 }, // pmuludq sequence
269     { ISD::UDIV, MVT::v8i32,  30+2 }, // 2*pmuludq sequence + split.
270     { ISD::UDIV, MVT::v4i32,    15 }, // pmuludq sequence
271   };
272 
273   if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
274       ST->hasSSE2()) {
275     // pmuldq sequence.
276     if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
277       return LT.first * 32;
278     if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
279       return LT.first * 15;
280 
281     // XOP has faster vXi8 shifts.
282     if ((ISD != ISD::SHL && ISD != ISD::SRL && ISD != ISD::SRA) ||
283         !ST->hasXOP())
284       if (const auto *Entry =
285               CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
286         return LT.first * Entry->Cost;
287   }
288 
289   static const CostTblEntry AVX2UniformCostTable[] = {
290     // Uniform splats are cheaper for the following instructions.
291     { ISD::SHL,  MVT::v16i16, 1 }, // psllw.
292     { ISD::SRL,  MVT::v16i16, 1 }, // psrlw.
293     { ISD::SRA,  MVT::v16i16, 1 }, // psraw.
294   };
295 
296   if (ST->hasAVX2() &&
297       ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
298        (Op2Info == TargetTransformInfo::OK_UniformValue))) {
299     if (const auto *Entry =
300             CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
301       return LT.first * Entry->Cost;
302   }
303 
304   static const CostTblEntry SSE2UniformCostTable[] = {
305     // Uniform splats are cheaper for the following instructions.
306     { ISD::SHL,  MVT::v8i16,  1 }, // psllw.
307     { ISD::SHL,  MVT::v4i32,  1 }, // pslld
308     { ISD::SHL,  MVT::v2i64,  1 }, // psllq.
309 
310     { ISD::SRL,  MVT::v8i16,  1 }, // psrlw.
311     { ISD::SRL,  MVT::v4i32,  1 }, // psrld.
312     { ISD::SRL,  MVT::v2i64,  1 }, // psrlq.
313 
314     { ISD::SRA,  MVT::v8i16,  1 }, // psraw.
315     { ISD::SRA,  MVT::v4i32,  1 }, // psrad.
316   };
317 
318   if (ST->hasSSE2() &&
319       ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
320        (Op2Info == TargetTransformInfo::OK_UniformValue))) {
321     if (const auto *Entry =
322             CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
323       return LT.first * Entry->Cost;
324   }
325 
326   static const CostTblEntry AVX512DQCostTable[] = {
327     { ISD::MUL,  MVT::v2i64, 1 },
328     { ISD::MUL,  MVT::v4i64, 1 },
329     { ISD::MUL,  MVT::v8i64, 1 }
330   };
331 
332   // Look for AVX512DQ lowering tricks for custom cases.
333   if (ST->hasDQI())
334     if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
335       return LT.first * Entry->Cost;
336 
337   static const CostTblEntry AVX512BWCostTable[] = {
338     { ISD::SHL,   MVT::v8i16,      1 }, // vpsllvw
339     { ISD::SRL,   MVT::v8i16,      1 }, // vpsrlvw
340     { ISD::SRA,   MVT::v8i16,      1 }, // vpsravw
341 
342     { ISD::SHL,   MVT::v16i16,     1 }, // vpsllvw
343     { ISD::SRL,   MVT::v16i16,     1 }, // vpsrlvw
344     { ISD::SRA,   MVT::v16i16,     1 }, // vpsravw
345 
346     { ISD::SHL,   MVT::v32i16,     1 }, // vpsllvw
347     { ISD::SRL,   MVT::v32i16,     1 }, // vpsrlvw
348     { ISD::SRA,   MVT::v32i16,     1 }, // vpsravw
349 
350     { ISD::SHL,   MVT::v64i8,     11 }, // vpblendvb sequence.
351     { ISD::SRL,   MVT::v64i8,     11 }, // vpblendvb sequence.
352     { ISD::SRA,   MVT::v64i8,     24 }, // vpblendvb sequence.
353 
354     { ISD::MUL,   MVT::v64i8,     11 }, // extend/pmullw/trunc sequence.
355     { ISD::MUL,   MVT::v32i8,      4 }, // extend/pmullw/trunc sequence.
356     { ISD::MUL,   MVT::v16i8,      4 }, // extend/pmullw/trunc sequence.
357 
358     // Vectorizing division is a bad idea. See the SSE2 table for more comments.
359     { ISD::SDIV,  MVT::v64i8,  64*20 },
360     { ISD::SDIV,  MVT::v32i16, 32*20 },
361     { ISD::UDIV,  MVT::v64i8,  64*20 },
362     { ISD::UDIV,  MVT::v32i16, 32*20 }
363   };
364 
365   // Look for AVX512BW lowering tricks for custom cases.
366   if (ST->hasBWI())
367     if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
368       return LT.first * Entry->Cost;
369 
370   static const CostTblEntry AVX512CostTable[] = {
371     { ISD::SHL,     MVT::v16i32,     1 },
372     { ISD::SRL,     MVT::v16i32,     1 },
373     { ISD::SRA,     MVT::v16i32,     1 },
374 
375     { ISD::SHL,     MVT::v8i64,      1 },
376     { ISD::SRL,     MVT::v8i64,      1 },
377 
378     { ISD::SRA,     MVT::v2i64,      1 },
379     { ISD::SRA,     MVT::v4i64,      1 },
380     { ISD::SRA,     MVT::v8i64,      1 },
381 
382     { ISD::MUL,     MVT::v32i8,     13 }, // extend/pmullw/trunc sequence.
383     { ISD::MUL,     MVT::v16i8,      5 }, // extend/pmullw/trunc sequence.
384     { ISD::MUL,     MVT::v16i32,     1 }, // pmulld
385     { ISD::MUL,     MVT::v8i64,      8 }, // 3*pmuludq/3*shift/2*add
386 
387     // Vectorizing division is a bad idea. See the SSE2 table for more comments.
388     { ISD::SDIV,    MVT::v16i32, 16*20 },
389     { ISD::SDIV,    MVT::v8i64,   8*20 },
390     { ISD::UDIV,    MVT::v16i32, 16*20 },
391     { ISD::UDIV,    MVT::v8i64,   8*20 }
392   };
393 
394   if (ST->hasAVX512())
395     if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
396       return LT.first * Entry->Cost;
397 
398   static const CostTblEntry AVX2ShiftCostTable[] = {
399     // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
400     // customize them to detect the cases where shift amount is a scalar one.
401     { ISD::SHL,     MVT::v4i32,    1 },
402     { ISD::SRL,     MVT::v4i32,    1 },
403     { ISD::SRA,     MVT::v4i32,    1 },
404     { ISD::SHL,     MVT::v8i32,    1 },
405     { ISD::SRL,     MVT::v8i32,    1 },
406     { ISD::SRA,     MVT::v8i32,    1 },
407     { ISD::SHL,     MVT::v2i64,    1 },
408     { ISD::SRL,     MVT::v2i64,    1 },
409     { ISD::SHL,     MVT::v4i64,    1 },
410     { ISD::SRL,     MVT::v4i64,    1 },
411   };
412 
413   // Look for AVX2 lowering tricks.
414   if (ST->hasAVX2()) {
415     if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
416         (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
417          Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
418       // On AVX2, a packed v16i16 shift left by a constant build_vector
419       // is lowered into a vector multiply (vpmullw).
420       return LT.first;
421 
422     if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
423       return LT.first * Entry->Cost;
424   }
425 
426   static const CostTblEntry XOPShiftCostTable[] = {
427     // 128bit shifts take 1cy, but right shifts require negation beforehand.
428     { ISD::SHL,     MVT::v16i8,    1 },
429     { ISD::SRL,     MVT::v16i8,    2 },
430     { ISD::SRA,     MVT::v16i8,    2 },
431     { ISD::SHL,     MVT::v8i16,    1 },
432     { ISD::SRL,     MVT::v8i16,    2 },
433     { ISD::SRA,     MVT::v8i16,    2 },
434     { ISD::SHL,     MVT::v4i32,    1 },
435     { ISD::SRL,     MVT::v4i32,    2 },
436     { ISD::SRA,     MVT::v4i32,    2 },
437     { ISD::SHL,     MVT::v2i64,    1 },
438     { ISD::SRL,     MVT::v2i64,    2 },
439     { ISD::SRA,     MVT::v2i64,    2 },
440     // 256bit shifts require splitting if AVX2 didn't catch them above.
441     { ISD::SHL,     MVT::v32i8,  2+2 },
442     { ISD::SRL,     MVT::v32i8,  4+2 },
443     { ISD::SRA,     MVT::v32i8,  4+2 },
444     { ISD::SHL,     MVT::v16i16, 2+2 },
445     { ISD::SRL,     MVT::v16i16, 4+2 },
446     { ISD::SRA,     MVT::v16i16, 4+2 },
447     { ISD::SHL,     MVT::v8i32,  2+2 },
448     { ISD::SRL,     MVT::v8i32,  4+2 },
449     { ISD::SRA,     MVT::v8i32,  4+2 },
450     { ISD::SHL,     MVT::v4i64,  2+2 },
451     { ISD::SRL,     MVT::v4i64,  4+2 },
452     { ISD::SRA,     MVT::v4i64,  4+2 },
453   };
454 
455   // Look for XOP lowering tricks.
456   if (ST->hasXOP())
457     if (const auto *Entry = CostTableLookup(XOPShiftCostTable, ISD, LT.second))
458       return LT.first * Entry->Cost;
459 
460   static const CostTblEntry SSE2UniformShiftCostTable[] = {
461     // Uniform splats are cheaper for the following instructions.
462     { ISD::SHL,  MVT::v16i16, 2+2 }, // 2*psllw + split.
463     { ISD::SHL,  MVT::v8i32,  2+2 }, // 2*pslld + split.
464     { ISD::SHL,  MVT::v4i64,  2+2 }, // 2*psllq + split.
465 
466     { ISD::SRL,  MVT::v16i16, 2+2 }, // 2*psrlw + split.
467     { ISD::SRL,  MVT::v8i32,  2+2 }, // 2*psrld + split.
468     { ISD::SRL,  MVT::v4i64,  2+2 }, // 2*psrlq + split.
469 
470     { ISD::SRA,  MVT::v16i16, 2+2 }, // 2*psraw + split.
471     { ISD::SRA,  MVT::v8i32,  2+2 }, // 2*psrad + split.
472     { ISD::SRA,  MVT::v2i64,    4 }, // 2*psrad + shuffle.
473     { ISD::SRA,  MVT::v4i64,  8+2 }, // 2*(2*psrad + shuffle) + split.
474   };
475 
476   if (ST->hasSSE2() &&
477       ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
478        (Op2Info == TargetTransformInfo::OK_UniformValue))) {
479 
480     // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
481     if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
482       return LT.first * 4; // 2*psrad + shuffle.
483 
484     if (const auto *Entry =
485             CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
486       return LT.first * Entry->Cost;
487   }
488 
489   if (ISD == ISD::SHL &&
490       Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) {
491     MVT VT = LT.second;
492     // Vector shift left by non uniform constant can be lowered
493     // into vector multiply.
494     if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
495         ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
496       ISD = ISD::MUL;
497   }
498 
499   static const CostTblEntry AVX2CostTable[] = {
500     { ISD::SHL,  MVT::v32i8,     11 }, // vpblendvb sequence.
501     { ISD::SHL,  MVT::v16i16,    10 }, // extend/vpsrlvd/pack sequence.
502 
503     { ISD::SRL,  MVT::v32i8,     11 }, // vpblendvb sequence.
504     { ISD::SRL,  MVT::v16i16,    10 }, // extend/vpsrlvd/pack sequence.
505 
506     { ISD::SRA,  MVT::v32i8,     24 }, // vpblendvb sequence.
507     { ISD::SRA,  MVT::v16i16,    10 }, // extend/vpsravd/pack sequence.
508     { ISD::SRA,  MVT::v2i64,      4 }, // srl/xor/sub sequence.
509     { ISD::SRA,  MVT::v4i64,      4 }, // srl/xor/sub sequence.
510 
511     { ISD::SUB,  MVT::v32i8,      1 }, // psubb
512     { ISD::ADD,  MVT::v32i8,      1 }, // paddb
513     { ISD::SUB,  MVT::v16i16,     1 }, // psubw
514     { ISD::ADD,  MVT::v16i16,     1 }, // paddw
515     { ISD::SUB,  MVT::v8i32,      1 }, // psubd
516     { ISD::ADD,  MVT::v8i32,      1 }, // paddd
517     { ISD::SUB,  MVT::v4i64,      1 }, // psubq
518     { ISD::ADD,  MVT::v4i64,      1 }, // paddq
519 
520     { ISD::MUL,  MVT::v32i8,     17 }, // extend/pmullw/trunc sequence.
521     { ISD::MUL,  MVT::v16i8,      7 }, // extend/pmullw/trunc sequence.
522     { ISD::MUL,  MVT::v16i16,     1 }, // pmullw
523     { ISD::MUL,  MVT::v8i32,      1 }, // pmulld
524     { ISD::MUL,  MVT::v4i64,      8 }, // 3*pmuludq/3*shift/2*add
525 
526     { ISD::FDIV, MVT::f32,        7 }, // Haswell from http://www.agner.org/
527     { ISD::FDIV, MVT::v4f32,      7 }, // Haswell from http://www.agner.org/
528     { ISD::FDIV, MVT::v8f32,     14 }, // Haswell from http://www.agner.org/
529     { ISD::FDIV, MVT::f64,       14 }, // Haswell from http://www.agner.org/
530     { ISD::FDIV, MVT::v2f64,     14 }, // Haswell from http://www.agner.org/
531     { ISD::FDIV, MVT::v4f64,     28 }, // Haswell from http://www.agner.org/
532   };
533 
534   // Look for AVX2 lowering tricks for custom cases.
535   if (ST->hasAVX2())
536     if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
537       return LT.first * Entry->Cost;
538 
539   static const CostTblEntry AVX1CostTable[] = {
540     // We don't have to scalarize unsupported ops. We can issue two half-sized
541     // operations and we only need to extract the upper YMM half.
542     // Two ops + 1 extract + 1 insert = 4.
543     { ISD::MUL,     MVT::v16i16,     4 },
544     { ISD::MUL,     MVT::v8i32,      4 },
545     { ISD::SUB,     MVT::v32i8,      4 },
546     { ISD::ADD,     MVT::v32i8,      4 },
547     { ISD::SUB,     MVT::v16i16,     4 },
548     { ISD::ADD,     MVT::v16i16,     4 },
549     { ISD::SUB,     MVT::v8i32,      4 },
550     { ISD::ADD,     MVT::v8i32,      4 },
551     { ISD::SUB,     MVT::v4i64,      4 },
552     { ISD::ADD,     MVT::v4i64,      4 },
553 
554     // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
555     // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
556     // Because we believe v4i64 to be a legal type, we must also include the
557     // extract+insert in the cost table. Therefore, the cost here is 18
558     // instead of 8.
559     { ISD::MUL,     MVT::v4i64,     18 },
560 
561     { ISD::MUL,     MVT::v32i8,     26 }, // extend/pmullw/trunc sequence.
562 
563     { ISD::FDIV,    MVT::f32,       14 }, // SNB from http://www.agner.org/
564     { ISD::FDIV,    MVT::v4f32,     14 }, // SNB from http://www.agner.org/
565     { ISD::FDIV,    MVT::v8f32,     28 }, // SNB from http://www.agner.org/
566     { ISD::FDIV,    MVT::f64,       22 }, // SNB from http://www.agner.org/
567     { ISD::FDIV,    MVT::v2f64,     22 }, // SNB from http://www.agner.org/
568     { ISD::FDIV,    MVT::v4f64,     44 }, // SNB from http://www.agner.org/
569 
570     // Vectorizing division is a bad idea. See the SSE2 table for more comments.
571     { ISD::SDIV,    MVT::v32i8,  32*20 },
572     { ISD::SDIV,    MVT::v16i16, 16*20 },
573     { ISD::SDIV,    MVT::v8i32,   8*20 },
574     { ISD::SDIV,    MVT::v4i64,   4*20 },
575     { ISD::UDIV,    MVT::v32i8,  32*20 },
576     { ISD::UDIV,    MVT::v16i16, 16*20 },
577     { ISD::UDIV,    MVT::v8i32,   8*20 },
578     { ISD::UDIV,    MVT::v4i64,   4*20 },
579   };
580 
581   if (ST->hasAVX())
582     if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
583       return LT.first * Entry->Cost;
584 
585   static const CostTblEntry SSE42CostTable[] = {
586     { ISD::FDIV,  MVT::f32,   14 }, // Nehalem from http://www.agner.org/
587     { ISD::FDIV,  MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
588     { ISD::FDIV,  MVT::f64,   22 }, // Nehalem from http://www.agner.org/
589     { ISD::FDIV,  MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
590   };
591 
592   if (ST->hasSSE42())
593     if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
594       return LT.first * Entry->Cost;
595 
596   static const CostTblEntry SSE41CostTable[] = {
597     { ISD::SHL,  MVT::v16i8,      11 }, // pblendvb sequence.
598     { ISD::SHL,  MVT::v32i8,  2*11+2 }, // pblendvb sequence + split.
599     { ISD::SHL,  MVT::v8i16,      14 }, // pblendvb sequence.
600     { ISD::SHL,  MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
601     { ISD::SHL,  MVT::v4i32,       4 }, // pslld/paddd/cvttps2dq/pmulld
602     { ISD::SHL,  MVT::v8i32,   2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split
603 
604     { ISD::SRL,  MVT::v16i8,      12 }, // pblendvb sequence.
605     { ISD::SRL,  MVT::v32i8,  2*12+2 }, // pblendvb sequence + split.
606     { ISD::SRL,  MVT::v8i16,      14 }, // pblendvb sequence.
607     { ISD::SRL,  MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
608     { ISD::SRL,  MVT::v4i32,      11 }, // Shift each lane + blend.
609     { ISD::SRL,  MVT::v8i32,  2*11+2 }, // Shift each lane + blend + split.
610 
611     { ISD::SRA,  MVT::v16i8,      24 }, // pblendvb sequence.
612     { ISD::SRA,  MVT::v32i8,  2*24+2 }, // pblendvb sequence + split.
613     { ISD::SRA,  MVT::v8i16,      14 }, // pblendvb sequence.
614     { ISD::SRA,  MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
615     { ISD::SRA,  MVT::v4i32,      12 }, // Shift each lane + blend.
616     { ISD::SRA,  MVT::v8i32,  2*12+2 }, // Shift each lane + blend + split.
617 
618     { ISD::MUL,  MVT::v4i32,       1 }  // pmulld
619   };
620 
621   if (ST->hasSSE41())
622     if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
623       return LT.first * Entry->Cost;
624 
625   static const CostTblEntry SSE2CostTable[] = {
626     // We don't correctly identify costs of casts because they are marked as
627     // custom.
628     { ISD::SHL,  MVT::v16i8,      26 }, // cmpgtb sequence.
629     { ISD::SHL,  MVT::v8i16,      32 }, // cmpgtb sequence.
630     { ISD::SHL,  MVT::v4i32,     2*5 }, // We optimized this using mul.
631     { ISD::SHL,  MVT::v2i64,       4 }, // splat+shuffle sequence.
632     { ISD::SHL,  MVT::v4i64,   2*4+2 }, // splat+shuffle sequence + split.
633 
634     { ISD::SRL,  MVT::v16i8,      26 }, // cmpgtb sequence.
635     { ISD::SRL,  MVT::v8i16,      32 }, // cmpgtb sequence.
636     { ISD::SRL,  MVT::v4i32,      16 }, // Shift each lane + blend.
637     { ISD::SRL,  MVT::v2i64,       4 }, // splat+shuffle sequence.
638     { ISD::SRL,  MVT::v4i64,   2*4+2 }, // splat+shuffle sequence + split.
639 
640     { ISD::SRA,  MVT::v16i8,      54 }, // unpacked cmpgtb sequence.
641     { ISD::SRA,  MVT::v8i16,      32 }, // cmpgtb sequence.
642     { ISD::SRA,  MVT::v4i32,      16 }, // Shift each lane + blend.
643     { ISD::SRA,  MVT::v2i64,      12 }, // srl/xor/sub sequence.
644     { ISD::SRA,  MVT::v4i64,  2*12+2 }, // srl/xor/sub sequence+split.
645 
646     { ISD::MUL,  MVT::v16i8,      12 }, // extend/pmullw/trunc sequence.
647     { ISD::MUL,  MVT::v8i16,       1 }, // pmullw
648     { ISD::MUL,  MVT::v4i32,       6 }, // 3*pmuludq/4*shuffle
649     { ISD::MUL,  MVT::v2i64,       8 }, // 3*pmuludq/3*shift/2*add
650 
651     { ISD::FDIV, MVT::f32,        23 }, // Pentium IV from http://www.agner.org/
652     { ISD::FDIV, MVT::v4f32,      39 }, // Pentium IV from http://www.agner.org/
653     { ISD::FDIV, MVT::f64,        38 }, // Pentium IV from http://www.agner.org/
654     { ISD::FDIV, MVT::v2f64,      69 }, // Pentium IV from http://www.agner.org/
655 
656     // It is not a good idea to vectorize division. We have to scalarize it and
657     // in the process we will often end up having to spilling regular
658     // registers. The overhead of division is going to dominate most kernels
659     // anyways so try hard to prevent vectorization of division - it is
660     // generally a bad idea. Assume somewhat arbitrarily that we have to be able
661     // to hide "20 cycles" for each lane.
662     { ISD::SDIV,  MVT::v16i8,  16*20 },
663     { ISD::SDIV,  MVT::v8i16,   8*20 },
664     { ISD::SDIV,  MVT::v4i32,   4*20 },
665     { ISD::SDIV,  MVT::v2i64,   2*20 },
666     { ISD::UDIV,  MVT::v16i8,  16*20 },
667     { ISD::UDIV,  MVT::v8i16,   8*20 },
668     { ISD::UDIV,  MVT::v4i32,   4*20 },
669     { ISD::UDIV,  MVT::v2i64,   2*20 },
670   };
671 
672   if (ST->hasSSE2())
673     if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
674       return LT.first * Entry->Cost;
675 
676   static const CostTblEntry SSE1CostTable[] = {
677     { ISD::FDIV, MVT::f32,   17 }, // Pentium III from http://www.agner.org/
678     { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
679   };
680 
681   if (ST->hasSSE1())
682     if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
683       return LT.first * Entry->Cost;
684 
685   // Fallback to the default implementation.
686   return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
687 }
688 
689 int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
690                                Type *SubTp) {
691   // 64-bit packed float vectors (v2f32) are widened to type v4f32.
692   // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
693   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
694 
695   // For Broadcasts we are splatting the first element from the first input
696   // register, so only need to reference that input and all the output
697   // registers are the same.
698   if (Kind == TTI::SK_Broadcast)
699     LT.first = 1;
700 
701   // We are going to permute multiple sources and the result will be in multiple
702   // destinations. Providing an accurate cost only for splits where the element
703   // type remains the same.
704   if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
705     MVT LegalVT = LT.second;
706     if (LegalVT.getVectorElementType().getSizeInBits() ==
707             Tp->getVectorElementType()->getPrimitiveSizeInBits() &&
708         LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {
709 
710       unsigned VecTySize = DL.getTypeStoreSize(Tp);
711       unsigned LegalVTSize = LegalVT.getStoreSize();
712       // Number of source vectors after legalization:
713       unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
714       // Number of destination vectors after legalization:
715       unsigned NumOfDests = LT.first;
716 
717       Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(),
718                                          LegalVT.getVectorNumElements());
719 
720       unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
721       return NumOfShuffles *
722              getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
723     }
724 
725     return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
726   }
727 
728   // For 2-input shuffles, we must account for splitting the 2 inputs into many.
729   if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
730     // We assume that source and destination have the same vector type.
731     int NumOfDests = LT.first;
732     int NumOfShufflesPerDest = LT.first * 2 - 1;
733     LT.first = NumOfDests * NumOfShufflesPerDest;
734   }
735 
736   static const CostTblEntry AVX512VBMIShuffleTbl[] = {
737     { TTI::SK_Reverse,          MVT::v64i8,  1 }, // vpermb
738     { TTI::SK_Reverse,          MVT::v32i8,  1 }, // vpermb
739 
740     { TTI::SK_PermuteSingleSrc, MVT::v64i8,  1 }, // vpermb
741     { TTI::SK_PermuteSingleSrc, MVT::v32i8,  1 }, // vpermb
742 
743     { TTI::SK_PermuteTwoSrc,    MVT::v64i8,  1 }, // vpermt2b
744     { TTI::SK_PermuteTwoSrc,    MVT::v32i8,  1 }, // vpermt2b
745     { TTI::SK_PermuteTwoSrc,    MVT::v16i8,  1 }  // vpermt2b
746   };
747 
748   if (ST->hasVBMI())
749     if (const auto *Entry =
750             CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
751       return LT.first * Entry->Cost;
752 
753   static const CostTblEntry AVX512BWShuffleTbl[] = {
754     { TTI::SK_Broadcast,        MVT::v32i16, 1 }, // vpbroadcastw
755     { TTI::SK_Broadcast,        MVT::v64i8,  1 }, // vpbroadcastb
756 
757     { TTI::SK_Reverse,          MVT::v32i16, 1 }, // vpermw
758     { TTI::SK_Reverse,          MVT::v16i16, 1 }, // vpermw
759     { TTI::SK_Reverse,          MVT::v64i8,  2 }, // pshufb + vshufi64x2
760 
761     { TTI::SK_PermuteSingleSrc, MVT::v32i16, 1 }, // vpermw
762     { TTI::SK_PermuteSingleSrc, MVT::v16i16, 1 }, // vpermw
763     { TTI::SK_PermuteSingleSrc, MVT::v8i16,  1 }, // vpermw
764     { TTI::SK_PermuteSingleSrc, MVT::v64i8,  8 }, // extend to v32i16
765     { TTI::SK_PermuteSingleSrc, MVT::v32i8,  3 }, // vpermw + zext/trunc
766 
767     { TTI::SK_PermuteTwoSrc,    MVT::v32i16, 1 }, // vpermt2w
768     { TTI::SK_PermuteTwoSrc,    MVT::v16i16, 1 }, // vpermt2w
769     { TTI::SK_PermuteTwoSrc,    MVT::v8i16,  1 }, // vpermt2w
770     { TTI::SK_PermuteTwoSrc,    MVT::v32i8,  3 }, // zext + vpermt2w + trunc
771     { TTI::SK_PermuteTwoSrc,    MVT::v64i8, 19 }, // 6 * v32i8 + 1
772     { TTI::SK_PermuteTwoSrc,    MVT::v16i8,  3 }  // zext + vpermt2w + trunc
773   };
774 
775   if (ST->hasBWI())
776     if (const auto *Entry =
777             CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
778       return LT.first * Entry->Cost;
779 
780   static const CostTblEntry AVX512ShuffleTbl[] = {
781     { TTI::SK_Broadcast,        MVT::v8f64,  1 }, // vbroadcastpd
782     { TTI::SK_Broadcast,        MVT::v16f32, 1 }, // vbroadcastps
783     { TTI::SK_Broadcast,        MVT::v8i64,  1 }, // vpbroadcastq
784     { TTI::SK_Broadcast,        MVT::v16i32, 1 }, // vpbroadcastd
785 
786     { TTI::SK_Reverse,          MVT::v8f64,  1 }, // vpermpd
787     { TTI::SK_Reverse,          MVT::v16f32, 1 }, // vpermps
788     { TTI::SK_Reverse,          MVT::v8i64,  1 }, // vpermq
789     { TTI::SK_Reverse,          MVT::v16i32, 1 }, // vpermd
790 
791     { TTI::SK_PermuteSingleSrc, MVT::v8f64,  1 }, // vpermpd
792     { TTI::SK_PermuteSingleSrc, MVT::v4f64,  1 }, // vpermpd
793     { TTI::SK_PermuteSingleSrc, MVT::v2f64,  1 }, // vpermpd
794     { TTI::SK_PermuteSingleSrc, MVT::v16f32, 1 }, // vpermps
795     { TTI::SK_PermuteSingleSrc, MVT::v8f32,  1 }, // vpermps
796     { TTI::SK_PermuteSingleSrc, MVT::v4f32,  1 }, // vpermps
797     { TTI::SK_PermuteSingleSrc, MVT::v8i64,  1 }, // vpermq
798     { TTI::SK_PermuteSingleSrc, MVT::v4i64,  1 }, // vpermq
799     { TTI::SK_PermuteSingleSrc, MVT::v2i64,  1 }, // vpermq
800     { TTI::SK_PermuteSingleSrc, MVT::v16i32, 1 }, // vpermd
801     { TTI::SK_PermuteSingleSrc, MVT::v8i32,  1 }, // vpermd
802     { TTI::SK_PermuteSingleSrc, MVT::v4i32,  1 }, // vpermd
803     { TTI::SK_PermuteSingleSrc, MVT::v16i8,  1 }, // pshufb
804 
805     { TTI::SK_PermuteTwoSrc,    MVT::v8f64,  1 }, // vpermt2pd
806     { TTI::SK_PermuteTwoSrc,    MVT::v16f32, 1 }, // vpermt2ps
807     { TTI::SK_PermuteTwoSrc,    MVT::v8i64,  1 }, // vpermt2q
808     { TTI::SK_PermuteTwoSrc,    MVT::v16i32, 1 }, // vpermt2d
809     { TTI::SK_PermuteTwoSrc,    MVT::v4f64,  1 }, // vpermt2pd
810     { TTI::SK_PermuteTwoSrc,    MVT::v8f32,  1 }, // vpermt2ps
811     { TTI::SK_PermuteTwoSrc,    MVT::v4i64,  1 }, // vpermt2q
812     { TTI::SK_PermuteTwoSrc,    MVT::v8i32,  1 }, // vpermt2d
813     { TTI::SK_PermuteTwoSrc,    MVT::v2f64,  1 }, // vpermt2pd
814     { TTI::SK_PermuteTwoSrc,    MVT::v4f32,  1 }, // vpermt2ps
815     { TTI::SK_PermuteTwoSrc,    MVT::v2i64,  1 }, // vpermt2q
816     { TTI::SK_PermuteTwoSrc,    MVT::v4i32,  1 }  // vpermt2d
817   };
818 
819   if (ST->hasAVX512())
820     if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
821       return LT.first * Entry->Cost;
822 
823   static const CostTblEntry AVX2ShuffleTbl[] = {
824     { TTI::SK_Broadcast, MVT::v4f64,  1 }, // vbroadcastpd
825     { TTI::SK_Broadcast, MVT::v8f32,  1 }, // vbroadcastps
826     { TTI::SK_Broadcast, MVT::v4i64,  1 }, // vpbroadcastq
827     { TTI::SK_Broadcast, MVT::v8i32,  1 }, // vpbroadcastd
828     { TTI::SK_Broadcast, MVT::v16i16, 1 }, // vpbroadcastw
829     { TTI::SK_Broadcast, MVT::v32i8,  1 }, // vpbroadcastb
830 
831     { TTI::SK_Reverse,   MVT::v4f64,  1 }, // vpermpd
832     { TTI::SK_Reverse,   MVT::v8f32,  1 }, // vpermps
833     { TTI::SK_Reverse,   MVT::v4i64,  1 }, // vpermq
834     { TTI::SK_Reverse,   MVT::v8i32,  1 }, // vpermd
835     { TTI::SK_Reverse,   MVT::v16i16, 2 }, // vperm2i128 + pshufb
836     { TTI::SK_Reverse,   MVT::v32i8,  2 }, // vperm2i128 + pshufb
837 
838     { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw
839     { TTI::SK_Alternate, MVT::v32i8,  1 }, // vpblendvb
840 
841     { TTI::SK_PermuteSingleSrc, MVT::v4i64,  1 }, // vpermq
842     { TTI::SK_PermuteSingleSrc, MVT::v8i32,  1 }, // vpermd
843     { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vperm2i128 + 2 * vpshufb
844                                                   // + vpblendvb
845     { TTI::SK_PermuteSingleSrc, MVT::v32i8,  4 }  // vperm2i128 + 2 * vpshufb
846                                                   // + vpblendvb
847   };
848 
849   if (ST->hasAVX2())
850     if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
851       return LT.first * Entry->Cost;
852 
853   static const CostTblEntry AVX1ShuffleTbl[] = {
854     { TTI::SK_Broadcast, MVT::v4f64,  2 }, // vperm2f128 + vpermilpd
855     { TTI::SK_Broadcast, MVT::v8f32,  2 }, // vperm2f128 + vpermilps
856     { TTI::SK_Broadcast, MVT::v4i64,  2 }, // vperm2f128 + vpermilpd
857     { TTI::SK_Broadcast, MVT::v8i32,  2 }, // vperm2f128 + vpermilps
858     { TTI::SK_Broadcast, MVT::v16i16, 3 }, // vpshuflw + vpshufd + vinsertf128
859     { TTI::SK_Broadcast, MVT::v32i8,  2 }, // vpshufb + vinsertf128
860 
861     { TTI::SK_Reverse,   MVT::v4f64,  2 }, // vperm2f128 + vpermilpd
862     { TTI::SK_Reverse,   MVT::v8f32,  2 }, // vperm2f128 + vpermilps
863     { TTI::SK_Reverse,   MVT::v4i64,  2 }, // vperm2f128 + vpermilpd
864     { TTI::SK_Reverse,   MVT::v8i32,  2 }, // vperm2f128 + vpermilps
865     { TTI::SK_Reverse,   MVT::v16i16, 4 }, // vextractf128 + 2*pshufb
866                                            // + vinsertf128
867     { TTI::SK_Reverse,   MVT::v32i8,  4 }, // vextractf128 + 2*pshufb
868                                            // + vinsertf128
869 
870     { TTI::SK_Alternate, MVT::v4i64,  1 }, // vblendpd
871     { TTI::SK_Alternate, MVT::v4f64,  1 }, // vblendpd
872     { TTI::SK_Alternate, MVT::v8i32,  1 }, // vblendps
873     { TTI::SK_Alternate, MVT::v8f32,  1 }, // vblendps
874     { TTI::SK_Alternate, MVT::v16i16, 3 }, // vpand + vpandn + vpor
875     { TTI::SK_Alternate, MVT::v32i8,  3 }  // vpand + vpandn + vpor
876   };
877 
878   if (ST->hasAVX())
879     if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
880       return LT.first * Entry->Cost;
881 
882   static const CostTblEntry SSE41ShuffleTbl[] = {
883     { TTI::SK_Alternate, MVT::v2i64,  1 }, // pblendw
884     { TTI::SK_Alternate, MVT::v2f64,  1 }, // movsd
885     { TTI::SK_Alternate, MVT::v4i32,  1 }, // pblendw
886     { TTI::SK_Alternate, MVT::v4f32,  1 }, // blendps
887     { TTI::SK_Alternate, MVT::v8i16,  1 }, // pblendw
888     { TTI::SK_Alternate, MVT::v16i8,  1 }  // pblendvb
889   };
890 
891   if (ST->hasSSE41())
892     if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
893       return LT.first * Entry->Cost;
894 
895   static const CostTblEntry SSSE3ShuffleTbl[] = {
896     { TTI::SK_Broadcast, MVT::v8i16,  1 }, // pshufb
897     { TTI::SK_Broadcast, MVT::v16i8,  1 }, // pshufb
898 
899     { TTI::SK_Reverse,   MVT::v8i16,  1 }, // pshufb
900     { TTI::SK_Reverse,   MVT::v16i8,  1 }, // pshufb
901 
902     { TTI::SK_Alternate, MVT::v8i16,  3 }, // pshufb + pshufb + por
903     { TTI::SK_Alternate, MVT::v16i8,  3 }, // pshufb + pshufb + por
904 
905     { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // pshufb
906     { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }  // pshufb
907   };
908 
909   if (ST->hasSSSE3())
910     if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
911       return LT.first * Entry->Cost;
912 
913   static const CostTblEntry SSE2ShuffleTbl[] = {
914     { TTI::SK_Broadcast, MVT::v2f64,  1 }, // shufpd
915     { TTI::SK_Broadcast, MVT::v2i64,  1 }, // pshufd
916     { TTI::SK_Broadcast, MVT::v4i32,  1 }, // pshufd
917     { TTI::SK_Broadcast, MVT::v8i16,  2 }, // pshuflw  + pshufd
918     { TTI::SK_Broadcast, MVT::v16i8,  3 }, // unpck + pshuflw + pshufd
919 
920     { TTI::SK_Reverse,   MVT::v2f64,  1 }, // shufpd
921     { TTI::SK_Reverse,   MVT::v2i64,  1 }, // pshufd
922     { TTI::SK_Reverse,   MVT::v4i32,  1 }, // pshufd
923     { TTI::SK_Reverse,   MVT::v8i16,  3 }, // pshuflw + pshufhw  + pshufd
924     { TTI::SK_Reverse,   MVT::v16i8,  9 }, // 2*pshuflw + 2*pshufhw
925                                            // + 2*pshufd + 2*unpck + packus
926 
927     { TTI::SK_Alternate, MVT::v2i64,  1 }, // movsd
928     { TTI::SK_Alternate, MVT::v2f64,  1 }, // movsd
929     { TTI::SK_Alternate, MVT::v4i32,  2 }, // 2*shufps
930     { TTI::SK_Alternate, MVT::v8i16,  3 }, // pand + pandn + por
931     { TTI::SK_Alternate, MVT::v16i8,  3 }, // pand + pandn + por
932 
933     { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd
934     { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }  // pshufd
935   };
936 
937   if (ST->hasSSE2())
938     if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
939       return LT.first * Entry->Cost;
940 
941   static const CostTblEntry SSE1ShuffleTbl[] = {
942     { TTI::SK_Broadcast, MVT::v4f32,  1 }, // shufps
943     { TTI::SK_Reverse,   MVT::v4f32,  1 }, // shufps
944     { TTI::SK_Alternate, MVT::v4f32,  2 }  // 2*shufps
945   };
946 
947   if (ST->hasSSE1())
948     if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
949       return LT.first * Entry->Cost;
950 
951   return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
952 }
953 
954 int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
955                                  const Instruction *I) {
956   int ISD = TLI->InstructionOpcodeToISD(Opcode);
957   assert(ISD && "Invalid opcode");
958 
959   // FIXME: Need a better design of the cost table to handle non-simple types of
960   // potential massive combinations (elem_num x src_type x dst_type).
961 
962   static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
963     { ISD::SINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  1 },
964     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
965     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  1 },
966     { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  1 },
967     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  1 },
968     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  1 },
969 
970     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  1 },
971     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
972     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  1 },
973     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  1 },
974     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  1 },
975     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  1 },
976 
977     { ISD::FP_TO_SINT,  MVT::v2i64,  MVT::v2f32,  1 },
978     { ISD::FP_TO_SINT,  MVT::v4i64,  MVT::v4f32,  1 },
979     { ISD::FP_TO_SINT,  MVT::v8i64,  MVT::v8f32,  1 },
980     { ISD::FP_TO_SINT,  MVT::v2i64,  MVT::v2f64,  1 },
981     { ISD::FP_TO_SINT,  MVT::v4i64,  MVT::v4f64,  1 },
982     { ISD::FP_TO_SINT,  MVT::v8i64,  MVT::v8f64,  1 },
983 
984     { ISD::FP_TO_UINT,  MVT::v2i64,  MVT::v2f32,  1 },
985     { ISD::FP_TO_UINT,  MVT::v4i64,  MVT::v4f32,  1 },
986     { ISD::FP_TO_UINT,  MVT::v8i64,  MVT::v8f32,  1 },
987     { ISD::FP_TO_UINT,  MVT::v2i64,  MVT::v2f64,  1 },
988     { ISD::FP_TO_UINT,  MVT::v4i64,  MVT::v4f64,  1 },
989     { ISD::FP_TO_UINT,  MVT::v8i64,  MVT::v8f64,  1 },
990   };
991 
992   // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
993   // 256-bit wide vectors.
994 
995   static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
996     { ISD::FP_EXTEND, MVT::v8f64,   MVT::v8f32,  1 },
997     { ISD::FP_EXTEND, MVT::v8f64,   MVT::v16f32, 3 },
998     { ISD::FP_ROUND,  MVT::v8f32,   MVT::v8f64,  1 },
999 
1000     { ISD::TRUNCATE,  MVT::v16i8,   MVT::v16i32, 1 },
1001     { ISD::TRUNCATE,  MVT::v16i16,  MVT::v16i32, 1 },
1002     { ISD::TRUNCATE,  MVT::v8i16,   MVT::v8i64,  1 },
1003     { ISD::TRUNCATE,  MVT::v8i32,   MVT::v8i64,  1 },
1004 
1005     // v16i1 -> v16i32 - load + broadcast
1006     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1,  2 },
1007     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1,  2 },
1008     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
1009     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
1010     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
1011     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
1012     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
1013     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
1014     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
1015     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
1016 
1017     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
1018     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
1019     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i8,   2 },
1020     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i8,  2 },
1021     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
1022     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 2 },
1023     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
1024     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
1025     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64, 26 },
1026     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64, 26 },
1027 
1028     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
1029     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
1030     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i8,   2 },
1031     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i8,   2 },
1032     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i8,   2 },
1033     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i8,   2 },
1034     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i8,  2 },
1035     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i16,  5 },
1036     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i16,  2 },
1037     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  2 },
1038     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
1039     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 2 },
1040     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  2 },
1041     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i32,  1 },
1042     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  1 },
1043     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  1 },
1044     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  1 },
1045     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
1046     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
1047     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  5 },
1048     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  5 },
1049     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64, 12 },
1050     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64, 26 },
1051 
1052     { ISD::FP_TO_UINT,  MVT::v2i32,  MVT::v2f32,  1 },
1053     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  1 },
1054     { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  1 },
1055     { ISD::FP_TO_UINT,  MVT::v16i32, MVT::v16f32, 1 },
1056   };
1057 
1058   static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
1059     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   3 },
1060     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   3 },
1061     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
1062     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
1063     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,   3 },
1064     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,   3 },
1065     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   3 },
1066     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   3 },
1067     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
1068     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
1069     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16,  3 },
1070     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16,  3 },
1071     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
1072     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
1073     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  1 },
1074     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  1 },
1075 
1076     { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i64,  2 },
1077     { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i64,  2 },
1078     { ISD::TRUNCATE,    MVT::v4i32,  MVT::v4i64,  2 },
1079     { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  2 },
1080     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  2 },
1081     { ISD::TRUNCATE,    MVT::v8i32,  MVT::v8i64,  4 },
1082 
1083     { ISD::FP_EXTEND,   MVT::v8f64,  MVT::v8f32,  3 },
1084     { ISD::FP_ROUND,    MVT::v8f32,  MVT::v8f64,  3 },
1085 
1086     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  8 },
1087   };
1088 
1089   static const TypeConversionCostTblEntry AVXConversionTbl[] = {
1090     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,  6 },
1091     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,  4 },
1092     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,  7 },
1093     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,  4 },
1094     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,  6 },
1095     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,  4 },
1096     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,  7 },
1097     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,  4 },
1098     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
1099     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
1100     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16, 6 },
1101     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
1102     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16, 4 },
1103     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16, 4 },
1104     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32, 4 },
1105     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32, 4 },
1106 
1107     { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i16, 4 },
1108     { ISD::TRUNCATE,    MVT::v8i8,  MVT::v8i32,  4 },
1109     { ISD::TRUNCATE,    MVT::v8i16, MVT::v8i32,  5 },
1110     { ISD::TRUNCATE,    MVT::v4i8,  MVT::v4i64,  4 },
1111     { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i64,  4 },
1112     { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64,  4 },
1113     { ISD::TRUNCATE,    MVT::v8i32, MVT::v8i64,  9 },
1114 
1115     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1,  3 },
1116     { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i1,  3 },
1117     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i1,  8 },
1118     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8,  3 },
1119     { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i8,  3 },
1120     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i8,  8 },
1121     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 3 },
1122     { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i16, 3 },
1123     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 5 },
1124     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
1125     { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i32, 1 },
1126     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 1 },
1127 
1128     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1,  7 },
1129     { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i1,  7 },
1130     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i1,  6 },
1131     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8,  2 },
1132     { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i8,  2 },
1133     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i8,  5 },
1134     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
1135     { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i16, 2 },
1136     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 5 },
1137     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 6 },
1138     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 6 },
1139     { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i32, 6 },
1140     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 9 },
1141     // The generic code to compute the scalar overhead is currently broken.
1142     // Workaround this limitation by estimating the scalarization overhead
1143     // here. We have roughly 10 instructions per scalar element.
1144     // Multiply that by the vector width.
1145     // FIXME: remove that when PR19268 is fixed.
1146     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i64, 10 },
1147     { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i64, 20 },
1148     { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i64, 13 },
1149     { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i64, 13 },
1150 
1151     { ISD::FP_TO_SINT,  MVT::v4i8,  MVT::v4f32, 1 },
1152     { ISD::FP_TO_SINT,  MVT::v8i8,  MVT::v8f32, 7 },
1153     // This node is expanded into scalarized operations but BasicTTI is overly
1154     // optimistic estimating its cost.  It computes 3 per element (one
1155     // vector-extract, one scalar conversion and one vector-insert).  The
1156     // problem is that the inserts form a read-modify-write chain so latency
1157     // should be factored in too.  Inflating the cost per element by 1.
1158     { ISD::FP_TO_UINT,  MVT::v8i32, MVT::v8f32, 8*4 },
1159     { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f64, 4*4 },
1160 
1161     { ISD::FP_EXTEND,   MVT::v4f64,  MVT::v4f32,  1 },
1162     { ISD::FP_ROUND,    MVT::v4f32,  MVT::v4f64,  1 },
1163   };
1164 
1165   static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
1166     { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8,    2 },
1167     { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8,    2 },
1168     { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16,   2 },
1169     { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16,   2 },
1170     { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32,   2 },
1171     { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32,   2 },
1172 
1173     { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i8,   1 },
1174     { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i8,   2 },
1175     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i8,   1 },
1176     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i8,   1 },
1177     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
1178     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
1179     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   2 },
1180     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   2 },
1181     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  2 },
1182     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  2 },
1183     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  4 },
1184     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  4 },
1185     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
1186     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
1187     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  2 },
1188     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  2 },
1189     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
1190     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
1191 
1192     { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  2 },
1193     { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  1 },
1194     { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i32,  1 },
1195     { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i32,  1 },
1196     { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  3 },
1197     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  3 },
1198     { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 6 },
1199 
1200   };
1201 
1202   static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
1203     // These are somewhat magic numbers justified by looking at the output of
1204     // Intel's IACA, running some kernels and making sure when we take
1205     // legalization into account the throughput will be overestimated.
1206     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
1207     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1208     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
1209     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
1210     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
1211     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
1212     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
1213     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
1214 
1215     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1216     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
1217     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
1218     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
1219     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
1220     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 },
1221     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
1222     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
1223 
1224     { ISD::FP_TO_SINT,  MVT::v2i32,  MVT::v2f64,  3 },
1225 
1226     { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i8,   1 },
1227     { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i8,   6 },
1228     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i8,   2 },
1229     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i8,   3 },
1230     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,   4 },
1231     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,   8 },
1232     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
1233     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i8,   2 },
1234     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   6 },
1235     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   6 },
1236     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  3 },
1237     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  4 },
1238     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  9 },
1239     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  12 },
1240     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
1241     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i16,  2 },
1242     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16,  3 },
1243     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16,  10 },
1244     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  3 },
1245     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  4 },
1246     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 },
1247     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 },
1248     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  3 },
1249     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  5 },
1250 
1251     { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  4 },
1252     { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  2 },
1253     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 3 },
1254     { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i32,  3 },
1255     { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i32,  3 },
1256     { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  4 },
1257     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 7 },
1258     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  5 },
1259     { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 10 },
1260   };
1261 
1262   std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
1263   std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
1264 
1265   if (ST->hasSSE2() && !ST->hasAVX()) {
1266     if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1267                                                    LTDest.second, LTSrc.second))
1268       return LTSrc.first * Entry->Cost;
1269   }
1270 
1271   EVT SrcTy = TLI->getValueType(DL, Src);
1272   EVT DstTy = TLI->getValueType(DL, Dst);
1273 
1274   // The function getSimpleVT only handles simple value types.
1275   if (!SrcTy.isSimple() || !DstTy.isSimple())
1276     return BaseT::getCastInstrCost(Opcode, Dst, Src);
1277 
1278   if (ST->hasDQI())
1279     if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
1280                                                    DstTy.getSimpleVT(),
1281                                                    SrcTy.getSimpleVT()))
1282       return Entry->Cost;
1283 
1284   if (ST->hasAVX512())
1285     if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
1286                                                    DstTy.getSimpleVT(),
1287                                                    SrcTy.getSimpleVT()))
1288       return Entry->Cost;
1289 
1290   if (ST->hasAVX2()) {
1291     if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
1292                                                    DstTy.getSimpleVT(),
1293                                                    SrcTy.getSimpleVT()))
1294       return Entry->Cost;
1295   }
1296 
1297   if (ST->hasAVX()) {
1298     if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
1299                                                    DstTy.getSimpleVT(),
1300                                                    SrcTy.getSimpleVT()))
1301       return Entry->Cost;
1302   }
1303 
1304   if (ST->hasSSE41()) {
1305     if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
1306                                                    DstTy.getSimpleVT(),
1307                                                    SrcTy.getSimpleVT()))
1308       return Entry->Cost;
1309   }
1310 
1311   if (ST->hasSSE2()) {
1312     if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1313                                                    DstTy.getSimpleVT(),
1314                                                    SrcTy.getSimpleVT()))
1315       return Entry->Cost;
1316   }
1317 
1318   return BaseT::getCastInstrCost(Opcode, Dst, Src);
1319 }
1320 
1321 int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1322                                    const Instruction *I) {
1323   // Legalize the type.
1324   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1325 
1326   MVT MTy = LT.second;
1327 
1328   int ISD = TLI->InstructionOpcodeToISD(Opcode);
1329   assert(ISD && "Invalid opcode");
1330 
1331   static const CostTblEntry SSE2CostTbl[] = {
1332     { ISD::SETCC,   MVT::v2i64,   8 },
1333     { ISD::SETCC,   MVT::v4i32,   1 },
1334     { ISD::SETCC,   MVT::v8i16,   1 },
1335     { ISD::SETCC,   MVT::v16i8,   1 },
1336   };
1337 
1338   static const CostTblEntry SSE42CostTbl[] = {
1339     { ISD::SETCC,   MVT::v2f64,   1 },
1340     { ISD::SETCC,   MVT::v4f32,   1 },
1341     { ISD::SETCC,   MVT::v2i64,   1 },
1342   };
1343 
1344   static const CostTblEntry AVX1CostTbl[] = {
1345     { ISD::SETCC,   MVT::v4f64,   1 },
1346     { ISD::SETCC,   MVT::v8f32,   1 },
1347     // AVX1 does not support 8-wide integer compare.
1348     { ISD::SETCC,   MVT::v4i64,   4 },
1349     { ISD::SETCC,   MVT::v8i32,   4 },
1350     { ISD::SETCC,   MVT::v16i16,  4 },
1351     { ISD::SETCC,   MVT::v32i8,   4 },
1352   };
1353 
1354   static const CostTblEntry AVX2CostTbl[] = {
1355     { ISD::SETCC,   MVT::v4i64,   1 },
1356     { ISD::SETCC,   MVT::v8i32,   1 },
1357     { ISD::SETCC,   MVT::v16i16,  1 },
1358     { ISD::SETCC,   MVT::v32i8,   1 },
1359   };
1360 
1361   static const CostTblEntry AVX512CostTbl[] = {
1362     { ISD::SETCC,   MVT::v8i64,   1 },
1363     { ISD::SETCC,   MVT::v16i32,  1 },
1364     { ISD::SETCC,   MVT::v8f64,   1 },
1365     { ISD::SETCC,   MVT::v16f32,  1 },
1366   };
1367 
1368   if (ST->hasAVX512())
1369     if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
1370       return LT.first * Entry->Cost;
1371 
1372   if (ST->hasAVX2())
1373     if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
1374       return LT.first * Entry->Cost;
1375 
1376   if (ST->hasAVX())
1377     if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
1378       return LT.first * Entry->Cost;
1379 
1380   if (ST->hasSSE42())
1381     if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
1382       return LT.first * Entry->Cost;
1383 
1384   if (ST->hasSSE2())
1385     if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
1386       return LT.first * Entry->Cost;
1387 
1388   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
1389 }
1390 
1391 unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; }
1392 
1393 int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
1394                                       ArrayRef<Type *> Tys, FastMathFlags FMF,
1395                                       unsigned ScalarizationCostPassed) {
1396   // Costs should match the codegen from:
1397   // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
1398   // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
1399   // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
1400   // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
1401   // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
1402   static const CostTblEntry AVX512CDCostTbl[] = {
1403     { ISD::CTLZ,       MVT::v8i64,   1 },
1404     { ISD::CTLZ,       MVT::v16i32,  1 },
1405     { ISD::CTLZ,       MVT::v32i16,  8 },
1406     { ISD::CTLZ,       MVT::v64i8,  20 },
1407     { ISD::CTLZ,       MVT::v4i64,   1 },
1408     { ISD::CTLZ,       MVT::v8i32,   1 },
1409     { ISD::CTLZ,       MVT::v16i16,  4 },
1410     { ISD::CTLZ,       MVT::v32i8,  10 },
1411     { ISD::CTLZ,       MVT::v2i64,   1 },
1412     { ISD::CTLZ,       MVT::v4i32,   1 },
1413     { ISD::CTLZ,       MVT::v8i16,   4 },
1414     { ISD::CTLZ,       MVT::v16i8,   4 },
1415   };
1416   static const CostTblEntry AVX512BWCostTbl[] = {
1417     { ISD::BITREVERSE, MVT::v8i64,   5 },
1418     { ISD::BITREVERSE, MVT::v16i32,  5 },
1419     { ISD::BITREVERSE, MVT::v32i16,  5 },
1420     { ISD::BITREVERSE, MVT::v64i8,   5 },
1421     { ISD::CTLZ,       MVT::v8i64,  23 },
1422     { ISD::CTLZ,       MVT::v16i32, 22 },
1423     { ISD::CTLZ,       MVT::v32i16, 18 },
1424     { ISD::CTLZ,       MVT::v64i8,  17 },
1425     { ISD::CTPOP,      MVT::v8i64,   7 },
1426     { ISD::CTPOP,      MVT::v16i32, 11 },
1427     { ISD::CTPOP,      MVT::v32i16,  9 },
1428     { ISD::CTPOP,      MVT::v64i8,   6 },
1429     { ISD::CTTZ,       MVT::v8i64,  10 },
1430     { ISD::CTTZ,       MVT::v16i32, 14 },
1431     { ISD::CTTZ,       MVT::v32i16, 12 },
1432     { ISD::CTTZ,       MVT::v64i8,   9 },
1433   };
1434   static const CostTblEntry AVX512CostTbl[] = {
1435     { ISD::BITREVERSE, MVT::v8i64,  36 },
1436     { ISD::BITREVERSE, MVT::v16i32, 24 },
1437     { ISD::CTLZ,       MVT::v8i64,  29 },
1438     { ISD::CTLZ,       MVT::v16i32, 35 },
1439     { ISD::CTPOP,      MVT::v8i64,  16 },
1440     { ISD::CTPOP,      MVT::v16i32, 24 },
1441     { ISD::CTTZ,       MVT::v8i64,  20 },
1442     { ISD::CTTZ,       MVT::v16i32, 28 },
1443   };
1444   static const CostTblEntry XOPCostTbl[] = {
1445     { ISD::BITREVERSE, MVT::v4i64,   4 },
1446     { ISD::BITREVERSE, MVT::v8i32,   4 },
1447     { ISD::BITREVERSE, MVT::v16i16,  4 },
1448     { ISD::BITREVERSE, MVT::v32i8,   4 },
1449     { ISD::BITREVERSE, MVT::v2i64,   1 },
1450     { ISD::BITREVERSE, MVT::v4i32,   1 },
1451     { ISD::BITREVERSE, MVT::v8i16,   1 },
1452     { ISD::BITREVERSE, MVT::v16i8,   1 },
1453     { ISD::BITREVERSE, MVT::i64,     3 },
1454     { ISD::BITREVERSE, MVT::i32,     3 },
1455     { ISD::BITREVERSE, MVT::i16,     3 },
1456     { ISD::BITREVERSE, MVT::i8,      3 }
1457   };
1458   static const CostTblEntry AVX2CostTbl[] = {
1459     { ISD::BITREVERSE, MVT::v4i64,   5 },
1460     { ISD::BITREVERSE, MVT::v8i32,   5 },
1461     { ISD::BITREVERSE, MVT::v16i16,  5 },
1462     { ISD::BITREVERSE, MVT::v32i8,   5 },
1463     { ISD::BSWAP,      MVT::v4i64,   1 },
1464     { ISD::BSWAP,      MVT::v8i32,   1 },
1465     { ISD::BSWAP,      MVT::v16i16,  1 },
1466     { ISD::CTLZ,       MVT::v4i64,  23 },
1467     { ISD::CTLZ,       MVT::v8i32,  18 },
1468     { ISD::CTLZ,       MVT::v16i16, 14 },
1469     { ISD::CTLZ,       MVT::v32i8,   9 },
1470     { ISD::CTPOP,      MVT::v4i64,   7 },
1471     { ISD::CTPOP,      MVT::v8i32,  11 },
1472     { ISD::CTPOP,      MVT::v16i16,  9 },
1473     { ISD::CTPOP,      MVT::v32i8,   6 },
1474     { ISD::CTTZ,       MVT::v4i64,  10 },
1475     { ISD::CTTZ,       MVT::v8i32,  14 },
1476     { ISD::CTTZ,       MVT::v16i16, 12 },
1477     { ISD::CTTZ,       MVT::v32i8,   9 },
1478     { ISD::FSQRT,      MVT::f32,     7 }, // Haswell from http://www.agner.org/
1479     { ISD::FSQRT,      MVT::v4f32,   7 }, // Haswell from http://www.agner.org/
1480     { ISD::FSQRT,      MVT::v8f32,  14 }, // Haswell from http://www.agner.org/
1481     { ISD::FSQRT,      MVT::f64,    14 }, // Haswell from http://www.agner.org/
1482     { ISD::FSQRT,      MVT::v2f64,  14 }, // Haswell from http://www.agner.org/
1483     { ISD::FSQRT,      MVT::v4f64,  28 }, // Haswell from http://www.agner.org/
1484   };
1485   static const CostTblEntry AVX1CostTbl[] = {
1486     { ISD::BITREVERSE, MVT::v4i64,  12 }, // 2 x 128-bit Op + extract/insert
1487     { ISD::BITREVERSE, MVT::v8i32,  12 }, // 2 x 128-bit Op + extract/insert
1488     { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
1489     { ISD::BITREVERSE, MVT::v32i8,  12 }, // 2 x 128-bit Op + extract/insert
1490     { ISD::BSWAP,      MVT::v4i64,   4 },
1491     { ISD::BSWAP,      MVT::v8i32,   4 },
1492     { ISD::BSWAP,      MVT::v16i16,  4 },
1493     { ISD::CTLZ,       MVT::v4i64,  48 }, // 2 x 128-bit Op + extract/insert
1494     { ISD::CTLZ,       MVT::v8i32,  38 }, // 2 x 128-bit Op + extract/insert
1495     { ISD::CTLZ,       MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
1496     { ISD::CTLZ,       MVT::v32i8,  20 }, // 2 x 128-bit Op + extract/insert
1497     { ISD::CTPOP,      MVT::v4i64,  16 }, // 2 x 128-bit Op + extract/insert
1498     { ISD::CTPOP,      MVT::v8i32,  24 }, // 2 x 128-bit Op + extract/insert
1499     { ISD::CTPOP,      MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
1500     { ISD::CTPOP,      MVT::v32i8,  14 }, // 2 x 128-bit Op + extract/insert
1501     { ISD::CTTZ,       MVT::v4i64,  22 }, // 2 x 128-bit Op + extract/insert
1502     { ISD::CTTZ,       MVT::v8i32,  30 }, // 2 x 128-bit Op + extract/insert
1503     { ISD::CTTZ,       MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
1504     { ISD::CTTZ,       MVT::v32i8,  20 }, // 2 x 128-bit Op + extract/insert
1505     { ISD::FSQRT,      MVT::f32,    14 }, // SNB from http://www.agner.org/
1506     { ISD::FSQRT,      MVT::v4f32,  14 }, // SNB from http://www.agner.org/
1507     { ISD::FSQRT,      MVT::v8f32,  28 }, // SNB from http://www.agner.org/
1508     { ISD::FSQRT,      MVT::f64,    21 }, // SNB from http://www.agner.org/
1509     { ISD::FSQRT,      MVT::v2f64,  21 }, // SNB from http://www.agner.org/
1510     { ISD::FSQRT,      MVT::v4f64,  43 }, // SNB from http://www.agner.org/
1511   };
1512   static const CostTblEntry SSE42CostTbl[] = {
1513     { ISD::FSQRT,      MVT::f32,    18 }, // Nehalem from http://www.agner.org/
1514     { ISD::FSQRT,      MVT::v4f32,  18 }, // Nehalem from http://www.agner.org/
1515   };
1516   static const CostTblEntry SSSE3CostTbl[] = {
1517     { ISD::BITREVERSE, MVT::v2i64,   5 },
1518     { ISD::BITREVERSE, MVT::v4i32,   5 },
1519     { ISD::BITREVERSE, MVT::v8i16,   5 },
1520     { ISD::BITREVERSE, MVT::v16i8,   5 },
1521     { ISD::BSWAP,      MVT::v2i64,   1 },
1522     { ISD::BSWAP,      MVT::v4i32,   1 },
1523     { ISD::BSWAP,      MVT::v8i16,   1 },
1524     { ISD::CTLZ,       MVT::v2i64,  23 },
1525     { ISD::CTLZ,       MVT::v4i32,  18 },
1526     { ISD::CTLZ,       MVT::v8i16,  14 },
1527     { ISD::CTLZ,       MVT::v16i8,   9 },
1528     { ISD::CTPOP,      MVT::v2i64,   7 },
1529     { ISD::CTPOP,      MVT::v4i32,  11 },
1530     { ISD::CTPOP,      MVT::v8i16,   9 },
1531     { ISD::CTPOP,      MVT::v16i8,   6 },
1532     { ISD::CTTZ,       MVT::v2i64,  10 },
1533     { ISD::CTTZ,       MVT::v4i32,  14 },
1534     { ISD::CTTZ,       MVT::v8i16,  12 },
1535     { ISD::CTTZ,       MVT::v16i8,   9 }
1536   };
1537   static const CostTblEntry SSE2CostTbl[] = {
1538     { ISD::BITREVERSE, MVT::v2i64,  29 },
1539     { ISD::BITREVERSE, MVT::v4i32,  27 },
1540     { ISD::BITREVERSE, MVT::v8i16,  27 },
1541     { ISD::BITREVERSE, MVT::v16i8,  20 },
1542     { ISD::BSWAP,      MVT::v2i64,   7 },
1543     { ISD::BSWAP,      MVT::v4i32,   7 },
1544     { ISD::BSWAP,      MVT::v8i16,   7 },
1545     { ISD::CTLZ,       MVT::v2i64,  25 },
1546     { ISD::CTLZ,       MVT::v4i32,  26 },
1547     { ISD::CTLZ,       MVT::v8i16,  20 },
1548     { ISD::CTLZ,       MVT::v16i8,  17 },
1549     { ISD::CTPOP,      MVT::v2i64,  12 },
1550     { ISD::CTPOP,      MVT::v4i32,  15 },
1551     { ISD::CTPOP,      MVT::v8i16,  13 },
1552     { ISD::CTPOP,      MVT::v16i8,  10 },
1553     { ISD::CTTZ,       MVT::v2i64,  14 },
1554     { ISD::CTTZ,       MVT::v4i32,  18 },
1555     { ISD::CTTZ,       MVT::v8i16,  16 },
1556     { ISD::CTTZ,       MVT::v16i8,  13 },
1557     { ISD::FSQRT,      MVT::f64,    32 }, // Nehalem from http://www.agner.org/
1558     { ISD::FSQRT,      MVT::v2f64,  32 }, // Nehalem from http://www.agner.org/
1559   };
1560   static const CostTblEntry SSE1CostTbl[] = {
1561     { ISD::FSQRT,      MVT::f32,    28 }, // Pentium III from http://www.agner.org/
1562     { ISD::FSQRT,      MVT::v4f32,  56 }, // Pentium III from http://www.agner.org/
1563   };
1564   static const CostTblEntry X64CostTbl[] = { // 64-bit targets
1565     { ISD::BITREVERSE, MVT::i64,    14 }
1566   };
1567   static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1568     { ISD::BITREVERSE, MVT::i32,    14 },
1569     { ISD::BITREVERSE, MVT::i16,    14 },
1570     { ISD::BITREVERSE, MVT::i8,     11 }
1571   };
1572 
1573   unsigned ISD = ISD::DELETED_NODE;
1574   switch (IID) {
1575   default:
1576     break;
1577   case Intrinsic::bitreverse:
1578     ISD = ISD::BITREVERSE;
1579     break;
1580   case Intrinsic::bswap:
1581     ISD = ISD::BSWAP;
1582     break;
1583   case Intrinsic::ctlz:
1584     ISD = ISD::CTLZ;
1585     break;
1586   case Intrinsic::ctpop:
1587     ISD = ISD::CTPOP;
1588     break;
1589   case Intrinsic::cttz:
1590     ISD = ISD::CTTZ;
1591     break;
1592   case Intrinsic::sqrt:
1593     ISD = ISD::FSQRT;
1594     break;
1595   }
1596 
1597   // Legalize the type.
1598   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
1599   MVT MTy = LT.second;
1600 
1601   // Attempt to lookup cost.
1602   if (ST->hasCDI())
1603     if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
1604       return LT.first * Entry->Cost;
1605 
1606   if (ST->hasBWI())
1607     if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
1608       return LT.first * Entry->Cost;
1609 
1610   if (ST->hasAVX512())
1611     if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
1612       return LT.first * Entry->Cost;
1613 
1614   if (ST->hasXOP())
1615     if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
1616       return LT.first * Entry->Cost;
1617 
1618   if (ST->hasAVX2())
1619     if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
1620       return LT.first * Entry->Cost;
1621 
1622   if (ST->hasAVX())
1623     if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
1624       return LT.first * Entry->Cost;
1625 
1626   if (ST->hasSSE42())
1627     if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
1628       return LT.first * Entry->Cost;
1629 
1630   if (ST->hasSSSE3())
1631     if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
1632       return LT.first * Entry->Cost;
1633 
1634   if (ST->hasSSE2())
1635     if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
1636       return LT.first * Entry->Cost;
1637 
1638   if (ST->hasSSE1())
1639     if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
1640       return LT.first * Entry->Cost;
1641 
1642   if (ST->is64Bit())
1643     if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
1644       return LT.first * Entry->Cost;
1645 
1646   if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
1647     return LT.first * Entry->Cost;
1648 
1649   return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed);
1650 }
1651 
1652 int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
1653                      ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) {
1654   return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF);
1655 }
1656 
1657 int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
1658   assert(Val->isVectorTy() && "This must be a vector type");
1659 
1660   Type *ScalarType = Val->getScalarType();
1661 
1662   if (Index != -1U) {
1663     // Legalize the type.
1664     std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
1665 
1666     // This type is legalized to a scalar type.
1667     if (!LT.second.isVector())
1668       return 0;
1669 
1670     // The type may be split. Normalize the index to the new type.
1671     unsigned Width = LT.second.getVectorNumElements();
1672     Index = Index % Width;
1673 
1674     // Floating point scalars are already located in index #0.
1675     if (ScalarType->isFloatingPointTy() && Index == 0)
1676       return 0;
1677   }
1678 
1679   // Add to the base cost if we know that the extracted element of a vector is
1680   // destined to be moved to and used in the integer register file.
1681   int RegisterFileMoveCost = 0;
1682   if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
1683     RegisterFileMoveCost = 1;
1684 
1685   return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
1686 }
1687 
1688 int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
1689                                 unsigned AddressSpace, const Instruction *I) {
1690   // Handle non-power-of-two vectors such as <3 x float>
1691   if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
1692     unsigned NumElem = VTy->getVectorNumElements();
1693 
1694     // Handle a few common cases:
1695     // <3 x float>
1696     if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
1697       // Cost = 64 bit store + extract + 32 bit store.
1698       return 3;
1699 
1700     // <3 x double>
1701     if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
1702       // Cost = 128 bit store + unpack + 64 bit store.
1703       return 3;
1704 
1705     // Assume that all other non-power-of-two numbers are scalarized.
1706     if (!isPowerOf2_32(NumElem)) {
1707       int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
1708                                         AddressSpace);
1709       int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load,
1710                                                Opcode == Instruction::Store);
1711       return NumElem * Cost + SplitCost;
1712     }
1713   }
1714 
1715   // Legalize the type.
1716   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
1717   assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1718          "Invalid Opcode");
1719 
1720   // Each load/store unit costs 1.
1721   int Cost = LT.first * 1;
1722 
1723   // This isn't exactly right. We're using slow unaligned 32-byte accesses as a
1724   // proxy for a double-pumped AVX memory interface such as on Sandybridge.
1725   if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow())
1726     Cost *= 2;
1727 
1728   return Cost;
1729 }
1730 
1731 int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
1732                                       unsigned Alignment,
1733                                       unsigned AddressSpace) {
1734   VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
1735   if (!SrcVTy)
1736     // To calculate scalar take the regular cost, without mask
1737     return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace);
1738 
1739   unsigned NumElem = SrcVTy->getVectorNumElements();
1740   VectorType *MaskTy =
1741     VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
1742   if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy)) ||
1743       (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy)) ||
1744       !isPowerOf2_32(NumElem)) {
1745     // Scalarization
1746     int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
1747     int ScalarCompareCost = getCmpSelInstrCost(
1748         Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr);
1749     int BranchCost = getCFInstrCost(Instruction::Br);
1750     int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
1751 
1752     int ValueSplitCost = getScalarizationOverhead(
1753         SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store);
1754     int MemopCost =
1755         NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
1756                                          Alignment, AddressSpace);
1757     return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
1758   }
1759 
1760   // Legalize the type.
1761   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
1762   auto VT = TLI->getValueType(DL, SrcVTy);
1763   int Cost = 0;
1764   if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
1765       LT.second.getVectorNumElements() == NumElem)
1766     // Promotion requires expand/truncate for data and a shuffle for mask.
1767     Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, nullptr) +
1768             getShuffleCost(TTI::SK_Alternate, MaskTy, 0, nullptr);
1769 
1770   else if (LT.second.getVectorNumElements() > NumElem) {
1771     VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
1772                                             LT.second.getVectorNumElements());
1773     // Expanding requires fill mask with zeroes
1774     Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
1775   }
1776   if (!ST->hasAVX512())
1777     return Cost + LT.first*4; // Each maskmov costs 4
1778 
1779   // AVX-512 masked load/store is cheapper
1780   return Cost+LT.first;
1781 }
1782 
1783 int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
1784                                           const SCEV *Ptr) {
1785   // Address computations in vectorized code with non-consecutive addresses will
1786   // likely result in more instructions compared to scalar code where the
1787   // computation can more often be merged into the index mode. The resulting
1788   // extra micro-ops can significantly decrease throughput.
1789   unsigned NumVectorInstToHideOverhead = 10;
1790 
1791   // Cost modeling of Strided Access Computation is hidden by the indexing
1792   // modes of X86 regardless of the stride value. We dont believe that there
1793   // is a difference between constant strided access in gerenal and constant
1794   // strided value which is less than or equal to 64.
1795   // Even in the case of (loop invariant) stride whose value is not known at
1796   // compile time, the address computation will not incur more than one extra
1797   // ADD instruction.
1798   if (Ty->isVectorTy() && SE) {
1799     if (!BaseT::isStridedAccess(Ptr))
1800       return NumVectorInstToHideOverhead;
1801     if (!BaseT::getConstantStrideStep(SE, Ptr))
1802       return 1;
1803   }
1804 
1805   return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1806 }
1807 
1808 int X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
1809                                  bool IsPairwise) {
1810 
1811   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1812 
1813   MVT MTy = LT.second;
1814 
1815   int ISD = TLI->InstructionOpcodeToISD(Opcode);
1816   assert(ISD && "Invalid opcode");
1817 
1818   // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
1819   // and make it as the cost.
1820 
1821   static const CostTblEntry SSE42CostTblPairWise[] = {
1822     { ISD::FADD,  MVT::v2f64,   2 },
1823     { ISD::FADD,  MVT::v4f32,   4 },
1824     { ISD::ADD,   MVT::v2i64,   2 },      // The data reported by the IACA tool is "1.6".
1825     { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.5".
1826     { ISD::ADD,   MVT::v8i16,   5 },
1827   };
1828 
1829   static const CostTblEntry AVX1CostTblPairWise[] = {
1830     { ISD::FADD,  MVT::v4f32,   4 },
1831     { ISD::FADD,  MVT::v4f64,   5 },
1832     { ISD::FADD,  MVT::v8f32,   7 },
1833     { ISD::ADD,   MVT::v2i64,   1 },      // The data reported by the IACA tool is "1.5".
1834     { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.5".
1835     { ISD::ADD,   MVT::v4i64,   5 },      // The data reported by the IACA tool is "4.8".
1836     { ISD::ADD,   MVT::v8i16,   5 },
1837     { ISD::ADD,   MVT::v8i32,   5 },
1838   };
1839 
1840   static const CostTblEntry SSE42CostTblNoPairWise[] = {
1841     { ISD::FADD,  MVT::v2f64,   2 },
1842     { ISD::FADD,  MVT::v4f32,   4 },
1843     { ISD::ADD,   MVT::v2i64,   2 },      // The data reported by the IACA tool is "1.6".
1844     { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.3".
1845     { ISD::ADD,   MVT::v8i16,   4 },      // The data reported by the IACA tool is "4.3".
1846   };
1847 
1848   static const CostTblEntry AVX1CostTblNoPairWise[] = {
1849     { ISD::FADD,  MVT::v4f32,   3 },
1850     { ISD::FADD,  MVT::v4f64,   3 },
1851     { ISD::FADD,  MVT::v8f32,   4 },
1852     { ISD::ADD,   MVT::v2i64,   1 },      // The data reported by the IACA tool is "1.5".
1853     { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "2.8".
1854     { ISD::ADD,   MVT::v4i64,   3 },
1855     { ISD::ADD,   MVT::v8i16,   4 },
1856     { ISD::ADD,   MVT::v8i32,   5 },
1857   };
1858 
1859   if (IsPairwise) {
1860     if (ST->hasAVX())
1861       if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
1862         return LT.first * Entry->Cost;
1863 
1864     if (ST->hasSSE42())
1865       if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
1866         return LT.first * Entry->Cost;
1867   } else {
1868     if (ST->hasAVX())
1869       if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
1870         return LT.first * Entry->Cost;
1871 
1872     if (ST->hasSSE42())
1873       if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
1874         return LT.first * Entry->Cost;
1875   }
1876 
1877   return BaseT::getReductionCost(Opcode, ValTy, IsPairwise);
1878 }
1879 
1880 /// \brief Calculate the cost of materializing a 64-bit value. This helper
1881 /// method might only calculate a fraction of a larger immediate. Therefore it
1882 /// is valid to return a cost of ZERO.
1883 int X86TTIImpl::getIntImmCost(int64_t Val) {
1884   if (Val == 0)
1885     return TTI::TCC_Free;
1886 
1887   if (isInt<32>(Val))
1888     return TTI::TCC_Basic;
1889 
1890   return 2 * TTI::TCC_Basic;
1891 }
1892 
1893 int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
1894   assert(Ty->isIntegerTy());
1895 
1896   unsigned BitSize = Ty->getPrimitiveSizeInBits();
1897   if (BitSize == 0)
1898     return ~0U;
1899 
1900   // Never hoist constants larger than 128bit, because this might lead to
1901   // incorrect code generation or assertions in codegen.
1902   // Fixme: Create a cost model for types larger than i128 once the codegen
1903   // issues have been fixed.
1904   if (BitSize > 128)
1905     return TTI::TCC_Free;
1906 
1907   if (Imm == 0)
1908     return TTI::TCC_Free;
1909 
1910   // Sign-extend all constants to a multiple of 64-bit.
1911   APInt ImmVal = Imm;
1912   if (BitSize & 0x3f)
1913     ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
1914 
1915   // Split the constant into 64-bit chunks and calculate the cost for each
1916   // chunk.
1917   int Cost = 0;
1918   for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
1919     APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
1920     int64_t Val = Tmp.getSExtValue();
1921     Cost += getIntImmCost(Val);
1922   }
1923   // We need at least one instruction to materialize the constant.
1924   return std::max(1, Cost);
1925 }
1926 
1927 int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
1928                               Type *Ty) {
1929   assert(Ty->isIntegerTy());
1930 
1931   unsigned BitSize = Ty->getPrimitiveSizeInBits();
1932   // There is no cost model for constants with a bit size of 0. Return TCC_Free
1933   // here, so that constant hoisting will ignore this constant.
1934   if (BitSize == 0)
1935     return TTI::TCC_Free;
1936 
1937   unsigned ImmIdx = ~0U;
1938   switch (Opcode) {
1939   default:
1940     return TTI::TCC_Free;
1941   case Instruction::GetElementPtr:
1942     // Always hoist the base address of a GetElementPtr. This prevents the
1943     // creation of new constants for every base constant that gets constant
1944     // folded with the offset.
1945     if (Idx == 0)
1946       return 2 * TTI::TCC_Basic;
1947     return TTI::TCC_Free;
1948   case Instruction::Store:
1949     ImmIdx = 0;
1950     break;
1951   case Instruction::ICmp:
1952     // This is an imperfect hack to prevent constant hoisting of
1953     // compares that might be trying to check if a 64-bit value fits in
1954     // 32-bits. The backend can optimize these cases using a right shift by 32.
1955     // Ideally we would check the compare predicate here. There also other
1956     // similar immediates the backend can use shifts for.
1957     if (Idx == 1 && Imm.getBitWidth() == 64) {
1958       uint64_t ImmVal = Imm.getZExtValue();
1959       if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
1960         return TTI::TCC_Free;
1961     }
1962     ImmIdx = 1;
1963     break;
1964   case Instruction::And:
1965     // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
1966     // by using a 32-bit operation with implicit zero extension. Detect such
1967     // immediates here as the normal path expects bit 31 to be sign extended.
1968     if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue()))
1969       return TTI::TCC_Free;
1970     LLVM_FALLTHROUGH;
1971   case Instruction::Add:
1972   case Instruction::Sub:
1973   case Instruction::Mul:
1974   case Instruction::UDiv:
1975   case Instruction::SDiv:
1976   case Instruction::URem:
1977   case Instruction::SRem:
1978   case Instruction::Or:
1979   case Instruction::Xor:
1980     ImmIdx = 1;
1981     break;
1982   // Always return TCC_Free for the shift value of a shift instruction.
1983   case Instruction::Shl:
1984   case Instruction::LShr:
1985   case Instruction::AShr:
1986     if (Idx == 1)
1987       return TTI::TCC_Free;
1988     break;
1989   case Instruction::Trunc:
1990   case Instruction::ZExt:
1991   case Instruction::SExt:
1992   case Instruction::IntToPtr:
1993   case Instruction::PtrToInt:
1994   case Instruction::BitCast:
1995   case Instruction::PHI:
1996   case Instruction::Call:
1997   case Instruction::Select:
1998   case Instruction::Ret:
1999   case Instruction::Load:
2000     break;
2001   }
2002 
2003   if (Idx == ImmIdx) {
2004     int NumConstants = (BitSize + 63) / 64;
2005     int Cost = X86TTIImpl::getIntImmCost(Imm, Ty);
2006     return (Cost <= NumConstants * TTI::TCC_Basic)
2007                ? static_cast<int>(TTI::TCC_Free)
2008                : Cost;
2009   }
2010 
2011   return X86TTIImpl::getIntImmCost(Imm, Ty);
2012 }
2013 
2014 int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
2015                               Type *Ty) {
2016   assert(Ty->isIntegerTy());
2017 
2018   unsigned BitSize = Ty->getPrimitiveSizeInBits();
2019   // There is no cost model for constants with a bit size of 0. Return TCC_Free
2020   // here, so that constant hoisting will ignore this constant.
2021   if (BitSize == 0)
2022     return TTI::TCC_Free;
2023 
2024   switch (IID) {
2025   default:
2026     return TTI::TCC_Free;
2027   case Intrinsic::sadd_with_overflow:
2028   case Intrinsic::uadd_with_overflow:
2029   case Intrinsic::ssub_with_overflow:
2030   case Intrinsic::usub_with_overflow:
2031   case Intrinsic::smul_with_overflow:
2032   case Intrinsic::umul_with_overflow:
2033     if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
2034       return TTI::TCC_Free;
2035     break;
2036   case Intrinsic::experimental_stackmap:
2037     if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
2038       return TTI::TCC_Free;
2039     break;
2040   case Intrinsic::experimental_patchpoint_void:
2041   case Intrinsic::experimental_patchpoint_i64:
2042     if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
2043       return TTI::TCC_Free;
2044     break;
2045   }
2046   return X86TTIImpl::getIntImmCost(Imm, Ty);
2047 }
2048 
2049 // Return an average cost of Gather / Scatter instruction, maybe improved later
2050 int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
2051                                 unsigned Alignment, unsigned AddressSpace) {
2052 
2053   assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
2054   unsigned VF = SrcVTy->getVectorNumElements();
2055 
2056   // Try to reduce index size from 64 bit (default for GEP)
2057   // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
2058   // operation will use 16 x 64 indices which do not fit in a zmm and needs
2059   // to split. Also check that the base pointer is the same for all lanes,
2060   // and that there's at most one variable index.
2061   auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) {
2062     unsigned IndexSize = DL.getPointerSizeInBits();
2063     GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
2064     if (IndexSize < 64 || !GEP)
2065       return IndexSize;
2066 
2067     unsigned NumOfVarIndices = 0;
2068     Value *Ptrs = GEP->getPointerOperand();
2069     if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
2070       return IndexSize;
2071     for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
2072       if (isa<Constant>(GEP->getOperand(i)))
2073         continue;
2074       Type *IndxTy = GEP->getOperand(i)->getType();
2075       if (IndxTy->isVectorTy())
2076         IndxTy = IndxTy->getVectorElementType();
2077       if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
2078           !isa<SExtInst>(GEP->getOperand(i))) ||
2079          ++NumOfVarIndices > 1)
2080         return IndexSize; // 64
2081     }
2082     return (unsigned)32;
2083   };
2084 
2085 
2086   // Trying to reduce IndexSize to 32 bits for vector 16.
2087   // By default the IndexSize is equal to pointer size.
2088   unsigned IndexSize = (VF >= 16) ? getIndexSizeInBits(Ptr, DL) :
2089     DL.getPointerSizeInBits();
2090 
2091   Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(),
2092                                                     IndexSize), VF);
2093   std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
2094   std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
2095   int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
2096   if (SplitFactor > 1) {
2097     // Handle splitting of vector of pointers
2098     Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
2099     return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
2100                                          AddressSpace);
2101   }
2102 
2103   // The gather / scatter cost is given by Intel architects. It is a rough
2104   // number since we are looking at one instruction in a time.
2105   const int GSOverhead = 2;
2106   return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
2107                                            Alignment, AddressSpace);
2108 }
2109 
2110 /// Return the cost of full scalarization of gather / scatter operation.
2111 ///
2112 /// Opcode - Load or Store instruction.
2113 /// SrcVTy - The type of the data vector that should be gathered or scattered.
2114 /// VariableMask - The mask is non-constant at compile time.
2115 /// Alignment - Alignment for one element.
2116 /// AddressSpace - pointer[s] address space.
2117 ///
2118 int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
2119                                 bool VariableMask, unsigned Alignment,
2120                                 unsigned AddressSpace) {
2121   unsigned VF = SrcVTy->getVectorNumElements();
2122 
2123   int MaskUnpackCost = 0;
2124   if (VariableMask) {
2125     VectorType *MaskTy =
2126       VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
2127     MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
2128     int ScalarCompareCost =
2129       getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()),
2130                          nullptr);
2131     int BranchCost = getCFInstrCost(Instruction::Br);
2132     MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
2133   }
2134 
2135   // The cost of the scalar loads/stores.
2136   int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
2137                                           Alignment, AddressSpace);
2138 
2139   int InsertExtractCost = 0;
2140   if (Opcode == Instruction::Load)
2141     for (unsigned i = 0; i < VF; ++i)
2142       // Add the cost of inserting each scalar load into the vector
2143       InsertExtractCost +=
2144         getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
2145   else
2146     for (unsigned i = 0; i < VF; ++i)
2147       // Add the cost of extracting each element out of the data vector
2148       InsertExtractCost +=
2149         getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
2150 
2151   return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
2152 }
2153 
2154 /// Calculate the cost of Gather / Scatter operation
2155 int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
2156                                        Value *Ptr, bool VariableMask,
2157                                        unsigned Alignment) {
2158   assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
2159   unsigned VF = SrcVTy->getVectorNumElements();
2160   PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
2161   if (!PtrTy && Ptr->getType()->isVectorTy())
2162     PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType());
2163   assert(PtrTy && "Unexpected type for Ptr argument");
2164   unsigned AddressSpace = PtrTy->getAddressSpace();
2165 
2166   bool Scalarize = false;
2167   if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) ||
2168       (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy)))
2169     Scalarize = true;
2170   // Gather / Scatter for vector 2 is not profitable on KNL / SKX
2171   // Vector-4 of gather/scatter instruction does not exist on KNL.
2172   // We can extend it to 8 elements, but zeroing upper bits of
2173   // the mask vector will add more instructions. Right now we give the scalar
2174   // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
2175   // is better in the VariableMask case.
2176   if (VF == 2 || (VF == 4 && !ST->hasVLX()))
2177     Scalarize = true;
2178 
2179   if (Scalarize)
2180     return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
2181                            AddressSpace);
2182 
2183   return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
2184 }
2185 
2186 bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
2187   Type *ScalarTy = DataTy->getScalarType();
2188   int DataWidth = isa<PointerType>(ScalarTy) ?
2189     DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();
2190 
2191   return ((DataWidth == 32 || DataWidth == 64) && ST->hasAVX()) ||
2192          ((DataWidth == 8 || DataWidth == 16) && ST->hasBWI());
2193 }
2194 
2195 bool X86TTIImpl::isLegalMaskedStore(Type *DataType) {
2196   return isLegalMaskedLoad(DataType);
2197 }
2198 
2199 bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) {
2200   // This function is called now in two cases: from the Loop Vectorizer
2201   // and from the Scalarizer.
2202   // When the Loop Vectorizer asks about legality of the feature,
2203   // the vectorization factor is not calculated yet. The Loop Vectorizer
2204   // sends a scalar type and the decision is based on the width of the
2205   // scalar element.
2206   // Later on, the cost model will estimate usage this intrinsic based on
2207   // the vector type.
2208   // The Scalarizer asks again about legality. It sends a vector type.
2209   // In this case we can reject non-power-of-2 vectors.
2210   if (isa<VectorType>(DataTy) && !isPowerOf2_32(DataTy->getVectorNumElements()))
2211     return false;
2212   Type *ScalarTy = DataTy->getScalarType();
2213   int DataWidth = isa<PointerType>(ScalarTy) ?
2214     DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();
2215 
2216   // AVX-512 allows gather and scatter
2217   return (DataWidth == 32 || DataWidth == 64) && ST->hasAVX512();
2218 }
2219 
2220 bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) {
2221   return isLegalMaskedGather(DataType);
2222 }
2223 
2224 bool X86TTIImpl::areInlineCompatible(const Function *Caller,
2225                                      const Function *Callee) const {
2226   const TargetMachine &TM = getTLI()->getTargetMachine();
2227 
2228   // Work this as a subsetting of subtarget features.
2229   const FeatureBitset &CallerBits =
2230       TM.getSubtargetImpl(*Caller)->getFeatureBits();
2231   const FeatureBitset &CalleeBits =
2232       TM.getSubtargetImpl(*Callee)->getFeatureBits();
2233 
2234   // FIXME: This is likely too limiting as it will include subtarget features
2235   // that we might not care about for inlining, but it is conservatively
2236   // correct.
2237   return (CallerBits & CalleeBits) == CalleeBits;
2238 }
2239 
2240 bool X86TTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) {
2241   // TODO: We can increase these based on available vector ops.
2242   MaxLoadSize = ST->is64Bit() ? 8 : 4;
2243   return true;
2244 }
2245 
2246 bool X86TTIImpl::enableInterleavedAccessVectorization() {
2247   // TODO: We expect this to be beneficial regardless of arch,
2248   // but there are currently some unexplained performance artifacts on Atom.
2249   // As a temporary solution, disable on Atom.
2250   return !(ST->isAtom());
2251 }
2252 
2253 // Get estimation for interleaved load/store operations for AVX2.
2254 // \p Factor is the interleaved-access factor (stride) - number of
2255 // (interleaved) elements in the group.
2256 // \p Indices contains the indices for a strided load: when the
2257 // interleaved load has gaps they indicate which elements are used.
2258 // If Indices is empty (or if the number of indices is equal to the size
2259 // of the interleaved-access as given in \p Factor) the access has no gaps.
2260 //
2261 // As opposed to AVX-512, AVX2 does not have generic shuffles that allow
2262 // computing the cost using a generic formula as a function of generic
2263 // shuffles. We therefore use a lookup table instead, filled according to
2264 // the instruction sequences that codegen currently generates.
2265 int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
2266                                                unsigned Factor,
2267                                                ArrayRef<unsigned> Indices,
2268                                                unsigned Alignment,
2269                                                unsigned AddressSpace) {
2270 
2271   // We currently Support only fully-interleaved groups, with no gaps.
2272   // TODO: Support also strided loads (interleaved-groups with gaps).
2273   if (Indices.size() && Indices.size() != Factor)
2274     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2275                                              Alignment, AddressSpace);
2276 
2277   // VecTy for interleave memop is <VF*Factor x Elt>.
2278   // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
2279   // VecTy = <12 x i32>.
2280   MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
2281 
2282   // This function can be called with VecTy=<6xi128>, Factor=3, in which case
2283   // the VF=2, while v2i128 is an unsupported MVT vector type
2284   // (see MachineValueType.h::getVectorVT()).
2285   if (!LegalVT.isVector())
2286     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2287                                              Alignment, AddressSpace);
2288 
2289   unsigned VF = VecTy->getVectorNumElements() / Factor;
2290   Type *ScalarTy = VecTy->getVectorElementType();
2291 
2292   // Calculate the number of memory operations (NumOfMemOps), required
2293   // for load/store the VecTy.
2294   unsigned VecTySize = DL.getTypeStoreSize(VecTy);
2295   unsigned LegalVTSize = LegalVT.getStoreSize();
2296   unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
2297 
2298   // Get the cost of one memory operation.
2299   Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
2300                                         LegalVT.getVectorNumElements());
2301   unsigned MemOpCost =
2302       getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
2303 
2304   VectorType *VT = VectorType::get(ScalarTy, VF);
2305   EVT ETy = TLI->getValueType(DL, VT);
2306   if (!ETy.isSimple())
2307     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2308                                              Alignment, AddressSpace);
2309 
2310   // TODO: Complete for other data-types and strides.
2311   // Each combination of Stride, ElementTy and VF results in a different
2312   // sequence; The cost tables are therefore accessed with:
2313   // Factor (stride) and VectorType=VFxElemType.
2314   // The Cost accounts only for the shuffle sequence;
2315   // The cost of the loads/stores is accounted for separately.
2316   //
2317   static const CostTblEntry AVX2InterleavedLoadTbl[] = {
2318     { 3, MVT::v2i8,  10 }, //(load 6i8 and)  deinterleave into 3 x 2i8
2319     { 3, MVT::v4i8,  4 },  //(load 12i8 and) deinterleave into 3 x 4i8
2320     { 3, MVT::v8i8,  9 },  //(load 24i8 and) deinterleave into 3 x 8i8
2321     { 3, MVT::v16i8, 18},  //(load 48i8 and) deinterleave into 3 x 16i8
2322     { 3, MVT::v32i8, 42 }, //(load 96i8 and) deinterleave into 3 x 32i8
2323 
2324     { 4, MVT::v2i8,  12 }, //(load 8i8 and)   deinterleave into 4 x 2i8
2325     { 4, MVT::v4i8,  4 },  //(load 16i8 and)  deinterleave into 4 x 4i8
2326     { 4, MVT::v8i8,  20 }, //(load 32i8 and)  deinterleave into 4 x 8i8
2327     { 4, MVT::v16i8, 39 }, //(load 64i8 and)  deinterleave into 4 x 16i8
2328     { 4, MVT::v32i8, 80 }  //(load 128i8 and) deinterleave into 4 x 32i8
2329   };
2330 
2331   static const CostTblEntry AVX2InterleavedStoreTbl[] = {
2332     { 3, MVT::v2i8,  7 },  //interleave 3 x 2i8  into 6i8 (and store)
2333     { 3, MVT::v4i8,  8 },  //interleave 3 x 4i8  into 12i8 (and store)
2334     { 3, MVT::v8i8,  11 }, //interleave 3 x 8i8  into 24i8 (and store)
2335     { 3, MVT::v16i8, 17 }, //interleave 3 x 16i8 into 48i8 (and store)
2336     { 3, MVT::v32i8, 32 }, //interleave 3 x 32i8 into 96i8 (and store)
2337 
2338     { 4, MVT::v2i8,  12 }, //interleave 4 x 2i8  into 8i8 (and store)
2339     { 4, MVT::v4i8,  9 },  //interleave 4 x 4i8  into 16i8 (and store)
2340     { 4, MVT::v8i8,  16 }, //interleave 4 x 8i8  into 32i8 (and store)
2341     { 4, MVT::v16i8, 20 }, //interleave 4 x 16i8 into 64i8 (and store)
2342     { 4, MVT::v32i8, 40 }  //interleave 4 x 32i8 into 128i8 (and store)
2343   };
2344 
2345   if (Opcode == Instruction::Load) {
2346     if (const auto *Entry =
2347             CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
2348       return NumOfMemOps * MemOpCost + Entry->Cost;
2349   } else {
2350     assert(Opcode == Instruction::Store &&
2351            "Expected Store Instruction at this  point");
2352     if (const auto *Entry =
2353             CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
2354       return NumOfMemOps * MemOpCost + Entry->Cost;
2355   }
2356 
2357   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2358                                            Alignment, AddressSpace);
2359 }
2360 
2361 // Get estimation for interleaved load/store operations and strided load.
2362 // \p Indices contains indices for strided load.
2363 // \p Factor - the factor of interleaving.
2364 // AVX-512 provides 3-src shuffles that significantly reduces the cost.
2365 int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
2366                                                  unsigned Factor,
2367                                                  ArrayRef<unsigned> Indices,
2368                                                  unsigned Alignment,
2369                                                  unsigned AddressSpace) {
2370 
2371   // VecTy for interleave memop is <VF*Factor x Elt>.
2372   // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
2373   // VecTy = <12 x i32>.
2374 
2375   // Calculate the number of memory operations (NumOfMemOps), required
2376   // for load/store the VecTy.
2377   MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
2378   unsigned VecTySize = DL.getTypeStoreSize(VecTy);
2379   unsigned LegalVTSize = LegalVT.getStoreSize();
2380   unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
2381 
2382   // Get the cost of one memory operation.
2383   Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
2384                                         LegalVT.getVectorNumElements());
2385   unsigned MemOpCost =
2386       getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
2387 
2388   if (Opcode == Instruction::Load) {
2389     // Kind of shuffle depends on number of loaded values.
2390     // If we load the entire data in one register, we can use a 1-src shuffle.
2391     // Otherwise, we'll merge 2 sources in each operation.
2392     TTI::ShuffleKind ShuffleKind =
2393         (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
2394 
2395     unsigned ShuffleCost =
2396         getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr);
2397 
2398     unsigned NumOfLoadsInInterleaveGrp =
2399         Indices.size() ? Indices.size() : Factor;
2400     Type *ResultTy = VectorType::get(VecTy->getVectorElementType(),
2401                                      VecTy->getVectorNumElements() / Factor);
2402     unsigned NumOfResults =
2403         getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
2404         NumOfLoadsInInterleaveGrp;
2405 
2406     // About a half of the loads may be folded in shuffles when we have only
2407     // one result. If we have more than one result, we do not fold loads at all.
2408     unsigned NumOfUnfoldedLoads =
2409         NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
2410 
2411     // Get a number of shuffle operations per result.
2412     unsigned NumOfShufflesPerResult =
2413         std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
2414 
2415     // The SK_MergeTwoSrc shuffle clobbers one of src operands.
2416     // When we have more than one destination, we need additional instructions
2417     // to keep sources.
2418     unsigned NumOfMoves = 0;
2419     if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
2420       NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
2421 
2422     int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
2423                NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
2424 
2425     return Cost;
2426   }
2427 
2428   // Store.
2429   assert(Opcode == Instruction::Store &&
2430          "Expected Store Instruction at this  point");
2431 
2432   // There is no strided stores meanwhile. And store can't be folded in
2433   // shuffle.
2434   unsigned NumOfSources = Factor; // The number of values to be merged.
2435   unsigned ShuffleCost =
2436       getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);
2437   unsigned NumOfShufflesPerStore = NumOfSources - 1;
2438 
2439   // The SK_MergeTwoSrc shuffle clobbers one of src operands.
2440   // We need additional instructions to keep sources.
2441   unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
2442   int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
2443              NumOfMoves;
2444   return Cost;
2445 }
2446 
2447 int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
2448                                            unsigned Factor,
2449                                            ArrayRef<unsigned> Indices,
2450                                            unsigned Alignment,
2451                                            unsigned AddressSpace) {
2452   auto isSupportedOnAVX512 = [](Type *VecTy, bool &RequiresBW) {
2453     RequiresBW = false;
2454     Type *EltTy = VecTy->getVectorElementType();
2455     if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
2456         EltTy->isIntegerTy(32) || EltTy->isPointerTy())
2457       return true;
2458     if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8)) {
2459       RequiresBW = true;
2460       return true;
2461     }
2462     return false;
2463   };
2464   bool RequiresBW;
2465   bool HasAVX512Solution = isSupportedOnAVX512(VecTy, RequiresBW);
2466   if (ST->hasAVX512() && HasAVX512Solution && (!RequiresBW || ST->hasBWI()))
2467     return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
2468                                             Alignment, AddressSpace);
2469   if (ST->hasAVX2())
2470     return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
2471                                           Alignment, AddressSpace);
2472 
2473   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2474                                            Alignment, AddressSpace);
2475 }
2476