1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 /// \file
10 /// This file implements a TargetTransformInfo analysis pass specific to the
11 /// X86 target machine. It uses the target's detailed information to provide
12 /// more precise answers to certain TTI queries, while letting the target
13 /// independent and default TTI implementations handle the rest.
14 ///
15 //===----------------------------------------------------------------------===//
16 /// About Cost Model numbers used below it's necessary to say the following:
17 /// the numbers correspond to some "generic" X86 CPU instead of usage of
18 /// concrete CPU model. Usually the numbers correspond to CPU where the feature
19 /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
20 /// the lookups below the cost is based on Nehalem as that was the first CPU
21 /// to support that feature level and thus has most likely the worst case cost.
22 /// Some examples of other technologies/CPUs:
23 ///   SSE 3   - Pentium4 / Athlon64
24 ///   SSE 4.1 - Penryn
25 ///   SSE 4.2 - Nehalem
26 ///   AVX     - Sandy Bridge
27 ///   AVX2    - Haswell
28 ///   AVX-512 - Xeon Phi / Skylake
29 /// And some examples of instruction target dependent costs (latency)
30 ///                   divss     sqrtss          rsqrtss
31 ///   AMD K7            11-16     19              3
32 ///   Piledriver        9-24      13-15           5
33 ///   Jaguar            14        16              2
34 ///   Pentium II,III    18        30              2
35 ///   Nehalem           7-14      7-18            3
36 ///   Haswell           10-13     11              5
37 /// TODO: Develop and implement  the target dependent cost model and
38 /// specialize cost numbers for different Cost Model Targets such as throughput,
39 /// code size, latency and uop count.
40 //===----------------------------------------------------------------------===//
41 
42 #include "X86TargetTransformInfo.h"
43 #include "llvm/Analysis/TargetTransformInfo.h"
44 #include "llvm/CodeGen/BasicTTIImpl.h"
45 #include "llvm/IR/IntrinsicInst.h"
46 #include "llvm/Support/Debug.h"
47 #include "llvm/Target/CostTable.h"
48 #include "llvm/Target/TargetLowering.h"
49 
50 using namespace llvm;
51 
52 #define DEBUG_TYPE "x86tti"
53 
54 //===----------------------------------------------------------------------===//
55 //
56 // X86 cost model.
57 //
58 //===----------------------------------------------------------------------===//
59 
60 TargetTransformInfo::PopcntSupportKind
61 X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
62   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
63   // TODO: Currently the __builtin_popcount() implementation using SSE3
64   //   instructions is inefficient. Once the problem is fixed, we should
65   //   call ST->hasSSE3() instead of ST->hasPOPCNT().
66   return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
67 }
68 
69 unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
70   if (Vector && !ST->hasSSE1())
71     return 0;
72 
73   if (ST->is64Bit()) {
74     if (Vector && ST->hasAVX512())
75       return 32;
76     return 16;
77   }
78   return 8;
79 }
80 
81 unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {
82   if (Vector) {
83     if (ST->hasAVX512())
84       return 512;
85     if (ST->hasAVX())
86       return 256;
87     if (ST->hasSSE1())
88       return 128;
89     return 0;
90   }
91 
92   if (ST->is64Bit())
93     return 64;
94 
95   return 32;
96 }
97 
98 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
99   return getRegisterBitWidth(true);
100 }
101 
102 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
103   // If the loop will not be vectorized, don't interleave the loop.
104   // Let regular unroll to unroll the loop, which saves the overflow
105   // check and memory check cost.
106   if (VF == 1)
107     return 1;
108 
109   if (ST->isAtom())
110     return 1;
111 
112   // Sandybridge and Haswell have multiple execution ports and pipelined
113   // vector units.
114   if (ST->hasAVX())
115     return 4;
116 
117   return 2;
118 }
119 
120 int X86TTIImpl::getArithmeticInstrCost(
121     unsigned Opcode, Type *Ty,
122     TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
123     TTI::OperandValueProperties Opd1PropInfo,
124     TTI::OperandValueProperties Opd2PropInfo,
125     ArrayRef<const Value *> Args) {
126   // Legalize the type.
127   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
128 
129   int ISD = TLI->InstructionOpcodeToISD(Opcode);
130   assert(ISD && "Invalid opcode");
131 
132   static const CostTblEntry SLMCostTable[] = {
133     { ISD::MUL,  MVT::v4i32, 11 }, // pmulld
134     { ISD::MUL,  MVT::v8i16, 2  }, // pmullw
135     { ISD::MUL,  MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
136     { ISD::FMUL, MVT::f64,   2  }, // mulsd
137     { ISD::FMUL, MVT::v2f64, 4  }, // mulpd
138     { ISD::FMUL, MVT::v4f32, 2  }, // mulps
139     { ISD::FDIV, MVT::f32,   17 }, // divss
140     { ISD::FDIV, MVT::v4f32, 39 }, // divps
141     { ISD::FDIV, MVT::f64,   32 }, // divsd
142     { ISD::FDIV, MVT::v2f64, 69 }, // divpd
143     { ISD::FADD, MVT::v2f64, 2  }, // addpd
144     { ISD::FSUB, MVT::v2f64, 2  }, // subpd
145     // v2i64/v4i64 mul is custom lowered as a series of long
146     // multiplies(3), shifts(3) and adds(2).
147     // slm muldq version throughput is 2
148     { ISD::MUL,  MVT::v2i64, 11 },
149   };
150 
151   if (ST->isSLM()) {
152     if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
153       // Check if the operands can be shrinked into a smaller datatype.
154       bool Op1Signed = false;
155       unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
156       bool Op2Signed = false;
157       unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
158 
159       bool signedMode = Op1Signed | Op2Signed;
160       unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
161 
162       if (OpMinSize <= 7)
163         return LT.first * 3; // pmullw/sext
164       if (!signedMode && OpMinSize <= 8)
165         return LT.first * 3; // pmullw/zext
166       if (OpMinSize <= 15)
167         return LT.first * 5; // pmullw/pmulhw/pshuf
168       if (!signedMode && OpMinSize <= 16)
169         return LT.first * 5; // pmullw/pmulhw/pshuf
170     }
171     if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
172                                             LT.second)) {
173       return LT.first * Entry->Cost;
174     }
175   }
176 
177   if (ISD == ISD::SDIV &&
178       Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
179       Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
180     // On X86, vector signed division by constants power-of-two are
181     // normally expanded to the sequence SRA + SRL + ADD + SRA.
182     // The OperandValue properties many not be same as that of previous
183     // operation;conservatively assume OP_None.
184     int Cost = 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info,
185                                           Op2Info, TargetTransformInfo::OP_None,
186                                           TargetTransformInfo::OP_None);
187     Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
188                                    TargetTransformInfo::OP_None,
189                                    TargetTransformInfo::OP_None);
190     Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
191                                    TargetTransformInfo::OP_None,
192                                    TargetTransformInfo::OP_None);
193 
194     return Cost;
195   }
196 
197   static const CostTblEntry AVX512BWUniformConstCostTable[] = {
198     { ISD::SHL,  MVT::v64i8,   2 }, // psllw + pand.
199     { ISD::SRL,  MVT::v64i8,   2 }, // psrlw + pand.
200     { ISD::SRA,  MVT::v64i8,   4 }, // psrlw, pand, pxor, psubb.
201 
202     { ISD::SDIV, MVT::v32i16,  6 }, // vpmulhw sequence
203     { ISD::UDIV, MVT::v32i16,  6 }, // vpmulhuw sequence
204   };
205 
206   if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
207       ST->hasBWI()) {
208     if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
209                                             LT.second))
210       return LT.first * Entry->Cost;
211   }
212 
213   static const CostTblEntry AVX512UniformConstCostTable[] = {
214     { ISD::SRA,  MVT::v2i64,   1 },
215     { ISD::SRA,  MVT::v4i64,   1 },
216     { ISD::SRA,  MVT::v8i64,   1 },
217 
218     { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
219     { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
220   };
221 
222   if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
223       ST->hasAVX512()) {
224     if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
225                                             LT.second))
226       return LT.first * Entry->Cost;
227   }
228 
229   static const CostTblEntry AVX2UniformConstCostTable[] = {
230     { ISD::SHL,  MVT::v32i8,   2 }, // psllw + pand.
231     { ISD::SRL,  MVT::v32i8,   2 }, // psrlw + pand.
232     { ISD::SRA,  MVT::v32i8,   4 }, // psrlw, pand, pxor, psubb.
233 
234     { ISD::SRA,  MVT::v4i64,   4 }, // 2 x psrad + shuffle.
235 
236     { ISD::SDIV, MVT::v16i16,  6 }, // vpmulhw sequence
237     { ISD::UDIV, MVT::v16i16,  6 }, // vpmulhuw sequence
238     { ISD::SDIV, MVT::v8i32,  15 }, // vpmuldq sequence
239     { ISD::UDIV, MVT::v8i32,  15 }, // vpmuludq sequence
240   };
241 
242   if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
243       ST->hasAVX2()) {
244     if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
245                                             LT.second))
246       return LT.first * Entry->Cost;
247   }
248 
249   static const CostTblEntry SSE2UniformConstCostTable[] = {
250     { ISD::SHL,  MVT::v16i8,     2 }, // psllw + pand.
251     { ISD::SRL,  MVT::v16i8,     2 }, // psrlw + pand.
252     { ISD::SRA,  MVT::v16i8,     4 }, // psrlw, pand, pxor, psubb.
253 
254     { ISD::SHL,  MVT::v32i8,   4+2 }, // 2*(psllw + pand) + split.
255     { ISD::SRL,  MVT::v32i8,   4+2 }, // 2*(psrlw + pand) + split.
256     { ISD::SRA,  MVT::v32i8,   8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
257 
258     { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
259     { ISD::SDIV, MVT::v8i16,     6 }, // pmulhw sequence
260     { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
261     { ISD::UDIV, MVT::v8i16,     6 }, // pmulhuw sequence
262     { ISD::SDIV, MVT::v8i32,  38+2 }, // 2*pmuludq sequence + split.
263     { ISD::SDIV, MVT::v4i32,    19 }, // pmuludq sequence
264     { ISD::UDIV, MVT::v8i32,  30+2 }, // 2*pmuludq sequence + split.
265     { ISD::UDIV, MVT::v4i32,    15 }, // pmuludq sequence
266   };
267 
268   if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
269       ST->hasSSE2()) {
270     // pmuldq sequence.
271     if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
272       return LT.first * 32;
273     if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
274       return LT.first * 15;
275 
276     // XOP has faster vXi8 shifts.
277     if ((ISD != ISD::SHL && ISD != ISD::SRL && ISD != ISD::SRA) ||
278         !ST->hasXOP())
279       if (const auto *Entry =
280               CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
281         return LT.first * Entry->Cost;
282   }
283 
284   static const CostTblEntry AVX2UniformCostTable[] = {
285     // Uniform splats are cheaper for the following instructions.
286     { ISD::SHL,  MVT::v16i16, 1 }, // psllw.
287     { ISD::SRL,  MVT::v16i16, 1 }, // psrlw.
288     { ISD::SRA,  MVT::v16i16, 1 }, // psraw.
289   };
290 
291   if (ST->hasAVX2() &&
292       ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
293        (Op2Info == TargetTransformInfo::OK_UniformValue))) {
294     if (const auto *Entry =
295             CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
296       return LT.first * Entry->Cost;
297   }
298 
299   static const CostTblEntry SSE2UniformCostTable[] = {
300     // Uniform splats are cheaper for the following instructions.
301     { ISD::SHL,  MVT::v8i16,  1 }, // psllw.
302     { ISD::SHL,  MVT::v4i32,  1 }, // pslld
303     { ISD::SHL,  MVT::v2i64,  1 }, // psllq.
304 
305     { ISD::SRL,  MVT::v8i16,  1 }, // psrlw.
306     { ISD::SRL,  MVT::v4i32,  1 }, // psrld.
307     { ISD::SRL,  MVT::v2i64,  1 }, // psrlq.
308 
309     { ISD::SRA,  MVT::v8i16,  1 }, // psraw.
310     { ISD::SRA,  MVT::v4i32,  1 }, // psrad.
311   };
312 
313   if (ST->hasSSE2() &&
314       ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
315        (Op2Info == TargetTransformInfo::OK_UniformValue))) {
316     if (const auto *Entry =
317             CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
318       return LT.first * Entry->Cost;
319   }
320 
321   static const CostTblEntry AVX512DQCostTable[] = {
322     { ISD::MUL,  MVT::v2i64, 1 },
323     { ISD::MUL,  MVT::v4i64, 1 },
324     { ISD::MUL,  MVT::v8i64, 1 }
325   };
326 
327   // Look for AVX512DQ lowering tricks for custom cases.
328   if (ST->hasDQI())
329     if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
330       return LT.first * Entry->Cost;
331 
332   static const CostTblEntry AVX512BWCostTable[] = {
333     { ISD::SHL,   MVT::v8i16,      1 }, // vpsllvw
334     { ISD::SRL,   MVT::v8i16,      1 }, // vpsrlvw
335     { ISD::SRA,   MVT::v8i16,      1 }, // vpsravw
336 
337     { ISD::SHL,   MVT::v16i16,     1 }, // vpsllvw
338     { ISD::SRL,   MVT::v16i16,     1 }, // vpsrlvw
339     { ISD::SRA,   MVT::v16i16,     1 }, // vpsravw
340 
341     { ISD::SHL,   MVT::v32i16,     1 }, // vpsllvw
342     { ISD::SRL,   MVT::v32i16,     1 }, // vpsrlvw
343     { ISD::SRA,   MVT::v32i16,     1 }, // vpsravw
344 
345     { ISD::SHL,   MVT::v64i8,     11 }, // vpblendvb sequence.
346     { ISD::SRL,   MVT::v64i8,     11 }, // vpblendvb sequence.
347     { ISD::SRA,   MVT::v64i8,     24 }, // vpblendvb sequence.
348 
349     { ISD::MUL,   MVT::v64i8,     11 }, // extend/pmullw/trunc sequence.
350     { ISD::MUL,   MVT::v32i8,      4 }, // extend/pmullw/trunc sequence.
351     { ISD::MUL,   MVT::v16i8,      4 }, // extend/pmullw/trunc sequence.
352 
353     // Vectorizing division is a bad idea. See the SSE2 table for more comments.
354     { ISD::SDIV,  MVT::v64i8,  64*20 },
355     { ISD::SDIV,  MVT::v32i16, 32*20 },
356     { ISD::UDIV,  MVT::v64i8,  64*20 },
357     { ISD::UDIV,  MVT::v32i16, 32*20 }
358   };
359 
360   // Look for AVX512BW lowering tricks for custom cases.
361   if (ST->hasBWI())
362     if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
363       return LT.first * Entry->Cost;
364 
365   static const CostTblEntry AVX512CostTable[] = {
366     { ISD::SHL,     MVT::v16i32,     1 },
367     { ISD::SRL,     MVT::v16i32,     1 },
368     { ISD::SRA,     MVT::v16i32,     1 },
369 
370     { ISD::SHL,     MVT::v8i64,      1 },
371     { ISD::SRL,     MVT::v8i64,      1 },
372 
373     { ISD::SRA,     MVT::v2i64,      1 },
374     { ISD::SRA,     MVT::v4i64,      1 },
375     { ISD::SRA,     MVT::v8i64,      1 },
376 
377     { ISD::MUL,     MVT::v32i8,     13 }, // extend/pmullw/trunc sequence.
378     { ISD::MUL,     MVT::v16i8,      5 }, // extend/pmullw/trunc sequence.
379     { ISD::MUL,     MVT::v16i32,     1 }, // pmulld
380     { ISD::MUL,     MVT::v8i64,      8 }, // 3*pmuludq/3*shift/2*add
381 
382     // Vectorizing division is a bad idea. See the SSE2 table for more comments.
383     { ISD::SDIV,    MVT::v16i32, 16*20 },
384     { ISD::SDIV,    MVT::v8i64,   8*20 },
385     { ISD::UDIV,    MVT::v16i32, 16*20 },
386     { ISD::UDIV,    MVT::v8i64,   8*20 }
387   };
388 
389   if (ST->hasAVX512())
390     if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
391       return LT.first * Entry->Cost;
392 
393   static const CostTblEntry AVX2ShiftCostTable[] = {
394     // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
395     // customize them to detect the cases where shift amount is a scalar one.
396     { ISD::SHL,     MVT::v4i32,    1 },
397     { ISD::SRL,     MVT::v4i32,    1 },
398     { ISD::SRA,     MVT::v4i32,    1 },
399     { ISD::SHL,     MVT::v8i32,    1 },
400     { ISD::SRL,     MVT::v8i32,    1 },
401     { ISD::SRA,     MVT::v8i32,    1 },
402     { ISD::SHL,     MVT::v2i64,    1 },
403     { ISD::SRL,     MVT::v2i64,    1 },
404     { ISD::SHL,     MVT::v4i64,    1 },
405     { ISD::SRL,     MVT::v4i64,    1 },
406   };
407 
408   // Look for AVX2 lowering tricks.
409   if (ST->hasAVX2()) {
410     if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
411         (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
412          Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
413       // On AVX2, a packed v16i16 shift left by a constant build_vector
414       // is lowered into a vector multiply (vpmullw).
415       return LT.first;
416 
417     if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
418       return LT.first * Entry->Cost;
419   }
420 
421   static const CostTblEntry XOPShiftCostTable[] = {
422     // 128bit shifts take 1cy, but right shifts require negation beforehand.
423     { ISD::SHL,     MVT::v16i8,    1 },
424     { ISD::SRL,     MVT::v16i8,    2 },
425     { ISD::SRA,     MVT::v16i8,    2 },
426     { ISD::SHL,     MVT::v8i16,    1 },
427     { ISD::SRL,     MVT::v8i16,    2 },
428     { ISD::SRA,     MVT::v8i16,    2 },
429     { ISD::SHL,     MVT::v4i32,    1 },
430     { ISD::SRL,     MVT::v4i32,    2 },
431     { ISD::SRA,     MVT::v4i32,    2 },
432     { ISD::SHL,     MVT::v2i64,    1 },
433     { ISD::SRL,     MVT::v2i64,    2 },
434     { ISD::SRA,     MVT::v2i64,    2 },
435     // 256bit shifts require splitting if AVX2 didn't catch them above.
436     { ISD::SHL,     MVT::v32i8,  2+2 },
437     { ISD::SRL,     MVT::v32i8,  4+2 },
438     { ISD::SRA,     MVT::v32i8,  4+2 },
439     { ISD::SHL,     MVT::v16i16, 2+2 },
440     { ISD::SRL,     MVT::v16i16, 4+2 },
441     { ISD::SRA,     MVT::v16i16, 4+2 },
442     { ISD::SHL,     MVT::v8i32,  2+2 },
443     { ISD::SRL,     MVT::v8i32,  4+2 },
444     { ISD::SRA,     MVT::v8i32,  4+2 },
445     { ISD::SHL,     MVT::v4i64,  2+2 },
446     { ISD::SRL,     MVT::v4i64,  4+2 },
447     { ISD::SRA,     MVT::v4i64,  4+2 },
448   };
449 
450   // Look for XOP lowering tricks.
451   if (ST->hasXOP())
452     if (const auto *Entry = CostTableLookup(XOPShiftCostTable, ISD, LT.second))
453       return LT.first * Entry->Cost;
454 
455   static const CostTblEntry SSE2UniformShiftCostTable[] = {
456     // Uniform splats are cheaper for the following instructions.
457     { ISD::SHL,  MVT::v16i16, 2+2 }, // 2*psllw + split.
458     { ISD::SHL,  MVT::v8i32,  2+2 }, // 2*pslld + split.
459     { ISD::SHL,  MVT::v4i64,  2+2 }, // 2*psllq + split.
460 
461     { ISD::SRL,  MVT::v16i16, 2+2 }, // 2*psrlw + split.
462     { ISD::SRL,  MVT::v8i32,  2+2 }, // 2*psrld + split.
463     { ISD::SRL,  MVT::v4i64,  2+2 }, // 2*psrlq + split.
464 
465     { ISD::SRA,  MVT::v16i16, 2+2 }, // 2*psraw + split.
466     { ISD::SRA,  MVT::v8i32,  2+2 }, // 2*psrad + split.
467     { ISD::SRA,  MVT::v2i64,    4 }, // 2*psrad + shuffle.
468     { ISD::SRA,  MVT::v4i64,  8+2 }, // 2*(2*psrad + shuffle) + split.
469   };
470 
471   if (ST->hasSSE2() &&
472       ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
473        (Op2Info == TargetTransformInfo::OK_UniformValue))) {
474 
475     // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
476     if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
477       return LT.first * 4; // 2*psrad + shuffle.
478 
479     if (const auto *Entry =
480             CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
481       return LT.first * Entry->Cost;
482   }
483 
484   if (ISD == ISD::SHL &&
485       Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) {
486     MVT VT = LT.second;
487     // Vector shift left by non uniform constant can be lowered
488     // into vector multiply.
489     if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
490         ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
491       ISD = ISD::MUL;
492   }
493 
494   static const CostTblEntry AVX2CostTable[] = {
495     { ISD::SHL,  MVT::v32i8,     11 }, // vpblendvb sequence.
496     { ISD::SHL,  MVT::v16i16,    10 }, // extend/vpsrlvd/pack sequence.
497 
498     { ISD::SRL,  MVT::v32i8,     11 }, // vpblendvb sequence.
499     { ISD::SRL,  MVT::v16i16,    10 }, // extend/vpsrlvd/pack sequence.
500 
501     { ISD::SRA,  MVT::v32i8,     24 }, // vpblendvb sequence.
502     { ISD::SRA,  MVT::v16i16,    10 }, // extend/vpsravd/pack sequence.
503     { ISD::SRA,  MVT::v2i64,      4 }, // srl/xor/sub sequence.
504     { ISD::SRA,  MVT::v4i64,      4 }, // srl/xor/sub sequence.
505 
506     { ISD::SUB,  MVT::v32i8,      1 }, // psubb
507     { ISD::ADD,  MVT::v32i8,      1 }, // paddb
508     { ISD::SUB,  MVT::v16i16,     1 }, // psubw
509     { ISD::ADD,  MVT::v16i16,     1 }, // paddw
510     { ISD::SUB,  MVT::v8i32,      1 }, // psubd
511     { ISD::ADD,  MVT::v8i32,      1 }, // paddd
512     { ISD::SUB,  MVT::v4i64,      1 }, // psubq
513     { ISD::ADD,  MVT::v4i64,      1 }, // paddq
514 
515     { ISD::MUL,  MVT::v32i8,     17 }, // extend/pmullw/trunc sequence.
516     { ISD::MUL,  MVT::v16i8,      7 }, // extend/pmullw/trunc sequence.
517     { ISD::MUL,  MVT::v16i16,     1 }, // pmullw
518     { ISD::MUL,  MVT::v8i32,      1 }, // pmulld
519     { ISD::MUL,  MVT::v4i64,      8 }, // 3*pmuludq/3*shift/2*add
520 
521     { ISD::FDIV, MVT::f32,        7 }, // Haswell from http://www.agner.org/
522     { ISD::FDIV, MVT::v4f32,      7 }, // Haswell from http://www.agner.org/
523     { ISD::FDIV, MVT::v8f32,     14 }, // Haswell from http://www.agner.org/
524     { ISD::FDIV, MVT::f64,       14 }, // Haswell from http://www.agner.org/
525     { ISD::FDIV, MVT::v2f64,     14 }, // Haswell from http://www.agner.org/
526     { ISD::FDIV, MVT::v4f64,     28 }, // Haswell from http://www.agner.org/
527   };
528 
529   // Look for AVX2 lowering tricks for custom cases.
530   if (ST->hasAVX2())
531     if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
532       return LT.first * Entry->Cost;
533 
534   static const CostTblEntry AVX1CostTable[] = {
535     // We don't have to scalarize unsupported ops. We can issue two half-sized
536     // operations and we only need to extract the upper YMM half.
537     // Two ops + 1 extract + 1 insert = 4.
538     { ISD::MUL,     MVT::v16i16,     4 },
539     { ISD::MUL,     MVT::v8i32,      4 },
540     { ISD::SUB,     MVT::v32i8,      4 },
541     { ISD::ADD,     MVT::v32i8,      4 },
542     { ISD::SUB,     MVT::v16i16,     4 },
543     { ISD::ADD,     MVT::v16i16,     4 },
544     { ISD::SUB,     MVT::v8i32,      4 },
545     { ISD::ADD,     MVT::v8i32,      4 },
546     { ISD::SUB,     MVT::v4i64,      4 },
547     { ISD::ADD,     MVT::v4i64,      4 },
548 
549     // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
550     // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
551     // Because we believe v4i64 to be a legal type, we must also include the
552     // extract+insert in the cost table. Therefore, the cost here is 18
553     // instead of 8.
554     { ISD::MUL,     MVT::v4i64,     18 },
555 
556     { ISD::MUL,     MVT::v32i8,     26 }, // extend/pmullw/trunc sequence.
557 
558     { ISD::FDIV,    MVT::f32,       14 }, // SNB from http://www.agner.org/
559     { ISD::FDIV,    MVT::v4f32,     14 }, // SNB from http://www.agner.org/
560     { ISD::FDIV,    MVT::v8f32,     28 }, // SNB from http://www.agner.org/
561     { ISD::FDIV,    MVT::f64,       22 }, // SNB from http://www.agner.org/
562     { ISD::FDIV,    MVT::v2f64,     22 }, // SNB from http://www.agner.org/
563     { ISD::FDIV,    MVT::v4f64,     44 }, // SNB from http://www.agner.org/
564 
565     // Vectorizing division is a bad idea. See the SSE2 table for more comments.
566     { ISD::SDIV,    MVT::v32i8,  32*20 },
567     { ISD::SDIV,    MVT::v16i16, 16*20 },
568     { ISD::SDIV,    MVT::v8i32,   8*20 },
569     { ISD::SDIV,    MVT::v4i64,   4*20 },
570     { ISD::UDIV,    MVT::v32i8,  32*20 },
571     { ISD::UDIV,    MVT::v16i16, 16*20 },
572     { ISD::UDIV,    MVT::v8i32,   8*20 },
573     { ISD::UDIV,    MVT::v4i64,   4*20 },
574   };
575 
576   if (ST->hasAVX())
577     if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
578       return LT.first * Entry->Cost;
579 
580   static const CostTblEntry SSE42CostTable[] = {
581     { ISD::FDIV,  MVT::f32,   14 }, // Nehalem from http://www.agner.org/
582     { ISD::FDIV,  MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
583     { ISD::FDIV,  MVT::f64,   22 }, // Nehalem from http://www.agner.org/
584     { ISD::FDIV,  MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
585   };
586 
587   if (ST->hasSSE42())
588     if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
589       return LT.first * Entry->Cost;
590 
591   static const CostTblEntry SSE41CostTable[] = {
592     { ISD::SHL,  MVT::v16i8,      11 }, // pblendvb sequence.
593     { ISD::SHL,  MVT::v32i8,  2*11+2 }, // pblendvb sequence + split.
594     { ISD::SHL,  MVT::v8i16,      14 }, // pblendvb sequence.
595     { ISD::SHL,  MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
596     { ISD::SHL,  MVT::v4i32,       4 }, // pslld/paddd/cvttps2dq/pmulld
597     { ISD::SHL,  MVT::v8i32,   2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split
598 
599     { ISD::SRL,  MVT::v16i8,      12 }, // pblendvb sequence.
600     { ISD::SRL,  MVT::v32i8,  2*12+2 }, // pblendvb sequence + split.
601     { ISD::SRL,  MVT::v8i16,      14 }, // pblendvb sequence.
602     { ISD::SRL,  MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
603     { ISD::SRL,  MVT::v4i32,      11 }, // Shift each lane + blend.
604     { ISD::SRL,  MVT::v8i32,  2*11+2 }, // Shift each lane + blend + split.
605 
606     { ISD::SRA,  MVT::v16i8,      24 }, // pblendvb sequence.
607     { ISD::SRA,  MVT::v32i8,  2*24+2 }, // pblendvb sequence + split.
608     { ISD::SRA,  MVT::v8i16,      14 }, // pblendvb sequence.
609     { ISD::SRA,  MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
610     { ISD::SRA,  MVT::v4i32,      12 }, // Shift each lane + blend.
611     { ISD::SRA,  MVT::v8i32,  2*12+2 }, // Shift each lane + blend + split.
612 
613     { ISD::MUL,  MVT::v4i32,       1 }  // pmulld
614   };
615 
616   if (ST->hasSSE41())
617     if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
618       return LT.first * Entry->Cost;
619 
620   static const CostTblEntry SSE2CostTable[] = {
621     // We don't correctly identify costs of casts because they are marked as
622     // custom.
623     { ISD::SHL,  MVT::v16i8,      26 }, // cmpgtb sequence.
624     { ISD::SHL,  MVT::v8i16,      32 }, // cmpgtb sequence.
625     { ISD::SHL,  MVT::v4i32,     2*5 }, // We optimized this using mul.
626     { ISD::SHL,  MVT::v2i64,       4 }, // splat+shuffle sequence.
627     { ISD::SHL,  MVT::v4i64,   2*4+2 }, // splat+shuffle sequence + split.
628 
629     { ISD::SRL,  MVT::v16i8,      26 }, // cmpgtb sequence.
630     { ISD::SRL,  MVT::v8i16,      32 }, // cmpgtb sequence.
631     { ISD::SRL,  MVT::v4i32,      16 }, // Shift each lane + blend.
632     { ISD::SRL,  MVT::v2i64,       4 }, // splat+shuffle sequence.
633     { ISD::SRL,  MVT::v4i64,   2*4+2 }, // splat+shuffle sequence + split.
634 
635     { ISD::SRA,  MVT::v16i8,      54 }, // unpacked cmpgtb sequence.
636     { ISD::SRA,  MVT::v8i16,      32 }, // cmpgtb sequence.
637     { ISD::SRA,  MVT::v4i32,      16 }, // Shift each lane + blend.
638     { ISD::SRA,  MVT::v2i64,      12 }, // srl/xor/sub sequence.
639     { ISD::SRA,  MVT::v4i64,  2*12+2 }, // srl/xor/sub sequence+split.
640 
641     { ISD::MUL,  MVT::v16i8,      12 }, // extend/pmullw/trunc sequence.
642     { ISD::MUL,  MVT::v8i16,       1 }, // pmullw
643     { ISD::MUL,  MVT::v4i32,       6 }, // 3*pmuludq/4*shuffle
644     { ISD::MUL,  MVT::v2i64,       8 }, // 3*pmuludq/3*shift/2*add
645 
646     { ISD::FDIV, MVT::f32,        23 }, // Pentium IV from http://www.agner.org/
647     { ISD::FDIV, MVT::v4f32,      39 }, // Pentium IV from http://www.agner.org/
648     { ISD::FDIV, MVT::f64,        38 }, // Pentium IV from http://www.agner.org/
649     { ISD::FDIV, MVT::v2f64,      69 }, // Pentium IV from http://www.agner.org/
650 
651     // It is not a good idea to vectorize division. We have to scalarize it and
652     // in the process we will often end up having to spilling regular
653     // registers. The overhead of division is going to dominate most kernels
654     // anyways so try hard to prevent vectorization of division - it is
655     // generally a bad idea. Assume somewhat arbitrarily that we have to be able
656     // to hide "20 cycles" for each lane.
657     { ISD::SDIV,  MVT::v16i8,  16*20 },
658     { ISD::SDIV,  MVT::v8i16,   8*20 },
659     { ISD::SDIV,  MVT::v4i32,   4*20 },
660     { ISD::SDIV,  MVT::v2i64,   2*20 },
661     { ISD::UDIV,  MVT::v16i8,  16*20 },
662     { ISD::UDIV,  MVT::v8i16,   8*20 },
663     { ISD::UDIV,  MVT::v4i32,   4*20 },
664     { ISD::UDIV,  MVT::v2i64,   2*20 },
665   };
666 
667   if (ST->hasSSE2())
668     if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
669       return LT.first * Entry->Cost;
670 
671   static const CostTblEntry SSE1CostTable[] = {
672     { ISD::FDIV, MVT::f32,   17 }, // Pentium III from http://www.agner.org/
673     { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
674   };
675 
676   if (ST->hasSSE1())
677     if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
678       return LT.first * Entry->Cost;
679 
680   // Fallback to the default implementation.
681   return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
682 }
683 
684 int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
685                                Type *SubTp) {
686   // 64-bit packed float vectors (v2f32) are widened to type v4f32.
687   // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
688   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
689 
690   // For Broadcasts we are splatting the first element from the first input
691   // register, so only need to reference that input and all the output
692   // registers are the same.
693   if (Kind == TTI::SK_Broadcast)
694     LT.first = 1;
695 
696   // We are going to permute multiple sources and the result will be in multiple
697   // destinations. Providing an accurate cost only for splits where the element
698   // type remains the same.
699   if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
700     MVT LegalVT = LT.second;
701     if (LegalVT.getVectorElementType().getSizeInBits() ==
702             Tp->getVectorElementType()->getPrimitiveSizeInBits() &&
703         LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {
704 
705       unsigned VecTySize = DL.getTypeStoreSize(Tp);
706       unsigned LegalVTSize = LegalVT.getStoreSize();
707       // Number of source vectors after legalization:
708       unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
709       // Number of destination vectors after legalization:
710       unsigned NumOfDests = LT.first;
711 
712       Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(),
713                                          LegalVT.getVectorNumElements());
714 
715       unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
716       return NumOfShuffles *
717              getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
718     }
719 
720     return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
721   }
722 
723   // For 2-input shuffles, we must account for splitting the 2 inputs into many.
724   if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
725     // We assume that source and destination have the same vector type.
726     int NumOfDests = LT.first;
727     int NumOfShufflesPerDest = LT.first * 2 - 1;
728     LT.first = NumOfDests * NumOfShufflesPerDest;
729   }
730 
731   static const CostTblEntry AVX512VBMIShuffleTbl[] = {
732     { TTI::SK_Reverse,          MVT::v64i8,  1 }, // vpermb
733     { TTI::SK_Reverse,          MVT::v32i8,  1 }, // vpermb
734 
735     { TTI::SK_PermuteSingleSrc, MVT::v64i8,  1 }, // vpermb
736     { TTI::SK_PermuteSingleSrc, MVT::v32i8,  1 }, // vpermb
737 
738     { TTI::SK_PermuteTwoSrc,    MVT::v64i8,  1 }, // vpermt2b
739     { TTI::SK_PermuteTwoSrc,    MVT::v32i8,  1 }, // vpermt2b
740     { TTI::SK_PermuteTwoSrc,    MVT::v16i8,  1 }  // vpermt2b
741   };
742 
743   if (ST->hasVBMI())
744     if (const auto *Entry =
745             CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
746       return LT.first * Entry->Cost;
747 
748   static const CostTblEntry AVX512BWShuffleTbl[] = {
749     { TTI::SK_Broadcast,        MVT::v32i16, 1 }, // vpbroadcastw
750     { TTI::SK_Broadcast,        MVT::v64i8,  1 }, // vpbroadcastb
751 
752     { TTI::SK_Reverse,          MVT::v32i16, 1 }, // vpermw
753     { TTI::SK_Reverse,          MVT::v16i16, 1 }, // vpermw
754     { TTI::SK_Reverse,          MVT::v64i8,  2 }, // pshufb + vshufi64x2
755 
756     { TTI::SK_PermuteSingleSrc, MVT::v32i16, 1 }, // vpermw
757     { TTI::SK_PermuteSingleSrc, MVT::v16i16, 1 }, // vpermw
758     { TTI::SK_PermuteSingleSrc, MVT::v8i16,  1 }, // vpermw
759     { TTI::SK_PermuteSingleSrc, MVT::v64i8,  8 }, // extend to v32i16
760     { TTI::SK_PermuteSingleSrc, MVT::v32i8,  3 }, // vpermw + zext/trunc
761 
762     { TTI::SK_PermuteTwoSrc,    MVT::v32i16, 1 }, // vpermt2w
763     { TTI::SK_PermuteTwoSrc,    MVT::v16i16, 1 }, // vpermt2w
764     { TTI::SK_PermuteTwoSrc,    MVT::v8i16,  1 }, // vpermt2w
765     { TTI::SK_PermuteTwoSrc,    MVT::v32i8,  3 }, // zext + vpermt2w + trunc
766     { TTI::SK_PermuteTwoSrc,    MVT::v64i8, 19 }, // 6 * v32i8 + 1
767     { TTI::SK_PermuteTwoSrc,    MVT::v16i8,  3 }  // zext + vpermt2w + trunc
768   };
769 
770   if (ST->hasBWI())
771     if (const auto *Entry =
772             CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
773       return LT.first * Entry->Cost;
774 
775   static const CostTblEntry AVX512ShuffleTbl[] = {
776     { TTI::SK_Broadcast,        MVT::v8f64,  1 }, // vbroadcastpd
777     { TTI::SK_Broadcast,        MVT::v16f32, 1 }, // vbroadcastps
778     { TTI::SK_Broadcast,        MVT::v8i64,  1 }, // vpbroadcastq
779     { TTI::SK_Broadcast,        MVT::v16i32, 1 }, // vpbroadcastd
780 
781     { TTI::SK_Reverse,          MVT::v8f64,  1 }, // vpermpd
782     { TTI::SK_Reverse,          MVT::v16f32, 1 }, // vpermps
783     { TTI::SK_Reverse,          MVT::v8i64,  1 }, // vpermq
784     { TTI::SK_Reverse,          MVT::v16i32, 1 }, // vpermd
785 
786     { TTI::SK_PermuteSingleSrc, MVT::v8f64,  1 }, // vpermpd
787     { TTI::SK_PermuteSingleSrc, MVT::v4f64,  1 }, // vpermpd
788     { TTI::SK_PermuteSingleSrc, MVT::v2f64,  1 }, // vpermpd
789     { TTI::SK_PermuteSingleSrc, MVT::v16f32, 1 }, // vpermps
790     { TTI::SK_PermuteSingleSrc, MVT::v8f32,  1 }, // vpermps
791     { TTI::SK_PermuteSingleSrc, MVT::v4f32,  1 }, // vpermps
792     { TTI::SK_PermuteSingleSrc, MVT::v8i64,  1 }, // vpermq
793     { TTI::SK_PermuteSingleSrc, MVT::v4i64,  1 }, // vpermq
794     { TTI::SK_PermuteSingleSrc, MVT::v2i64,  1 }, // vpermq
795     { TTI::SK_PermuteSingleSrc, MVT::v16i32, 1 }, // vpermd
796     { TTI::SK_PermuteSingleSrc, MVT::v8i32,  1 }, // vpermd
797     { TTI::SK_PermuteSingleSrc, MVT::v4i32,  1 }, // vpermd
798     { TTI::SK_PermuteSingleSrc, MVT::v16i8,  1 }, // pshufb
799 
800     { TTI::SK_PermuteTwoSrc,    MVT::v8f64,  1 }, // vpermt2pd
801     { TTI::SK_PermuteTwoSrc,    MVT::v16f32, 1 }, // vpermt2ps
802     { TTI::SK_PermuteTwoSrc,    MVT::v8i64,  1 }, // vpermt2q
803     { TTI::SK_PermuteTwoSrc,    MVT::v16i32, 1 }, // vpermt2d
804     { TTI::SK_PermuteTwoSrc,    MVT::v4f64,  1 }, // vpermt2pd
805     { TTI::SK_PermuteTwoSrc,    MVT::v8f32,  1 }, // vpermt2ps
806     { TTI::SK_PermuteTwoSrc,    MVT::v4i64,  1 }, // vpermt2q
807     { TTI::SK_PermuteTwoSrc,    MVT::v8i32,  1 }, // vpermt2d
808     { TTI::SK_PermuteTwoSrc,    MVT::v2f64,  1 }, // vpermt2pd
809     { TTI::SK_PermuteTwoSrc,    MVT::v4f32,  1 }, // vpermt2ps
810     { TTI::SK_PermuteTwoSrc,    MVT::v2i64,  1 }, // vpermt2q
811     { TTI::SK_PermuteTwoSrc,    MVT::v4i32,  1 }  // vpermt2d
812   };
813 
814   if (ST->hasAVX512())
815     if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
816       return LT.first * Entry->Cost;
817 
818   static const CostTblEntry AVX2ShuffleTbl[] = {
819     { TTI::SK_Broadcast, MVT::v4f64,  1 }, // vbroadcastpd
820     { TTI::SK_Broadcast, MVT::v8f32,  1 }, // vbroadcastps
821     { TTI::SK_Broadcast, MVT::v4i64,  1 }, // vpbroadcastq
822     { TTI::SK_Broadcast, MVT::v8i32,  1 }, // vpbroadcastd
823     { TTI::SK_Broadcast, MVT::v16i16, 1 }, // vpbroadcastw
824     { TTI::SK_Broadcast, MVT::v32i8,  1 }, // vpbroadcastb
825 
826     { TTI::SK_Reverse,   MVT::v4f64,  1 }, // vpermpd
827     { TTI::SK_Reverse,   MVT::v8f32,  1 }, // vpermps
828     { TTI::SK_Reverse,   MVT::v4i64,  1 }, // vpermq
829     { TTI::SK_Reverse,   MVT::v8i32,  1 }, // vpermd
830     { TTI::SK_Reverse,   MVT::v16i16, 2 }, // vperm2i128 + pshufb
831     { TTI::SK_Reverse,   MVT::v32i8,  2 }, // vperm2i128 + pshufb
832 
833     { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw
834     { TTI::SK_Alternate, MVT::v32i8,  1 }, // vpblendvb
835 
836     { TTI::SK_PermuteSingleSrc, MVT::v4i64,  1 }, // vpermq
837     { TTI::SK_PermuteSingleSrc, MVT::v8i32,  1 }, // vpermd
838     { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vperm2i128 + 2 * vpshufb
839                                                   // + vpblendvb
840     { TTI::SK_PermuteSingleSrc, MVT::v32i8,  4 }  // vperm2i128 + 2 * vpshufb
841                                                   // + vpblendvb
842   };
843 
844   if (ST->hasAVX2())
845     if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
846       return LT.first * Entry->Cost;
847 
848   static const CostTblEntry AVX1ShuffleTbl[] = {
849     { TTI::SK_Broadcast, MVT::v4f64,  2 }, // vperm2f128 + vpermilpd
850     { TTI::SK_Broadcast, MVT::v8f32,  2 }, // vperm2f128 + vpermilps
851     { TTI::SK_Broadcast, MVT::v4i64,  2 }, // vperm2f128 + vpermilpd
852     { TTI::SK_Broadcast, MVT::v8i32,  2 }, // vperm2f128 + vpermilps
853     { TTI::SK_Broadcast, MVT::v16i16, 3 }, // vpshuflw + vpshufd + vinsertf128
854     { TTI::SK_Broadcast, MVT::v32i8,  2 }, // vpshufb + vinsertf128
855 
856     { TTI::SK_Reverse,   MVT::v4f64,  2 }, // vperm2f128 + vpermilpd
857     { TTI::SK_Reverse,   MVT::v8f32,  2 }, // vperm2f128 + vpermilps
858     { TTI::SK_Reverse,   MVT::v4i64,  2 }, // vperm2f128 + vpermilpd
859     { TTI::SK_Reverse,   MVT::v8i32,  2 }, // vperm2f128 + vpermilps
860     { TTI::SK_Reverse,   MVT::v16i16, 4 }, // vextractf128 + 2*pshufb
861                                            // + vinsertf128
862     { TTI::SK_Reverse,   MVT::v32i8,  4 }, // vextractf128 + 2*pshufb
863                                            // + vinsertf128
864 
865     { TTI::SK_Alternate, MVT::v4i64,  1 }, // vblendpd
866     { TTI::SK_Alternate, MVT::v4f64,  1 }, // vblendpd
867     { TTI::SK_Alternate, MVT::v8i32,  1 }, // vblendps
868     { TTI::SK_Alternate, MVT::v8f32,  1 }, // vblendps
869     { TTI::SK_Alternate, MVT::v16i16, 3 }, // vpand + vpandn + vpor
870     { TTI::SK_Alternate, MVT::v32i8,  3 }  // vpand + vpandn + vpor
871   };
872 
873   if (ST->hasAVX())
874     if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
875       return LT.first * Entry->Cost;
876 
877   static const CostTblEntry SSE41ShuffleTbl[] = {
878     { TTI::SK_Alternate, MVT::v2i64,  1 }, // pblendw
879     { TTI::SK_Alternate, MVT::v2f64,  1 }, // movsd
880     { TTI::SK_Alternate, MVT::v4i32,  1 }, // pblendw
881     { TTI::SK_Alternate, MVT::v4f32,  1 }, // blendps
882     { TTI::SK_Alternate, MVT::v8i16,  1 }, // pblendw
883     { TTI::SK_Alternate, MVT::v16i8,  1 }  // pblendvb
884   };
885 
886   if (ST->hasSSE41())
887     if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
888       return LT.first * Entry->Cost;
889 
890   static const CostTblEntry SSSE3ShuffleTbl[] = {
891     { TTI::SK_Broadcast, MVT::v8i16,  1 }, // pshufb
892     { TTI::SK_Broadcast, MVT::v16i8,  1 }, // pshufb
893 
894     { TTI::SK_Reverse,   MVT::v8i16,  1 }, // pshufb
895     { TTI::SK_Reverse,   MVT::v16i8,  1 }, // pshufb
896 
897     { TTI::SK_Alternate, MVT::v8i16,  3 }, // pshufb + pshufb + por
898     { TTI::SK_Alternate, MVT::v16i8,  3 }, // pshufb + pshufb + por
899 
900     { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // pshufb
901     { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }  // pshufb
902   };
903 
904   if (ST->hasSSSE3())
905     if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
906       return LT.first * Entry->Cost;
907 
908   static const CostTblEntry SSE2ShuffleTbl[] = {
909     { TTI::SK_Broadcast, MVT::v2f64,  1 }, // shufpd
910     { TTI::SK_Broadcast, MVT::v2i64,  1 }, // pshufd
911     { TTI::SK_Broadcast, MVT::v4i32,  1 }, // pshufd
912     { TTI::SK_Broadcast, MVT::v8i16,  2 }, // pshuflw  + pshufd
913     { TTI::SK_Broadcast, MVT::v16i8,  3 }, // unpck + pshuflw + pshufd
914 
915     { TTI::SK_Reverse,   MVT::v2f64,  1 }, // shufpd
916     { TTI::SK_Reverse,   MVT::v2i64,  1 }, // pshufd
917     { TTI::SK_Reverse,   MVT::v4i32,  1 }, // pshufd
918     { TTI::SK_Reverse,   MVT::v8i16,  3 }, // pshuflw + pshufhw  + pshufd
919     { TTI::SK_Reverse,   MVT::v16i8,  9 }, // 2*pshuflw + 2*pshufhw
920                                            // + 2*pshufd + 2*unpck + packus
921 
922     { TTI::SK_Alternate, MVT::v2i64,  1 }, // movsd
923     { TTI::SK_Alternate, MVT::v2f64,  1 }, // movsd
924     { TTI::SK_Alternate, MVT::v4i32,  2 }, // 2*shufps
925     { TTI::SK_Alternate, MVT::v8i16,  3 }, // pand + pandn + por
926     { TTI::SK_Alternate, MVT::v16i8,  3 }, // pand + pandn + por
927 
928     { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd
929     { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }  // pshufd
930   };
931 
932   if (ST->hasSSE2())
933     if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
934       return LT.first * Entry->Cost;
935 
936   static const CostTblEntry SSE1ShuffleTbl[] = {
937     { TTI::SK_Broadcast, MVT::v4f32,  1 }, // shufps
938     { TTI::SK_Reverse,   MVT::v4f32,  1 }, // shufps
939     { TTI::SK_Alternate, MVT::v4f32,  2 }  // 2*shufps
940   };
941 
942   if (ST->hasSSE1())
943     if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
944       return LT.first * Entry->Cost;
945 
946   return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
947 }
948 
949 int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
950                                  const Instruction *I) {
951   int ISD = TLI->InstructionOpcodeToISD(Opcode);
952   assert(ISD && "Invalid opcode");
953 
954   // FIXME: Need a better design of the cost table to handle non-simple types of
955   // potential massive combinations (elem_num x src_type x dst_type).
956 
957   static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
958     { ISD::SINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  1 },
959     { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
960     { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  1 },
961     { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  1 },
962     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  1 },
963     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  1 },
964 
965     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  1 },
966     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
967     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  1 },
968     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  1 },
969     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  1 },
970     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  1 },
971 
972     { ISD::FP_TO_SINT,  MVT::v2i64,  MVT::v2f32,  1 },
973     { ISD::FP_TO_SINT,  MVT::v4i64,  MVT::v4f32,  1 },
974     { ISD::FP_TO_SINT,  MVT::v8i64,  MVT::v8f32,  1 },
975     { ISD::FP_TO_SINT,  MVT::v2i64,  MVT::v2f64,  1 },
976     { ISD::FP_TO_SINT,  MVT::v4i64,  MVT::v4f64,  1 },
977     { ISD::FP_TO_SINT,  MVT::v8i64,  MVT::v8f64,  1 },
978 
979     { ISD::FP_TO_UINT,  MVT::v2i64,  MVT::v2f32,  1 },
980     { ISD::FP_TO_UINT,  MVT::v4i64,  MVT::v4f32,  1 },
981     { ISD::FP_TO_UINT,  MVT::v8i64,  MVT::v8f32,  1 },
982     { ISD::FP_TO_UINT,  MVT::v2i64,  MVT::v2f64,  1 },
983     { ISD::FP_TO_UINT,  MVT::v4i64,  MVT::v4f64,  1 },
984     { ISD::FP_TO_UINT,  MVT::v8i64,  MVT::v8f64,  1 },
985   };
986 
987   // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
988   // 256-bit wide vectors.
989 
990   static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
991     { ISD::FP_EXTEND, MVT::v8f64,   MVT::v8f32,  1 },
992     { ISD::FP_EXTEND, MVT::v8f64,   MVT::v16f32, 3 },
993     { ISD::FP_ROUND,  MVT::v8f32,   MVT::v8f64,  1 },
994 
995     { ISD::TRUNCATE,  MVT::v16i8,   MVT::v16i32, 1 },
996     { ISD::TRUNCATE,  MVT::v16i16,  MVT::v16i32, 1 },
997     { ISD::TRUNCATE,  MVT::v8i16,   MVT::v8i64,  1 },
998     { ISD::TRUNCATE,  MVT::v8i32,   MVT::v8i64,  1 },
999 
1000     // v16i1 -> v16i32 - load + broadcast
1001     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1,  2 },
1002     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1,  2 },
1003     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
1004     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
1005     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
1006     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
1007     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
1008     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
1009     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
1010     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
1011 
1012     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
1013     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
1014     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i8,   2 },
1015     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i8,  2 },
1016     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
1017     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 2 },
1018     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
1019     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
1020     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64, 26 },
1021     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64, 26 },
1022 
1023     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
1024     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
1025     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i8,   2 },
1026     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i8,   2 },
1027     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i8,   2 },
1028     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i8,   2 },
1029     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i8,  2 },
1030     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i16,  5 },
1031     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i16,  2 },
1032     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  2 },
1033     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
1034     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 2 },
1035     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  2 },
1036     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i32,  1 },
1037     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  1 },
1038     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  1 },
1039     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  1 },
1040     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
1041     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
1042     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  5 },
1043     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  5 },
1044     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64, 12 },
1045     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64, 26 },
1046 
1047     { ISD::FP_TO_UINT,  MVT::v2i32,  MVT::v2f32,  1 },
1048     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  1 },
1049     { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  1 },
1050     { ISD::FP_TO_UINT,  MVT::v16i32, MVT::v16f32, 1 },
1051   };
1052 
1053   static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
1054     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   3 },
1055     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   3 },
1056     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
1057     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
1058     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,   3 },
1059     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,   3 },
1060     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   3 },
1061     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   3 },
1062     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
1063     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
1064     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16,  3 },
1065     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16,  3 },
1066     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
1067     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
1068     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  1 },
1069     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  1 },
1070 
1071     { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i64,  2 },
1072     { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i64,  2 },
1073     { ISD::TRUNCATE,    MVT::v4i32,  MVT::v4i64,  2 },
1074     { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  2 },
1075     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  2 },
1076     { ISD::TRUNCATE,    MVT::v8i32,  MVT::v8i64,  4 },
1077 
1078     { ISD::FP_EXTEND,   MVT::v8f64,  MVT::v8f32,  3 },
1079     { ISD::FP_ROUND,    MVT::v8f32,  MVT::v8f64,  3 },
1080 
1081     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  8 },
1082   };
1083 
1084   static const TypeConversionCostTblEntry AVXConversionTbl[] = {
1085     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,  6 },
1086     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,  4 },
1087     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,  7 },
1088     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,  4 },
1089     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,  6 },
1090     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,  4 },
1091     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,  7 },
1092     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,  4 },
1093     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
1094     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
1095     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16, 6 },
1096     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
1097     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16, 4 },
1098     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16, 4 },
1099     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32, 4 },
1100     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32, 4 },
1101 
1102     { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i16, 4 },
1103     { ISD::TRUNCATE,    MVT::v8i8,  MVT::v8i32,  4 },
1104     { ISD::TRUNCATE,    MVT::v8i16, MVT::v8i32,  5 },
1105     { ISD::TRUNCATE,    MVT::v4i8,  MVT::v4i64,  4 },
1106     { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i64,  4 },
1107     { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64,  4 },
1108     { ISD::TRUNCATE,    MVT::v8i32, MVT::v8i64,  9 },
1109 
1110     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1,  3 },
1111     { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i1,  3 },
1112     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i1,  8 },
1113     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8,  3 },
1114     { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i8,  3 },
1115     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i8,  8 },
1116     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 3 },
1117     { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i16, 3 },
1118     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 5 },
1119     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
1120     { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i32, 1 },
1121     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 1 },
1122 
1123     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1,  7 },
1124     { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i1,  7 },
1125     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i1,  6 },
1126     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8,  2 },
1127     { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i8,  2 },
1128     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i8,  5 },
1129     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
1130     { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i16, 2 },
1131     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 5 },
1132     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 6 },
1133     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 6 },
1134     { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i32, 6 },
1135     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 9 },
1136     // The generic code to compute the scalar overhead is currently broken.
1137     // Workaround this limitation by estimating the scalarization overhead
1138     // here. We have roughly 10 instructions per scalar element.
1139     // Multiply that by the vector width.
1140     // FIXME: remove that when PR19268 is fixed.
1141     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i64, 10 },
1142     { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i64, 20 },
1143     { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i64, 13 },
1144     { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i64, 13 },
1145 
1146     { ISD::FP_TO_SINT,  MVT::v4i8,  MVT::v4f32, 1 },
1147     { ISD::FP_TO_SINT,  MVT::v8i8,  MVT::v8f32, 7 },
1148     // This node is expanded into scalarized operations but BasicTTI is overly
1149     // optimistic estimating its cost.  It computes 3 per element (one
1150     // vector-extract, one scalar conversion and one vector-insert).  The
1151     // problem is that the inserts form a read-modify-write chain so latency
1152     // should be factored in too.  Inflating the cost per element by 1.
1153     { ISD::FP_TO_UINT,  MVT::v8i32, MVT::v8f32, 8*4 },
1154     { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f64, 4*4 },
1155 
1156     { ISD::FP_EXTEND,   MVT::v4f64,  MVT::v4f32,  1 },
1157     { ISD::FP_ROUND,    MVT::v4f32,  MVT::v4f64,  1 },
1158   };
1159 
1160   static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
1161     { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8,    2 },
1162     { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8,    2 },
1163     { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16,   2 },
1164     { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16,   2 },
1165     { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32,   2 },
1166     { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32,   2 },
1167 
1168     { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i8,   1 },
1169     { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i8,   2 },
1170     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i8,   1 },
1171     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i8,   1 },
1172     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
1173     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
1174     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   2 },
1175     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   2 },
1176     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  2 },
1177     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  2 },
1178     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  4 },
1179     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  4 },
1180     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
1181     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
1182     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  2 },
1183     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  2 },
1184     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
1185     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
1186 
1187     { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  2 },
1188     { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  1 },
1189     { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i32,  1 },
1190     { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i32,  1 },
1191     { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  3 },
1192     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  3 },
1193     { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 6 },
1194 
1195   };
1196 
1197   static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
1198     // These are somewhat magic numbers justified by looking at the output of
1199     // Intel's IACA, running some kernels and making sure when we take
1200     // legalization into account the throughput will be overestimated.
1201     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
1202     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1203     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
1204     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
1205     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
1206     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
1207     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
1208     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
1209 
1210     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1211     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
1212     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
1213     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
1214     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
1215     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 },
1216     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
1217     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
1218 
1219     { ISD::FP_TO_SINT,  MVT::v2i32,  MVT::v2f64,  3 },
1220 
1221     { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i8,   1 },
1222     { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i8,   6 },
1223     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i8,   2 },
1224     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i8,   3 },
1225     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,   4 },
1226     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,   8 },
1227     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
1228     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i8,   2 },
1229     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   6 },
1230     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   6 },
1231     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  3 },
1232     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  4 },
1233     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  9 },
1234     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  12 },
1235     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
1236     { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i16,  2 },
1237     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16,  3 },
1238     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16,  10 },
1239     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  3 },
1240     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  4 },
1241     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 },
1242     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 },
1243     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  3 },
1244     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  5 },
1245 
1246     { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  4 },
1247     { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  2 },
1248     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 3 },
1249     { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i32,  3 },
1250     { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i32,  3 },
1251     { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  4 },
1252     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 7 },
1253     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  5 },
1254     { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 10 },
1255   };
1256 
1257   std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
1258   std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
1259 
1260   if (ST->hasSSE2() && !ST->hasAVX()) {
1261     if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1262                                                    LTDest.second, LTSrc.second))
1263       return LTSrc.first * Entry->Cost;
1264   }
1265 
1266   EVT SrcTy = TLI->getValueType(DL, Src);
1267   EVT DstTy = TLI->getValueType(DL, Dst);
1268 
1269   // The function getSimpleVT only handles simple value types.
1270   if (!SrcTy.isSimple() || !DstTy.isSimple())
1271     return BaseT::getCastInstrCost(Opcode, Dst, Src);
1272 
1273   if (ST->hasDQI())
1274     if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
1275                                                    DstTy.getSimpleVT(),
1276                                                    SrcTy.getSimpleVT()))
1277       return Entry->Cost;
1278 
1279   if (ST->hasAVX512())
1280     if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
1281                                                    DstTy.getSimpleVT(),
1282                                                    SrcTy.getSimpleVT()))
1283       return Entry->Cost;
1284 
1285   if (ST->hasAVX2()) {
1286     if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
1287                                                    DstTy.getSimpleVT(),
1288                                                    SrcTy.getSimpleVT()))
1289       return Entry->Cost;
1290   }
1291 
1292   if (ST->hasAVX()) {
1293     if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
1294                                                    DstTy.getSimpleVT(),
1295                                                    SrcTy.getSimpleVT()))
1296       return Entry->Cost;
1297   }
1298 
1299   if (ST->hasSSE41()) {
1300     if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
1301                                                    DstTy.getSimpleVT(),
1302                                                    SrcTy.getSimpleVT()))
1303       return Entry->Cost;
1304   }
1305 
1306   if (ST->hasSSE2()) {
1307     if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1308                                                    DstTy.getSimpleVT(),
1309                                                    SrcTy.getSimpleVT()))
1310       return Entry->Cost;
1311   }
1312 
1313   return BaseT::getCastInstrCost(Opcode, Dst, Src);
1314 }
1315 
1316 int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1317                                    const Instruction *I) {
1318   // Legalize the type.
1319   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1320 
1321   MVT MTy = LT.second;
1322 
1323   int ISD = TLI->InstructionOpcodeToISD(Opcode);
1324   assert(ISD && "Invalid opcode");
1325 
1326   static const CostTblEntry SSE2CostTbl[] = {
1327     { ISD::SETCC,   MVT::v2i64,   8 },
1328     { ISD::SETCC,   MVT::v4i32,   1 },
1329     { ISD::SETCC,   MVT::v8i16,   1 },
1330     { ISD::SETCC,   MVT::v16i8,   1 },
1331   };
1332 
1333   static const CostTblEntry SSE42CostTbl[] = {
1334     { ISD::SETCC,   MVT::v2f64,   1 },
1335     { ISD::SETCC,   MVT::v4f32,   1 },
1336     { ISD::SETCC,   MVT::v2i64,   1 },
1337   };
1338 
1339   static const CostTblEntry AVX1CostTbl[] = {
1340     { ISD::SETCC,   MVT::v4f64,   1 },
1341     { ISD::SETCC,   MVT::v8f32,   1 },
1342     // AVX1 does not support 8-wide integer compare.
1343     { ISD::SETCC,   MVT::v4i64,   4 },
1344     { ISD::SETCC,   MVT::v8i32,   4 },
1345     { ISD::SETCC,   MVT::v16i16,  4 },
1346     { ISD::SETCC,   MVT::v32i8,   4 },
1347   };
1348 
1349   static const CostTblEntry AVX2CostTbl[] = {
1350     { ISD::SETCC,   MVT::v4i64,   1 },
1351     { ISD::SETCC,   MVT::v8i32,   1 },
1352     { ISD::SETCC,   MVT::v16i16,  1 },
1353     { ISD::SETCC,   MVT::v32i8,   1 },
1354   };
1355 
1356   static const CostTblEntry AVX512CostTbl[] = {
1357     { ISD::SETCC,   MVT::v8i64,   1 },
1358     { ISD::SETCC,   MVT::v16i32,  1 },
1359     { ISD::SETCC,   MVT::v8f64,   1 },
1360     { ISD::SETCC,   MVT::v16f32,  1 },
1361   };
1362 
1363   if (ST->hasAVX512())
1364     if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
1365       return LT.first * Entry->Cost;
1366 
1367   if (ST->hasAVX2())
1368     if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
1369       return LT.first * Entry->Cost;
1370 
1371   if (ST->hasAVX())
1372     if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
1373       return LT.first * Entry->Cost;
1374 
1375   if (ST->hasSSE42())
1376     if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
1377       return LT.first * Entry->Cost;
1378 
1379   if (ST->hasSSE2())
1380     if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
1381       return LT.first * Entry->Cost;
1382 
1383   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
1384 }
1385 
1386 unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; }
1387 
1388 int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
1389                                       ArrayRef<Type *> Tys, FastMathFlags FMF,
1390                                       unsigned ScalarizationCostPassed) {
1391   // Costs should match the codegen from:
1392   // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
1393   // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
1394   // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
1395   // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
1396   // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
1397   static const CostTblEntry AVX512CDCostTbl[] = {
1398     { ISD::CTLZ,       MVT::v8i64,   1 },
1399     { ISD::CTLZ,       MVT::v16i32,  1 },
1400     { ISD::CTLZ,       MVT::v32i16,  8 },
1401     { ISD::CTLZ,       MVT::v64i8,  20 },
1402     { ISD::CTLZ,       MVT::v4i64,   1 },
1403     { ISD::CTLZ,       MVT::v8i32,   1 },
1404     { ISD::CTLZ,       MVT::v16i16,  4 },
1405     { ISD::CTLZ,       MVT::v32i8,  10 },
1406     { ISD::CTLZ,       MVT::v2i64,   1 },
1407     { ISD::CTLZ,       MVT::v4i32,   1 },
1408     { ISD::CTLZ,       MVT::v8i16,   4 },
1409     { ISD::CTLZ,       MVT::v16i8,   4 },
1410   };
1411   static const CostTblEntry AVX512BWCostTbl[] = {
1412     { ISD::BITREVERSE, MVT::v8i64,   5 },
1413     { ISD::BITREVERSE, MVT::v16i32,  5 },
1414     { ISD::BITREVERSE, MVT::v32i16,  5 },
1415     { ISD::BITREVERSE, MVT::v64i8,   5 },
1416     { ISD::CTLZ,       MVT::v8i64,  23 },
1417     { ISD::CTLZ,       MVT::v16i32, 22 },
1418     { ISD::CTLZ,       MVT::v32i16, 18 },
1419     { ISD::CTLZ,       MVT::v64i8,  17 },
1420     { ISD::CTPOP,      MVT::v8i64,   7 },
1421     { ISD::CTPOP,      MVT::v16i32, 11 },
1422     { ISD::CTPOP,      MVT::v32i16,  9 },
1423     { ISD::CTPOP,      MVT::v64i8,   6 },
1424     { ISD::CTTZ,       MVT::v8i64,  10 },
1425     { ISD::CTTZ,       MVT::v16i32, 14 },
1426     { ISD::CTTZ,       MVT::v32i16, 12 },
1427     { ISD::CTTZ,       MVT::v64i8,   9 },
1428   };
1429   static const CostTblEntry AVX512CostTbl[] = {
1430     { ISD::BITREVERSE, MVT::v8i64,  36 },
1431     { ISD::BITREVERSE, MVT::v16i32, 24 },
1432     { ISD::CTLZ,       MVT::v8i64,  29 },
1433     { ISD::CTLZ,       MVT::v16i32, 35 },
1434     { ISD::CTPOP,      MVT::v8i64,  16 },
1435     { ISD::CTPOP,      MVT::v16i32, 24 },
1436     { ISD::CTTZ,       MVT::v8i64,  20 },
1437     { ISD::CTTZ,       MVT::v16i32, 28 },
1438   };
1439   static const CostTblEntry XOPCostTbl[] = {
1440     { ISD::BITREVERSE, MVT::v4i64,   4 },
1441     { ISD::BITREVERSE, MVT::v8i32,   4 },
1442     { ISD::BITREVERSE, MVT::v16i16,  4 },
1443     { ISD::BITREVERSE, MVT::v32i8,   4 },
1444     { ISD::BITREVERSE, MVT::v2i64,   1 },
1445     { ISD::BITREVERSE, MVT::v4i32,   1 },
1446     { ISD::BITREVERSE, MVT::v8i16,   1 },
1447     { ISD::BITREVERSE, MVT::v16i8,   1 },
1448     { ISD::BITREVERSE, MVT::i64,     3 },
1449     { ISD::BITREVERSE, MVT::i32,     3 },
1450     { ISD::BITREVERSE, MVT::i16,     3 },
1451     { ISD::BITREVERSE, MVT::i8,      3 }
1452   };
1453   static const CostTblEntry AVX2CostTbl[] = {
1454     { ISD::BITREVERSE, MVT::v4i64,   5 },
1455     { ISD::BITREVERSE, MVT::v8i32,   5 },
1456     { ISD::BITREVERSE, MVT::v16i16,  5 },
1457     { ISD::BITREVERSE, MVT::v32i8,   5 },
1458     { ISD::BSWAP,      MVT::v4i64,   1 },
1459     { ISD::BSWAP,      MVT::v8i32,   1 },
1460     { ISD::BSWAP,      MVT::v16i16,  1 },
1461     { ISD::CTLZ,       MVT::v4i64,  23 },
1462     { ISD::CTLZ,       MVT::v8i32,  18 },
1463     { ISD::CTLZ,       MVT::v16i16, 14 },
1464     { ISD::CTLZ,       MVT::v32i8,   9 },
1465     { ISD::CTPOP,      MVT::v4i64,   7 },
1466     { ISD::CTPOP,      MVT::v8i32,  11 },
1467     { ISD::CTPOP,      MVT::v16i16,  9 },
1468     { ISD::CTPOP,      MVT::v32i8,   6 },
1469     { ISD::CTTZ,       MVT::v4i64,  10 },
1470     { ISD::CTTZ,       MVT::v8i32,  14 },
1471     { ISD::CTTZ,       MVT::v16i16, 12 },
1472     { ISD::CTTZ,       MVT::v32i8,   9 },
1473     { ISD::FSQRT,      MVT::f32,     7 }, // Haswell from http://www.agner.org/
1474     { ISD::FSQRT,      MVT::v4f32,   7 }, // Haswell from http://www.agner.org/
1475     { ISD::FSQRT,      MVT::v8f32,  14 }, // Haswell from http://www.agner.org/
1476     { ISD::FSQRT,      MVT::f64,    14 }, // Haswell from http://www.agner.org/
1477     { ISD::FSQRT,      MVT::v2f64,  14 }, // Haswell from http://www.agner.org/
1478     { ISD::FSQRT,      MVT::v4f64,  28 }, // Haswell from http://www.agner.org/
1479   };
1480   static const CostTblEntry AVX1CostTbl[] = {
1481     { ISD::BITREVERSE, MVT::v4i64,  12 }, // 2 x 128-bit Op + extract/insert
1482     { ISD::BITREVERSE, MVT::v8i32,  12 }, // 2 x 128-bit Op + extract/insert
1483     { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
1484     { ISD::BITREVERSE, MVT::v32i8,  12 }, // 2 x 128-bit Op + extract/insert
1485     { ISD::BSWAP,      MVT::v4i64,   4 },
1486     { ISD::BSWAP,      MVT::v8i32,   4 },
1487     { ISD::BSWAP,      MVT::v16i16,  4 },
1488     { ISD::CTLZ,       MVT::v4i64,  48 }, // 2 x 128-bit Op + extract/insert
1489     { ISD::CTLZ,       MVT::v8i32,  38 }, // 2 x 128-bit Op + extract/insert
1490     { ISD::CTLZ,       MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
1491     { ISD::CTLZ,       MVT::v32i8,  20 }, // 2 x 128-bit Op + extract/insert
1492     { ISD::CTPOP,      MVT::v4i64,  16 }, // 2 x 128-bit Op + extract/insert
1493     { ISD::CTPOP,      MVT::v8i32,  24 }, // 2 x 128-bit Op + extract/insert
1494     { ISD::CTPOP,      MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
1495     { ISD::CTPOP,      MVT::v32i8,  14 }, // 2 x 128-bit Op + extract/insert
1496     { ISD::CTTZ,       MVT::v4i64,  22 }, // 2 x 128-bit Op + extract/insert
1497     { ISD::CTTZ,       MVT::v8i32,  30 }, // 2 x 128-bit Op + extract/insert
1498     { ISD::CTTZ,       MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
1499     { ISD::CTTZ,       MVT::v32i8,  20 }, // 2 x 128-bit Op + extract/insert
1500     { ISD::FSQRT,      MVT::f32,    14 }, // SNB from http://www.agner.org/
1501     { ISD::FSQRT,      MVT::v4f32,  14 }, // SNB from http://www.agner.org/
1502     { ISD::FSQRT,      MVT::v8f32,  28 }, // SNB from http://www.agner.org/
1503     { ISD::FSQRT,      MVT::f64,    21 }, // SNB from http://www.agner.org/
1504     { ISD::FSQRT,      MVT::v2f64,  21 }, // SNB from http://www.agner.org/
1505     { ISD::FSQRT,      MVT::v4f64,  43 }, // SNB from http://www.agner.org/
1506   };
1507   static const CostTblEntry SSE42CostTbl[] = {
1508     { ISD::FSQRT,      MVT::f32,    18 }, // Nehalem from http://www.agner.org/
1509     { ISD::FSQRT,      MVT::v4f32,  18 }, // Nehalem from http://www.agner.org/
1510   };
1511   static const CostTblEntry SSSE3CostTbl[] = {
1512     { ISD::BITREVERSE, MVT::v2i64,   5 },
1513     { ISD::BITREVERSE, MVT::v4i32,   5 },
1514     { ISD::BITREVERSE, MVT::v8i16,   5 },
1515     { ISD::BITREVERSE, MVT::v16i8,   5 },
1516     { ISD::BSWAP,      MVT::v2i64,   1 },
1517     { ISD::BSWAP,      MVT::v4i32,   1 },
1518     { ISD::BSWAP,      MVT::v8i16,   1 },
1519     { ISD::CTLZ,       MVT::v2i64,  23 },
1520     { ISD::CTLZ,       MVT::v4i32,  18 },
1521     { ISD::CTLZ,       MVT::v8i16,  14 },
1522     { ISD::CTLZ,       MVT::v16i8,   9 },
1523     { ISD::CTPOP,      MVT::v2i64,   7 },
1524     { ISD::CTPOP,      MVT::v4i32,  11 },
1525     { ISD::CTPOP,      MVT::v8i16,   9 },
1526     { ISD::CTPOP,      MVT::v16i8,   6 },
1527     { ISD::CTTZ,       MVT::v2i64,  10 },
1528     { ISD::CTTZ,       MVT::v4i32,  14 },
1529     { ISD::CTTZ,       MVT::v8i16,  12 },
1530     { ISD::CTTZ,       MVT::v16i8,   9 }
1531   };
1532   static const CostTblEntry SSE2CostTbl[] = {
1533     { ISD::BITREVERSE, MVT::v2i64,  29 },
1534     { ISD::BITREVERSE, MVT::v4i32,  27 },
1535     { ISD::BITREVERSE, MVT::v8i16,  27 },
1536     { ISD::BITREVERSE, MVT::v16i8,  20 },
1537     { ISD::BSWAP,      MVT::v2i64,   7 },
1538     { ISD::BSWAP,      MVT::v4i32,   7 },
1539     { ISD::BSWAP,      MVT::v8i16,   7 },
1540     { ISD::CTLZ,       MVT::v2i64,  25 },
1541     { ISD::CTLZ,       MVT::v4i32,  26 },
1542     { ISD::CTLZ,       MVT::v8i16,  20 },
1543     { ISD::CTLZ,       MVT::v16i8,  17 },
1544     { ISD::CTPOP,      MVT::v2i64,  12 },
1545     { ISD::CTPOP,      MVT::v4i32,  15 },
1546     { ISD::CTPOP,      MVT::v8i16,  13 },
1547     { ISD::CTPOP,      MVT::v16i8,  10 },
1548     { ISD::CTTZ,       MVT::v2i64,  14 },
1549     { ISD::CTTZ,       MVT::v4i32,  18 },
1550     { ISD::CTTZ,       MVT::v8i16,  16 },
1551     { ISD::CTTZ,       MVT::v16i8,  13 },
1552     { ISD::FSQRT,      MVT::f64,    32 }, // Nehalem from http://www.agner.org/
1553     { ISD::FSQRT,      MVT::v2f64,  32 }, // Nehalem from http://www.agner.org/
1554   };
1555   static const CostTblEntry SSE1CostTbl[] = {
1556     { ISD::FSQRT,      MVT::f32,    28 }, // Pentium III from http://www.agner.org/
1557     { ISD::FSQRT,      MVT::v4f32,  56 }, // Pentium III from http://www.agner.org/
1558   };
1559   static const CostTblEntry X64CostTbl[] = { // 64-bit targets
1560     { ISD::BITREVERSE, MVT::i64,    14 }
1561   };
1562   static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1563     { ISD::BITREVERSE, MVT::i32,    14 },
1564     { ISD::BITREVERSE, MVT::i16,    14 },
1565     { ISD::BITREVERSE, MVT::i8,     11 }
1566   };
1567 
1568   unsigned ISD = ISD::DELETED_NODE;
1569   switch (IID) {
1570   default:
1571     break;
1572   case Intrinsic::bitreverse:
1573     ISD = ISD::BITREVERSE;
1574     break;
1575   case Intrinsic::bswap:
1576     ISD = ISD::BSWAP;
1577     break;
1578   case Intrinsic::ctlz:
1579     ISD = ISD::CTLZ;
1580     break;
1581   case Intrinsic::ctpop:
1582     ISD = ISD::CTPOP;
1583     break;
1584   case Intrinsic::cttz:
1585     ISD = ISD::CTTZ;
1586     break;
1587   case Intrinsic::sqrt:
1588     ISD = ISD::FSQRT;
1589     break;
1590   }
1591 
1592   // Legalize the type.
1593   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
1594   MVT MTy = LT.second;
1595 
1596   // Attempt to lookup cost.
1597   if (ST->hasCDI())
1598     if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
1599       return LT.first * Entry->Cost;
1600 
1601   if (ST->hasBWI())
1602     if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
1603       return LT.first * Entry->Cost;
1604 
1605   if (ST->hasAVX512())
1606     if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
1607       return LT.first * Entry->Cost;
1608 
1609   if (ST->hasXOP())
1610     if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
1611       return LT.first * Entry->Cost;
1612 
1613   if (ST->hasAVX2())
1614     if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
1615       return LT.first * Entry->Cost;
1616 
1617   if (ST->hasAVX())
1618     if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
1619       return LT.first * Entry->Cost;
1620 
1621   if (ST->hasSSE42())
1622     if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
1623       return LT.first * Entry->Cost;
1624 
1625   if (ST->hasSSSE3())
1626     if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
1627       return LT.first * Entry->Cost;
1628 
1629   if (ST->hasSSE2())
1630     if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
1631       return LT.first * Entry->Cost;
1632 
1633   if (ST->hasSSE1())
1634     if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
1635       return LT.first * Entry->Cost;
1636 
1637   if (ST->is64Bit())
1638     if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
1639       return LT.first * Entry->Cost;
1640 
1641   if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
1642     return LT.first * Entry->Cost;
1643 
1644   return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed);
1645 }
1646 
1647 int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
1648                      ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) {
1649   return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF);
1650 }
1651 
1652 int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
1653   assert(Val->isVectorTy() && "This must be a vector type");
1654 
1655   Type *ScalarType = Val->getScalarType();
1656 
1657   if (Index != -1U) {
1658     // Legalize the type.
1659     std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
1660 
1661     // This type is legalized to a scalar type.
1662     if (!LT.second.isVector())
1663       return 0;
1664 
1665     // The type may be split. Normalize the index to the new type.
1666     unsigned Width = LT.second.getVectorNumElements();
1667     Index = Index % Width;
1668 
1669     // Floating point scalars are already located in index #0.
1670     if (ScalarType->isFloatingPointTy() && Index == 0)
1671       return 0;
1672   }
1673 
1674   // Add to the base cost if we know that the extracted element of a vector is
1675   // destined to be moved to and used in the integer register file.
1676   int RegisterFileMoveCost = 0;
1677   if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
1678     RegisterFileMoveCost = 1;
1679 
1680   return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
1681 }
1682 
1683 int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
1684                                 unsigned AddressSpace, const Instruction *I) {
1685   // Handle non-power-of-two vectors such as <3 x float>
1686   if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
1687     unsigned NumElem = VTy->getVectorNumElements();
1688 
1689     // Handle a few common cases:
1690     // <3 x float>
1691     if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
1692       // Cost = 64 bit store + extract + 32 bit store.
1693       return 3;
1694 
1695     // <3 x double>
1696     if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
1697       // Cost = 128 bit store + unpack + 64 bit store.
1698       return 3;
1699 
1700     // Assume that all other non-power-of-two numbers are scalarized.
1701     if (!isPowerOf2_32(NumElem)) {
1702       int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
1703                                         AddressSpace);
1704       int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load,
1705                                                Opcode == Instruction::Store);
1706       return NumElem * Cost + SplitCost;
1707     }
1708   }
1709 
1710   // Legalize the type.
1711   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
1712   assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1713          "Invalid Opcode");
1714 
1715   // Each load/store unit costs 1.
1716   int Cost = LT.first * 1;
1717 
1718   // This isn't exactly right. We're using slow unaligned 32-byte accesses as a
1719   // proxy for a double-pumped AVX memory interface such as on Sandybridge.
1720   if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow())
1721     Cost *= 2;
1722 
1723   return Cost;
1724 }
1725 
1726 int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
1727                                       unsigned Alignment,
1728                                       unsigned AddressSpace) {
1729   VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
1730   if (!SrcVTy)
1731     // To calculate scalar take the regular cost, without mask
1732     return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace);
1733 
1734   unsigned NumElem = SrcVTy->getVectorNumElements();
1735   VectorType *MaskTy =
1736     VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
1737   if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy)) ||
1738       (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy)) ||
1739       !isPowerOf2_32(NumElem)) {
1740     // Scalarization
1741     int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
1742     int ScalarCompareCost = getCmpSelInstrCost(
1743         Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr);
1744     int BranchCost = getCFInstrCost(Instruction::Br);
1745     int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
1746 
1747     int ValueSplitCost = getScalarizationOverhead(
1748         SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store);
1749     int MemopCost =
1750         NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
1751                                          Alignment, AddressSpace);
1752     return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
1753   }
1754 
1755   // Legalize the type.
1756   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
1757   auto VT = TLI->getValueType(DL, SrcVTy);
1758   int Cost = 0;
1759   if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
1760       LT.second.getVectorNumElements() == NumElem)
1761     // Promotion requires expand/truncate for data and a shuffle for mask.
1762     Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, nullptr) +
1763             getShuffleCost(TTI::SK_Alternate, MaskTy, 0, nullptr);
1764 
1765   else if (LT.second.getVectorNumElements() > NumElem) {
1766     VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
1767                                             LT.second.getVectorNumElements());
1768     // Expanding requires fill mask with zeroes
1769     Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
1770   }
1771   if (!ST->hasAVX512())
1772     return Cost + LT.first*4; // Each maskmov costs 4
1773 
1774   // AVX-512 masked load/store is cheapper
1775   return Cost+LT.first;
1776 }
1777 
1778 int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
1779                                           const SCEV *Ptr) {
1780   // Address computations in vectorized code with non-consecutive addresses will
1781   // likely result in more instructions compared to scalar code where the
1782   // computation can more often be merged into the index mode. The resulting
1783   // extra micro-ops can significantly decrease throughput.
1784   unsigned NumVectorInstToHideOverhead = 10;
1785 
1786   // Cost modeling of Strided Access Computation is hidden by the indexing
1787   // modes of X86 regardless of the stride value. We dont believe that there
1788   // is a difference between constant strided access in gerenal and constant
1789   // strided value which is less than or equal to 64.
1790   // Even in the case of (loop invariant) stride whose value is not known at
1791   // compile time, the address computation will not incur more than one extra
1792   // ADD instruction.
1793   if (Ty->isVectorTy() && SE) {
1794     if (!BaseT::isStridedAccess(Ptr))
1795       return NumVectorInstToHideOverhead;
1796     if (!BaseT::getConstantStrideStep(SE, Ptr))
1797       return 1;
1798   }
1799 
1800   return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1801 }
1802 
1803 int X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
1804                                  bool IsPairwise) {
1805 
1806   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1807 
1808   MVT MTy = LT.second;
1809 
1810   int ISD = TLI->InstructionOpcodeToISD(Opcode);
1811   assert(ISD && "Invalid opcode");
1812 
1813   // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
1814   // and make it as the cost.
1815 
1816   static const CostTblEntry SSE42CostTblPairWise[] = {
1817     { ISD::FADD,  MVT::v2f64,   2 },
1818     { ISD::FADD,  MVT::v4f32,   4 },
1819     { ISD::ADD,   MVT::v2i64,   2 },      // The data reported by the IACA tool is "1.6".
1820     { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.5".
1821     { ISD::ADD,   MVT::v8i16,   5 },
1822   };
1823 
1824   static const CostTblEntry AVX1CostTblPairWise[] = {
1825     { ISD::FADD,  MVT::v4f32,   4 },
1826     { ISD::FADD,  MVT::v4f64,   5 },
1827     { ISD::FADD,  MVT::v8f32,   7 },
1828     { ISD::ADD,   MVT::v2i64,   1 },      // The data reported by the IACA tool is "1.5".
1829     { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.5".
1830     { ISD::ADD,   MVT::v4i64,   5 },      // The data reported by the IACA tool is "4.8".
1831     { ISD::ADD,   MVT::v8i16,   5 },
1832     { ISD::ADD,   MVT::v8i32,   5 },
1833   };
1834 
1835   static const CostTblEntry SSE42CostTblNoPairWise[] = {
1836     { ISD::FADD,  MVT::v2f64,   2 },
1837     { ISD::FADD,  MVT::v4f32,   4 },
1838     { ISD::ADD,   MVT::v2i64,   2 },      // The data reported by the IACA tool is "1.6".
1839     { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.3".
1840     { ISD::ADD,   MVT::v8i16,   4 },      // The data reported by the IACA tool is "4.3".
1841   };
1842 
1843   static const CostTblEntry AVX1CostTblNoPairWise[] = {
1844     { ISD::FADD,  MVT::v4f32,   3 },
1845     { ISD::FADD,  MVT::v4f64,   3 },
1846     { ISD::FADD,  MVT::v8f32,   4 },
1847     { ISD::ADD,   MVT::v2i64,   1 },      // The data reported by the IACA tool is "1.5".
1848     { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "2.8".
1849     { ISD::ADD,   MVT::v4i64,   3 },
1850     { ISD::ADD,   MVT::v8i16,   4 },
1851     { ISD::ADD,   MVT::v8i32,   5 },
1852   };
1853 
1854   if (IsPairwise) {
1855     if (ST->hasAVX())
1856       if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
1857         return LT.first * Entry->Cost;
1858 
1859     if (ST->hasSSE42())
1860       if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
1861         return LT.first * Entry->Cost;
1862   } else {
1863     if (ST->hasAVX())
1864       if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
1865         return LT.first * Entry->Cost;
1866 
1867     if (ST->hasSSE42())
1868       if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
1869         return LT.first * Entry->Cost;
1870   }
1871 
1872   return BaseT::getReductionCost(Opcode, ValTy, IsPairwise);
1873 }
1874 
1875 /// \brief Calculate the cost of materializing a 64-bit value. This helper
1876 /// method might only calculate a fraction of a larger immediate. Therefore it
1877 /// is valid to return a cost of ZERO.
1878 int X86TTIImpl::getIntImmCost(int64_t Val) {
1879   if (Val == 0)
1880     return TTI::TCC_Free;
1881 
1882   if (isInt<32>(Val))
1883     return TTI::TCC_Basic;
1884 
1885   return 2 * TTI::TCC_Basic;
1886 }
1887 
1888 int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
1889   assert(Ty->isIntegerTy());
1890 
1891   unsigned BitSize = Ty->getPrimitiveSizeInBits();
1892   if (BitSize == 0)
1893     return ~0U;
1894 
1895   // Never hoist constants larger than 128bit, because this might lead to
1896   // incorrect code generation or assertions in codegen.
1897   // Fixme: Create a cost model for types larger than i128 once the codegen
1898   // issues have been fixed.
1899   if (BitSize > 128)
1900     return TTI::TCC_Free;
1901 
1902   if (Imm == 0)
1903     return TTI::TCC_Free;
1904 
1905   // Sign-extend all constants to a multiple of 64-bit.
1906   APInt ImmVal = Imm;
1907   if (BitSize & 0x3f)
1908     ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
1909 
1910   // Split the constant into 64-bit chunks and calculate the cost for each
1911   // chunk.
1912   int Cost = 0;
1913   for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
1914     APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
1915     int64_t Val = Tmp.getSExtValue();
1916     Cost += getIntImmCost(Val);
1917   }
1918   // We need at least one instruction to materialize the constant.
1919   return std::max(1, Cost);
1920 }
1921 
1922 int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
1923                               Type *Ty) {
1924   assert(Ty->isIntegerTy());
1925 
1926   unsigned BitSize = Ty->getPrimitiveSizeInBits();
1927   // There is no cost model for constants with a bit size of 0. Return TCC_Free
1928   // here, so that constant hoisting will ignore this constant.
1929   if (BitSize == 0)
1930     return TTI::TCC_Free;
1931 
1932   unsigned ImmIdx = ~0U;
1933   switch (Opcode) {
1934   default:
1935     return TTI::TCC_Free;
1936   case Instruction::GetElementPtr:
1937     // Always hoist the base address of a GetElementPtr. This prevents the
1938     // creation of new constants for every base constant that gets constant
1939     // folded with the offset.
1940     if (Idx == 0)
1941       return 2 * TTI::TCC_Basic;
1942     return TTI::TCC_Free;
1943   case Instruction::Store:
1944     ImmIdx = 0;
1945     break;
1946   case Instruction::ICmp:
1947     // This is an imperfect hack to prevent constant hoisting of
1948     // compares that might be trying to check if a 64-bit value fits in
1949     // 32-bits. The backend can optimize these cases using a right shift by 32.
1950     // Ideally we would check the compare predicate here. There also other
1951     // similar immediates the backend can use shifts for.
1952     if (Idx == 1 && Imm.getBitWidth() == 64) {
1953       uint64_t ImmVal = Imm.getZExtValue();
1954       if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
1955         return TTI::TCC_Free;
1956     }
1957     ImmIdx = 1;
1958     break;
1959   case Instruction::And:
1960     // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
1961     // by using a 32-bit operation with implicit zero extension. Detect such
1962     // immediates here as the normal path expects bit 31 to be sign extended.
1963     if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue()))
1964       return TTI::TCC_Free;
1965     LLVM_FALLTHROUGH;
1966   case Instruction::Add:
1967   case Instruction::Sub:
1968   case Instruction::Mul:
1969   case Instruction::UDiv:
1970   case Instruction::SDiv:
1971   case Instruction::URem:
1972   case Instruction::SRem:
1973   case Instruction::Or:
1974   case Instruction::Xor:
1975     ImmIdx = 1;
1976     break;
1977   // Always return TCC_Free for the shift value of a shift instruction.
1978   case Instruction::Shl:
1979   case Instruction::LShr:
1980   case Instruction::AShr:
1981     if (Idx == 1)
1982       return TTI::TCC_Free;
1983     break;
1984   case Instruction::Trunc:
1985   case Instruction::ZExt:
1986   case Instruction::SExt:
1987   case Instruction::IntToPtr:
1988   case Instruction::PtrToInt:
1989   case Instruction::BitCast:
1990   case Instruction::PHI:
1991   case Instruction::Call:
1992   case Instruction::Select:
1993   case Instruction::Ret:
1994   case Instruction::Load:
1995     break;
1996   }
1997 
1998   if (Idx == ImmIdx) {
1999     int NumConstants = (BitSize + 63) / 64;
2000     int Cost = X86TTIImpl::getIntImmCost(Imm, Ty);
2001     return (Cost <= NumConstants * TTI::TCC_Basic)
2002                ? static_cast<int>(TTI::TCC_Free)
2003                : Cost;
2004   }
2005 
2006   return X86TTIImpl::getIntImmCost(Imm, Ty);
2007 }
2008 
2009 int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
2010                               Type *Ty) {
2011   assert(Ty->isIntegerTy());
2012 
2013   unsigned BitSize = Ty->getPrimitiveSizeInBits();
2014   // There is no cost model for constants with a bit size of 0. Return TCC_Free
2015   // here, so that constant hoisting will ignore this constant.
2016   if (BitSize == 0)
2017     return TTI::TCC_Free;
2018 
2019   switch (IID) {
2020   default:
2021     return TTI::TCC_Free;
2022   case Intrinsic::sadd_with_overflow:
2023   case Intrinsic::uadd_with_overflow:
2024   case Intrinsic::ssub_with_overflow:
2025   case Intrinsic::usub_with_overflow:
2026   case Intrinsic::smul_with_overflow:
2027   case Intrinsic::umul_with_overflow:
2028     if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
2029       return TTI::TCC_Free;
2030     break;
2031   case Intrinsic::experimental_stackmap:
2032     if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
2033       return TTI::TCC_Free;
2034     break;
2035   case Intrinsic::experimental_patchpoint_void:
2036   case Intrinsic::experimental_patchpoint_i64:
2037     if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
2038       return TTI::TCC_Free;
2039     break;
2040   }
2041   return X86TTIImpl::getIntImmCost(Imm, Ty);
2042 }
2043 
2044 // Return an average cost of Gather / Scatter instruction, maybe improved later
2045 int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
2046                                 unsigned Alignment, unsigned AddressSpace) {
2047 
2048   assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
2049   unsigned VF = SrcVTy->getVectorNumElements();
2050 
2051   // Try to reduce index size from 64 bit (default for GEP)
2052   // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
2053   // operation will use 16 x 64 indices which do not fit in a zmm and needs
2054   // to split. Also check that the base pointer is the same for all lanes,
2055   // and that there's at most one variable index.
2056   auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) {
2057     unsigned IndexSize = DL.getPointerSizeInBits();
2058     GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
2059     if (IndexSize < 64 || !GEP)
2060       return IndexSize;
2061 
2062     unsigned NumOfVarIndices = 0;
2063     Value *Ptrs = GEP->getPointerOperand();
2064     if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
2065       return IndexSize;
2066     for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
2067       if (isa<Constant>(GEP->getOperand(i)))
2068         continue;
2069       Type *IndxTy = GEP->getOperand(i)->getType();
2070       if (IndxTy->isVectorTy())
2071         IndxTy = IndxTy->getVectorElementType();
2072       if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
2073           !isa<SExtInst>(GEP->getOperand(i))) ||
2074          ++NumOfVarIndices > 1)
2075         return IndexSize; // 64
2076     }
2077     return (unsigned)32;
2078   };
2079 
2080 
2081   // Trying to reduce IndexSize to 32 bits for vector 16.
2082   // By default the IndexSize is equal to pointer size.
2083   unsigned IndexSize = (VF >= 16) ? getIndexSizeInBits(Ptr, DL) :
2084     DL.getPointerSizeInBits();
2085 
2086   Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(),
2087                                                     IndexSize), VF);
2088   std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
2089   std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
2090   int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
2091   if (SplitFactor > 1) {
2092     // Handle splitting of vector of pointers
2093     Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
2094     return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
2095                                          AddressSpace);
2096   }
2097 
2098   // The gather / scatter cost is given by Intel architects. It is a rough
2099   // number since we are looking at one instruction in a time.
2100   const int GSOverhead = 2;
2101   return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
2102                                            Alignment, AddressSpace);
2103 }
2104 
2105 /// Return the cost of full scalarization of gather / scatter operation.
2106 ///
2107 /// Opcode - Load or Store instruction.
2108 /// SrcVTy - The type of the data vector that should be gathered or scattered.
2109 /// VariableMask - The mask is non-constant at compile time.
2110 /// Alignment - Alignment for one element.
2111 /// AddressSpace - pointer[s] address space.
2112 ///
2113 int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
2114                                 bool VariableMask, unsigned Alignment,
2115                                 unsigned AddressSpace) {
2116   unsigned VF = SrcVTy->getVectorNumElements();
2117 
2118   int MaskUnpackCost = 0;
2119   if (VariableMask) {
2120     VectorType *MaskTy =
2121       VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
2122     MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
2123     int ScalarCompareCost =
2124       getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()),
2125                          nullptr);
2126     int BranchCost = getCFInstrCost(Instruction::Br);
2127     MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
2128   }
2129 
2130   // The cost of the scalar loads/stores.
2131   int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
2132                                           Alignment, AddressSpace);
2133 
2134   int InsertExtractCost = 0;
2135   if (Opcode == Instruction::Load)
2136     for (unsigned i = 0; i < VF; ++i)
2137       // Add the cost of inserting each scalar load into the vector
2138       InsertExtractCost +=
2139         getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
2140   else
2141     for (unsigned i = 0; i < VF; ++i)
2142       // Add the cost of extracting each element out of the data vector
2143       InsertExtractCost +=
2144         getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
2145 
2146   return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
2147 }
2148 
2149 /// Calculate the cost of Gather / Scatter operation
2150 int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
2151                                        Value *Ptr, bool VariableMask,
2152                                        unsigned Alignment) {
2153   assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
2154   unsigned VF = SrcVTy->getVectorNumElements();
2155   PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
2156   if (!PtrTy && Ptr->getType()->isVectorTy())
2157     PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType());
2158   assert(PtrTy && "Unexpected type for Ptr argument");
2159   unsigned AddressSpace = PtrTy->getAddressSpace();
2160 
2161   bool Scalarize = false;
2162   if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) ||
2163       (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy)))
2164     Scalarize = true;
2165   // Gather / Scatter for vector 2 is not profitable on KNL / SKX
2166   // Vector-4 of gather/scatter instruction does not exist on KNL.
2167   // We can extend it to 8 elements, but zeroing upper bits of
2168   // the mask vector will add more instructions. Right now we give the scalar
2169   // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
2170   // is better in the VariableMask case.
2171   if (VF == 2 || (VF == 4 && !ST->hasVLX()))
2172     Scalarize = true;
2173 
2174   if (Scalarize)
2175     return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
2176                            AddressSpace);
2177 
2178   return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
2179 }
2180 
2181 bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
2182   Type *ScalarTy = DataTy->getScalarType();
2183   int DataWidth = isa<PointerType>(ScalarTy) ?
2184     DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();
2185 
2186   return ((DataWidth == 32 || DataWidth == 64) && ST->hasAVX()) ||
2187          ((DataWidth == 8 || DataWidth == 16) && ST->hasBWI());
2188 }
2189 
2190 bool X86TTIImpl::isLegalMaskedStore(Type *DataType) {
2191   return isLegalMaskedLoad(DataType);
2192 }
2193 
2194 bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) {
2195   // This function is called now in two cases: from the Loop Vectorizer
2196   // and from the Scalarizer.
2197   // When the Loop Vectorizer asks about legality of the feature,
2198   // the vectorization factor is not calculated yet. The Loop Vectorizer
2199   // sends a scalar type and the decision is based on the width of the
2200   // scalar element.
2201   // Later on, the cost model will estimate usage this intrinsic based on
2202   // the vector type.
2203   // The Scalarizer asks again about legality. It sends a vector type.
2204   // In this case we can reject non-power-of-2 vectors.
2205   if (isa<VectorType>(DataTy) && !isPowerOf2_32(DataTy->getVectorNumElements()))
2206     return false;
2207   Type *ScalarTy = DataTy->getScalarType();
2208   int DataWidth = isa<PointerType>(ScalarTy) ?
2209     DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();
2210 
2211   // AVX-512 allows gather and scatter
2212   return (DataWidth == 32 || DataWidth == 64) && ST->hasAVX512();
2213 }
2214 
2215 bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) {
2216   return isLegalMaskedGather(DataType);
2217 }
2218 
2219 bool X86TTIImpl::areInlineCompatible(const Function *Caller,
2220                                      const Function *Callee) const {
2221   const TargetMachine &TM = getTLI()->getTargetMachine();
2222 
2223   // Work this as a subsetting of subtarget features.
2224   const FeatureBitset &CallerBits =
2225       TM.getSubtargetImpl(*Caller)->getFeatureBits();
2226   const FeatureBitset &CalleeBits =
2227       TM.getSubtargetImpl(*Callee)->getFeatureBits();
2228 
2229   // FIXME: This is likely too limiting as it will include subtarget features
2230   // that we might not care about for inlining, but it is conservatively
2231   // correct.
2232   return (CallerBits & CalleeBits) == CalleeBits;
2233 }
2234 
2235 bool X86TTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) {
2236   // TODO: We can increase these based on available vector ops.
2237   MaxLoadSize = ST->is64Bit() ? 8 : 4;
2238   return true;
2239 }
2240 
2241 bool X86TTIImpl::enableInterleavedAccessVectorization() {
2242   // TODO: We expect this to be beneficial regardless of arch,
2243   // but there are currently some unexplained performance artifacts on Atom.
2244   // As a temporary solution, disable on Atom.
2245   return !(ST->isAtom());
2246 }
2247 
2248 // Get estimation for interleaved load/store operations for AVX2.
2249 // \p Factor is the interleaved-access factor (stride) - number of
2250 // (interleaved) elements in the group.
2251 // \p Indices contains the indices for a strided load: when the
2252 // interleaved load has gaps they indicate which elements are used.
2253 // If Indices is empty (or if the number of indices is equal to the size
2254 // of the interleaved-access as given in \p Factor) the access has no gaps.
2255 //
2256 // As opposed to AVX-512, AVX2 does not have generic shuffles that allow
2257 // computing the cost using a generic formula as a function of generic
2258 // shuffles. We therefore use a lookup table instead, filled according to
2259 // the instruction sequences that codegen currently generates.
2260 int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
2261                                                unsigned Factor,
2262                                                ArrayRef<unsigned> Indices,
2263                                                unsigned Alignment,
2264                                                unsigned AddressSpace) {
2265 
2266   // We currently Support only fully-interleaved groups, with no gaps.
2267   // TODO: Support also strided loads (interleaved-groups with gaps).
2268   if (Indices.size() && Indices.size() != Factor)
2269     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2270                                              Alignment, AddressSpace);
2271 
2272   // VecTy for interleave memop is <VF*Factor x Elt>.
2273   // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
2274   // VecTy = <12 x i32>.
2275   MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
2276 
2277   // This function can be called with VecTy=<6xi128>, Factor=3, in which case
2278   // the VF=2, while v2i128 is an unsupported MVT vector type
2279   // (see MachineValueType.h::getVectorVT()).
2280   if (!LegalVT.isVector())
2281     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2282                                              Alignment, AddressSpace);
2283 
2284   unsigned VF = VecTy->getVectorNumElements() / Factor;
2285   Type *ScalarTy = VecTy->getVectorElementType();
2286 
2287   // Calculate the number of memory operations (NumOfMemOps), required
2288   // for load/store the VecTy.
2289   unsigned VecTySize = DL.getTypeStoreSize(VecTy);
2290   unsigned LegalVTSize = LegalVT.getStoreSize();
2291   unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
2292 
2293   // Get the cost of one memory operation.
2294   Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
2295                                         LegalVT.getVectorNumElements());
2296   unsigned MemOpCost =
2297       getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
2298 
2299   VectorType *VT = VectorType::get(ScalarTy, VF);
2300   EVT ETy = TLI->getValueType(DL, VT);
2301   if (!ETy.isSimple())
2302     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2303                                              Alignment, AddressSpace);
2304 
2305   // TODO: Complete for other data-types and strides.
2306   // Each combination of Stride, ElementTy and VF results in a different
2307   // sequence; The cost tables are therefore accessed with:
2308   // Factor (stride) and VectorType=VFxElemType.
2309   // The Cost accounts only for the shuffle sequence;
2310   // The cost of the loads/stores is accounted for separately.
2311   //
2312   static const CostTblEntry AVX2InterleavedLoadTbl[] = {
2313     { 3, MVT::v2i8,  10 }, //(load 6i8 and)  deinterleave into 3 x 2i8
2314     { 3, MVT::v4i8,  4 },  //(load 12i8 and) deinterleave into 3 x 4i8
2315     { 3, MVT::v8i8,  9 },  //(load 24i8 and) deinterleave into 3 x 8i8
2316     { 3, MVT::v16i8, 18},  //(load 48i8 and) deinterleave into 3 x 16i8
2317     { 3, MVT::v32i8, 42 }, //(load 96i8 and) deinterleave into 3 x 32i8
2318 
2319     { 4, MVT::v2i8,  12 }, //(load 8i8 and)   deinterleave into 4 x 2i8
2320     { 4, MVT::v4i8,  4 },  //(load 16i8 and)  deinterleave into 4 x 4i8
2321     { 4, MVT::v8i8,  20 }, //(load 32i8 and)  deinterleave into 4 x 8i8
2322     { 4, MVT::v16i8, 39 }, //(load 64i8 and)  deinterleave into 4 x 16i8
2323     { 4, MVT::v32i8, 80 }  //(load 128i8 and) deinterleave into 4 x 32i8
2324   };
2325 
2326   static const CostTblEntry AVX2InterleavedStoreTbl[] = {
2327     { 3, MVT::v2i8,  7 },  //interleave 3 x 2i8  into 6i8 (and store)
2328     { 3, MVT::v4i8,  8 },  //interleave 3 x 4i8  into 12i8 (and store)
2329     { 3, MVT::v8i8,  11 }, //interleave 3 x 8i8  into 24i8 (and store)
2330     { 3, MVT::v16i8, 17 }, //interleave 3 x 16i8 into 48i8 (and store)
2331     { 3, MVT::v32i8, 32 }, //interleave 3 x 32i8 into 96i8 (and store)
2332 
2333     { 4, MVT::v2i8,  12 }, //interleave 4 x 2i8  into 8i8 (and store)
2334     { 4, MVT::v4i8,  9 },  //interleave 4 x 4i8  into 16i8 (and store)
2335     { 4, MVT::v8i8,  16 }, //interleave 4 x 8i8  into 32i8 (and store)
2336     { 4, MVT::v16i8, 20 }, //interleave 4 x 16i8 into 64i8 (and store)
2337     { 4, MVT::v32i8, 40 }  //interleave 4 x 32i8 into 128i8 (and store)
2338   };
2339 
2340   if (Opcode == Instruction::Load) {
2341     if (const auto *Entry =
2342             CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
2343       return NumOfMemOps * MemOpCost + Entry->Cost;
2344   } else {
2345     assert(Opcode == Instruction::Store &&
2346            "Expected Store Instruction at this  point");
2347     if (const auto *Entry =
2348             CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
2349       return NumOfMemOps * MemOpCost + Entry->Cost;
2350   }
2351 
2352   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2353                                            Alignment, AddressSpace);
2354 }
2355 
2356 // Get estimation for interleaved load/store operations and strided load.
2357 // \p Indices contains indices for strided load.
2358 // \p Factor - the factor of interleaving.
2359 // AVX-512 provides 3-src shuffles that significantly reduces the cost.
2360 int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
2361                                                  unsigned Factor,
2362                                                  ArrayRef<unsigned> Indices,
2363                                                  unsigned Alignment,
2364                                                  unsigned AddressSpace) {
2365 
2366   // VecTy for interleave memop is <VF*Factor x Elt>.
2367   // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
2368   // VecTy = <12 x i32>.
2369 
2370   // Calculate the number of memory operations (NumOfMemOps), required
2371   // for load/store the VecTy.
2372   MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
2373   unsigned VecTySize = DL.getTypeStoreSize(VecTy);
2374   unsigned LegalVTSize = LegalVT.getStoreSize();
2375   unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
2376 
2377   // Get the cost of one memory operation.
2378   Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
2379                                         LegalVT.getVectorNumElements());
2380   unsigned MemOpCost =
2381       getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
2382 
2383   if (Opcode == Instruction::Load) {
2384     // Kind of shuffle depends on number of loaded values.
2385     // If we load the entire data in one register, we can use a 1-src shuffle.
2386     // Otherwise, we'll merge 2 sources in each operation.
2387     TTI::ShuffleKind ShuffleKind =
2388         (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
2389 
2390     unsigned ShuffleCost =
2391         getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr);
2392 
2393     unsigned NumOfLoadsInInterleaveGrp =
2394         Indices.size() ? Indices.size() : Factor;
2395     Type *ResultTy = VectorType::get(VecTy->getVectorElementType(),
2396                                      VecTy->getVectorNumElements() / Factor);
2397     unsigned NumOfResults =
2398         getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
2399         NumOfLoadsInInterleaveGrp;
2400 
2401     // About a half of the loads may be folded in shuffles when we have only
2402     // one result. If we have more than one result, we do not fold loads at all.
2403     unsigned NumOfUnfoldedLoads =
2404         NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
2405 
2406     // Get a number of shuffle operations per result.
2407     unsigned NumOfShufflesPerResult =
2408         std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
2409 
2410     // The SK_MergeTwoSrc shuffle clobbers one of src operands.
2411     // When we have more than one destination, we need additional instructions
2412     // to keep sources.
2413     unsigned NumOfMoves = 0;
2414     if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
2415       NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
2416 
2417     int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
2418                NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
2419 
2420     return Cost;
2421   }
2422 
2423   // Store.
2424   assert(Opcode == Instruction::Store &&
2425          "Expected Store Instruction at this  point");
2426 
2427   // There is no strided stores meanwhile. And store can't be folded in
2428   // shuffle.
2429   unsigned NumOfSources = Factor; // The number of values to be merged.
2430   unsigned ShuffleCost =
2431       getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);
2432   unsigned NumOfShufflesPerStore = NumOfSources - 1;
2433 
2434   // The SK_MergeTwoSrc shuffle clobbers one of src operands.
2435   // We need additional instructions to keep sources.
2436   unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
2437   int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
2438              NumOfMoves;
2439   return Cost;
2440 }
2441 
2442 int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
2443                                            unsigned Factor,
2444                                            ArrayRef<unsigned> Indices,
2445                                            unsigned Alignment,
2446                                            unsigned AddressSpace) {
2447   auto isSupportedOnAVX512 = [](Type *VecTy, bool &RequiresBW) {
2448     RequiresBW = false;
2449     Type *EltTy = VecTy->getVectorElementType();
2450     if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
2451         EltTy->isIntegerTy(32) || EltTy->isPointerTy())
2452       return true;
2453     if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8)) {
2454       RequiresBW = true;
2455       return true;
2456     }
2457     return false;
2458   };
2459   bool RequiresBW;
2460   bool HasAVX512Solution = isSupportedOnAVX512(VecTy, RequiresBW);
2461   if (ST->hasAVX512() && HasAVX512Solution && (!RequiresBW || ST->hasBWI()))
2462     return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
2463                                             Alignment, AddressSpace);
2464   if (ST->hasAVX2())
2465     return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
2466                                           Alignment, AddressSpace);
2467 
2468   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2469                                            Alignment, AddressSpace);
2470 }
2471