1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// \file 10 /// This file implements a TargetTransformInfo analysis pass specific to the 11 /// X86 target machine. It uses the target's detailed information to provide 12 /// more precise answers to certain TTI queries, while letting the target 13 /// independent and default TTI implementations handle the rest. 14 /// 15 //===----------------------------------------------------------------------===// 16 17 #include "X86.h" 18 #include "X86TargetMachine.h" 19 #include "llvm/ADT/DepthFirstIterator.h" 20 #include "llvm/Analysis/LoopInfo.h" 21 #include "llvm/Analysis/TargetTransformInfo.h" 22 #include "llvm/IR/IntrinsicInst.h" 23 #include "llvm/Support/CommandLine.h" 24 #include "llvm/Support/Debug.h" 25 #include "llvm/Target/CostTable.h" 26 #include "llvm/Target/TargetLowering.h" 27 using namespace llvm; 28 29 #define DEBUG_TYPE "x86tti" 30 31 // Declare the pass initialization routine locally as target-specific passes 32 // don't havve a target-wide initialization entry point, and so we rely on the 33 // pass constructor initialization. 34 namespace llvm { 35 void initializeX86TTIPass(PassRegistry &); 36 } 37 38 static cl::opt<bool> 39 UsePartialUnrolling("x86-use-partial-unrolling", cl::init(true), 40 cl::desc("Use partial unrolling for some X86 targets"), cl::Hidden); 41 static cl::opt<unsigned> 42 PartialUnrollingThreshold("x86-partial-unrolling-threshold", cl::init(0), 43 cl::desc("Threshold for X86 partial unrolling"), cl::Hidden); 44 static cl::opt<unsigned> 45 PartialUnrollingMaxBranches("x86-partial-max-branches", cl::init(2), 46 cl::desc("Threshold for taken branches in X86 partial unrolling"), 47 cl::Hidden); 48 49 namespace { 50 51 class X86TTI final : public ImmutablePass, public TargetTransformInfo { 52 const X86Subtarget *ST; 53 const X86TargetLowering *TLI; 54 55 /// Estimate the overhead of scalarizing an instruction. Insert and Extract 56 /// are set if the result needs to be inserted and/or extracted from vectors. 57 unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; 58 59 public: 60 X86TTI() : ImmutablePass(ID), ST(0), TLI(0) { 61 llvm_unreachable("This pass cannot be directly constructed"); 62 } 63 64 X86TTI(const X86TargetMachine *TM) 65 : ImmutablePass(ID), ST(TM->getSubtargetImpl()), 66 TLI(TM->getTargetLowering()) { 67 initializeX86TTIPass(*PassRegistry::getPassRegistry()); 68 } 69 70 void initializePass() override { 71 pushTTIStack(this); 72 } 73 74 void getAnalysisUsage(AnalysisUsage &AU) const override { 75 TargetTransformInfo::getAnalysisUsage(AU); 76 } 77 78 /// Pass identification. 79 static char ID; 80 81 /// Provide necessary pointer adjustments for the two base classes. 82 void *getAdjustedAnalysisPointer(const void *ID) override { 83 if (ID == &TargetTransformInfo::ID) 84 return (TargetTransformInfo*)this; 85 return this; 86 } 87 88 /// \name Scalar TTI Implementations 89 /// @{ 90 PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override; 91 void getUnrollingPreferences(Loop *L, 92 UnrollingPreferences &UP) const override; 93 94 /// @} 95 96 /// \name Vector TTI Implementations 97 /// @{ 98 99 unsigned getNumberOfRegisters(bool Vector) const override; 100 unsigned getRegisterBitWidth(bool Vector) const override; 101 unsigned getMaximumUnrollFactor() const override; 102 unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind, 103 OperandValueKind) const override; 104 unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, 105 int Index, Type *SubTp) const override; 106 unsigned getCastInstrCost(unsigned Opcode, Type *Dst, 107 Type *Src) const override; 108 unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 109 Type *CondTy) const override; 110 unsigned getVectorInstrCost(unsigned Opcode, Type *Val, 111 unsigned Index) const override; 112 unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, 113 unsigned AddressSpace) const override; 114 115 unsigned getAddressComputationCost(Type *PtrTy, 116 bool IsComplex) const override; 117 118 unsigned getReductionCost(unsigned Opcode, Type *Ty, 119 bool IsPairwiseForm) const override; 120 121 unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override; 122 123 unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, 124 Type *Ty) const override; 125 unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, 126 Type *Ty) const override; 127 128 /// @} 129 }; 130 131 } // end anonymous namespace 132 133 INITIALIZE_AG_PASS(X86TTI, TargetTransformInfo, "x86tti", 134 "X86 Target Transform Info", true, true, false) 135 char X86TTI::ID = 0; 136 137 ImmutablePass * 138 llvm::createX86TargetTransformInfoPass(const X86TargetMachine *TM) { 139 return new X86TTI(TM); 140 } 141 142 143 //===----------------------------------------------------------------------===// 144 // 145 // X86 cost model. 146 // 147 //===----------------------------------------------------------------------===// 148 149 X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const { 150 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 151 // TODO: Currently the __builtin_popcount() implementation using SSE3 152 // instructions is inefficient. Once the problem is fixed, we should 153 // call ST->hasSSE3() instead of ST->hasPOPCNT(). 154 return ST->hasPOPCNT() ? PSK_FastHardware : PSK_Software; 155 } 156 157 void X86TTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const { 158 if (!UsePartialUnrolling) 159 return; 160 // According to the Intel 64 and IA-32 Architectures Optimization Reference 161 // Manual, Intel Core models and later have a loop stream detector 162 // (and associated uop queue) that can benefit from partial unrolling. 163 // The relevant requirements are: 164 // - The loop must have no more than 4 (8 for Nehalem and later) branches 165 // taken, and none of them may be calls. 166 // - The loop can have no more than 18 (28 for Nehalem and later) uops. 167 168 // According to the Software Optimization Guide for AMD Family 15h Processors, 169 // models 30h-4fh (Steamroller and later) have a loop predictor and loop 170 // buffer which can benefit from partial unrolling. 171 // The relevant requirements are: 172 // - The loop must have fewer than 16 branches 173 // - The loop must have less than 40 uops in all executed loop branches 174 175 unsigned MaxBranches, MaxOps; 176 if (PartialUnrollingThreshold.getNumOccurrences() > 0) { 177 MaxBranches = PartialUnrollingMaxBranches; 178 MaxOps = PartialUnrollingThreshold; 179 } else if (ST->isAtom()) { 180 // On the Atom, the throughput for taken branches is 2 cycles. For small 181 // simple loops, expand by a small factor to hide the backedge cost. 182 MaxBranches = 2; 183 MaxOps = 10; 184 } else if (ST->hasFSGSBase() && ST->hasXOP() /* Steamroller and later */) { 185 MaxBranches = 16; 186 MaxOps = 40; 187 } else if (ST->hasFMA4() /* Any other recent AMD */) { 188 return; 189 } else if (ST->hasAVX() || ST->hasSSE42() /* Nehalem and later */) { 190 MaxBranches = 8; 191 MaxOps = 28; 192 } else if (ST->hasSSSE3() /* Intel Core */) { 193 MaxBranches = 4; 194 MaxOps = 18; 195 } else { 196 return; 197 } 198 199 // Scan the loop: don't unroll loops with calls, and count the potential 200 // number of taken branches (this is somewhat conservative because we're 201 // counting all block transitions as potential branches while in reality some 202 // of these will become implicit via block placement). 203 unsigned MaxDepth = 0; 204 for (df_iterator<BasicBlock*> DI = df_begin(L->getHeader()), 205 DE = df_end(L->getHeader()); DI != DE;) { 206 if (!L->contains(*DI)) { 207 DI.skipChildren(); 208 continue; 209 } 210 211 MaxDepth = std::max(MaxDepth, DI.getPathLength()); 212 if (MaxDepth > MaxBranches) 213 return; 214 215 for (BasicBlock::iterator I = DI->begin(), IE = DI->end(); I != IE; ++I) 216 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 217 ImmutableCallSite CS(I); 218 if (const Function *F = CS.getCalledFunction()) { 219 if (!isLoweredToCall(F)) 220 continue; 221 } 222 223 return; 224 } 225 226 ++DI; 227 } 228 229 // Enable runtime and partial unrolling up to the specified size. 230 UP.Partial = UP.Runtime = true; 231 UP.PartialThreshold = UP.PartialOptSizeThreshold = MaxOps; 232 233 // Set the maximum count based on the loop depth. The maximum number of 234 // branches taken in a loop (including the backedge) is equal to the maximum 235 // loop depth (the DFS path length from the loop header to any block in the 236 // loop). When the loop is unrolled, this depth (except for the backedge 237 // itself) is multiplied by the unrolling factor. This new unrolled depth 238 // must be less than the target-specific maximum branch count (which limits 239 // the number of taken branches in the uop buffer). 240 if (MaxDepth > 1) 241 UP.MaxCount = (MaxBranches-1)/(MaxDepth-1); 242 } 243 244 unsigned X86TTI::getNumberOfRegisters(bool Vector) const { 245 if (Vector && !ST->hasSSE1()) 246 return 0; 247 248 if (ST->is64Bit()) 249 return 16; 250 return 8; 251 } 252 253 unsigned X86TTI::getRegisterBitWidth(bool Vector) const { 254 if (Vector) { 255 if (ST->hasAVX()) return 256; 256 if (ST->hasSSE1()) return 128; 257 return 0; 258 } 259 260 if (ST->is64Bit()) 261 return 64; 262 return 32; 263 264 } 265 266 unsigned X86TTI::getMaximumUnrollFactor() const { 267 if (ST->isAtom()) 268 return 1; 269 270 // Sandybridge and Haswell have multiple execution ports and pipelined 271 // vector units. 272 if (ST->hasAVX()) 273 return 4; 274 275 return 2; 276 } 277 278 unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, 279 OperandValueKind Op1Info, 280 OperandValueKind Op2Info) const { 281 // Legalize the type. 282 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty); 283 284 int ISD = TLI->InstructionOpcodeToISD(Opcode); 285 assert(ISD && "Invalid opcode"); 286 287 static const CostTblEntry<MVT::SimpleValueType> AVX2CostTable[] = { 288 // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to 289 // customize them to detect the cases where shift amount is a scalar one. 290 { ISD::SHL, MVT::v4i32, 1 }, 291 { ISD::SRL, MVT::v4i32, 1 }, 292 { ISD::SRA, MVT::v4i32, 1 }, 293 { ISD::SHL, MVT::v8i32, 1 }, 294 { ISD::SRL, MVT::v8i32, 1 }, 295 { ISD::SRA, MVT::v8i32, 1 }, 296 { ISD::SHL, MVT::v2i64, 1 }, 297 { ISD::SRL, MVT::v2i64, 1 }, 298 { ISD::SHL, MVT::v4i64, 1 }, 299 { ISD::SRL, MVT::v4i64, 1 }, 300 301 { ISD::SHL, MVT::v32i8, 42 }, // cmpeqb sequence. 302 { ISD::SHL, MVT::v16i16, 16*10 }, // Scalarized. 303 304 { ISD::SRL, MVT::v32i8, 32*10 }, // Scalarized. 305 { ISD::SRL, MVT::v16i16, 8*10 }, // Scalarized. 306 307 { ISD::SRA, MVT::v32i8, 32*10 }, // Scalarized. 308 { ISD::SRA, MVT::v16i16, 16*10 }, // Scalarized. 309 { ISD::SRA, MVT::v4i64, 4*10 }, // Scalarized. 310 311 // Vectorizing division is a bad idea. See the SSE2 table for more comments. 312 { ISD::SDIV, MVT::v32i8, 32*20 }, 313 { ISD::SDIV, MVT::v16i16, 16*20 }, 314 { ISD::SDIV, MVT::v8i32, 8*20 }, 315 { ISD::SDIV, MVT::v4i64, 4*20 }, 316 { ISD::UDIV, MVT::v32i8, 32*20 }, 317 { ISD::UDIV, MVT::v16i16, 16*20 }, 318 { ISD::UDIV, MVT::v8i32, 8*20 }, 319 { ISD::UDIV, MVT::v4i64, 4*20 }, 320 }; 321 322 // Look for AVX2 lowering tricks. 323 if (ST->hasAVX2()) { 324 if (ISD == ISD::SHL && LT.second == MVT::v16i16 && 325 (Op2Info == TargetTransformInfo::OK_UniformConstantValue || 326 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) 327 // On AVX2, a packed v16i16 shift left by a constant build_vector 328 // is lowered into a vector multiply (vpmullw). 329 return LT.first; 330 331 int Idx = CostTableLookup(AVX2CostTable, ISD, LT.second); 332 if (Idx != -1) 333 return LT.first * AVX2CostTable[Idx].Cost; 334 } 335 336 static const CostTblEntry<MVT::SimpleValueType> 337 SSE2UniformConstCostTable[] = { 338 // We don't correctly identify costs of casts because they are marked as 339 // custom. 340 // Constant splats are cheaper for the following instructions. 341 { ISD::SHL, MVT::v16i8, 1 }, // psllw. 342 { ISD::SHL, MVT::v8i16, 1 }, // psllw. 343 { ISD::SHL, MVT::v4i32, 1 }, // pslld 344 { ISD::SHL, MVT::v2i64, 1 }, // psllq. 345 346 { ISD::SRL, MVT::v16i8, 1 }, // psrlw. 347 { ISD::SRL, MVT::v8i16, 1 }, // psrlw. 348 { ISD::SRL, MVT::v4i32, 1 }, // psrld. 349 { ISD::SRL, MVT::v2i64, 1 }, // psrlq. 350 351 { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. 352 { ISD::SRA, MVT::v8i16, 1 }, // psraw. 353 { ISD::SRA, MVT::v4i32, 1 }, // psrad. 354 }; 355 356 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && 357 ST->hasSSE2()) { 358 int Idx = CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second); 359 if (Idx != -1) 360 return LT.first * SSE2UniformConstCostTable[Idx].Cost; 361 } 362 363 if (ISD == ISD::SHL && 364 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) { 365 EVT VT = LT.second; 366 if ((VT == MVT::v8i16 && ST->hasSSE2()) || 367 (VT == MVT::v4i32 && ST->hasSSE41())) 368 // Vector shift left by non uniform constant can be lowered 369 // into vector multiply (pmullw/pmulld). 370 return LT.first; 371 if (VT == MVT::v4i32 && ST->hasSSE2()) 372 // A vector shift left by non uniform constant is converted 373 // into a vector multiply; the new multiply is eventually 374 // lowered into a sequence of shuffles and 2 x pmuludq. 375 ISD = ISD::MUL; 376 } 377 378 static const CostTblEntry<MVT::SimpleValueType> SSE2CostTable[] = { 379 // We don't correctly identify costs of casts because they are marked as 380 // custom. 381 // For some cases, where the shift amount is a scalar we would be able 382 // to generate better code. Unfortunately, when this is the case the value 383 // (the splat) will get hoisted out of the loop, thereby making it invisible 384 // to ISel. The cost model must return worst case assumptions because it is 385 // used for vectorization and we don't want to make vectorized code worse 386 // than scalar code. 387 { ISD::SHL, MVT::v16i8, 30 }, // cmpeqb sequence. 388 { ISD::SHL, MVT::v8i16, 8*10 }, // Scalarized. 389 { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. 390 { ISD::SHL, MVT::v2i64, 2*10 }, // Scalarized. 391 { ISD::SHL, MVT::v4i64, 4*10 }, // Scalarized. 392 393 { ISD::SRL, MVT::v16i8, 16*10 }, // Scalarized. 394 { ISD::SRL, MVT::v8i16, 8*10 }, // Scalarized. 395 { ISD::SRL, MVT::v4i32, 4*10 }, // Scalarized. 396 { ISD::SRL, MVT::v2i64, 2*10 }, // Scalarized. 397 398 { ISD::SRA, MVT::v16i8, 16*10 }, // Scalarized. 399 { ISD::SRA, MVT::v8i16, 8*10 }, // Scalarized. 400 { ISD::SRA, MVT::v4i32, 4*10 }, // Scalarized. 401 { ISD::SRA, MVT::v2i64, 2*10 }, // Scalarized. 402 403 // It is not a good idea to vectorize division. We have to scalarize it and 404 // in the process we will often end up having to spilling regular 405 // registers. The overhead of division is going to dominate most kernels 406 // anyways so try hard to prevent vectorization of division - it is 407 // generally a bad idea. Assume somewhat arbitrarily that we have to be able 408 // to hide "20 cycles" for each lane. 409 { ISD::SDIV, MVT::v16i8, 16*20 }, 410 { ISD::SDIV, MVT::v8i16, 8*20 }, 411 { ISD::SDIV, MVT::v4i32, 4*20 }, 412 { ISD::SDIV, MVT::v2i64, 2*20 }, 413 { ISD::UDIV, MVT::v16i8, 16*20 }, 414 { ISD::UDIV, MVT::v8i16, 8*20 }, 415 { ISD::UDIV, MVT::v4i32, 4*20 }, 416 { ISD::UDIV, MVT::v2i64, 2*20 }, 417 }; 418 419 if (ST->hasSSE2()) { 420 int Idx = CostTableLookup(SSE2CostTable, ISD, LT.second); 421 if (Idx != -1) 422 return LT.first * SSE2CostTable[Idx].Cost; 423 } 424 425 static const CostTblEntry<MVT::SimpleValueType> AVX1CostTable[] = { 426 // We don't have to scalarize unsupported ops. We can issue two half-sized 427 // operations and we only need to extract the upper YMM half. 428 // Two ops + 1 extract + 1 insert = 4. 429 { ISD::MUL, MVT::v16i16, 4 }, 430 { ISD::MUL, MVT::v8i32, 4 }, 431 { ISD::SUB, MVT::v8i32, 4 }, 432 { ISD::ADD, MVT::v8i32, 4 }, 433 { ISD::SUB, MVT::v4i64, 4 }, 434 { ISD::ADD, MVT::v4i64, 4 }, 435 // A v4i64 multiply is custom lowered as two split v2i64 vectors that then 436 // are lowered as a series of long multiplies(3), shifts(4) and adds(2) 437 // Because we believe v4i64 to be a legal type, we must also include the 438 // split factor of two in the cost table. Therefore, the cost here is 18 439 // instead of 9. 440 { ISD::MUL, MVT::v4i64, 18 }, 441 }; 442 443 // Look for AVX1 lowering tricks. 444 if (ST->hasAVX() && !ST->hasAVX2()) { 445 EVT VT = LT.second; 446 447 // v16i16 and v8i32 shifts by non-uniform constants are lowered into a 448 // sequence of extract + two vector multiply + insert. 449 if (ISD == ISD::SHL && (VT == MVT::v8i32 || VT == MVT::v16i16) && 450 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) 451 ISD = ISD::MUL; 452 453 int Idx = CostTableLookup(AVX1CostTable, ISD, VT); 454 if (Idx != -1) 455 return LT.first * AVX1CostTable[Idx].Cost; 456 } 457 458 // Custom lowering of vectors. 459 static const CostTblEntry<MVT::SimpleValueType> CustomLowered[] = { 460 // A v2i64/v4i64 and multiply is custom lowered as a series of long 461 // multiplies(3), shifts(4) and adds(2). 462 { ISD::MUL, MVT::v2i64, 9 }, 463 { ISD::MUL, MVT::v4i64, 9 }, 464 }; 465 int Idx = CostTableLookup(CustomLowered, ISD, LT.second); 466 if (Idx != -1) 467 return LT.first * CustomLowered[Idx].Cost; 468 469 // Special lowering of v4i32 mul on sse2, sse3: Lower v4i32 mul as 2x shuffle, 470 // 2x pmuludq, 2x shuffle. 471 if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() && 472 !ST->hasSSE41()) 473 return LT.first * 6; 474 475 // Fallback to the default implementation. 476 return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info, 477 Op2Info); 478 } 479 480 unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, 481 Type *SubTp) const { 482 // We only estimate the cost of reverse shuffles. 483 if (Kind != SK_Reverse) 484 return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); 485 486 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp); 487 unsigned Cost = 1; 488 if (LT.second.getSizeInBits() > 128) 489 Cost = 3; // Extract + insert + copy. 490 491 // Multiple by the number of parts. 492 return Cost * LT.first; 493 } 494 495 unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { 496 int ISD = TLI->InstructionOpcodeToISD(Opcode); 497 assert(ISD && "Invalid opcode"); 498 499 std::pair<unsigned, MVT> LTSrc = TLI->getTypeLegalizationCost(Src); 500 std::pair<unsigned, MVT> LTDest = TLI->getTypeLegalizationCost(Dst); 501 502 static const TypeConversionCostTblEntry<MVT::SimpleValueType> 503 SSE2ConvTbl[] = { 504 // These are somewhat magic numbers justified by looking at the output of 505 // Intel's IACA, running some kernels and making sure when we take 506 // legalization into account the throughput will be overestimated. 507 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, 508 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, 509 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, 510 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, 511 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, 512 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, 513 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, 514 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, 515 // There are faster sequences for float conversions. 516 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, 517 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 }, 518 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, 519 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, 520 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, 521 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 }, 522 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, 523 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, 524 }; 525 526 if (ST->hasSSE2() && !ST->hasAVX()) { 527 int Idx = 528 ConvertCostTableLookup(SSE2ConvTbl, ISD, LTDest.second, LTSrc.second); 529 if (Idx != -1) 530 return LTSrc.first * SSE2ConvTbl[Idx].Cost; 531 } 532 533 EVT SrcTy = TLI->getValueType(Src); 534 EVT DstTy = TLI->getValueType(Dst); 535 536 // The function getSimpleVT only handles simple value types. 537 if (!SrcTy.isSimple() || !DstTy.isSimple()) 538 return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); 539 540 static const TypeConversionCostTblEntry<MVT::SimpleValueType> 541 AVX2ConversionTbl[] = { 542 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, 543 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, 544 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, 545 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, 546 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 547 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 548 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, 549 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, 550 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, 551 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, 552 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, 553 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, 554 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 555 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 556 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, 557 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, 558 559 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, 560 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, 561 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 }, 562 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, 563 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 }, 564 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 4 }, 565 }; 566 567 static const TypeConversionCostTblEntry<MVT::SimpleValueType> 568 AVXConversionTbl[] = { 569 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, 570 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, 571 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 }, 572 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 }, 573 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 7 }, 574 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 }, 575 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, 576 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, 577 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 }, 578 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 }, 579 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 }, 580 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, 581 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 }, 582 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 583 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 4 }, 584 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 4 }, 585 586 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 4 }, 587 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 4 }, 588 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 4 }, 589 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 }, 590 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, 591 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 4 }, 592 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 9 }, 593 594 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 }, 595 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 8 }, 596 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 }, 597 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, 598 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, 599 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 600 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 3 }, 601 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 602 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 }, 603 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i8, 3 }, 604 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i16, 3 }, 605 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, 606 607 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 }, 608 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 5 }, 609 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 }, 610 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 9 }, 611 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 }, 612 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 2 }, 613 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 614 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 6 }, 615 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 }, 616 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 }, 617 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 }, 618 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 }, 619 // The generic code to compute the scalar overhead is currently broken. 620 // Workaround this limitation by estimating the scalarization overhead 621 // here. We have roughly 10 instructions per scalar element. 622 // Multiply that by the vector width. 623 // FIXME: remove that when PR19268 is fixed. 624 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, 625 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 4*10 }, 626 627 { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 7 }, 628 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 }, 629 // This node is expanded into scalarized operations but BasicTTI is overly 630 // optimistic estimating its cost. It computes 3 per element (one 631 // vector-extract, one scalar conversion and one vector-insert). The 632 // problem is that the inserts form a read-modify-write chain so latency 633 // should be factored in too. Inflating the cost per element by 1. 634 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 8*4 }, 635 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 }, 636 }; 637 638 if (ST->hasAVX2()) { 639 int Idx = ConvertCostTableLookup(AVX2ConversionTbl, ISD, 640 DstTy.getSimpleVT(), SrcTy.getSimpleVT()); 641 if (Idx != -1) 642 return AVX2ConversionTbl[Idx].Cost; 643 } 644 645 if (ST->hasAVX()) { 646 int Idx = ConvertCostTableLookup(AVXConversionTbl, ISD, DstTy.getSimpleVT(), 647 SrcTy.getSimpleVT()); 648 if (Idx != -1) 649 return AVXConversionTbl[Idx].Cost; 650 } 651 652 return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); 653 } 654 655 unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 656 Type *CondTy) const { 657 // Legalize the type. 658 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy); 659 660 MVT MTy = LT.second; 661 662 int ISD = TLI->InstructionOpcodeToISD(Opcode); 663 assert(ISD && "Invalid opcode"); 664 665 static const CostTblEntry<MVT::SimpleValueType> SSE42CostTbl[] = { 666 { ISD::SETCC, MVT::v2f64, 1 }, 667 { ISD::SETCC, MVT::v4f32, 1 }, 668 { ISD::SETCC, MVT::v2i64, 1 }, 669 { ISD::SETCC, MVT::v4i32, 1 }, 670 { ISD::SETCC, MVT::v8i16, 1 }, 671 { ISD::SETCC, MVT::v16i8, 1 }, 672 }; 673 674 static const CostTblEntry<MVT::SimpleValueType> AVX1CostTbl[] = { 675 { ISD::SETCC, MVT::v4f64, 1 }, 676 { ISD::SETCC, MVT::v8f32, 1 }, 677 // AVX1 does not support 8-wide integer compare. 678 { ISD::SETCC, MVT::v4i64, 4 }, 679 { ISD::SETCC, MVT::v8i32, 4 }, 680 { ISD::SETCC, MVT::v16i16, 4 }, 681 { ISD::SETCC, MVT::v32i8, 4 }, 682 }; 683 684 static const CostTblEntry<MVT::SimpleValueType> AVX2CostTbl[] = { 685 { ISD::SETCC, MVT::v4i64, 1 }, 686 { ISD::SETCC, MVT::v8i32, 1 }, 687 { ISD::SETCC, MVT::v16i16, 1 }, 688 { ISD::SETCC, MVT::v32i8, 1 }, 689 }; 690 691 if (ST->hasAVX2()) { 692 int Idx = CostTableLookup(AVX2CostTbl, ISD, MTy); 693 if (Idx != -1) 694 return LT.first * AVX2CostTbl[Idx].Cost; 695 } 696 697 if (ST->hasAVX()) { 698 int Idx = CostTableLookup(AVX1CostTbl, ISD, MTy); 699 if (Idx != -1) 700 return LT.first * AVX1CostTbl[Idx].Cost; 701 } 702 703 if (ST->hasSSE42()) { 704 int Idx = CostTableLookup(SSE42CostTbl, ISD, MTy); 705 if (Idx != -1) 706 return LT.first * SSE42CostTbl[Idx].Cost; 707 } 708 709 return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy); 710 } 711 712 unsigned X86TTI::getVectorInstrCost(unsigned Opcode, Type *Val, 713 unsigned Index) const { 714 assert(Val->isVectorTy() && "This must be a vector type"); 715 716 if (Index != -1U) { 717 // Legalize the type. 718 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val); 719 720 // This type is legalized to a scalar type. 721 if (!LT.second.isVector()) 722 return 0; 723 724 // The type may be split. Normalize the index to the new type. 725 unsigned Width = LT.second.getVectorNumElements(); 726 Index = Index % Width; 727 728 // Floating point scalars are already located in index #0. 729 if (Val->getScalarType()->isFloatingPointTy() && Index == 0) 730 return 0; 731 } 732 733 return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index); 734 } 735 736 unsigned X86TTI::getScalarizationOverhead(Type *Ty, bool Insert, 737 bool Extract) const { 738 assert (Ty->isVectorTy() && "Can only scalarize vectors"); 739 unsigned Cost = 0; 740 741 for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { 742 if (Insert) 743 Cost += TopTTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); 744 if (Extract) 745 Cost += TopTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, i); 746 } 747 748 return Cost; 749 } 750 751 unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, 752 unsigned AddressSpace) const { 753 // Handle non-power-of-two vectors such as <3 x float> 754 if (VectorType *VTy = dyn_cast<VectorType>(Src)) { 755 unsigned NumElem = VTy->getVectorNumElements(); 756 757 // Handle a few common cases: 758 // <3 x float> 759 if (NumElem == 3 && VTy->getScalarSizeInBits() == 32) 760 // Cost = 64 bit store + extract + 32 bit store. 761 return 3; 762 763 // <3 x double> 764 if (NumElem == 3 && VTy->getScalarSizeInBits() == 64) 765 // Cost = 128 bit store + unpack + 64 bit store. 766 return 3; 767 768 // Assume that all other non-power-of-two numbers are scalarized. 769 if (!isPowerOf2_32(NumElem)) { 770 unsigned Cost = TargetTransformInfo::getMemoryOpCost(Opcode, 771 VTy->getScalarType(), 772 Alignment, 773 AddressSpace); 774 unsigned SplitCost = getScalarizationOverhead(Src, 775 Opcode == Instruction::Load, 776 Opcode==Instruction::Store); 777 return NumElem * Cost + SplitCost; 778 } 779 } 780 781 // Legalize the type. 782 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src); 783 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && 784 "Invalid Opcode"); 785 786 // Each load/store unit costs 1. 787 unsigned Cost = LT.first * 1; 788 789 // On Sandybridge 256bit load/stores are double pumped 790 // (but not on Haswell). 791 if (LT.second.getSizeInBits() > 128 && !ST->hasAVX2()) 792 Cost*=2; 793 794 return Cost; 795 } 796 797 unsigned X86TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const { 798 // Address computations in vectorized code with non-consecutive addresses will 799 // likely result in more instructions compared to scalar code where the 800 // computation can more often be merged into the index mode. The resulting 801 // extra micro-ops can significantly decrease throughput. 802 unsigned NumVectorInstToHideOverhead = 10; 803 804 if (Ty->isVectorTy() && IsComplex) 805 return NumVectorInstToHideOverhead; 806 807 return TargetTransformInfo::getAddressComputationCost(Ty, IsComplex); 808 } 809 810 unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy, 811 bool IsPairwise) const { 812 813 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy); 814 815 MVT MTy = LT.second; 816 817 int ISD = TLI->InstructionOpcodeToISD(Opcode); 818 assert(ISD && "Invalid opcode"); 819 820 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput 821 // and make it as the cost. 822 823 static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblPairWise[] = { 824 { ISD::FADD, MVT::v2f64, 2 }, 825 { ISD::FADD, MVT::v4f32, 4 }, 826 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". 827 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". 828 { ISD::ADD, MVT::v8i16, 5 }, 829 }; 830 831 static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblPairWise[] = { 832 { ISD::FADD, MVT::v4f32, 4 }, 833 { ISD::FADD, MVT::v4f64, 5 }, 834 { ISD::FADD, MVT::v8f32, 7 }, 835 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". 836 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". 837 { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8". 838 { ISD::ADD, MVT::v8i16, 5 }, 839 { ISD::ADD, MVT::v8i32, 5 }, 840 }; 841 842 static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblNoPairWise[] = { 843 { ISD::FADD, MVT::v2f64, 2 }, 844 { ISD::FADD, MVT::v4f32, 4 }, 845 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". 846 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3". 847 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3". 848 }; 849 850 static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblNoPairWise[] = { 851 { ISD::FADD, MVT::v4f32, 3 }, 852 { ISD::FADD, MVT::v4f64, 3 }, 853 { ISD::FADD, MVT::v8f32, 4 }, 854 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". 855 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8". 856 { ISD::ADD, MVT::v4i64, 3 }, 857 { ISD::ADD, MVT::v8i16, 4 }, 858 { ISD::ADD, MVT::v8i32, 5 }, 859 }; 860 861 if (IsPairwise) { 862 if (ST->hasAVX()) { 863 int Idx = CostTableLookup(AVX1CostTblPairWise, ISD, MTy); 864 if (Idx != -1) 865 return LT.first * AVX1CostTblPairWise[Idx].Cost; 866 } 867 868 if (ST->hasSSE42()) { 869 int Idx = CostTableLookup(SSE42CostTblPairWise, ISD, MTy); 870 if (Idx != -1) 871 return LT.first * SSE42CostTblPairWise[Idx].Cost; 872 } 873 } else { 874 if (ST->hasAVX()) { 875 int Idx = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy); 876 if (Idx != -1) 877 return LT.first * AVX1CostTblNoPairWise[Idx].Cost; 878 } 879 880 if (ST->hasSSE42()) { 881 int Idx = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy); 882 if (Idx != -1) 883 return LT.first * SSE42CostTblNoPairWise[Idx].Cost; 884 } 885 } 886 887 return TargetTransformInfo::getReductionCost(Opcode, ValTy, IsPairwise); 888 } 889 890 unsigned X86TTI::getIntImmCost(const APInt &Imm, Type *Ty) const { 891 assert(Ty->isIntegerTy()); 892 893 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 894 if (BitSize == 0) 895 return ~0U; 896 897 if (Imm == 0) 898 return TCC_Free; 899 900 if (Imm.getBitWidth() <= 64 && 901 (isInt<32>(Imm.getSExtValue()) || isUInt<32>(Imm.getZExtValue()))) 902 return TCC_Basic; 903 else 904 return 2 * TCC_Basic; 905 } 906 907 unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, 908 Type *Ty) const { 909 assert(Ty->isIntegerTy()); 910 911 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 912 if (BitSize == 0) 913 return ~0U; 914 915 unsigned ImmIdx = ~0U; 916 switch (Opcode) { 917 default: return TCC_Free; 918 case Instruction::GetElementPtr: 919 // Always hoist the base address of a GetElementPtr. This prevents the 920 // creation of new constants for every base constant that gets constant 921 // folded with the offset. 922 if (Idx == 0) 923 return 2 * TCC_Basic; 924 return TCC_Free; 925 case Instruction::Store: 926 ImmIdx = 0; 927 break; 928 case Instruction::Add: 929 case Instruction::Sub: 930 case Instruction::Mul: 931 case Instruction::UDiv: 932 case Instruction::SDiv: 933 case Instruction::URem: 934 case Instruction::SRem: 935 case Instruction::Shl: 936 case Instruction::LShr: 937 case Instruction::AShr: 938 case Instruction::And: 939 case Instruction::Or: 940 case Instruction::Xor: 941 case Instruction::ICmp: 942 ImmIdx = 1; 943 break; 944 case Instruction::Trunc: 945 case Instruction::ZExt: 946 case Instruction::SExt: 947 case Instruction::IntToPtr: 948 case Instruction::PtrToInt: 949 case Instruction::BitCast: 950 case Instruction::PHI: 951 case Instruction::Call: 952 case Instruction::Select: 953 case Instruction::Ret: 954 case Instruction::Load: 955 break; 956 } 957 958 if ((Idx == ImmIdx) && 959 Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue())) 960 return TCC_Free; 961 962 return X86TTI::getIntImmCost(Imm, Ty); 963 } 964 965 unsigned X86TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx, 966 const APInt &Imm, Type *Ty) const { 967 assert(Ty->isIntegerTy()); 968 969 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 970 if (BitSize == 0) 971 return ~0U; 972 973 switch (IID) { 974 default: return TCC_Free; 975 case Intrinsic::sadd_with_overflow: 976 case Intrinsic::uadd_with_overflow: 977 case Intrinsic::ssub_with_overflow: 978 case Intrinsic::usub_with_overflow: 979 case Intrinsic::smul_with_overflow: 980 case Intrinsic::umul_with_overflow: 981 if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue())) 982 return TCC_Free; 983 break; 984 case Intrinsic::experimental_stackmap: 985 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 986 return TCC_Free; 987 break; 988 case Intrinsic::experimental_patchpoint_void: 989 case Intrinsic::experimental_patchpoint_i64: 990 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 991 return TCC_Free; 992 break; 993 } 994 return X86TTI::getIntImmCost(Imm, Ty); 995 } 996