1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// \file 10 /// This file implements a TargetTransformInfo analysis pass specific to the 11 /// X86 target machine. It uses the target's detailed information to provide 12 /// more precise answers to certain TTI queries, while letting the target 13 /// independent and default TTI implementations handle the rest. 14 /// 15 //===----------------------------------------------------------------------===// 16 17 #include "X86TargetTransformInfo.h" 18 #include "llvm/Analysis/TargetTransformInfo.h" 19 #include "llvm/CodeGen/BasicTTIImpl.h" 20 #include "llvm/IR/IntrinsicInst.h" 21 #include "llvm/Support/Debug.h" 22 #include "llvm/Target/CostTable.h" 23 #include "llvm/Target/TargetLowering.h" 24 using namespace llvm; 25 26 #define DEBUG_TYPE "x86tti" 27 28 //===----------------------------------------------------------------------===// 29 // 30 // X86 cost model. 31 // 32 //===----------------------------------------------------------------------===// 33 34 TargetTransformInfo::PopcntSupportKind 35 X86TTIImpl::getPopcntSupport(unsigned TyWidth) { 36 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 37 // TODO: Currently the __builtin_popcount() implementation using SSE3 38 // instructions is inefficient. Once the problem is fixed, we should 39 // call ST->hasSSE3() instead of ST->hasPOPCNT(). 40 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software; 41 } 42 43 unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) { 44 if (Vector && !ST->hasSSE1()) 45 return 0; 46 47 if (ST->is64Bit()) { 48 if (Vector && ST->hasAVX512()) 49 return 32; 50 return 16; 51 } 52 return 8; 53 } 54 55 unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) { 56 if (Vector) { 57 if (ST->hasAVX512()) return 512; 58 if (ST->hasAVX()) return 256; 59 if (ST->hasSSE1()) return 128; 60 return 0; 61 } 62 63 if (ST->is64Bit()) 64 return 64; 65 return 32; 66 67 } 68 69 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { 70 // If the loop will not be vectorized, don't interleave the loop. 71 // Let regular unroll to unroll the loop, which saves the overflow 72 // check and memory check cost. 73 if (VF == 1) 74 return 1; 75 76 if (ST->isAtom()) 77 return 1; 78 79 // Sandybridge and Haswell have multiple execution ports and pipelined 80 // vector units. 81 if (ST->hasAVX()) 82 return 4; 83 84 return 2; 85 } 86 87 unsigned X86TTIImpl::getArithmeticInstrCost( 88 unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, 89 TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, 90 TTI::OperandValueProperties Opd2PropInfo) { 91 // Legalize the type. 92 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty); 93 94 int ISD = TLI->InstructionOpcodeToISD(Opcode); 95 assert(ISD && "Invalid opcode"); 96 97 if (ISD == ISD::SDIV && 98 Op2Info == TargetTransformInfo::OK_UniformConstantValue && 99 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { 100 // On X86, vector signed division by constants power-of-two are 101 // normally expanded to the sequence SRA + SRL + ADD + SRA. 102 // The OperandValue properties many not be same as that of previous 103 // operation;conservatively assume OP_None. 104 unsigned Cost = 105 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info, 106 TargetTransformInfo::OP_None, 107 TargetTransformInfo::OP_None); 108 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info, 109 TargetTransformInfo::OP_None, 110 TargetTransformInfo::OP_None); 111 Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info, 112 TargetTransformInfo::OP_None, 113 TargetTransformInfo::OP_None); 114 115 return Cost; 116 } 117 118 static const CostTblEntry<MVT::SimpleValueType> 119 AVX2UniformConstCostTable[] = { 120 { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence 121 { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence 122 { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence 123 { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence 124 }; 125 126 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && 127 ST->hasAVX2()) { 128 int Idx = CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second); 129 if (Idx != -1) 130 return LT.first * AVX2UniformConstCostTable[Idx].Cost; 131 } 132 133 static const CostTblEntry<MVT::SimpleValueType> AVX512CostTable[] = { 134 { ISD::SHL, MVT::v16i32, 1 }, 135 { ISD::SRL, MVT::v16i32, 1 }, 136 { ISD::SRA, MVT::v16i32, 1 }, 137 { ISD::SHL, MVT::v8i64, 1 }, 138 { ISD::SRL, MVT::v8i64, 1 }, 139 { ISD::SRA, MVT::v8i64, 1 }, 140 }; 141 142 static const CostTblEntry<MVT::SimpleValueType> AVX2CostTable[] = { 143 // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to 144 // customize them to detect the cases where shift amount is a scalar one. 145 { ISD::SHL, MVT::v4i32, 1 }, 146 { ISD::SRL, MVT::v4i32, 1 }, 147 { ISD::SRA, MVT::v4i32, 1 }, 148 { ISD::SHL, MVT::v8i32, 1 }, 149 { ISD::SRL, MVT::v8i32, 1 }, 150 { ISD::SRA, MVT::v8i32, 1 }, 151 { ISD::SHL, MVT::v2i64, 1 }, 152 { ISD::SRL, MVT::v2i64, 1 }, 153 { ISD::SHL, MVT::v4i64, 1 }, 154 { ISD::SRL, MVT::v4i64, 1 }, 155 156 { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence. 157 { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. 158 159 { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence. 160 { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. 161 162 { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence. 163 { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence. 164 { ISD::SRA, MVT::v4i64, 4*10 }, // Scalarized. 165 166 // Vectorizing division is a bad idea. See the SSE2 table for more comments. 167 { ISD::SDIV, MVT::v32i8, 32*20 }, 168 { ISD::SDIV, MVT::v16i16, 16*20 }, 169 { ISD::SDIV, MVT::v8i32, 8*20 }, 170 { ISD::SDIV, MVT::v4i64, 4*20 }, 171 { ISD::UDIV, MVT::v32i8, 32*20 }, 172 { ISD::UDIV, MVT::v16i16, 16*20 }, 173 { ISD::UDIV, MVT::v8i32, 8*20 }, 174 { ISD::UDIV, MVT::v4i64, 4*20 }, 175 }; 176 177 if (ST->hasAVX512()) { 178 int Idx = CostTableLookup(AVX512CostTable, ISD, LT.second); 179 if (Idx != -1) 180 return LT.first * AVX512CostTable[Idx].Cost; 181 } 182 // Look for AVX2 lowering tricks. 183 if (ST->hasAVX2()) { 184 if (ISD == ISD::SHL && LT.second == MVT::v16i16 && 185 (Op2Info == TargetTransformInfo::OK_UniformConstantValue || 186 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) 187 // On AVX2, a packed v16i16 shift left by a constant build_vector 188 // is lowered into a vector multiply (vpmullw). 189 return LT.first; 190 191 int Idx = CostTableLookup(AVX2CostTable, ISD, LT.second); 192 if (Idx != -1) 193 return LT.first * AVX2CostTable[Idx].Cost; 194 } 195 196 static const CostTblEntry<MVT::SimpleValueType> 197 SSE2UniformConstCostTable[] = { 198 // We don't correctly identify costs of casts because they are marked as 199 // custom. 200 // Constant splats are cheaper for the following instructions. 201 { ISD::SHL, MVT::v16i8, 1 }, // psllw. 202 { ISD::SHL, MVT::v8i16, 1 }, // psllw. 203 { ISD::SHL, MVT::v4i32, 1 }, // pslld 204 { ISD::SHL, MVT::v2i64, 1 }, // psllq. 205 206 { ISD::SRL, MVT::v16i8, 1 }, // psrlw. 207 { ISD::SRL, MVT::v8i16, 1 }, // psrlw. 208 { ISD::SRL, MVT::v4i32, 1 }, // psrld. 209 { ISD::SRL, MVT::v2i64, 1 }, // psrlq. 210 211 { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. 212 { ISD::SRA, MVT::v8i16, 1 }, // psraw. 213 { ISD::SRA, MVT::v4i32, 1 }, // psrad. 214 215 { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence 216 { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence 217 { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence 218 { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence 219 }; 220 221 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && 222 ST->hasSSE2()) { 223 // pmuldq sequence. 224 if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41()) 225 return LT.first * 15; 226 227 int Idx = CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second); 228 if (Idx != -1) 229 return LT.first * SSE2UniformConstCostTable[Idx].Cost; 230 } 231 232 if (ISD == ISD::SHL && 233 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) { 234 EVT VT = LT.second; 235 if ((VT == MVT::v8i16 && ST->hasSSE2()) || 236 (VT == MVT::v4i32 && ST->hasSSE41())) 237 // Vector shift left by non uniform constant can be lowered 238 // into vector multiply (pmullw/pmulld). 239 return LT.first; 240 if (VT == MVT::v4i32 && ST->hasSSE2()) 241 // A vector shift left by non uniform constant is converted 242 // into a vector multiply; the new multiply is eventually 243 // lowered into a sequence of shuffles and 2 x pmuludq. 244 ISD = ISD::MUL; 245 } 246 247 static const CostTblEntry<MVT::SimpleValueType> SSE2CostTable[] = { 248 // We don't correctly identify costs of casts because they are marked as 249 // custom. 250 // For some cases, where the shift amount is a scalar we would be able 251 // to generate better code. Unfortunately, when this is the case the value 252 // (the splat) will get hoisted out of the loop, thereby making it invisible 253 // to ISel. The cost model must return worst case assumptions because it is 254 // used for vectorization and we don't want to make vectorized code worse 255 // than scalar code. 256 { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence. 257 { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence. 258 { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. 259 { ISD::SHL, MVT::v2i64, 2*10 }, // Scalarized. 260 { ISD::SHL, MVT::v4i64, 4*10 }, // Scalarized. 261 262 { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence. 263 { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence. 264 { ISD::SRL, MVT::v4i32, 4*10 }, // Scalarized. 265 { ISD::SRL, MVT::v2i64, 2*10 }, // Scalarized. 266 267 { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence. 268 { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence. 269 { ISD::SRA, MVT::v4i32, 4*10 }, // Scalarized. 270 { ISD::SRA, MVT::v2i64, 2*10 }, // Scalarized. 271 272 // It is not a good idea to vectorize division. We have to scalarize it and 273 // in the process we will often end up having to spilling regular 274 // registers. The overhead of division is going to dominate most kernels 275 // anyways so try hard to prevent vectorization of division - it is 276 // generally a bad idea. Assume somewhat arbitrarily that we have to be able 277 // to hide "20 cycles" for each lane. 278 { ISD::SDIV, MVT::v16i8, 16*20 }, 279 { ISD::SDIV, MVT::v8i16, 8*20 }, 280 { ISD::SDIV, MVT::v4i32, 4*20 }, 281 { ISD::SDIV, MVT::v2i64, 2*20 }, 282 { ISD::UDIV, MVT::v16i8, 16*20 }, 283 { ISD::UDIV, MVT::v8i16, 8*20 }, 284 { ISD::UDIV, MVT::v4i32, 4*20 }, 285 { ISD::UDIV, MVT::v2i64, 2*20 }, 286 }; 287 288 if (ST->hasSSE2()) { 289 int Idx = CostTableLookup(SSE2CostTable, ISD, LT.second); 290 if (Idx != -1) 291 return LT.first * SSE2CostTable[Idx].Cost; 292 } 293 294 static const CostTblEntry<MVT::SimpleValueType> AVX1CostTable[] = { 295 // We don't have to scalarize unsupported ops. We can issue two half-sized 296 // operations and we only need to extract the upper YMM half. 297 // Two ops + 1 extract + 1 insert = 4. 298 { ISD::MUL, MVT::v16i16, 4 }, 299 { ISD::MUL, MVT::v8i32, 4 }, 300 { ISD::SUB, MVT::v8i32, 4 }, 301 { ISD::ADD, MVT::v8i32, 4 }, 302 { ISD::SUB, MVT::v4i64, 4 }, 303 { ISD::ADD, MVT::v4i64, 4 }, 304 // A v4i64 multiply is custom lowered as two split v2i64 vectors that then 305 // are lowered as a series of long multiplies(3), shifts(4) and adds(2) 306 // Because we believe v4i64 to be a legal type, we must also include the 307 // split factor of two in the cost table. Therefore, the cost here is 18 308 // instead of 9. 309 { ISD::MUL, MVT::v4i64, 18 }, 310 }; 311 312 // Look for AVX1 lowering tricks. 313 if (ST->hasAVX() && !ST->hasAVX2()) { 314 EVT VT = LT.second; 315 316 // v16i16 and v8i32 shifts by non-uniform constants are lowered into a 317 // sequence of extract + two vector multiply + insert. 318 if (ISD == ISD::SHL && (VT == MVT::v8i32 || VT == MVT::v16i16) && 319 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) 320 ISD = ISD::MUL; 321 322 int Idx = CostTableLookup(AVX1CostTable, ISD, VT); 323 if (Idx != -1) 324 return LT.first * AVX1CostTable[Idx].Cost; 325 } 326 327 // Custom lowering of vectors. 328 static const CostTblEntry<MVT::SimpleValueType> CustomLowered[] = { 329 // A v2i64/v4i64 and multiply is custom lowered as a series of long 330 // multiplies(3), shifts(4) and adds(2). 331 { ISD::MUL, MVT::v2i64, 9 }, 332 { ISD::MUL, MVT::v4i64, 9 }, 333 }; 334 int Idx = CostTableLookup(CustomLowered, ISD, LT.second); 335 if (Idx != -1) 336 return LT.first * CustomLowered[Idx].Cost; 337 338 // Special lowering of v4i32 mul on sse2, sse3: Lower v4i32 mul as 2x shuffle, 339 // 2x pmuludq, 2x shuffle. 340 if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() && 341 !ST->hasSSE41()) 342 return LT.first * 6; 343 344 // Fallback to the default implementation. 345 return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info); 346 } 347 348 unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, 349 Type *SubTp) { 350 // We only estimate the cost of reverse and alternate shuffles. 351 if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate) 352 return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); 353 354 if (Kind == TTI::SK_Reverse) { 355 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp); 356 unsigned Cost = 1; 357 if (LT.second.getSizeInBits() > 128) 358 Cost = 3; // Extract + insert + copy. 359 360 // Multiple by the number of parts. 361 return Cost * LT.first; 362 } 363 364 if (Kind == TTI::SK_Alternate) { 365 // 64-bit packed float vectors (v2f32) are widened to type v4f32. 366 // 64-bit packed integer vectors (v2i32) are promoted to type v2i64. 367 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp); 368 369 // The backend knows how to generate a single VEX.256 version of 370 // instruction VPBLENDW if the target supports AVX2. 371 if (ST->hasAVX2() && LT.second == MVT::v16i16) 372 return LT.first; 373 374 static const CostTblEntry<MVT::SimpleValueType> AVXAltShuffleTbl[] = { 375 {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1}, // vblendpd 376 {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1}, // vblendpd 377 378 {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1}, // vblendps 379 {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1}, // vblendps 380 381 // This shuffle is custom lowered into a sequence of: 382 // 2x vextractf128 , 2x vpblendw , 1x vinsertf128 383 {ISD::VECTOR_SHUFFLE, MVT::v16i16, 5}, 384 385 // This shuffle is custom lowered into a long sequence of: 386 // 2x vextractf128 , 4x vpshufb , 2x vpor , 1x vinsertf128 387 {ISD::VECTOR_SHUFFLE, MVT::v32i8, 9} 388 }; 389 390 if (ST->hasAVX()) { 391 int Idx = CostTableLookup(AVXAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); 392 if (Idx != -1) 393 return LT.first * AVXAltShuffleTbl[Idx].Cost; 394 } 395 396 static const CostTblEntry<MVT::SimpleValueType> SSE41AltShuffleTbl[] = { 397 // These are lowered into movsd. 398 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, 399 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, 400 401 // packed float vectors with four elements are lowered into BLENDI dag 402 // nodes. A v4i32/v4f32 BLENDI generates a single 'blendps'/'blendpd'. 403 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, 404 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, 405 406 // This shuffle generates a single pshufw. 407 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, 408 409 // There is no instruction that matches a v16i8 alternate shuffle. 410 // The backend will expand it into the sequence 'pshufb + pshufb + or'. 411 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} 412 }; 413 414 if (ST->hasSSE41()) { 415 int Idx = CostTableLookup(SSE41AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); 416 if (Idx != -1) 417 return LT.first * SSE41AltShuffleTbl[Idx].Cost; 418 } 419 420 static const CostTblEntry<MVT::SimpleValueType> SSSE3AltShuffleTbl[] = { 421 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd 422 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd 423 424 // SSE3 doesn't have 'blendps'. The following shuffles are expanded into 425 // the sequence 'shufps + pshufd' 426 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, 427 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, 428 429 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 3}, // pshufb + pshufb + or 430 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} // pshufb + pshufb + or 431 }; 432 433 if (ST->hasSSSE3()) { 434 int Idx = CostTableLookup(SSSE3AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); 435 if (Idx != -1) 436 return LT.first * SSSE3AltShuffleTbl[Idx].Cost; 437 } 438 439 static const CostTblEntry<MVT::SimpleValueType> SSEAltShuffleTbl[] = { 440 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd 441 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd 442 443 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, // shufps + pshufd 444 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, // shufps + pshufd 445 446 // This is expanded into a long sequence of four extract + four insert. 447 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 8}, // 4 x pextrw + 4 pinsrw. 448 449 // 8 x (pinsrw + pextrw + and + movb + movzb + or) 450 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 48} 451 }; 452 453 // Fall-back (SSE3 and SSE2). 454 int Idx = CostTableLookup(SSEAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); 455 if (Idx != -1) 456 return LT.first * SSEAltShuffleTbl[Idx].Cost; 457 return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); 458 } 459 460 return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); 461 } 462 463 unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { 464 int ISD = TLI->InstructionOpcodeToISD(Opcode); 465 assert(ISD && "Invalid opcode"); 466 467 std::pair<unsigned, MVT> LTSrc = TLI->getTypeLegalizationCost(Src); 468 std::pair<unsigned, MVT> LTDest = TLI->getTypeLegalizationCost(Dst); 469 470 static const TypeConversionCostTblEntry<MVT::SimpleValueType> 471 SSE2ConvTbl[] = { 472 // These are somewhat magic numbers justified by looking at the output of 473 // Intel's IACA, running some kernels and making sure when we take 474 // legalization into account the throughput will be overestimated. 475 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, 476 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, 477 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, 478 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, 479 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, 480 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, 481 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, 482 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, 483 // There are faster sequences for float conversions. 484 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, 485 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 }, 486 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, 487 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, 488 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, 489 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 }, 490 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, 491 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, 492 }; 493 494 if (ST->hasSSE2() && !ST->hasAVX()) { 495 int Idx = 496 ConvertCostTableLookup(SSE2ConvTbl, ISD, LTDest.second, LTSrc.second); 497 if (Idx != -1) 498 return LTSrc.first * SSE2ConvTbl[Idx].Cost; 499 } 500 501 static const TypeConversionCostTblEntry<MVT::SimpleValueType> 502 AVX512ConversionTbl[] = { 503 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 }, 504 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 }, 505 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 }, 506 { ISD::FP_ROUND, MVT::v16f32, MVT::v8f64, 3 }, 507 508 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 1 }, 509 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 1 }, 510 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 1 }, 511 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, 512 { ISD::TRUNCATE, MVT::v16i32, MVT::v8i64, 4 }, 513 514 // v16i1 -> v16i32 - load + broadcast 515 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, 516 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, 517 518 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, 519 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, 520 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, 521 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, 522 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i32, 3 }, 523 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i32, 3 }, 524 525 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, 526 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 }, 527 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 }, 528 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, 529 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, 530 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, 531 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, 532 }; 533 534 if (ST->hasAVX512()) { 535 int Idx = ConvertCostTableLookup(AVX512ConversionTbl, ISD, LTDest.second, 536 LTSrc.second); 537 if (Idx != -1) 538 return AVX512ConversionTbl[Idx].Cost; 539 } 540 EVT SrcTy = TLI->getValueType(Src); 541 EVT DstTy = TLI->getValueType(Dst); 542 543 // The function getSimpleVT only handles simple value types. 544 if (!SrcTy.isSimple() || !DstTy.isSimple()) 545 return BaseT::getCastInstrCost(Opcode, Dst, Src); 546 547 static const TypeConversionCostTblEntry<MVT::SimpleValueType> 548 AVX2ConversionTbl[] = { 549 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, 550 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, 551 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, 552 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, 553 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 554 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 555 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, 556 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, 557 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, 558 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, 559 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, 560 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, 561 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 562 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 563 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, 564 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, 565 566 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, 567 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, 568 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 }, 569 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, 570 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 }, 571 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 4 }, 572 573 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 }, 574 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 }, 575 576 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 }, 577 }; 578 579 static const TypeConversionCostTblEntry<MVT::SimpleValueType> 580 AVXConversionTbl[] = { 581 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, 582 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, 583 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 }, 584 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 }, 585 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 7 }, 586 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 }, 587 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, 588 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, 589 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 }, 590 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 }, 591 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 }, 592 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, 593 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 }, 594 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 595 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 4 }, 596 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 4 }, 597 598 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 4 }, 599 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 4 }, 600 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 4 }, 601 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 }, 602 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, 603 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 4 }, 604 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 9 }, 605 606 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 }, 607 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 8 }, 608 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 }, 609 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, 610 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, 611 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 612 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 3 }, 613 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 614 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 }, 615 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i8, 3 }, 616 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i16, 3 }, 617 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, 618 619 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 }, 620 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 5 }, 621 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 }, 622 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 9 }, 623 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 }, 624 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 2 }, 625 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 626 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 6 }, 627 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 }, 628 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 }, 629 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 }, 630 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 }, 631 // The generic code to compute the scalar overhead is currently broken. 632 // Workaround this limitation by estimating the scalarization overhead 633 // here. We have roughly 10 instructions per scalar element. 634 // Multiply that by the vector width. 635 // FIXME: remove that when PR19268 is fixed. 636 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, 637 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 4*10 }, 638 639 { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 7 }, 640 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 }, 641 // This node is expanded into scalarized operations but BasicTTI is overly 642 // optimistic estimating its cost. It computes 3 per element (one 643 // vector-extract, one scalar conversion and one vector-insert). The 644 // problem is that the inserts form a read-modify-write chain so latency 645 // should be factored in too. Inflating the cost per element by 1. 646 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 8*4 }, 647 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 }, 648 }; 649 650 if (ST->hasAVX2()) { 651 int Idx = ConvertCostTableLookup(AVX2ConversionTbl, ISD, 652 DstTy.getSimpleVT(), SrcTy.getSimpleVT()); 653 if (Idx != -1) 654 return AVX2ConversionTbl[Idx].Cost; 655 } 656 657 if (ST->hasAVX()) { 658 int Idx = ConvertCostTableLookup(AVXConversionTbl, ISD, DstTy.getSimpleVT(), 659 SrcTy.getSimpleVT()); 660 if (Idx != -1) 661 return AVXConversionTbl[Idx].Cost; 662 } 663 664 return BaseT::getCastInstrCost(Opcode, Dst, Src); 665 } 666 667 unsigned X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 668 Type *CondTy) { 669 // Legalize the type. 670 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy); 671 672 MVT MTy = LT.second; 673 674 int ISD = TLI->InstructionOpcodeToISD(Opcode); 675 assert(ISD && "Invalid opcode"); 676 677 static const CostTblEntry<MVT::SimpleValueType> SSE42CostTbl[] = { 678 { ISD::SETCC, MVT::v2f64, 1 }, 679 { ISD::SETCC, MVT::v4f32, 1 }, 680 { ISD::SETCC, MVT::v2i64, 1 }, 681 { ISD::SETCC, MVT::v4i32, 1 }, 682 { ISD::SETCC, MVT::v8i16, 1 }, 683 { ISD::SETCC, MVT::v16i8, 1 }, 684 }; 685 686 static const CostTblEntry<MVT::SimpleValueType> AVX1CostTbl[] = { 687 { ISD::SETCC, MVT::v4f64, 1 }, 688 { ISD::SETCC, MVT::v8f32, 1 }, 689 // AVX1 does not support 8-wide integer compare. 690 { ISD::SETCC, MVT::v4i64, 4 }, 691 { ISD::SETCC, MVT::v8i32, 4 }, 692 { ISD::SETCC, MVT::v16i16, 4 }, 693 { ISD::SETCC, MVT::v32i8, 4 }, 694 }; 695 696 static const CostTblEntry<MVT::SimpleValueType> AVX2CostTbl[] = { 697 { ISD::SETCC, MVT::v4i64, 1 }, 698 { ISD::SETCC, MVT::v8i32, 1 }, 699 { ISD::SETCC, MVT::v16i16, 1 }, 700 { ISD::SETCC, MVT::v32i8, 1 }, 701 }; 702 703 static const CostTblEntry<MVT::SimpleValueType> AVX512CostTbl[] = { 704 { ISD::SETCC, MVT::v8i64, 1 }, 705 { ISD::SETCC, MVT::v16i32, 1 }, 706 { ISD::SETCC, MVT::v8f64, 1 }, 707 { ISD::SETCC, MVT::v16f32, 1 }, 708 }; 709 710 if (ST->hasAVX512()) { 711 int Idx = CostTableLookup(AVX512CostTbl, ISD, MTy); 712 if (Idx != -1) 713 return LT.first * AVX512CostTbl[Idx].Cost; 714 } 715 716 if (ST->hasAVX2()) { 717 int Idx = CostTableLookup(AVX2CostTbl, ISD, MTy); 718 if (Idx != -1) 719 return LT.first * AVX2CostTbl[Idx].Cost; 720 } 721 722 if (ST->hasAVX()) { 723 int Idx = CostTableLookup(AVX1CostTbl, ISD, MTy); 724 if (Idx != -1) 725 return LT.first * AVX1CostTbl[Idx].Cost; 726 } 727 728 if (ST->hasSSE42()) { 729 int Idx = CostTableLookup(SSE42CostTbl, ISD, MTy); 730 if (Idx != -1) 731 return LT.first * SSE42CostTbl[Idx].Cost; 732 } 733 734 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); 735 } 736 737 unsigned X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 738 unsigned Index) { 739 assert(Val->isVectorTy() && "This must be a vector type"); 740 741 if (Index != -1U) { 742 // Legalize the type. 743 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val); 744 745 // This type is legalized to a scalar type. 746 if (!LT.second.isVector()) 747 return 0; 748 749 // The type may be split. Normalize the index to the new type. 750 unsigned Width = LT.second.getVectorNumElements(); 751 Index = Index % Width; 752 753 // Floating point scalars are already located in index #0. 754 if (Val->getScalarType()->isFloatingPointTy() && Index == 0) 755 return 0; 756 } 757 758 return BaseT::getVectorInstrCost(Opcode, Val, Index); 759 } 760 761 unsigned X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, 762 bool Extract) { 763 assert (Ty->isVectorTy() && "Can only scalarize vectors"); 764 unsigned Cost = 0; 765 766 for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { 767 if (Insert) 768 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, i); 769 if (Extract) 770 Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, i); 771 } 772 773 return Cost; 774 } 775 776 unsigned X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, 777 unsigned Alignment, 778 unsigned AddressSpace) { 779 // Handle non-power-of-two vectors such as <3 x float> 780 if (VectorType *VTy = dyn_cast<VectorType>(Src)) { 781 unsigned NumElem = VTy->getVectorNumElements(); 782 783 // Handle a few common cases: 784 // <3 x float> 785 if (NumElem == 3 && VTy->getScalarSizeInBits() == 32) 786 // Cost = 64 bit store + extract + 32 bit store. 787 return 3; 788 789 // <3 x double> 790 if (NumElem == 3 && VTy->getScalarSizeInBits() == 64) 791 // Cost = 128 bit store + unpack + 64 bit store. 792 return 3; 793 794 // Assume that all other non-power-of-two numbers are scalarized. 795 if (!isPowerOf2_32(NumElem)) { 796 unsigned Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), 797 Alignment, AddressSpace); 798 unsigned SplitCost = getScalarizationOverhead(Src, 799 Opcode == Instruction::Load, 800 Opcode==Instruction::Store); 801 return NumElem * Cost + SplitCost; 802 } 803 } 804 805 // Legalize the type. 806 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src); 807 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && 808 "Invalid Opcode"); 809 810 // Each load/store unit costs 1. 811 unsigned Cost = LT.first * 1; 812 813 // On Sandybridge 256bit load/stores are double pumped 814 // (but not on Haswell). 815 if (LT.second.getSizeInBits() > 128 && !ST->hasAVX2()) 816 Cost*=2; 817 818 return Cost; 819 } 820 821 unsigned X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, 822 unsigned Alignment, 823 unsigned AddressSpace) { 824 VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy); 825 if (!SrcVTy) 826 // To calculate scalar take the regular cost, without mask 827 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace); 828 829 unsigned NumElem = SrcVTy->getVectorNumElements(); 830 VectorType *MaskTy = 831 VectorType::get(Type::getInt8Ty(getGlobalContext()), NumElem); 832 if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy, 1)) || 833 (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy, 1)) || 834 !isPowerOf2_32(NumElem)) { 835 // Scalarization 836 unsigned MaskSplitCost = getScalarizationOverhead(MaskTy, false, true); 837 unsigned ScalarCompareCost = 838 getCmpSelInstrCost(Instruction::ICmp, 839 Type::getInt8Ty(getGlobalContext()), NULL); 840 unsigned BranchCost = getCFInstrCost(Instruction::Br); 841 unsigned MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); 842 843 unsigned ValueSplitCost = 844 getScalarizationOverhead(SrcVTy, Opcode == Instruction::Load, 845 Opcode == Instruction::Store); 846 unsigned MemopCost = 847 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), 848 Alignment, AddressSpace); 849 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; 850 } 851 852 // Legalize the type. 853 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(SrcVTy); 854 unsigned Cost = 0; 855 if (LT.second != TLI->getValueType(SrcVTy).getSimpleVT() && 856 LT.second.getVectorNumElements() == NumElem) 857 // Promotion requires expand/truncate for data and a shuffle for mask. 858 Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, 0) + 859 getShuffleCost(TTI::SK_Alternate, MaskTy, 0, 0); 860 861 else if (LT.second.getVectorNumElements() > NumElem) { 862 VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(), 863 LT.second.getVectorNumElements()); 864 // Expanding requires fill mask with zeroes 865 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy); 866 } 867 if (!ST->hasAVX512()) 868 return Cost + LT.first*4; // Each maskmov costs 4 869 870 // AVX-512 masked load/store is cheapper 871 return Cost+LT.first; 872 } 873 874 unsigned X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { 875 // Address computations in vectorized code with non-consecutive addresses will 876 // likely result in more instructions compared to scalar code where the 877 // computation can more often be merged into the index mode. The resulting 878 // extra micro-ops can significantly decrease throughput. 879 unsigned NumVectorInstToHideOverhead = 10; 880 881 if (Ty->isVectorTy() && IsComplex) 882 return NumVectorInstToHideOverhead; 883 884 return BaseT::getAddressComputationCost(Ty, IsComplex); 885 } 886 887 unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, 888 bool IsPairwise) { 889 890 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy); 891 892 MVT MTy = LT.second; 893 894 int ISD = TLI->InstructionOpcodeToISD(Opcode); 895 assert(ISD && "Invalid opcode"); 896 897 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput 898 // and make it as the cost. 899 900 static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblPairWise[] = { 901 { ISD::FADD, MVT::v2f64, 2 }, 902 { ISD::FADD, MVT::v4f32, 4 }, 903 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". 904 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". 905 { ISD::ADD, MVT::v8i16, 5 }, 906 }; 907 908 static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblPairWise[] = { 909 { ISD::FADD, MVT::v4f32, 4 }, 910 { ISD::FADD, MVT::v4f64, 5 }, 911 { ISD::FADD, MVT::v8f32, 7 }, 912 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". 913 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". 914 { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8". 915 { ISD::ADD, MVT::v8i16, 5 }, 916 { ISD::ADD, MVT::v8i32, 5 }, 917 }; 918 919 static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblNoPairWise[] = { 920 { ISD::FADD, MVT::v2f64, 2 }, 921 { ISD::FADD, MVT::v4f32, 4 }, 922 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". 923 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3". 924 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3". 925 }; 926 927 static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblNoPairWise[] = { 928 { ISD::FADD, MVT::v4f32, 3 }, 929 { ISD::FADD, MVT::v4f64, 3 }, 930 { ISD::FADD, MVT::v8f32, 4 }, 931 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". 932 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8". 933 { ISD::ADD, MVT::v4i64, 3 }, 934 { ISD::ADD, MVT::v8i16, 4 }, 935 { ISD::ADD, MVT::v8i32, 5 }, 936 }; 937 938 if (IsPairwise) { 939 if (ST->hasAVX()) { 940 int Idx = CostTableLookup(AVX1CostTblPairWise, ISD, MTy); 941 if (Idx != -1) 942 return LT.first * AVX1CostTblPairWise[Idx].Cost; 943 } 944 945 if (ST->hasSSE42()) { 946 int Idx = CostTableLookup(SSE42CostTblPairWise, ISD, MTy); 947 if (Idx != -1) 948 return LT.first * SSE42CostTblPairWise[Idx].Cost; 949 } 950 } else { 951 if (ST->hasAVX()) { 952 int Idx = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy); 953 if (Idx != -1) 954 return LT.first * AVX1CostTblNoPairWise[Idx].Cost; 955 } 956 957 if (ST->hasSSE42()) { 958 int Idx = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy); 959 if (Idx != -1) 960 return LT.first * SSE42CostTblNoPairWise[Idx].Cost; 961 } 962 } 963 964 return BaseT::getReductionCost(Opcode, ValTy, IsPairwise); 965 } 966 967 /// \brief Calculate the cost of materializing a 64-bit value. This helper 968 /// method might only calculate a fraction of a larger immediate. Therefore it 969 /// is valid to return a cost of ZERO. 970 unsigned X86TTIImpl::getIntImmCost(int64_t Val) { 971 if (Val == 0) 972 return TTI::TCC_Free; 973 974 if (isInt<32>(Val)) 975 return TTI::TCC_Basic; 976 977 return 2 * TTI::TCC_Basic; 978 } 979 980 unsigned X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { 981 assert(Ty->isIntegerTy()); 982 983 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 984 if (BitSize == 0) 985 return ~0U; 986 987 // Never hoist constants larger than 128bit, because this might lead to 988 // incorrect code generation or assertions in codegen. 989 // Fixme: Create a cost model for types larger than i128 once the codegen 990 // issues have been fixed. 991 if (BitSize > 128) 992 return TTI::TCC_Free; 993 994 if (Imm == 0) 995 return TTI::TCC_Free; 996 997 // Sign-extend all constants to a multiple of 64-bit. 998 APInt ImmVal = Imm; 999 if (BitSize & 0x3f) 1000 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); 1001 1002 // Split the constant into 64-bit chunks and calculate the cost for each 1003 // chunk. 1004 unsigned Cost = 0; 1005 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { 1006 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); 1007 int64_t Val = Tmp.getSExtValue(); 1008 Cost += getIntImmCost(Val); 1009 } 1010 // We need at least one instruction to materialze the constant. 1011 return std::max(1U, Cost); 1012 } 1013 1014 unsigned X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, 1015 const APInt &Imm, Type *Ty) { 1016 assert(Ty->isIntegerTy()); 1017 1018 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 1019 // There is no cost model for constants with a bit size of 0. Return TCC_Free 1020 // here, so that constant hoisting will ignore this constant. 1021 if (BitSize == 0) 1022 return TTI::TCC_Free; 1023 1024 unsigned ImmIdx = ~0U; 1025 switch (Opcode) { 1026 default: 1027 return TTI::TCC_Free; 1028 case Instruction::GetElementPtr: 1029 // Always hoist the base address of a GetElementPtr. This prevents the 1030 // creation of new constants for every base constant that gets constant 1031 // folded with the offset. 1032 if (Idx == 0) 1033 return 2 * TTI::TCC_Basic; 1034 return TTI::TCC_Free; 1035 case Instruction::Store: 1036 ImmIdx = 0; 1037 break; 1038 case Instruction::Add: 1039 case Instruction::Sub: 1040 case Instruction::Mul: 1041 case Instruction::UDiv: 1042 case Instruction::SDiv: 1043 case Instruction::URem: 1044 case Instruction::SRem: 1045 case Instruction::And: 1046 case Instruction::Or: 1047 case Instruction::Xor: 1048 case Instruction::ICmp: 1049 ImmIdx = 1; 1050 break; 1051 // Always return TCC_Free for the shift value of a shift instruction. 1052 case Instruction::Shl: 1053 case Instruction::LShr: 1054 case Instruction::AShr: 1055 if (Idx == 1) 1056 return TTI::TCC_Free; 1057 break; 1058 case Instruction::Trunc: 1059 case Instruction::ZExt: 1060 case Instruction::SExt: 1061 case Instruction::IntToPtr: 1062 case Instruction::PtrToInt: 1063 case Instruction::BitCast: 1064 case Instruction::PHI: 1065 case Instruction::Call: 1066 case Instruction::Select: 1067 case Instruction::Ret: 1068 case Instruction::Load: 1069 break; 1070 } 1071 1072 if (Idx == ImmIdx) { 1073 unsigned NumConstants = (BitSize + 63) / 64; 1074 unsigned Cost = X86TTIImpl::getIntImmCost(Imm, Ty); 1075 return (Cost <= NumConstants * TTI::TCC_Basic) 1076 ? static_cast<unsigned>(TTI::TCC_Free) 1077 : Cost; 1078 } 1079 1080 return X86TTIImpl::getIntImmCost(Imm, Ty); 1081 } 1082 1083 unsigned X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, 1084 const APInt &Imm, Type *Ty) { 1085 assert(Ty->isIntegerTy()); 1086 1087 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 1088 // There is no cost model for constants with a bit size of 0. Return TCC_Free 1089 // here, so that constant hoisting will ignore this constant. 1090 if (BitSize == 0) 1091 return TTI::TCC_Free; 1092 1093 switch (IID) { 1094 default: 1095 return TTI::TCC_Free; 1096 case Intrinsic::sadd_with_overflow: 1097 case Intrinsic::uadd_with_overflow: 1098 case Intrinsic::ssub_with_overflow: 1099 case Intrinsic::usub_with_overflow: 1100 case Intrinsic::smul_with_overflow: 1101 case Intrinsic::umul_with_overflow: 1102 if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue())) 1103 return TTI::TCC_Free; 1104 break; 1105 case Intrinsic::experimental_stackmap: 1106 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 1107 return TTI::TCC_Free; 1108 break; 1109 case Intrinsic::experimental_patchpoint_void: 1110 case Intrinsic::experimental_patchpoint_i64: 1111 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 1112 return TTI::TCC_Free; 1113 break; 1114 } 1115 return X86TTIImpl::getIntImmCost(Imm, Ty); 1116 } 1117 1118 bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, int Consecutive) { 1119 int DataWidth = DataTy->getPrimitiveSizeInBits(); 1120 1121 // Todo: AVX512 allows gather/scatter, works with strided and random as well 1122 if ((DataWidth < 32) || (Consecutive == 0)) 1123 return false; 1124 if (ST->hasAVX512() || ST->hasAVX2()) 1125 return true; 1126 return false; 1127 } 1128 1129 bool X86TTIImpl::isLegalMaskedStore(Type *DataType, int Consecutive) { 1130 return isLegalMaskedLoad(DataType, Consecutive); 1131 } 1132 1133 bool X86TTIImpl::hasCompatibleFunctionAttributes(const Function *Caller, 1134 const Function *Callee) const { 1135 const TargetMachine &TM = getTLI()->getTargetMachine(); 1136 1137 // Work this as a subsetting of subtarget features. 1138 const FeatureBitset &CallerBits = 1139 TM.getSubtargetImpl(*Caller)->getFeatureBits(); 1140 const FeatureBitset &CalleeBits = 1141 TM.getSubtargetImpl(*Callee)->getFeatureBits(); 1142 1143 // FIXME: This is likely too limiting as it will include subtarget features 1144 // that we might not care about for inlining, but it is conservatively 1145 // correct. 1146 return (CallerBits & CalleeBits) == CalleeBits; 1147 } 1148