1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// \file 10 /// This file implements a TargetTransformInfo analysis pass specific to the 11 /// X86 target machine. It uses the target's detailed information to provide 12 /// more precise answers to certain TTI queries, while letting the target 13 /// independent and default TTI implementations handle the rest. 14 /// 15 //===----------------------------------------------------------------------===// 16 /// About Cost Model numbers used below it's necessary to say the following: 17 /// the numbers correspond to some "generic" X86 CPU instead of usage of 18 /// concrete CPU model. Usually the numbers correspond to CPU where the feature 19 /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in 20 /// the lookups below the cost is based on Nehalem as that was the first CPU 21 /// to support that feature level and thus has most likely the worst case cost. 22 /// Some examples of other technologies/CPUs: 23 /// SSE 3 - Pentium4 / Athlon64 24 /// SSE 4.1 - Penryn 25 /// SSE 4.2 - Nehalem 26 /// AVX - Sandy Bridge 27 /// AVX2 - Haswell 28 /// AVX-512 - Xeon Phi / Skylake 29 /// And some examples of instruction target dependent costs (latency) 30 /// divss sqrtss rsqrtss 31 /// AMD K7 11-16 19 3 32 /// Piledriver 9-24 13-15 5 33 /// Jaguar 14 16 2 34 /// Pentium II,III 18 30 2 35 /// Nehalem 7-14 7-18 3 36 /// Haswell 10-13 11 5 37 /// TODO: Develop and implement the target dependent cost model and 38 /// specialize cost numbers for different Cost Model Targets such as throughput, 39 /// code size, latency and uop count. 40 //===----------------------------------------------------------------------===// 41 42 #include "X86TargetTransformInfo.h" 43 #include "llvm/Analysis/TargetTransformInfo.h" 44 #include "llvm/CodeGen/BasicTTIImpl.h" 45 #include "llvm/CodeGen/CostTable.h" 46 #include "llvm/CodeGen/TargetLowering.h" 47 #include "llvm/IR/IntrinsicInst.h" 48 #include "llvm/Support/Debug.h" 49 50 using namespace llvm; 51 52 #define DEBUG_TYPE "x86tti" 53 54 //===----------------------------------------------------------------------===// 55 // 56 // X86 cost model. 57 // 58 //===----------------------------------------------------------------------===// 59 60 TargetTransformInfo::PopcntSupportKind 61 X86TTIImpl::getPopcntSupport(unsigned TyWidth) { 62 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 63 // TODO: Currently the __builtin_popcount() implementation using SSE3 64 // instructions is inefficient. Once the problem is fixed, we should 65 // call ST->hasSSE3() instead of ST->hasPOPCNT(). 66 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software; 67 } 68 69 llvm::Optional<unsigned> X86TTIImpl::getCacheSize( 70 TargetTransformInfo::CacheLevel Level) const { 71 switch (Level) { 72 case TargetTransformInfo::CacheLevel::L1D: 73 // - Penryn 74 // - Nehalem 75 // - Westmere 76 // - Sandy Bridge 77 // - Ivy Bridge 78 // - Haswell 79 // - Broadwell 80 // - Skylake 81 // - Kabylake 82 return 32 * 1024; // 32 KByte 83 case TargetTransformInfo::CacheLevel::L2D: 84 // - Penryn 85 // - Nehalem 86 // - Westmere 87 // - Sandy Bridge 88 // - Ivy Bridge 89 // - Haswell 90 // - Broadwell 91 // - Skylake 92 // - Kabylake 93 return 256 * 1024; // 256 KByte 94 } 95 96 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); 97 } 98 99 llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity( 100 TargetTransformInfo::CacheLevel Level) const { 101 // - Penryn 102 // - Nehalem 103 // - Westmere 104 // - Sandy Bridge 105 // - Ivy Bridge 106 // - Haswell 107 // - Broadwell 108 // - Skylake 109 // - Kabylake 110 switch (Level) { 111 case TargetTransformInfo::CacheLevel::L1D: 112 LLVM_FALLTHROUGH; 113 case TargetTransformInfo::CacheLevel::L2D: 114 return 8; 115 } 116 117 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); 118 } 119 120 unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) { 121 if (Vector && !ST->hasSSE1()) 122 return 0; 123 124 if (ST->is64Bit()) { 125 if (Vector && ST->hasAVX512()) 126 return 32; 127 return 16; 128 } 129 return 8; 130 } 131 132 unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const { 133 unsigned PreferVectorWidth = ST->getPreferVectorWidth(); 134 if (Vector) { 135 if (ST->hasAVX512() && PreferVectorWidth >= 512) 136 return 512; 137 if (ST->hasAVX() && PreferVectorWidth >= 256) 138 return 256; 139 if (ST->hasSSE1() && PreferVectorWidth >= 128) 140 return 128; 141 return 0; 142 } 143 144 if (ST->is64Bit()) 145 return 64; 146 147 return 32; 148 } 149 150 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const { 151 return getRegisterBitWidth(true); 152 } 153 154 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { 155 // If the loop will not be vectorized, don't interleave the loop. 156 // Let regular unroll to unroll the loop, which saves the overflow 157 // check and memory check cost. 158 if (VF == 1) 159 return 1; 160 161 if (ST->isAtom()) 162 return 1; 163 164 // Sandybridge and Haswell have multiple execution ports and pipelined 165 // vector units. 166 if (ST->hasAVX()) 167 return 4; 168 169 return 2; 170 } 171 172 int X86TTIImpl::getArithmeticInstrCost( 173 unsigned Opcode, Type *Ty, 174 TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, 175 TTI::OperandValueProperties Opd1PropInfo, 176 TTI::OperandValueProperties Opd2PropInfo, 177 ArrayRef<const Value *> Args) { 178 // Legalize the type. 179 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 180 181 int ISD = TLI->InstructionOpcodeToISD(Opcode); 182 assert(ISD && "Invalid opcode"); 183 184 static const CostTblEntry SLMCostTable[] = { 185 { ISD::MUL, MVT::v4i32, 11 }, // pmulld 186 { ISD::MUL, MVT::v8i16, 2 }, // pmullw 187 { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence. 188 { ISD::FMUL, MVT::f64, 2 }, // mulsd 189 { ISD::FMUL, MVT::v2f64, 4 }, // mulpd 190 { ISD::FMUL, MVT::v4f32, 2 }, // mulps 191 { ISD::FDIV, MVT::f32, 17 }, // divss 192 { ISD::FDIV, MVT::v4f32, 39 }, // divps 193 { ISD::FDIV, MVT::f64, 32 }, // divsd 194 { ISD::FDIV, MVT::v2f64, 69 }, // divpd 195 { ISD::FADD, MVT::v2f64, 2 }, // addpd 196 { ISD::FSUB, MVT::v2f64, 2 }, // subpd 197 // v2i64/v4i64 mul is custom lowered as a series of long: 198 // multiplies(3), shifts(3) and adds(2) 199 // slm muldq version throughput is 2 and addq throughput 4 200 // thus: 3X2 (muldq throughput) + 3X1 (shift throuput) + 201 // 3X4 (addq throughput) = 17 202 { ISD::MUL, MVT::v2i64, 17 }, 203 // slm addq\subq throughput is 4 204 { ISD::ADD, MVT::v2i64, 4 }, 205 { ISD::SUB, MVT::v2i64, 4 }, 206 }; 207 208 if (ST->isSLM()) { 209 if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) { 210 // Check if the operands can be shrinked into a smaller datatype. 211 bool Op1Signed = false; 212 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed); 213 bool Op2Signed = false; 214 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed); 215 216 bool signedMode = Op1Signed | Op2Signed; 217 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize); 218 219 if (OpMinSize <= 7) 220 return LT.first * 3; // pmullw/sext 221 if (!signedMode && OpMinSize <= 8) 222 return LT.first * 3; // pmullw/zext 223 if (OpMinSize <= 15) 224 return LT.first * 5; // pmullw/pmulhw/pshuf 225 if (!signedMode && OpMinSize <= 16) 226 return LT.first * 5; // pmullw/pmulhw/pshuf 227 } 228 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, 229 LT.second)) { 230 return LT.first * Entry->Cost; 231 } 232 } 233 234 if (ISD == ISD::SDIV && 235 Op2Info == TargetTransformInfo::OK_UniformConstantValue && 236 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { 237 // On X86, vector signed division by constants power-of-two are 238 // normally expanded to the sequence SRA + SRL + ADD + SRA. 239 // The OperandValue properties many not be same as that of previous 240 // operation;conservatively assume OP_None. 241 int Cost = 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, 242 Op2Info, TargetTransformInfo::OP_None, 243 TargetTransformInfo::OP_None); 244 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info, 245 TargetTransformInfo::OP_None, 246 TargetTransformInfo::OP_None); 247 Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info, 248 TargetTransformInfo::OP_None, 249 TargetTransformInfo::OP_None); 250 251 return Cost; 252 } 253 254 static const CostTblEntry AVX512BWUniformConstCostTable[] = { 255 { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand. 256 { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand. 257 { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb. 258 259 { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence 260 { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence 261 }; 262 263 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && 264 ST->hasBWI()) { 265 if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD, 266 LT.second)) 267 return LT.first * Entry->Cost; 268 } 269 270 static const CostTblEntry AVX512UniformConstCostTable[] = { 271 { ISD::SRA, MVT::v2i64, 1 }, 272 { ISD::SRA, MVT::v4i64, 1 }, 273 { ISD::SRA, MVT::v8i64, 1 }, 274 275 { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence 276 { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence 277 }; 278 279 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && 280 ST->hasAVX512()) { 281 if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD, 282 LT.second)) 283 return LT.first * Entry->Cost; 284 } 285 286 static const CostTblEntry AVX2UniformConstCostTable[] = { 287 { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand. 288 { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand. 289 { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb. 290 291 { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle. 292 293 { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence 294 { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence 295 { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence 296 { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence 297 }; 298 299 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && 300 ST->hasAVX2()) { 301 if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD, 302 LT.second)) 303 return LT.first * Entry->Cost; 304 } 305 306 static const CostTblEntry SSE2UniformConstCostTable[] = { 307 { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand. 308 { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand. 309 { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. 310 311 { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split. 312 { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split. 313 { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split. 314 315 { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split. 316 { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence 317 { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split. 318 { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence 319 { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split. 320 { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence 321 { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split. 322 { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence 323 }; 324 325 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && 326 ST->hasSSE2()) { 327 // pmuldq sequence. 328 if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX()) 329 return LT.first * 32; 330 if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41()) 331 return LT.first * 15; 332 333 // XOP has faster vXi8 shifts. 334 if ((ISD != ISD::SHL && ISD != ISD::SRL && ISD != ISD::SRA) || 335 !ST->hasXOP()) 336 if (const auto *Entry = 337 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second)) 338 return LT.first * Entry->Cost; 339 } 340 341 static const CostTblEntry AVX2UniformCostTable[] = { 342 // Uniform splats are cheaper for the following instructions. 343 { ISD::SHL, MVT::v16i16, 1 }, // psllw. 344 { ISD::SRL, MVT::v16i16, 1 }, // psrlw. 345 { ISD::SRA, MVT::v16i16, 1 }, // psraw. 346 }; 347 348 if (ST->hasAVX2() && 349 ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || 350 (Op2Info == TargetTransformInfo::OK_UniformValue))) { 351 if (const auto *Entry = 352 CostTableLookup(AVX2UniformCostTable, ISD, LT.second)) 353 return LT.first * Entry->Cost; 354 } 355 356 static const CostTblEntry SSE2UniformCostTable[] = { 357 // Uniform splats are cheaper for the following instructions. 358 { ISD::SHL, MVT::v8i16, 1 }, // psllw. 359 { ISD::SHL, MVT::v4i32, 1 }, // pslld 360 { ISD::SHL, MVT::v2i64, 1 }, // psllq. 361 362 { ISD::SRL, MVT::v8i16, 1 }, // psrlw. 363 { ISD::SRL, MVT::v4i32, 1 }, // psrld. 364 { ISD::SRL, MVT::v2i64, 1 }, // psrlq. 365 366 { ISD::SRA, MVT::v8i16, 1 }, // psraw. 367 { ISD::SRA, MVT::v4i32, 1 }, // psrad. 368 }; 369 370 if (ST->hasSSE2() && 371 ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || 372 (Op2Info == TargetTransformInfo::OK_UniformValue))) { 373 if (const auto *Entry = 374 CostTableLookup(SSE2UniformCostTable, ISD, LT.second)) 375 return LT.first * Entry->Cost; 376 } 377 378 static const CostTblEntry AVX512DQCostTable[] = { 379 { ISD::MUL, MVT::v2i64, 1 }, 380 { ISD::MUL, MVT::v4i64, 1 }, 381 { ISD::MUL, MVT::v8i64, 1 } 382 }; 383 384 // Look for AVX512DQ lowering tricks for custom cases. 385 if (ST->hasDQI()) 386 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second)) 387 return LT.first * Entry->Cost; 388 389 static const CostTblEntry AVX512BWCostTable[] = { 390 { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw 391 { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw 392 { ISD::SRA, MVT::v8i16, 1 }, // vpsravw 393 394 { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw 395 { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw 396 { ISD::SRA, MVT::v16i16, 1 }, // vpsravw 397 398 { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw 399 { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw 400 { ISD::SRA, MVT::v32i16, 1 }, // vpsravw 401 402 { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence. 403 { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence. 404 { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence. 405 406 { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence. 407 { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence. 408 { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence. 409 410 // Vectorizing division is a bad idea. See the SSE2 table for more comments. 411 { ISD::SDIV, MVT::v64i8, 64*20 }, 412 { ISD::SDIV, MVT::v32i16, 32*20 }, 413 { ISD::UDIV, MVT::v64i8, 64*20 }, 414 { ISD::UDIV, MVT::v32i16, 32*20 } 415 }; 416 417 // Look for AVX512BW lowering tricks for custom cases. 418 if (ST->hasBWI()) 419 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second)) 420 return LT.first * Entry->Cost; 421 422 static const CostTblEntry AVX512CostTable[] = { 423 { ISD::SHL, MVT::v16i32, 1 }, 424 { ISD::SRL, MVT::v16i32, 1 }, 425 { ISD::SRA, MVT::v16i32, 1 }, 426 427 { ISD::SHL, MVT::v8i64, 1 }, 428 { ISD::SRL, MVT::v8i64, 1 }, 429 430 { ISD::SRA, MVT::v2i64, 1 }, 431 { ISD::SRA, MVT::v4i64, 1 }, 432 { ISD::SRA, MVT::v8i64, 1 }, 433 434 { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence. 435 { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence. 436 { ISD::MUL, MVT::v16i32, 1 }, // pmulld 437 { ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add 438 439 // Vectorizing division is a bad idea. See the SSE2 table for more comments. 440 { ISD::SDIV, MVT::v16i32, 16*20 }, 441 { ISD::SDIV, MVT::v8i64, 8*20 }, 442 { ISD::UDIV, MVT::v16i32, 16*20 }, 443 { ISD::UDIV, MVT::v8i64, 8*20 } 444 }; 445 446 if (ST->hasAVX512()) 447 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second)) 448 return LT.first * Entry->Cost; 449 450 static const CostTblEntry AVX2ShiftCostTable[] = { 451 // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to 452 // customize them to detect the cases where shift amount is a scalar one. 453 { ISD::SHL, MVT::v4i32, 1 }, 454 { ISD::SRL, MVT::v4i32, 1 }, 455 { ISD::SRA, MVT::v4i32, 1 }, 456 { ISD::SHL, MVT::v8i32, 1 }, 457 { ISD::SRL, MVT::v8i32, 1 }, 458 { ISD::SRA, MVT::v8i32, 1 }, 459 { ISD::SHL, MVT::v2i64, 1 }, 460 { ISD::SRL, MVT::v2i64, 1 }, 461 { ISD::SHL, MVT::v4i64, 1 }, 462 { ISD::SRL, MVT::v4i64, 1 }, 463 }; 464 465 // Look for AVX2 lowering tricks. 466 if (ST->hasAVX2()) { 467 if (ISD == ISD::SHL && LT.second == MVT::v16i16 && 468 (Op2Info == TargetTransformInfo::OK_UniformConstantValue || 469 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) 470 // On AVX2, a packed v16i16 shift left by a constant build_vector 471 // is lowered into a vector multiply (vpmullw). 472 return LT.first; 473 474 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second)) 475 return LT.first * Entry->Cost; 476 } 477 478 static const CostTblEntry XOPShiftCostTable[] = { 479 // 128bit shifts take 1cy, but right shifts require negation beforehand. 480 { ISD::SHL, MVT::v16i8, 1 }, 481 { ISD::SRL, MVT::v16i8, 2 }, 482 { ISD::SRA, MVT::v16i8, 2 }, 483 { ISD::SHL, MVT::v8i16, 1 }, 484 { ISD::SRL, MVT::v8i16, 2 }, 485 { ISD::SRA, MVT::v8i16, 2 }, 486 { ISD::SHL, MVT::v4i32, 1 }, 487 { ISD::SRL, MVT::v4i32, 2 }, 488 { ISD::SRA, MVT::v4i32, 2 }, 489 { ISD::SHL, MVT::v2i64, 1 }, 490 { ISD::SRL, MVT::v2i64, 2 }, 491 { ISD::SRA, MVT::v2i64, 2 }, 492 // 256bit shifts require splitting if AVX2 didn't catch them above. 493 { ISD::SHL, MVT::v32i8, 2+2 }, 494 { ISD::SRL, MVT::v32i8, 4+2 }, 495 { ISD::SRA, MVT::v32i8, 4+2 }, 496 { ISD::SHL, MVT::v16i16, 2+2 }, 497 { ISD::SRL, MVT::v16i16, 4+2 }, 498 { ISD::SRA, MVT::v16i16, 4+2 }, 499 { ISD::SHL, MVT::v8i32, 2+2 }, 500 { ISD::SRL, MVT::v8i32, 4+2 }, 501 { ISD::SRA, MVT::v8i32, 4+2 }, 502 { ISD::SHL, MVT::v4i64, 2+2 }, 503 { ISD::SRL, MVT::v4i64, 4+2 }, 504 { ISD::SRA, MVT::v4i64, 4+2 }, 505 }; 506 507 // Look for XOP lowering tricks. 508 if (ST->hasXOP()) 509 if (const auto *Entry = CostTableLookup(XOPShiftCostTable, ISD, LT.second)) 510 return LT.first * Entry->Cost; 511 512 static const CostTblEntry SSE2UniformShiftCostTable[] = { 513 // Uniform splats are cheaper for the following instructions. 514 { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split. 515 { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split. 516 { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split. 517 518 { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split. 519 { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split. 520 { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split. 521 522 { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split. 523 { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split. 524 { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle. 525 { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split. 526 }; 527 528 if (ST->hasSSE2() && 529 ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || 530 (Op2Info == TargetTransformInfo::OK_UniformValue))) { 531 532 // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table. 533 if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2()) 534 return LT.first * 4; // 2*psrad + shuffle. 535 536 if (const auto *Entry = 537 CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second)) 538 return LT.first * Entry->Cost; 539 } 540 541 if (ISD == ISD::SHL && 542 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) { 543 MVT VT = LT.second; 544 // Vector shift left by non uniform constant can be lowered 545 // into vector multiply. 546 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) || 547 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX())) 548 ISD = ISD::MUL; 549 } 550 551 static const CostTblEntry AVX2CostTable[] = { 552 { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence. 553 { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. 554 555 { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence. 556 { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. 557 558 { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence. 559 { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence. 560 { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence. 561 { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence. 562 563 { ISD::SUB, MVT::v32i8, 1 }, // psubb 564 { ISD::ADD, MVT::v32i8, 1 }, // paddb 565 { ISD::SUB, MVT::v16i16, 1 }, // psubw 566 { ISD::ADD, MVT::v16i16, 1 }, // paddw 567 { ISD::SUB, MVT::v8i32, 1 }, // psubd 568 { ISD::ADD, MVT::v8i32, 1 }, // paddd 569 { ISD::SUB, MVT::v4i64, 1 }, // psubq 570 { ISD::ADD, MVT::v4i64, 1 }, // paddq 571 572 { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence. 573 { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence. 574 { ISD::MUL, MVT::v16i16, 1 }, // pmullw 575 { ISD::MUL, MVT::v8i32, 1 }, // pmulld 576 { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add 577 578 { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/ 579 { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/ 580 { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/ 581 { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/ 582 { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/ 583 { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/ 584 }; 585 586 // Look for AVX2 lowering tricks for custom cases. 587 if (ST->hasAVX2()) 588 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second)) 589 return LT.first * Entry->Cost; 590 591 static const CostTblEntry AVX1CostTable[] = { 592 // We don't have to scalarize unsupported ops. We can issue two half-sized 593 // operations and we only need to extract the upper YMM half. 594 // Two ops + 1 extract + 1 insert = 4. 595 { ISD::MUL, MVT::v16i16, 4 }, 596 { ISD::MUL, MVT::v8i32, 4 }, 597 { ISD::SUB, MVT::v32i8, 4 }, 598 { ISD::ADD, MVT::v32i8, 4 }, 599 { ISD::SUB, MVT::v16i16, 4 }, 600 { ISD::ADD, MVT::v16i16, 4 }, 601 { ISD::SUB, MVT::v8i32, 4 }, 602 { ISD::ADD, MVT::v8i32, 4 }, 603 { ISD::SUB, MVT::v4i64, 4 }, 604 { ISD::ADD, MVT::v4i64, 4 }, 605 606 // A v4i64 multiply is custom lowered as two split v2i64 vectors that then 607 // are lowered as a series of long multiplies(3), shifts(3) and adds(2) 608 // Because we believe v4i64 to be a legal type, we must also include the 609 // extract+insert in the cost table. Therefore, the cost here is 18 610 // instead of 8. 611 { ISD::MUL, MVT::v4i64, 18 }, 612 613 { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence. 614 615 { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/ 616 { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/ 617 { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/ 618 { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/ 619 { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/ 620 { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/ 621 622 // Vectorizing division is a bad idea. See the SSE2 table for more comments. 623 { ISD::SDIV, MVT::v32i8, 32*20 }, 624 { ISD::SDIV, MVT::v16i16, 16*20 }, 625 { ISD::SDIV, MVT::v8i32, 8*20 }, 626 { ISD::SDIV, MVT::v4i64, 4*20 }, 627 { ISD::UDIV, MVT::v32i8, 32*20 }, 628 { ISD::UDIV, MVT::v16i16, 16*20 }, 629 { ISD::UDIV, MVT::v8i32, 8*20 }, 630 { ISD::UDIV, MVT::v4i64, 4*20 }, 631 }; 632 633 if (ST->hasAVX()) 634 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second)) 635 return LT.first * Entry->Cost; 636 637 static const CostTblEntry SSE42CostTable[] = { 638 { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/ 639 { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/ 640 { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/ 641 { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/ 642 }; 643 644 if (ST->hasSSE42()) 645 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second)) 646 return LT.first * Entry->Cost; 647 648 static const CostTblEntry SSE41CostTable[] = { 649 { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence. 650 { ISD::SHL, MVT::v32i8, 2*11+2 }, // pblendvb sequence + split. 651 { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence. 652 { ISD::SHL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split. 653 { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld 654 { ISD::SHL, MVT::v8i32, 2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split 655 656 { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence. 657 { ISD::SRL, MVT::v32i8, 2*12+2 }, // pblendvb sequence + split. 658 { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence. 659 { ISD::SRL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split. 660 { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend. 661 { ISD::SRL, MVT::v8i32, 2*11+2 }, // Shift each lane + blend + split. 662 663 { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence. 664 { ISD::SRA, MVT::v32i8, 2*24+2 }, // pblendvb sequence + split. 665 { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence. 666 { ISD::SRA, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split. 667 { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend. 668 { ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split. 669 670 { ISD::MUL, MVT::v4i32, 1 } // pmulld 671 }; 672 673 if (ST->hasSSE41()) 674 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second)) 675 return LT.first * Entry->Cost; 676 677 static const CostTblEntry SSE2CostTable[] = { 678 // We don't correctly identify costs of casts because they are marked as 679 // custom. 680 { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence. 681 { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence. 682 { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. 683 { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence. 684 { ISD::SHL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split. 685 686 { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence. 687 { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence. 688 { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend. 689 { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence. 690 { ISD::SRL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split. 691 692 { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence. 693 { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence. 694 { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend. 695 { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence. 696 { ISD::SRA, MVT::v4i64, 2*12+2 }, // srl/xor/sub sequence+split. 697 698 { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence. 699 { ISD::MUL, MVT::v8i16, 1 }, // pmullw 700 { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle 701 { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add 702 703 { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/ 704 { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/ 705 { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/ 706 { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/ 707 708 // It is not a good idea to vectorize division. We have to scalarize it and 709 // in the process we will often end up having to spilling regular 710 // registers. The overhead of division is going to dominate most kernels 711 // anyways so try hard to prevent vectorization of division - it is 712 // generally a bad idea. Assume somewhat arbitrarily that we have to be able 713 // to hide "20 cycles" for each lane. 714 { ISD::SDIV, MVT::v16i8, 16*20 }, 715 { ISD::SDIV, MVT::v8i16, 8*20 }, 716 { ISD::SDIV, MVT::v4i32, 4*20 }, 717 { ISD::SDIV, MVT::v2i64, 2*20 }, 718 { ISD::UDIV, MVT::v16i8, 16*20 }, 719 { ISD::UDIV, MVT::v8i16, 8*20 }, 720 { ISD::UDIV, MVT::v4i32, 4*20 }, 721 { ISD::UDIV, MVT::v2i64, 2*20 }, 722 }; 723 724 if (ST->hasSSE2()) 725 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second)) 726 return LT.first * Entry->Cost; 727 728 static const CostTblEntry SSE1CostTable[] = { 729 { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/ 730 { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/ 731 }; 732 733 if (ST->hasSSE1()) 734 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second)) 735 return LT.first * Entry->Cost; 736 737 // Fallback to the default implementation. 738 return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info); 739 } 740 741 int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, 742 Type *SubTp) { 743 // 64-bit packed float vectors (v2f32) are widened to type v4f32. 744 // 64-bit packed integer vectors (v2i32) are promoted to type v2i64. 745 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 746 747 // For Broadcasts we are splatting the first element from the first input 748 // register, so only need to reference that input and all the output 749 // registers are the same. 750 if (Kind == TTI::SK_Broadcast) 751 LT.first = 1; 752 753 // We are going to permute multiple sources and the result will be in multiple 754 // destinations. Providing an accurate cost only for splits where the element 755 // type remains the same. 756 if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) { 757 MVT LegalVT = LT.second; 758 if (LegalVT.isVector() && 759 LegalVT.getVectorElementType().getSizeInBits() == 760 Tp->getVectorElementType()->getPrimitiveSizeInBits() && 761 LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) { 762 763 unsigned VecTySize = DL.getTypeStoreSize(Tp); 764 unsigned LegalVTSize = LegalVT.getStoreSize(); 765 // Number of source vectors after legalization: 766 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize; 767 // Number of destination vectors after legalization: 768 unsigned NumOfDests = LT.first; 769 770 Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(), 771 LegalVT.getVectorNumElements()); 772 773 unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; 774 return NumOfShuffles * 775 getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr); 776 } 777 778 return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); 779 } 780 781 // For 2-input shuffles, we must account for splitting the 2 inputs into many. 782 if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) { 783 // We assume that source and destination have the same vector type. 784 int NumOfDests = LT.first; 785 int NumOfShufflesPerDest = LT.first * 2 - 1; 786 LT.first = NumOfDests * NumOfShufflesPerDest; 787 } 788 789 static const CostTblEntry AVX512VBMIShuffleTbl[] = { 790 { TTI::SK_Reverse, MVT::v64i8, 1 }, // vpermb 791 { TTI::SK_Reverse, MVT::v32i8, 1 }, // vpermb 792 793 { TTI::SK_PermuteSingleSrc, MVT::v64i8, 1 }, // vpermb 794 { TTI::SK_PermuteSingleSrc, MVT::v32i8, 1 }, // vpermb 795 796 { TTI::SK_PermuteTwoSrc, MVT::v64i8, 1 }, // vpermt2b 797 { TTI::SK_PermuteTwoSrc, MVT::v32i8, 1 }, // vpermt2b 798 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 1 } // vpermt2b 799 }; 800 801 if (ST->hasVBMI()) 802 if (const auto *Entry = 803 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second)) 804 return LT.first * Entry->Cost; 805 806 static const CostTblEntry AVX512BWShuffleTbl[] = { 807 { TTI::SK_Broadcast, MVT::v32i16, 1 }, // vpbroadcastw 808 { TTI::SK_Broadcast, MVT::v64i8, 1 }, // vpbroadcastb 809 810 { TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw 811 { TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw 812 { TTI::SK_Reverse, MVT::v64i8, 2 }, // pshufb + vshufi64x2 813 814 { TTI::SK_PermuteSingleSrc, MVT::v32i16, 1 }, // vpermw 815 { TTI::SK_PermuteSingleSrc, MVT::v16i16, 1 }, // vpermw 816 { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // vpermw 817 { TTI::SK_PermuteSingleSrc, MVT::v64i8, 8 }, // extend to v32i16 818 { TTI::SK_PermuteSingleSrc, MVT::v32i8, 3 }, // vpermw + zext/trunc 819 820 { TTI::SK_PermuteTwoSrc, MVT::v32i16, 1 }, // vpermt2w 821 { TTI::SK_PermuteTwoSrc, MVT::v16i16, 1 }, // vpermt2w 822 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 1 }, // vpermt2w 823 { TTI::SK_PermuteTwoSrc, MVT::v32i8, 3 }, // zext + vpermt2w + trunc 824 { TTI::SK_PermuteTwoSrc, MVT::v64i8, 19 }, // 6 * v32i8 + 1 825 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 3 } // zext + vpermt2w + trunc 826 }; 827 828 if (ST->hasBWI()) 829 if (const auto *Entry = 830 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second)) 831 return LT.first * Entry->Cost; 832 833 static const CostTblEntry AVX512ShuffleTbl[] = { 834 { TTI::SK_Broadcast, MVT::v8f64, 1 }, // vbroadcastpd 835 { TTI::SK_Broadcast, MVT::v16f32, 1 }, // vbroadcastps 836 { TTI::SK_Broadcast, MVT::v8i64, 1 }, // vpbroadcastq 837 { TTI::SK_Broadcast, MVT::v16i32, 1 }, // vpbroadcastd 838 839 { TTI::SK_Reverse, MVT::v8f64, 1 }, // vpermpd 840 { TTI::SK_Reverse, MVT::v16f32, 1 }, // vpermps 841 { TTI::SK_Reverse, MVT::v8i64, 1 }, // vpermq 842 { TTI::SK_Reverse, MVT::v16i32, 1 }, // vpermd 843 844 { TTI::SK_PermuteSingleSrc, MVT::v8f64, 1 }, // vpermpd 845 { TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd 846 { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // vpermpd 847 { TTI::SK_PermuteSingleSrc, MVT::v16f32, 1 }, // vpermps 848 { TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps 849 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // vpermps 850 { TTI::SK_PermuteSingleSrc, MVT::v8i64, 1 }, // vpermq 851 { TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq 852 { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // vpermq 853 { TTI::SK_PermuteSingleSrc, MVT::v16i32, 1 }, // vpermd 854 { TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd 855 { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }, // vpermd 856 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb 857 858 { TTI::SK_PermuteTwoSrc, MVT::v8f64, 1 }, // vpermt2pd 859 { TTI::SK_PermuteTwoSrc, MVT::v16f32, 1 }, // vpermt2ps 860 { TTI::SK_PermuteTwoSrc, MVT::v8i64, 1 }, // vpermt2q 861 { TTI::SK_PermuteTwoSrc, MVT::v16i32, 1 }, // vpermt2d 862 { TTI::SK_PermuteTwoSrc, MVT::v4f64, 1 }, // vpermt2pd 863 { TTI::SK_PermuteTwoSrc, MVT::v8f32, 1 }, // vpermt2ps 864 { TTI::SK_PermuteTwoSrc, MVT::v4i64, 1 }, // vpermt2q 865 { TTI::SK_PermuteTwoSrc, MVT::v8i32, 1 }, // vpermt2d 866 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // vpermt2pd 867 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 1 }, // vpermt2ps 868 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // vpermt2q 869 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 1 } // vpermt2d 870 }; 871 872 if (ST->hasAVX512()) 873 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second)) 874 return LT.first * Entry->Cost; 875 876 static const CostTblEntry AVX2ShuffleTbl[] = { 877 { TTI::SK_Broadcast, MVT::v4f64, 1 }, // vbroadcastpd 878 { TTI::SK_Broadcast, MVT::v8f32, 1 }, // vbroadcastps 879 { TTI::SK_Broadcast, MVT::v4i64, 1 }, // vpbroadcastq 880 { TTI::SK_Broadcast, MVT::v8i32, 1 }, // vpbroadcastd 881 { TTI::SK_Broadcast, MVT::v16i16, 1 }, // vpbroadcastw 882 { TTI::SK_Broadcast, MVT::v32i8, 1 }, // vpbroadcastb 883 884 { TTI::SK_Reverse, MVT::v4f64, 1 }, // vpermpd 885 { TTI::SK_Reverse, MVT::v8f32, 1 }, // vpermps 886 { TTI::SK_Reverse, MVT::v4i64, 1 }, // vpermq 887 { TTI::SK_Reverse, MVT::v8i32, 1 }, // vpermd 888 { TTI::SK_Reverse, MVT::v16i16, 2 }, // vperm2i128 + pshufb 889 { TTI::SK_Reverse, MVT::v32i8, 2 }, // vperm2i128 + pshufb 890 891 { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw 892 { TTI::SK_Alternate, MVT::v32i8, 1 }, // vpblendvb 893 894 { TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd 895 { TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps 896 { TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq 897 { TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd 898 { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vperm2i128 + 2*vpshufb 899 // + vpblendvb 900 { TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 }, // vperm2i128 + 2*vpshufb 901 // + vpblendvb 902 903 { TTI::SK_PermuteTwoSrc, MVT::v4f64, 3 }, // 2*vpermpd + vblendpd 904 { TTI::SK_PermuteTwoSrc, MVT::v8f32, 3 }, // 2*vpermps + vblendps 905 { TTI::SK_PermuteTwoSrc, MVT::v4i64, 3 }, // 2*vpermq + vpblendd 906 { TTI::SK_PermuteTwoSrc, MVT::v8i32, 3 }, // 2*vpermd + vpblendd 907 { TTI::SK_PermuteTwoSrc, MVT::v16i16, 7 }, // 2*vperm2i128 + 4*vpshufb 908 // + vpblendvb 909 { TTI::SK_PermuteTwoSrc, MVT::v32i8, 7 }, // 2*vperm2i128 + 4*vpshufb 910 // + vpblendvb 911 }; 912 913 if (ST->hasAVX2()) 914 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second)) 915 return LT.first * Entry->Cost; 916 917 static const CostTblEntry XOPShuffleTbl[] = { 918 { TTI::SK_PermuteSingleSrc, MVT::v4f64, 2 }, // vperm2f128 + vpermil2pd 919 { TTI::SK_PermuteSingleSrc, MVT::v8f32, 2 }, // vperm2f128 + vpermil2ps 920 { TTI::SK_PermuteSingleSrc, MVT::v4i64, 2 }, // vperm2f128 + vpermil2pd 921 { TTI::SK_PermuteSingleSrc, MVT::v8i32, 2 }, // vperm2f128 + vpermil2ps 922 { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vextractf128 + 2*vpperm 923 // + vinsertf128 924 { TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 }, // vextractf128 + 2*vpperm 925 // + vinsertf128 926 927 { TTI::SK_PermuteTwoSrc, MVT::v16i16, 9 }, // 2*vextractf128 + 6*vpperm 928 // + vinsertf128 929 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 1 }, // vpperm 930 { TTI::SK_PermuteTwoSrc, MVT::v32i8, 9 }, // 2*vextractf128 + 6*vpperm 931 // + vinsertf128 932 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 1 }, // vpperm 933 }; 934 935 if (ST->hasXOP()) 936 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second)) 937 return LT.first * Entry->Cost; 938 939 static const CostTblEntry AVX1ShuffleTbl[] = { 940 { TTI::SK_Broadcast, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd 941 { TTI::SK_Broadcast, MVT::v8f32, 2 }, // vperm2f128 + vpermilps 942 { TTI::SK_Broadcast, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd 943 { TTI::SK_Broadcast, MVT::v8i32, 2 }, // vperm2f128 + vpermilps 944 { TTI::SK_Broadcast, MVT::v16i16, 3 }, // vpshuflw + vpshufd + vinsertf128 945 { TTI::SK_Broadcast, MVT::v32i8, 2 }, // vpshufb + vinsertf128 946 947 { TTI::SK_Reverse, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd 948 { TTI::SK_Reverse, MVT::v8f32, 2 }, // vperm2f128 + vpermilps 949 { TTI::SK_Reverse, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd 950 { TTI::SK_Reverse, MVT::v8i32, 2 }, // vperm2f128 + vpermilps 951 { TTI::SK_Reverse, MVT::v16i16, 4 }, // vextractf128 + 2*pshufb 952 // + vinsertf128 953 { TTI::SK_Reverse, MVT::v32i8, 4 }, // vextractf128 + 2*pshufb 954 // + vinsertf128 955 956 { TTI::SK_Alternate, MVT::v4i64, 1 }, // vblendpd 957 { TTI::SK_Alternate, MVT::v4f64, 1 }, // vblendpd 958 { TTI::SK_Alternate, MVT::v8i32, 1 }, // vblendps 959 { TTI::SK_Alternate, MVT::v8f32, 1 }, // vblendps 960 { TTI::SK_Alternate, MVT::v16i16, 3 }, // vpand + vpandn + vpor 961 { TTI::SK_Alternate, MVT::v32i8, 3 }, // vpand + vpandn + vpor 962 963 { TTI::SK_PermuteSingleSrc, MVT::v4f64, 3 }, // 2*vperm2f128 + vshufpd 964 { TTI::SK_PermuteSingleSrc, MVT::v4i64, 3 }, // 2*vperm2f128 + vshufpd 965 { TTI::SK_PermuteSingleSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps 966 { TTI::SK_PermuteSingleSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps 967 { TTI::SK_PermuteSingleSrc, MVT::v16i16, 8 }, // vextractf128 + 4*pshufb 968 // + 2*por + vinsertf128 969 { TTI::SK_PermuteSingleSrc, MVT::v32i8, 8 }, // vextractf128 + 4*pshufb 970 // + 2*por + vinsertf128 971 972 { TTI::SK_PermuteTwoSrc, MVT::v4f64, 4 }, // 2*vperm2f128 + 2*vshufpd 973 { TTI::SK_PermuteTwoSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps 974 { TTI::SK_PermuteTwoSrc, MVT::v4i64, 4 }, // 2*vperm2f128 + 2*vshufpd 975 { TTI::SK_PermuteTwoSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps 976 { TTI::SK_PermuteTwoSrc, MVT::v16i16, 15 }, // 2*vextractf128 + 8*pshufb 977 // + 4*por + vinsertf128 978 { TTI::SK_PermuteTwoSrc, MVT::v32i8, 15 }, // 2*vextractf128 + 8*pshufb 979 // + 4*por + vinsertf128 980 }; 981 982 if (ST->hasAVX()) 983 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second)) 984 return LT.first * Entry->Cost; 985 986 static const CostTblEntry SSE41ShuffleTbl[] = { 987 { TTI::SK_Alternate, MVT::v2i64, 1 }, // pblendw 988 { TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd 989 { TTI::SK_Alternate, MVT::v4i32, 1 }, // pblendw 990 { TTI::SK_Alternate, MVT::v4f32, 1 }, // blendps 991 { TTI::SK_Alternate, MVT::v8i16, 1 }, // pblendw 992 { TTI::SK_Alternate, MVT::v16i8, 1 } // pblendvb 993 }; 994 995 if (ST->hasSSE41()) 996 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second)) 997 return LT.first * Entry->Cost; 998 999 static const CostTblEntry SSSE3ShuffleTbl[] = { 1000 { TTI::SK_Broadcast, MVT::v8i16, 1 }, // pshufb 1001 { TTI::SK_Broadcast, MVT::v16i8, 1 }, // pshufb 1002 1003 { TTI::SK_Reverse, MVT::v8i16, 1 }, // pshufb 1004 { TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb 1005 1006 { TTI::SK_Alternate, MVT::v8i16, 3 }, // 2*pshufb + por 1007 { TTI::SK_Alternate, MVT::v16i8, 3 }, // 2*pshufb + por 1008 1009 { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // pshufb 1010 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb 1011 1012 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 3 }, // 2*pshufb + por 1013 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 3 }, // 2*pshufb + por 1014 }; 1015 1016 if (ST->hasSSSE3()) 1017 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second)) 1018 return LT.first * Entry->Cost; 1019 1020 static const CostTblEntry SSE2ShuffleTbl[] = { 1021 { TTI::SK_Broadcast, MVT::v2f64, 1 }, // shufpd 1022 { TTI::SK_Broadcast, MVT::v2i64, 1 }, // pshufd 1023 { TTI::SK_Broadcast, MVT::v4i32, 1 }, // pshufd 1024 { TTI::SK_Broadcast, MVT::v8i16, 2 }, // pshuflw + pshufd 1025 { TTI::SK_Broadcast, MVT::v16i8, 3 }, // unpck + pshuflw + pshufd 1026 1027 { TTI::SK_Reverse, MVT::v2f64, 1 }, // shufpd 1028 { TTI::SK_Reverse, MVT::v2i64, 1 }, // pshufd 1029 { TTI::SK_Reverse, MVT::v4i32, 1 }, // pshufd 1030 { TTI::SK_Reverse, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd 1031 { TTI::SK_Reverse, MVT::v16i8, 9 }, // 2*pshuflw + 2*pshufhw 1032 // + 2*pshufd + 2*unpck + packus 1033 1034 { TTI::SK_Alternate, MVT::v2i64, 1 }, // movsd 1035 { TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd 1036 { TTI::SK_Alternate, MVT::v4i32, 2 }, // 2*shufps 1037 { TTI::SK_Alternate, MVT::v8i16, 3 }, // pand + pandn + por 1038 { TTI::SK_Alternate, MVT::v16i8, 3 }, // pand + pandn + por 1039 1040 { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // shufpd 1041 { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd 1042 { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }, // pshufd 1043 { TTI::SK_PermuteSingleSrc, MVT::v8i16, 5 }, // 2*pshuflw + 2*pshufhw 1044 // + pshufd/unpck 1045 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw 1046 // + 2*pshufd + 2*unpck + 2*packus 1047 1048 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd 1049 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd 1050 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd} 1051 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute 1052 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute 1053 }; 1054 1055 if (ST->hasSSE2()) 1056 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second)) 1057 return LT.first * Entry->Cost; 1058 1059 static const CostTblEntry SSE1ShuffleTbl[] = { 1060 { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps 1061 { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps 1062 { TTI::SK_Alternate, MVT::v4f32, 2 }, // 2*shufps 1063 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps 1064 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps 1065 }; 1066 1067 if (ST->hasSSE1()) 1068 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second)) 1069 return LT.first * Entry->Cost; 1070 1071 return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); 1072 } 1073 1074 int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, 1075 const Instruction *I) { 1076 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1077 assert(ISD && "Invalid opcode"); 1078 1079 // FIXME: Need a better design of the cost table to handle non-simple types of 1080 // potential massive combinations (elem_num x src_type x dst_type). 1081 1082 static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = { 1083 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, 1084 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 1085 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, 1086 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, 1087 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, 1088 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, 1089 1090 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, 1091 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 1092 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, 1093 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, 1094 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, 1095 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, 1096 1097 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 1 }, 1098 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 }, 1099 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 }, 1100 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, 1101 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 }, 1102 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 }, 1103 1104 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 }, 1105 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 }, 1106 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 }, 1107 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, 1108 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 }, 1109 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 }, 1110 }; 1111 1112 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and 1113 // 256-bit wide vectors. 1114 1115 static const TypeConversionCostTblEntry AVX512FConversionTbl[] = { 1116 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 }, 1117 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 }, 1118 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 }, 1119 1120 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 1 }, 1121 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 1 }, 1122 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 1 }, 1123 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, 1124 1125 // v16i1 -> v16i32 - load + broadcast 1126 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, 1127 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, 1128 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, 1129 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, 1130 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, 1131 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, 1132 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, 1133 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, 1134 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, 1135 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, 1136 1137 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, 1138 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, 1139 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 }, 1140 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 }, 1141 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, 1142 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 }, 1143 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, 1144 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, 1145 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 }, 1146 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 26 }, 1147 1148 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, 1149 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, 1150 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 2 }, 1151 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 }, 1152 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 2 }, 1153 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 }, 1154 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 }, 1155 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 5 }, 1156 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 }, 1157 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, 1158 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, 1159 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 }, 1160 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 }, 1161 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 }, 1162 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 1163 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, 1164 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, 1165 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, 1166 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, 1167 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 }, 1168 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, 1169 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 12 }, 1170 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 26 }, 1171 1172 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, 1173 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, 1174 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 }, 1175 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 2 }, 1176 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 2 }, 1177 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 }, 1178 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 2 }, 1179 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 2 }, 1180 }; 1181 1182 static const TypeConversionCostTblEntry AVX2ConversionTbl[] = { 1183 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, 1184 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, 1185 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, 1186 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, 1187 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, 1188 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, 1189 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 1190 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 1191 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, 1192 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, 1193 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 1194 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 1195 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, 1196 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, 1197 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, 1198 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, 1199 1200 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, 1201 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, 1202 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 }, 1203 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, 1204 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 }, 1205 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 4 }, 1206 1207 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 }, 1208 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 }, 1209 1210 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 }, 1211 }; 1212 1213 static const TypeConversionCostTblEntry AVXConversionTbl[] = { 1214 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 }, 1215 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 }, 1216 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 }, 1217 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 }, 1218 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 }, 1219 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, 1220 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 7 }, 1221 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 }, 1222 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, 1223 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, 1224 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 }, 1225 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 1226 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, 1227 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, 1228 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 4 }, 1229 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 4 }, 1230 1231 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 4 }, 1232 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 }, 1233 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, 1234 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 4 }, 1235 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 4 }, 1236 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 4 }, 1237 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 9 }, 1238 1239 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, 1240 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 }, 1241 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 }, 1242 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 1243 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i8, 3 }, 1244 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 8 }, 1245 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 3 }, 1246 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i16, 3 }, 1247 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 }, 1248 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 1249 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, 1250 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, 1251 1252 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 }, 1253 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 }, 1254 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 }, 1255 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 2 }, 1256 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 }, 1257 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 5 }, 1258 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 1259 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 }, 1260 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 }, 1261 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 6 }, 1262 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 6 }, 1263 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 }, 1264 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 9 }, 1265 // The generic code to compute the scalar overhead is currently broken. 1266 // Workaround this limitation by estimating the scalarization overhead 1267 // here. We have roughly 10 instructions per scalar element. 1268 // Multiply that by the vector width. 1269 // FIXME: remove that when PR19268 is fixed. 1270 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 10 }, 1271 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 20 }, 1272 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 }, 1273 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 }, 1274 1275 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 }, 1276 { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 7 }, 1277 // This node is expanded into scalarized operations but BasicTTI is overly 1278 // optimistic estimating its cost. It computes 3 per element (one 1279 // vector-extract, one scalar conversion and one vector-insert). The 1280 // problem is that the inserts form a read-modify-write chain so latency 1281 // should be factored in too. Inflating the cost per element by 1. 1282 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 8*4 }, 1283 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 }, 1284 1285 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 }, 1286 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 }, 1287 }; 1288 1289 static const TypeConversionCostTblEntry SSE41ConversionTbl[] = { 1290 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 2 }, 1291 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 2 }, 1292 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 2 }, 1293 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 2 }, 1294 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 1295 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 1296 1297 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, 1298 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 2 }, 1299 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 1 }, 1300 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 1 }, 1301 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, 1302 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, 1303 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 2 }, 1304 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 2 }, 1305 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 1306 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 1307 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 4 }, 1308 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 4 }, 1309 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, 1310 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, 1311 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 1312 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 1313 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 }, 1314 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 }, 1315 1316 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, 1317 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1 }, 1318 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1 }, 1319 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, 1320 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, 1321 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 3 }, 1322 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 }, 1323 1324 }; 1325 1326 static const TypeConversionCostTblEntry SSE2ConversionTbl[] = { 1327 // These are somewhat magic numbers justified by looking at the output of 1328 // Intel's IACA, running some kernels and making sure when we take 1329 // legalization into account the throughput will be overestimated. 1330 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, 1331 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, 1332 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, 1333 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, 1334 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 }, 1335 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, 1336 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, 1337 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, 1338 1339 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, 1340 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, 1341 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, 1342 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, 1343 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, 1344 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 }, 1345 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, 1346 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, 1347 1348 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 3 }, 1349 1350 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, 1351 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 }, 1352 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, 1353 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 3 }, 1354 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, 1355 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 8 }, 1356 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, 1357 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 2 }, 1358 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 6 }, 1359 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 6 }, 1360 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, 1361 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, 1362 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 9 }, 1363 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 12 }, 1364 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, 1365 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 2 }, 1366 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 1367 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 10 }, 1368 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, 1369 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, 1370 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 }, 1371 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 }, 1372 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 }, 1373 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 5 }, 1374 1375 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 4 }, 1376 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, 1377 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, 1378 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 }, 1379 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 }, 1380 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 }, 1381 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 }, 1382 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, 1383 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 10 }, 1384 }; 1385 1386 std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src); 1387 std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst); 1388 1389 if (ST->hasSSE2() && !ST->hasAVX()) { 1390 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, 1391 LTDest.second, LTSrc.second)) 1392 return LTSrc.first * Entry->Cost; 1393 } 1394 1395 EVT SrcTy = TLI->getValueType(DL, Src); 1396 EVT DstTy = TLI->getValueType(DL, Dst); 1397 1398 // The function getSimpleVT only handles simple value types. 1399 if (!SrcTy.isSimple() || !DstTy.isSimple()) 1400 return BaseT::getCastInstrCost(Opcode, Dst, Src); 1401 1402 if (ST->hasDQI()) 1403 if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD, 1404 DstTy.getSimpleVT(), 1405 SrcTy.getSimpleVT())) 1406 return Entry->Cost; 1407 1408 if (ST->hasAVX512()) 1409 if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD, 1410 DstTy.getSimpleVT(), 1411 SrcTy.getSimpleVT())) 1412 return Entry->Cost; 1413 1414 if (ST->hasAVX2()) { 1415 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, 1416 DstTy.getSimpleVT(), 1417 SrcTy.getSimpleVT())) 1418 return Entry->Cost; 1419 } 1420 1421 if (ST->hasAVX()) { 1422 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, 1423 DstTy.getSimpleVT(), 1424 SrcTy.getSimpleVT())) 1425 return Entry->Cost; 1426 } 1427 1428 if (ST->hasSSE41()) { 1429 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, 1430 DstTy.getSimpleVT(), 1431 SrcTy.getSimpleVT())) 1432 return Entry->Cost; 1433 } 1434 1435 if (ST->hasSSE2()) { 1436 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, 1437 DstTy.getSimpleVT(), 1438 SrcTy.getSimpleVT())) 1439 return Entry->Cost; 1440 } 1441 1442 return BaseT::getCastInstrCost(Opcode, Dst, Src, I); 1443 } 1444 1445 int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, 1446 const Instruction *I) { 1447 // Legalize the type. 1448 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 1449 1450 MVT MTy = LT.second; 1451 1452 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1453 assert(ISD && "Invalid opcode"); 1454 1455 static const CostTblEntry SSE2CostTbl[] = { 1456 { ISD::SETCC, MVT::v2i64, 8 }, 1457 { ISD::SETCC, MVT::v4i32, 1 }, 1458 { ISD::SETCC, MVT::v8i16, 1 }, 1459 { ISD::SETCC, MVT::v16i8, 1 }, 1460 }; 1461 1462 static const CostTblEntry SSE42CostTbl[] = { 1463 { ISD::SETCC, MVT::v2f64, 1 }, 1464 { ISD::SETCC, MVT::v4f32, 1 }, 1465 { ISD::SETCC, MVT::v2i64, 1 }, 1466 }; 1467 1468 static const CostTblEntry AVX1CostTbl[] = { 1469 { ISD::SETCC, MVT::v4f64, 1 }, 1470 { ISD::SETCC, MVT::v8f32, 1 }, 1471 // AVX1 does not support 8-wide integer compare. 1472 { ISD::SETCC, MVT::v4i64, 4 }, 1473 { ISD::SETCC, MVT::v8i32, 4 }, 1474 { ISD::SETCC, MVT::v16i16, 4 }, 1475 { ISD::SETCC, MVT::v32i8, 4 }, 1476 }; 1477 1478 static const CostTblEntry AVX2CostTbl[] = { 1479 { ISD::SETCC, MVT::v4i64, 1 }, 1480 { ISD::SETCC, MVT::v8i32, 1 }, 1481 { ISD::SETCC, MVT::v16i16, 1 }, 1482 { ISD::SETCC, MVT::v32i8, 1 }, 1483 }; 1484 1485 static const CostTblEntry AVX512CostTbl[] = { 1486 { ISD::SETCC, MVT::v8i64, 1 }, 1487 { ISD::SETCC, MVT::v16i32, 1 }, 1488 { ISD::SETCC, MVT::v8f64, 1 }, 1489 { ISD::SETCC, MVT::v16f32, 1 }, 1490 }; 1491 1492 if (ST->hasAVX512()) 1493 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) 1494 return LT.first * Entry->Cost; 1495 1496 if (ST->hasAVX2()) 1497 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) 1498 return LT.first * Entry->Cost; 1499 1500 if (ST->hasAVX()) 1501 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) 1502 return LT.first * Entry->Cost; 1503 1504 if (ST->hasSSE42()) 1505 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) 1506 return LT.first * Entry->Cost; 1507 1508 if (ST->hasSSE2()) 1509 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) 1510 return LT.first * Entry->Cost; 1511 1512 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); 1513 } 1514 1515 unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; } 1516 1517 int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, 1518 ArrayRef<Type *> Tys, FastMathFlags FMF, 1519 unsigned ScalarizationCostPassed) { 1520 // Costs should match the codegen from: 1521 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll 1522 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll 1523 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll 1524 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll 1525 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll 1526 static const CostTblEntry AVX512CDCostTbl[] = { 1527 { ISD::CTLZ, MVT::v8i64, 1 }, 1528 { ISD::CTLZ, MVT::v16i32, 1 }, 1529 { ISD::CTLZ, MVT::v32i16, 8 }, 1530 { ISD::CTLZ, MVT::v64i8, 20 }, 1531 { ISD::CTLZ, MVT::v4i64, 1 }, 1532 { ISD::CTLZ, MVT::v8i32, 1 }, 1533 { ISD::CTLZ, MVT::v16i16, 4 }, 1534 { ISD::CTLZ, MVT::v32i8, 10 }, 1535 { ISD::CTLZ, MVT::v2i64, 1 }, 1536 { ISD::CTLZ, MVT::v4i32, 1 }, 1537 { ISD::CTLZ, MVT::v8i16, 4 }, 1538 { ISD::CTLZ, MVT::v16i8, 4 }, 1539 }; 1540 static const CostTblEntry AVX512BWCostTbl[] = { 1541 { ISD::BITREVERSE, MVT::v8i64, 5 }, 1542 { ISD::BITREVERSE, MVT::v16i32, 5 }, 1543 { ISD::BITREVERSE, MVT::v32i16, 5 }, 1544 { ISD::BITREVERSE, MVT::v64i8, 5 }, 1545 { ISD::CTLZ, MVT::v8i64, 23 }, 1546 { ISD::CTLZ, MVT::v16i32, 22 }, 1547 { ISD::CTLZ, MVT::v32i16, 18 }, 1548 { ISD::CTLZ, MVT::v64i8, 17 }, 1549 { ISD::CTPOP, MVT::v8i64, 7 }, 1550 { ISD::CTPOP, MVT::v16i32, 11 }, 1551 { ISD::CTPOP, MVT::v32i16, 9 }, 1552 { ISD::CTPOP, MVT::v64i8, 6 }, 1553 { ISD::CTTZ, MVT::v8i64, 10 }, 1554 { ISD::CTTZ, MVT::v16i32, 14 }, 1555 { ISD::CTTZ, MVT::v32i16, 12 }, 1556 { ISD::CTTZ, MVT::v64i8, 9 }, 1557 }; 1558 static const CostTblEntry AVX512CostTbl[] = { 1559 { ISD::BITREVERSE, MVT::v8i64, 36 }, 1560 { ISD::BITREVERSE, MVT::v16i32, 24 }, 1561 { ISD::CTLZ, MVT::v8i64, 29 }, 1562 { ISD::CTLZ, MVT::v16i32, 35 }, 1563 { ISD::CTPOP, MVT::v8i64, 16 }, 1564 { ISD::CTPOP, MVT::v16i32, 24 }, 1565 { ISD::CTTZ, MVT::v8i64, 20 }, 1566 { ISD::CTTZ, MVT::v16i32, 28 }, 1567 }; 1568 static const CostTblEntry XOPCostTbl[] = { 1569 { ISD::BITREVERSE, MVT::v4i64, 4 }, 1570 { ISD::BITREVERSE, MVT::v8i32, 4 }, 1571 { ISD::BITREVERSE, MVT::v16i16, 4 }, 1572 { ISD::BITREVERSE, MVT::v32i8, 4 }, 1573 { ISD::BITREVERSE, MVT::v2i64, 1 }, 1574 { ISD::BITREVERSE, MVT::v4i32, 1 }, 1575 { ISD::BITREVERSE, MVT::v8i16, 1 }, 1576 { ISD::BITREVERSE, MVT::v16i8, 1 }, 1577 { ISD::BITREVERSE, MVT::i64, 3 }, 1578 { ISD::BITREVERSE, MVT::i32, 3 }, 1579 { ISD::BITREVERSE, MVT::i16, 3 }, 1580 { ISD::BITREVERSE, MVT::i8, 3 } 1581 }; 1582 static const CostTblEntry AVX2CostTbl[] = { 1583 { ISD::BITREVERSE, MVT::v4i64, 5 }, 1584 { ISD::BITREVERSE, MVT::v8i32, 5 }, 1585 { ISD::BITREVERSE, MVT::v16i16, 5 }, 1586 { ISD::BITREVERSE, MVT::v32i8, 5 }, 1587 { ISD::BSWAP, MVT::v4i64, 1 }, 1588 { ISD::BSWAP, MVT::v8i32, 1 }, 1589 { ISD::BSWAP, MVT::v16i16, 1 }, 1590 { ISD::CTLZ, MVT::v4i64, 23 }, 1591 { ISD::CTLZ, MVT::v8i32, 18 }, 1592 { ISD::CTLZ, MVT::v16i16, 14 }, 1593 { ISD::CTLZ, MVT::v32i8, 9 }, 1594 { ISD::CTPOP, MVT::v4i64, 7 }, 1595 { ISD::CTPOP, MVT::v8i32, 11 }, 1596 { ISD::CTPOP, MVT::v16i16, 9 }, 1597 { ISD::CTPOP, MVT::v32i8, 6 }, 1598 { ISD::CTTZ, MVT::v4i64, 10 }, 1599 { ISD::CTTZ, MVT::v8i32, 14 }, 1600 { ISD::CTTZ, MVT::v16i16, 12 }, 1601 { ISD::CTTZ, MVT::v32i8, 9 }, 1602 { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/ 1603 { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/ 1604 { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/ 1605 { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/ 1606 { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/ 1607 { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/ 1608 }; 1609 static const CostTblEntry AVX1CostTbl[] = { 1610 { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert 1611 { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert 1612 { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert 1613 { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert 1614 { ISD::BSWAP, MVT::v4i64, 4 }, 1615 { ISD::BSWAP, MVT::v8i32, 4 }, 1616 { ISD::BSWAP, MVT::v16i16, 4 }, 1617 { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert 1618 { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert 1619 { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert 1620 { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert 1621 { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert 1622 { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert 1623 { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert 1624 { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert 1625 { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert 1626 { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert 1627 { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert 1628 { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert 1629 { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/ 1630 { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/ 1631 { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/ 1632 { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/ 1633 { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/ 1634 { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/ 1635 }; 1636 static const CostTblEntry SSE42CostTbl[] = { 1637 { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/ 1638 { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/ 1639 }; 1640 static const CostTblEntry SSSE3CostTbl[] = { 1641 { ISD::BITREVERSE, MVT::v2i64, 5 }, 1642 { ISD::BITREVERSE, MVT::v4i32, 5 }, 1643 { ISD::BITREVERSE, MVT::v8i16, 5 }, 1644 { ISD::BITREVERSE, MVT::v16i8, 5 }, 1645 { ISD::BSWAP, MVT::v2i64, 1 }, 1646 { ISD::BSWAP, MVT::v4i32, 1 }, 1647 { ISD::BSWAP, MVT::v8i16, 1 }, 1648 { ISD::CTLZ, MVT::v2i64, 23 }, 1649 { ISD::CTLZ, MVT::v4i32, 18 }, 1650 { ISD::CTLZ, MVT::v8i16, 14 }, 1651 { ISD::CTLZ, MVT::v16i8, 9 }, 1652 { ISD::CTPOP, MVT::v2i64, 7 }, 1653 { ISD::CTPOP, MVT::v4i32, 11 }, 1654 { ISD::CTPOP, MVT::v8i16, 9 }, 1655 { ISD::CTPOP, MVT::v16i8, 6 }, 1656 { ISD::CTTZ, MVT::v2i64, 10 }, 1657 { ISD::CTTZ, MVT::v4i32, 14 }, 1658 { ISD::CTTZ, MVT::v8i16, 12 }, 1659 { ISD::CTTZ, MVT::v16i8, 9 } 1660 }; 1661 static const CostTblEntry SSE2CostTbl[] = { 1662 { ISD::BITREVERSE, MVT::v2i64, 29 }, 1663 { ISD::BITREVERSE, MVT::v4i32, 27 }, 1664 { ISD::BITREVERSE, MVT::v8i16, 27 }, 1665 { ISD::BITREVERSE, MVT::v16i8, 20 }, 1666 { ISD::BSWAP, MVT::v2i64, 7 }, 1667 { ISD::BSWAP, MVT::v4i32, 7 }, 1668 { ISD::BSWAP, MVT::v8i16, 7 }, 1669 { ISD::CTLZ, MVT::v2i64, 25 }, 1670 { ISD::CTLZ, MVT::v4i32, 26 }, 1671 { ISD::CTLZ, MVT::v8i16, 20 }, 1672 { ISD::CTLZ, MVT::v16i8, 17 }, 1673 { ISD::CTPOP, MVT::v2i64, 12 }, 1674 { ISD::CTPOP, MVT::v4i32, 15 }, 1675 { ISD::CTPOP, MVT::v8i16, 13 }, 1676 { ISD::CTPOP, MVT::v16i8, 10 }, 1677 { ISD::CTTZ, MVT::v2i64, 14 }, 1678 { ISD::CTTZ, MVT::v4i32, 18 }, 1679 { ISD::CTTZ, MVT::v8i16, 16 }, 1680 { ISD::CTTZ, MVT::v16i8, 13 }, 1681 { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/ 1682 { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/ 1683 }; 1684 static const CostTblEntry SSE1CostTbl[] = { 1685 { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/ 1686 { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/ 1687 }; 1688 static const CostTblEntry X64CostTbl[] = { // 64-bit targets 1689 { ISD::BITREVERSE, MVT::i64, 14 } 1690 }; 1691 static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets 1692 { ISD::BITREVERSE, MVT::i32, 14 }, 1693 { ISD::BITREVERSE, MVT::i16, 14 }, 1694 { ISD::BITREVERSE, MVT::i8, 11 } 1695 }; 1696 1697 unsigned ISD = ISD::DELETED_NODE; 1698 switch (IID) { 1699 default: 1700 break; 1701 case Intrinsic::bitreverse: 1702 ISD = ISD::BITREVERSE; 1703 break; 1704 case Intrinsic::bswap: 1705 ISD = ISD::BSWAP; 1706 break; 1707 case Intrinsic::ctlz: 1708 ISD = ISD::CTLZ; 1709 break; 1710 case Intrinsic::ctpop: 1711 ISD = ISD::CTPOP; 1712 break; 1713 case Intrinsic::cttz: 1714 ISD = ISD::CTTZ; 1715 break; 1716 case Intrinsic::sqrt: 1717 ISD = ISD::FSQRT; 1718 break; 1719 } 1720 1721 // Legalize the type. 1722 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy); 1723 MVT MTy = LT.second; 1724 1725 // Attempt to lookup cost. 1726 if (ST->hasCDI()) 1727 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy)) 1728 return LT.first * Entry->Cost; 1729 1730 if (ST->hasBWI()) 1731 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) 1732 return LT.first * Entry->Cost; 1733 1734 if (ST->hasAVX512()) 1735 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) 1736 return LT.first * Entry->Cost; 1737 1738 if (ST->hasXOP()) 1739 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) 1740 return LT.first * Entry->Cost; 1741 1742 if (ST->hasAVX2()) 1743 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) 1744 return LT.first * Entry->Cost; 1745 1746 if (ST->hasAVX()) 1747 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) 1748 return LT.first * Entry->Cost; 1749 1750 if (ST->hasSSE42()) 1751 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) 1752 return LT.first * Entry->Cost; 1753 1754 if (ST->hasSSSE3()) 1755 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy)) 1756 return LT.first * Entry->Cost; 1757 1758 if (ST->hasSSE2()) 1759 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) 1760 return LT.first * Entry->Cost; 1761 1762 if (ST->hasSSE1()) 1763 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) 1764 return LT.first * Entry->Cost; 1765 1766 if (ST->is64Bit()) 1767 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) 1768 return LT.first * Entry->Cost; 1769 1770 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy)) 1771 return LT.first * Entry->Cost; 1772 1773 return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed); 1774 } 1775 1776 int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, 1777 ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) { 1778 return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF); 1779 } 1780 1781 int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { 1782 assert(Val->isVectorTy() && "This must be a vector type"); 1783 1784 Type *ScalarType = Val->getScalarType(); 1785 1786 if (Index != -1U) { 1787 // Legalize the type. 1788 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); 1789 1790 // This type is legalized to a scalar type. 1791 if (!LT.second.isVector()) 1792 return 0; 1793 1794 // The type may be split. Normalize the index to the new type. 1795 unsigned Width = LT.second.getVectorNumElements(); 1796 Index = Index % Width; 1797 1798 // Floating point scalars are already located in index #0. 1799 if (ScalarType->isFloatingPointTy() && Index == 0) 1800 return 0; 1801 } 1802 1803 // Add to the base cost if we know that the extracted element of a vector is 1804 // destined to be moved to and used in the integer register file. 1805 int RegisterFileMoveCost = 0; 1806 if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy()) 1807 RegisterFileMoveCost = 1; 1808 1809 return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost; 1810 } 1811 1812 int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, 1813 unsigned AddressSpace, const Instruction *I) { 1814 // Handle non-power-of-two vectors such as <3 x float> 1815 if (VectorType *VTy = dyn_cast<VectorType>(Src)) { 1816 unsigned NumElem = VTy->getVectorNumElements(); 1817 1818 // Handle a few common cases: 1819 // <3 x float> 1820 if (NumElem == 3 && VTy->getScalarSizeInBits() == 32) 1821 // Cost = 64 bit store + extract + 32 bit store. 1822 return 3; 1823 1824 // <3 x double> 1825 if (NumElem == 3 && VTy->getScalarSizeInBits() == 64) 1826 // Cost = 128 bit store + unpack + 64 bit store. 1827 return 3; 1828 1829 // Assume that all other non-power-of-two numbers are scalarized. 1830 if (!isPowerOf2_32(NumElem)) { 1831 int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment, 1832 AddressSpace); 1833 int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load, 1834 Opcode == Instruction::Store); 1835 return NumElem * Cost + SplitCost; 1836 } 1837 } 1838 1839 // Legalize the type. 1840 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); 1841 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && 1842 "Invalid Opcode"); 1843 1844 // Each load/store unit costs 1. 1845 int Cost = LT.first * 1; 1846 1847 // This isn't exactly right. We're using slow unaligned 32-byte accesses as a 1848 // proxy for a double-pumped AVX memory interface such as on Sandybridge. 1849 if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow()) 1850 Cost *= 2; 1851 1852 return Cost; 1853 } 1854 1855 int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, 1856 unsigned Alignment, 1857 unsigned AddressSpace) { 1858 VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy); 1859 if (!SrcVTy) 1860 // To calculate scalar take the regular cost, without mask 1861 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace); 1862 1863 unsigned NumElem = SrcVTy->getVectorNumElements(); 1864 VectorType *MaskTy = 1865 VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem); 1866 if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy)) || 1867 (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy)) || 1868 !isPowerOf2_32(NumElem)) { 1869 // Scalarization 1870 int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true); 1871 int ScalarCompareCost = getCmpSelInstrCost( 1872 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr); 1873 int BranchCost = getCFInstrCost(Instruction::Br); 1874 int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); 1875 1876 int ValueSplitCost = getScalarizationOverhead( 1877 SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store); 1878 int MemopCost = 1879 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), 1880 Alignment, AddressSpace); 1881 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; 1882 } 1883 1884 // Legalize the type. 1885 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy); 1886 auto VT = TLI->getValueType(DL, SrcVTy); 1887 int Cost = 0; 1888 if (VT.isSimple() && LT.second != VT.getSimpleVT() && 1889 LT.second.getVectorNumElements() == NumElem) 1890 // Promotion requires expand/truncate for data and a shuffle for mask. 1891 Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, nullptr) + 1892 getShuffleCost(TTI::SK_Alternate, MaskTy, 0, nullptr); 1893 1894 else if (LT.second.getVectorNumElements() > NumElem) { 1895 VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(), 1896 LT.second.getVectorNumElements()); 1897 // Expanding requires fill mask with zeroes 1898 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy); 1899 } 1900 if (!ST->hasAVX512()) 1901 return Cost + LT.first*4; // Each maskmov costs 4 1902 1903 // AVX-512 masked load/store is cheapper 1904 return Cost+LT.first; 1905 } 1906 1907 int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, 1908 const SCEV *Ptr) { 1909 // Address computations in vectorized code with non-consecutive addresses will 1910 // likely result in more instructions compared to scalar code where the 1911 // computation can more often be merged into the index mode. The resulting 1912 // extra micro-ops can significantly decrease throughput. 1913 unsigned NumVectorInstToHideOverhead = 10; 1914 1915 // Cost modeling of Strided Access Computation is hidden by the indexing 1916 // modes of X86 regardless of the stride value. We dont believe that there 1917 // is a difference between constant strided access in gerenal and constant 1918 // strided value which is less than or equal to 64. 1919 // Even in the case of (loop invariant) stride whose value is not known at 1920 // compile time, the address computation will not incur more than one extra 1921 // ADD instruction. 1922 if (Ty->isVectorTy() && SE) { 1923 if (!BaseT::isStridedAccess(Ptr)) 1924 return NumVectorInstToHideOverhead; 1925 if (!BaseT::getConstantStrideStep(SE, Ptr)) 1926 return 1; 1927 } 1928 1929 return BaseT::getAddressComputationCost(Ty, SE, Ptr); 1930 } 1931 1932 int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, 1933 bool IsPairwise) { 1934 1935 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 1936 1937 MVT MTy = LT.second; 1938 1939 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1940 assert(ISD && "Invalid opcode"); 1941 1942 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput 1943 // and make it as the cost. 1944 1945 static const CostTblEntry SSE42CostTblPairWise[] = { 1946 { ISD::FADD, MVT::v2f64, 2 }, 1947 { ISD::FADD, MVT::v4f32, 4 }, 1948 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". 1949 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". 1950 { ISD::ADD, MVT::v8i16, 5 }, 1951 }; 1952 1953 static const CostTblEntry AVX1CostTblPairWise[] = { 1954 { ISD::FADD, MVT::v4f32, 4 }, 1955 { ISD::FADD, MVT::v4f64, 5 }, 1956 { ISD::FADD, MVT::v8f32, 7 }, 1957 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". 1958 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". 1959 { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8". 1960 { ISD::ADD, MVT::v8i16, 5 }, 1961 { ISD::ADD, MVT::v8i32, 5 }, 1962 }; 1963 1964 static const CostTblEntry SSE42CostTblNoPairWise[] = { 1965 { ISD::FADD, MVT::v2f64, 2 }, 1966 { ISD::FADD, MVT::v4f32, 4 }, 1967 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". 1968 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3". 1969 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3". 1970 }; 1971 1972 static const CostTblEntry AVX1CostTblNoPairWise[] = { 1973 { ISD::FADD, MVT::v4f32, 3 }, 1974 { ISD::FADD, MVT::v4f64, 3 }, 1975 { ISD::FADD, MVT::v8f32, 4 }, 1976 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". 1977 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8". 1978 { ISD::ADD, MVT::v4i64, 3 }, 1979 { ISD::ADD, MVT::v8i16, 4 }, 1980 { ISD::ADD, MVT::v8i32, 5 }, 1981 }; 1982 1983 if (IsPairwise) { 1984 if (ST->hasAVX()) 1985 if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy)) 1986 return LT.first * Entry->Cost; 1987 1988 if (ST->hasSSE42()) 1989 if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy)) 1990 return LT.first * Entry->Cost; 1991 } else { 1992 if (ST->hasAVX()) 1993 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) 1994 return LT.first * Entry->Cost; 1995 1996 if (ST->hasSSE42()) 1997 if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy)) 1998 return LT.first * Entry->Cost; 1999 } 2000 2001 return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise); 2002 } 2003 2004 int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy, 2005 bool IsPairwise, bool IsUnsigned) { 2006 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 2007 2008 MVT MTy = LT.second; 2009 2010 int ISD; 2011 if (ValTy->isIntOrIntVectorTy()) { 2012 ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN; 2013 } else { 2014 assert(ValTy->isFPOrFPVectorTy() && 2015 "Expected float point or integer vector type."); 2016 ISD = ISD::FMINNUM; 2017 } 2018 2019 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput 2020 // and make it as the cost. 2021 2022 static const CostTblEntry SSE42CostTblPairWise[] = { 2023 {ISD::FMINNUM, MVT::v2f64, 3}, 2024 {ISD::FMINNUM, MVT::v4f32, 2}, 2025 {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8" 2026 {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6" 2027 {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5" 2028 {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8" 2029 {ISD::SMIN, MVT::v8i16, 2}, 2030 {ISD::UMIN, MVT::v8i16, 2}, 2031 }; 2032 2033 static const CostTblEntry AVX1CostTblPairWise[] = { 2034 {ISD::FMINNUM, MVT::v4f32, 1}, 2035 {ISD::FMINNUM, MVT::v4f64, 1}, 2036 {ISD::FMINNUM, MVT::v8f32, 2}, 2037 {ISD::SMIN, MVT::v2i64, 3}, 2038 {ISD::UMIN, MVT::v2i64, 3}, 2039 {ISD::SMIN, MVT::v4i32, 1}, 2040 {ISD::UMIN, MVT::v4i32, 1}, 2041 {ISD::SMIN, MVT::v8i16, 1}, 2042 {ISD::UMIN, MVT::v8i16, 1}, 2043 {ISD::SMIN, MVT::v8i32, 3}, 2044 {ISD::UMIN, MVT::v8i32, 3}, 2045 }; 2046 2047 static const CostTblEntry AVX2CostTblPairWise[] = { 2048 {ISD::SMIN, MVT::v4i64, 2}, 2049 {ISD::UMIN, MVT::v4i64, 2}, 2050 {ISD::SMIN, MVT::v8i32, 1}, 2051 {ISD::UMIN, MVT::v8i32, 1}, 2052 {ISD::SMIN, MVT::v16i16, 1}, 2053 {ISD::UMIN, MVT::v16i16, 1}, 2054 {ISD::SMIN, MVT::v32i8, 2}, 2055 {ISD::UMIN, MVT::v32i8, 2}, 2056 }; 2057 2058 static const CostTblEntry AVX512CostTblPairWise[] = { 2059 {ISD::FMINNUM, MVT::v8f64, 1}, 2060 {ISD::FMINNUM, MVT::v16f32, 2}, 2061 {ISD::SMIN, MVT::v8i64, 2}, 2062 {ISD::UMIN, MVT::v8i64, 2}, 2063 {ISD::SMIN, MVT::v16i32, 1}, 2064 {ISD::UMIN, MVT::v16i32, 1}, 2065 }; 2066 2067 static const CostTblEntry SSE42CostTblNoPairWise[] = { 2068 {ISD::FMINNUM, MVT::v2f64, 3}, 2069 {ISD::FMINNUM, MVT::v4f32, 3}, 2070 {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8" 2071 {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6" 2072 {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5" 2073 {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8" 2074 {ISD::SMIN, MVT::v8i16, 1}, // The data reported by the IACA is "1.5" 2075 {ISD::UMIN, MVT::v8i16, 2}, // The data reported by the IACA is "1.8" 2076 }; 2077 2078 static const CostTblEntry AVX1CostTblNoPairWise[] = { 2079 {ISD::FMINNUM, MVT::v4f32, 1}, 2080 {ISD::FMINNUM, MVT::v4f64, 1}, 2081 {ISD::FMINNUM, MVT::v8f32, 1}, 2082 {ISD::SMIN, MVT::v2i64, 3}, 2083 {ISD::UMIN, MVT::v2i64, 3}, 2084 {ISD::SMIN, MVT::v4i32, 1}, 2085 {ISD::UMIN, MVT::v4i32, 1}, 2086 {ISD::SMIN, MVT::v8i16, 1}, 2087 {ISD::UMIN, MVT::v8i16, 1}, 2088 {ISD::SMIN, MVT::v8i32, 2}, 2089 {ISD::UMIN, MVT::v8i32, 2}, 2090 }; 2091 2092 static const CostTblEntry AVX2CostTblNoPairWise[] = { 2093 {ISD::SMIN, MVT::v4i64, 1}, 2094 {ISD::UMIN, MVT::v4i64, 1}, 2095 {ISD::SMIN, MVT::v8i32, 1}, 2096 {ISD::UMIN, MVT::v8i32, 1}, 2097 {ISD::SMIN, MVT::v16i16, 1}, 2098 {ISD::UMIN, MVT::v16i16, 1}, 2099 {ISD::SMIN, MVT::v32i8, 1}, 2100 {ISD::UMIN, MVT::v32i8, 1}, 2101 }; 2102 2103 static const CostTblEntry AVX512CostTblNoPairWise[] = { 2104 {ISD::FMINNUM, MVT::v8f64, 1}, 2105 {ISD::FMINNUM, MVT::v16f32, 2}, 2106 {ISD::SMIN, MVT::v8i64, 1}, 2107 {ISD::UMIN, MVT::v8i64, 1}, 2108 {ISD::SMIN, MVT::v16i32, 1}, 2109 {ISD::UMIN, MVT::v16i32, 1}, 2110 }; 2111 2112 if (IsPairwise) { 2113 if (ST->hasAVX512()) 2114 if (const auto *Entry = CostTableLookup(AVX512CostTblPairWise, ISD, MTy)) 2115 return LT.first * Entry->Cost; 2116 2117 if (ST->hasAVX2()) 2118 if (const auto *Entry = CostTableLookup(AVX2CostTblPairWise, ISD, MTy)) 2119 return LT.first * Entry->Cost; 2120 2121 if (ST->hasAVX()) 2122 if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy)) 2123 return LT.first * Entry->Cost; 2124 2125 if (ST->hasSSE42()) 2126 if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy)) 2127 return LT.first * Entry->Cost; 2128 } else { 2129 if (ST->hasAVX512()) 2130 if (const auto *Entry = 2131 CostTableLookup(AVX512CostTblNoPairWise, ISD, MTy)) 2132 return LT.first * Entry->Cost; 2133 2134 if (ST->hasAVX2()) 2135 if (const auto *Entry = CostTableLookup(AVX2CostTblNoPairWise, ISD, MTy)) 2136 return LT.first * Entry->Cost; 2137 2138 if (ST->hasAVX()) 2139 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) 2140 return LT.first * Entry->Cost; 2141 2142 if (ST->hasSSE42()) 2143 if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy)) 2144 return LT.first * Entry->Cost; 2145 } 2146 2147 return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned); 2148 } 2149 2150 /// \brief Calculate the cost of materializing a 64-bit value. This helper 2151 /// method might only calculate a fraction of a larger immediate. Therefore it 2152 /// is valid to return a cost of ZERO. 2153 int X86TTIImpl::getIntImmCost(int64_t Val) { 2154 if (Val == 0) 2155 return TTI::TCC_Free; 2156 2157 if (isInt<32>(Val)) 2158 return TTI::TCC_Basic; 2159 2160 return 2 * TTI::TCC_Basic; 2161 } 2162 2163 int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { 2164 assert(Ty->isIntegerTy()); 2165 2166 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 2167 if (BitSize == 0) 2168 return ~0U; 2169 2170 // Never hoist constants larger than 128bit, because this might lead to 2171 // incorrect code generation or assertions in codegen. 2172 // Fixme: Create a cost model for types larger than i128 once the codegen 2173 // issues have been fixed. 2174 if (BitSize > 128) 2175 return TTI::TCC_Free; 2176 2177 if (Imm == 0) 2178 return TTI::TCC_Free; 2179 2180 // Sign-extend all constants to a multiple of 64-bit. 2181 APInt ImmVal = Imm; 2182 if (BitSize & 0x3f) 2183 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); 2184 2185 // Split the constant into 64-bit chunks and calculate the cost for each 2186 // chunk. 2187 int Cost = 0; 2188 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { 2189 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); 2190 int64_t Val = Tmp.getSExtValue(); 2191 Cost += getIntImmCost(Val); 2192 } 2193 // We need at least one instruction to materialize the constant. 2194 return std::max(1, Cost); 2195 } 2196 2197 int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, 2198 Type *Ty) { 2199 assert(Ty->isIntegerTy()); 2200 2201 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 2202 // There is no cost model for constants with a bit size of 0. Return TCC_Free 2203 // here, so that constant hoisting will ignore this constant. 2204 if (BitSize == 0) 2205 return TTI::TCC_Free; 2206 2207 unsigned ImmIdx = ~0U; 2208 switch (Opcode) { 2209 default: 2210 return TTI::TCC_Free; 2211 case Instruction::GetElementPtr: 2212 // Always hoist the base address of a GetElementPtr. This prevents the 2213 // creation of new constants for every base constant that gets constant 2214 // folded with the offset. 2215 if (Idx == 0) 2216 return 2 * TTI::TCC_Basic; 2217 return TTI::TCC_Free; 2218 case Instruction::Store: 2219 ImmIdx = 0; 2220 break; 2221 case Instruction::ICmp: 2222 // This is an imperfect hack to prevent constant hoisting of 2223 // compares that might be trying to check if a 64-bit value fits in 2224 // 32-bits. The backend can optimize these cases using a right shift by 32. 2225 // Ideally we would check the compare predicate here. There also other 2226 // similar immediates the backend can use shifts for. 2227 if (Idx == 1 && Imm.getBitWidth() == 64) { 2228 uint64_t ImmVal = Imm.getZExtValue(); 2229 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff) 2230 return TTI::TCC_Free; 2231 } 2232 ImmIdx = 1; 2233 break; 2234 case Instruction::And: 2235 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes 2236 // by using a 32-bit operation with implicit zero extension. Detect such 2237 // immediates here as the normal path expects bit 31 to be sign extended. 2238 if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue())) 2239 return TTI::TCC_Free; 2240 LLVM_FALLTHROUGH; 2241 case Instruction::Add: 2242 case Instruction::Sub: 2243 case Instruction::Mul: 2244 case Instruction::UDiv: 2245 case Instruction::SDiv: 2246 case Instruction::URem: 2247 case Instruction::SRem: 2248 case Instruction::Or: 2249 case Instruction::Xor: 2250 ImmIdx = 1; 2251 break; 2252 // Always return TCC_Free for the shift value of a shift instruction. 2253 case Instruction::Shl: 2254 case Instruction::LShr: 2255 case Instruction::AShr: 2256 if (Idx == 1) 2257 return TTI::TCC_Free; 2258 break; 2259 case Instruction::Trunc: 2260 case Instruction::ZExt: 2261 case Instruction::SExt: 2262 case Instruction::IntToPtr: 2263 case Instruction::PtrToInt: 2264 case Instruction::BitCast: 2265 case Instruction::PHI: 2266 case Instruction::Call: 2267 case Instruction::Select: 2268 case Instruction::Ret: 2269 case Instruction::Load: 2270 break; 2271 } 2272 2273 if (Idx == ImmIdx) { 2274 int NumConstants = (BitSize + 63) / 64; 2275 int Cost = X86TTIImpl::getIntImmCost(Imm, Ty); 2276 return (Cost <= NumConstants * TTI::TCC_Basic) 2277 ? static_cast<int>(TTI::TCC_Free) 2278 : Cost; 2279 } 2280 2281 return X86TTIImpl::getIntImmCost(Imm, Ty); 2282 } 2283 2284 int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, 2285 Type *Ty) { 2286 assert(Ty->isIntegerTy()); 2287 2288 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 2289 // There is no cost model for constants with a bit size of 0. Return TCC_Free 2290 // here, so that constant hoisting will ignore this constant. 2291 if (BitSize == 0) 2292 return TTI::TCC_Free; 2293 2294 switch (IID) { 2295 default: 2296 return TTI::TCC_Free; 2297 case Intrinsic::sadd_with_overflow: 2298 case Intrinsic::uadd_with_overflow: 2299 case Intrinsic::ssub_with_overflow: 2300 case Intrinsic::usub_with_overflow: 2301 case Intrinsic::smul_with_overflow: 2302 case Intrinsic::umul_with_overflow: 2303 if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue())) 2304 return TTI::TCC_Free; 2305 break; 2306 case Intrinsic::experimental_stackmap: 2307 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 2308 return TTI::TCC_Free; 2309 break; 2310 case Intrinsic::experimental_patchpoint_void: 2311 case Intrinsic::experimental_patchpoint_i64: 2312 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 2313 return TTI::TCC_Free; 2314 break; 2315 } 2316 return X86TTIImpl::getIntImmCost(Imm, Ty); 2317 } 2318 2319 unsigned X86TTIImpl::getUserCost(const User *U, 2320 ArrayRef<const Value *> Operands) { 2321 if (isa<StoreInst>(U)) { 2322 Value *Ptr = U->getOperand(1); 2323 // Store instruction with index and scale costs 2 Uops. 2324 // Check the preceding GEP to identify non-const indices. 2325 if (auto GEP = dyn_cast<GetElementPtrInst>(Ptr)) { 2326 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); })) 2327 return TTI::TCC_Basic * 2; 2328 } 2329 return TTI::TCC_Basic; 2330 } 2331 return BaseT::getUserCost(U, Operands); 2332 } 2333 2334 // Return an average cost of Gather / Scatter instruction, maybe improved later 2335 int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr, 2336 unsigned Alignment, unsigned AddressSpace) { 2337 2338 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost"); 2339 unsigned VF = SrcVTy->getVectorNumElements(); 2340 2341 // Try to reduce index size from 64 bit (default for GEP) 2342 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the 2343 // operation will use 16 x 64 indices which do not fit in a zmm and needs 2344 // to split. Also check that the base pointer is the same for all lanes, 2345 // and that there's at most one variable index. 2346 auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) { 2347 unsigned IndexSize = DL.getPointerSizeInBits(); 2348 GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); 2349 if (IndexSize < 64 || !GEP) 2350 return IndexSize; 2351 2352 unsigned NumOfVarIndices = 0; 2353 Value *Ptrs = GEP->getPointerOperand(); 2354 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs)) 2355 return IndexSize; 2356 for (unsigned i = 1; i < GEP->getNumOperands(); ++i) { 2357 if (isa<Constant>(GEP->getOperand(i))) 2358 continue; 2359 Type *IndxTy = GEP->getOperand(i)->getType(); 2360 if (IndxTy->isVectorTy()) 2361 IndxTy = IndxTy->getVectorElementType(); 2362 if ((IndxTy->getPrimitiveSizeInBits() == 64 && 2363 !isa<SExtInst>(GEP->getOperand(i))) || 2364 ++NumOfVarIndices > 1) 2365 return IndexSize; // 64 2366 } 2367 return (unsigned)32; 2368 }; 2369 2370 2371 // Trying to reduce IndexSize to 32 bits for vector 16. 2372 // By default the IndexSize is equal to pointer size. 2373 unsigned IndexSize = (ST->hasAVX512() && VF >= 16) 2374 ? getIndexSizeInBits(Ptr, DL) 2375 : DL.getPointerSizeInBits(); 2376 2377 Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(), 2378 IndexSize), VF); 2379 std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy); 2380 std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy); 2381 int SplitFactor = std::max(IdxsLT.first, SrcLT.first); 2382 if (SplitFactor > 1) { 2383 // Handle splitting of vector of pointers 2384 Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor); 2385 return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment, 2386 AddressSpace); 2387 } 2388 2389 // The gather / scatter cost is given by Intel architects. It is a rough 2390 // number since we are looking at one instruction in a time. 2391 const int GSOverhead = (Opcode == Instruction::Load) 2392 ? ST->getGatherOverhead() 2393 : ST->getScatterOverhead(); 2394 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), 2395 Alignment, AddressSpace); 2396 } 2397 2398 /// Return the cost of full scalarization of gather / scatter operation. 2399 /// 2400 /// Opcode - Load or Store instruction. 2401 /// SrcVTy - The type of the data vector that should be gathered or scattered. 2402 /// VariableMask - The mask is non-constant at compile time. 2403 /// Alignment - Alignment for one element. 2404 /// AddressSpace - pointer[s] address space. 2405 /// 2406 int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy, 2407 bool VariableMask, unsigned Alignment, 2408 unsigned AddressSpace) { 2409 unsigned VF = SrcVTy->getVectorNumElements(); 2410 2411 int MaskUnpackCost = 0; 2412 if (VariableMask) { 2413 VectorType *MaskTy = 2414 VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF); 2415 MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true); 2416 int ScalarCompareCost = 2417 getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), 2418 nullptr); 2419 int BranchCost = getCFInstrCost(Instruction::Br); 2420 MaskUnpackCost += VF * (BranchCost + ScalarCompareCost); 2421 } 2422 2423 // The cost of the scalar loads/stores. 2424 int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), 2425 Alignment, AddressSpace); 2426 2427 int InsertExtractCost = 0; 2428 if (Opcode == Instruction::Load) 2429 for (unsigned i = 0; i < VF; ++i) 2430 // Add the cost of inserting each scalar load into the vector 2431 InsertExtractCost += 2432 getVectorInstrCost(Instruction::InsertElement, SrcVTy, i); 2433 else 2434 for (unsigned i = 0; i < VF; ++i) 2435 // Add the cost of extracting each element out of the data vector 2436 InsertExtractCost += 2437 getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i); 2438 2439 return MemoryOpCost + MaskUnpackCost + InsertExtractCost; 2440 } 2441 2442 /// Calculate the cost of Gather / Scatter operation 2443 int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy, 2444 Value *Ptr, bool VariableMask, 2445 unsigned Alignment) { 2446 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter"); 2447 unsigned VF = SrcVTy->getVectorNumElements(); 2448 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); 2449 if (!PtrTy && Ptr->getType()->isVectorTy()) 2450 PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType()); 2451 assert(PtrTy && "Unexpected type for Ptr argument"); 2452 unsigned AddressSpace = PtrTy->getAddressSpace(); 2453 2454 bool Scalarize = false; 2455 if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) || 2456 (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy))) 2457 Scalarize = true; 2458 // Gather / Scatter for vector 2 is not profitable on KNL / SKX 2459 // Vector-4 of gather/scatter instruction does not exist on KNL. 2460 // We can extend it to 8 elements, but zeroing upper bits of 2461 // the mask vector will add more instructions. Right now we give the scalar 2462 // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction 2463 // is better in the VariableMask case. 2464 if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX()))) 2465 Scalarize = true; 2466 2467 if (Scalarize) 2468 return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, 2469 AddressSpace); 2470 2471 return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace); 2472 } 2473 2474 bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1, 2475 TargetTransformInfo::LSRCost &C2) { 2476 // X86 specific here are "instruction number 1st priority". 2477 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, 2478 C1.NumIVMuls, C1.NumBaseAdds, 2479 C1.ScaleCost, C1.ImmCost, C1.SetupCost) < 2480 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, 2481 C2.NumIVMuls, C2.NumBaseAdds, 2482 C2.ScaleCost, C2.ImmCost, C2.SetupCost); 2483 } 2484 2485 bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { 2486 // The backend can't handle a single element vector. 2487 if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1) 2488 return false; 2489 Type *ScalarTy = DataTy->getScalarType(); 2490 int DataWidth = isa<PointerType>(ScalarTy) ? 2491 DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits(); 2492 2493 return ((DataWidth == 32 || DataWidth == 64) && ST->hasAVX()) || 2494 ((DataWidth == 8 || DataWidth == 16) && ST->hasBWI()); 2495 } 2496 2497 bool X86TTIImpl::isLegalMaskedStore(Type *DataType) { 2498 return isLegalMaskedLoad(DataType); 2499 } 2500 2501 bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) { 2502 // This function is called now in two cases: from the Loop Vectorizer 2503 // and from the Scalarizer. 2504 // When the Loop Vectorizer asks about legality of the feature, 2505 // the vectorization factor is not calculated yet. The Loop Vectorizer 2506 // sends a scalar type and the decision is based on the width of the 2507 // scalar element. 2508 // Later on, the cost model will estimate usage this intrinsic based on 2509 // the vector type. 2510 // The Scalarizer asks again about legality. It sends a vector type. 2511 // In this case we can reject non-power-of-2 vectors. 2512 // We also reject single element vectors as the type legalizer can't 2513 // scalarize it. 2514 if (isa<VectorType>(DataTy)) { 2515 unsigned NumElts = DataTy->getVectorNumElements(); 2516 if (NumElts == 1 || !isPowerOf2_32(NumElts)) 2517 return false; 2518 } 2519 Type *ScalarTy = DataTy->getScalarType(); 2520 int DataWidth = isa<PointerType>(ScalarTy) ? 2521 DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits(); 2522 2523 // Some CPUs have better gather performance than others. 2524 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only 2525 // enable gather with a -march. 2526 return (DataWidth == 32 || DataWidth == 64) && 2527 (ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2())); 2528 } 2529 2530 bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) { 2531 // AVX2 doesn't support scatter 2532 if (!ST->hasAVX512()) 2533 return false; 2534 return isLegalMaskedGather(DataType); 2535 } 2536 2537 bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) { 2538 EVT VT = TLI->getValueType(DL, DataType); 2539 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT); 2540 } 2541 2542 bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) { 2543 return false; 2544 } 2545 2546 bool X86TTIImpl::areInlineCompatible(const Function *Caller, 2547 const Function *Callee) const { 2548 const TargetMachine &TM = getTLI()->getTargetMachine(); 2549 2550 // Work this as a subsetting of subtarget features. 2551 const FeatureBitset &CallerBits = 2552 TM.getSubtargetImpl(*Caller)->getFeatureBits(); 2553 const FeatureBitset &CalleeBits = 2554 TM.getSubtargetImpl(*Callee)->getFeatureBits(); 2555 2556 // FIXME: This is likely too limiting as it will include subtarget features 2557 // that we might not care about for inlining, but it is conservatively 2558 // correct. 2559 return (CallerBits & CalleeBits) == CalleeBits; 2560 } 2561 2562 const X86TTIImpl::TTI::MemCmpExpansionOptions * 2563 X86TTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const { 2564 // Only enable vector loads for equality comparison. 2565 // Right now the vector version is not as fast, see #33329. 2566 static const auto ThreeWayOptions = [this]() { 2567 TTI::MemCmpExpansionOptions Options; 2568 if (ST->is64Bit()) { 2569 Options.LoadSizes.push_back(8); 2570 } 2571 Options.LoadSizes.push_back(4); 2572 Options.LoadSizes.push_back(2); 2573 Options.LoadSizes.push_back(1); 2574 return Options; 2575 }(); 2576 static const auto EqZeroOptions = [this]() { 2577 TTI::MemCmpExpansionOptions Options; 2578 // TODO: enable AVX512 when the DAG is ready. 2579 // if (ST->hasAVX512()) Options.LoadSizes.push_back(64); 2580 if (ST->hasAVX2()) Options.LoadSizes.push_back(32); 2581 if (ST->hasSSE2()) Options.LoadSizes.push_back(16); 2582 if (ST->is64Bit()) { 2583 Options.LoadSizes.push_back(8); 2584 } 2585 Options.LoadSizes.push_back(4); 2586 Options.LoadSizes.push_back(2); 2587 Options.LoadSizes.push_back(1); 2588 return Options; 2589 }(); 2590 return IsZeroCmp ? &EqZeroOptions : &ThreeWayOptions; 2591 } 2592 2593 bool X86TTIImpl::enableInterleavedAccessVectorization() { 2594 // TODO: We expect this to be beneficial regardless of arch, 2595 // but there are currently some unexplained performance artifacts on Atom. 2596 // As a temporary solution, disable on Atom. 2597 return !(ST->isAtom()); 2598 } 2599 2600 // Get estimation for interleaved load/store operations for AVX2. 2601 // \p Factor is the interleaved-access factor (stride) - number of 2602 // (interleaved) elements in the group. 2603 // \p Indices contains the indices for a strided load: when the 2604 // interleaved load has gaps they indicate which elements are used. 2605 // If Indices is empty (or if the number of indices is equal to the size 2606 // of the interleaved-access as given in \p Factor) the access has no gaps. 2607 // 2608 // As opposed to AVX-512, AVX2 does not have generic shuffles that allow 2609 // computing the cost using a generic formula as a function of generic 2610 // shuffles. We therefore use a lookup table instead, filled according to 2611 // the instruction sequences that codegen currently generates. 2612 int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, 2613 unsigned Factor, 2614 ArrayRef<unsigned> Indices, 2615 unsigned Alignment, 2616 unsigned AddressSpace) { 2617 2618 // We currently Support only fully-interleaved groups, with no gaps. 2619 // TODO: Support also strided loads (interleaved-groups with gaps). 2620 if (Indices.size() && Indices.size() != Factor) 2621 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 2622 Alignment, AddressSpace); 2623 2624 // VecTy for interleave memop is <VF*Factor x Elt>. 2625 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have 2626 // VecTy = <12 x i32>. 2627 MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; 2628 2629 // This function can be called with VecTy=<6xi128>, Factor=3, in which case 2630 // the VF=2, while v2i128 is an unsupported MVT vector type 2631 // (see MachineValueType.h::getVectorVT()). 2632 if (!LegalVT.isVector()) 2633 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 2634 Alignment, AddressSpace); 2635 2636 unsigned VF = VecTy->getVectorNumElements() / Factor; 2637 Type *ScalarTy = VecTy->getVectorElementType(); 2638 2639 // Calculate the number of memory operations (NumOfMemOps), required 2640 // for load/store the VecTy. 2641 unsigned VecTySize = DL.getTypeStoreSize(VecTy); 2642 unsigned LegalVTSize = LegalVT.getStoreSize(); 2643 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; 2644 2645 // Get the cost of one memory operation. 2646 Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(), 2647 LegalVT.getVectorNumElements()); 2648 unsigned MemOpCost = 2649 getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace); 2650 2651 VectorType *VT = VectorType::get(ScalarTy, VF); 2652 EVT ETy = TLI->getValueType(DL, VT); 2653 if (!ETy.isSimple()) 2654 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 2655 Alignment, AddressSpace); 2656 2657 // TODO: Complete for other data-types and strides. 2658 // Each combination of Stride, ElementTy and VF results in a different 2659 // sequence; The cost tables are therefore accessed with: 2660 // Factor (stride) and VectorType=VFxElemType. 2661 // The Cost accounts only for the shuffle sequence; 2662 // The cost of the loads/stores is accounted for separately. 2663 // 2664 static const CostTblEntry AVX2InterleavedLoadTbl[] = { 2665 { 2, MVT::v4i64, 6 }, //(load 8i64 and) deinterleave into 2 x 4i64 2666 { 2, MVT::v4f64, 6 }, //(load 8f64 and) deinterleave into 2 x 4f64 2667 2668 { 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8 2669 { 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8 2670 { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8 2671 { 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8 2672 { 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8 2673 { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32 2674 2675 { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8 2676 { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8 2677 { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8 2678 { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8 2679 { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8 2680 2681 { 8, MVT::v8f32, 40 } //(load 64f32 and)deinterleave into 8 x 8f32 2682 }; 2683 2684 static const CostTblEntry AVX2InterleavedStoreTbl[] = { 2685 { 2, MVT::v4i64, 6 }, //interleave into 2 x 4i64 into 8i64 (and store) 2686 { 2, MVT::v4f64, 6 }, //interleave into 2 x 4f64 into 8f64 (and store) 2687 2688 { 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store) 2689 { 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store) 2690 { 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store) 2691 { 3, MVT::v16i8, 11 }, //interleave 3 x 16i8 into 48i8 (and store) 2692 { 3, MVT::v32i8, 13 }, //interleave 3 x 32i8 into 96i8 (and store) 2693 2694 { 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store) 2695 { 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store) 2696 { 4, MVT::v8i8, 10 }, //interleave 4 x 8i8 into 32i8 (and store) 2697 { 4, MVT::v16i8, 10 }, //interleave 4 x 16i8 into 64i8 (and store) 2698 { 4, MVT::v32i8, 12 } //interleave 4 x 32i8 into 128i8 (and store) 2699 }; 2700 2701 if (Opcode == Instruction::Load) { 2702 if (const auto *Entry = 2703 CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT())) 2704 return NumOfMemOps * MemOpCost + Entry->Cost; 2705 } else { 2706 assert(Opcode == Instruction::Store && 2707 "Expected Store Instruction at this point"); 2708 if (const auto *Entry = 2709 CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT())) 2710 return NumOfMemOps * MemOpCost + Entry->Cost; 2711 } 2712 2713 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 2714 Alignment, AddressSpace); 2715 } 2716 2717 // Get estimation for interleaved load/store operations and strided load. 2718 // \p Indices contains indices for strided load. 2719 // \p Factor - the factor of interleaving. 2720 // AVX-512 provides 3-src shuffles that significantly reduces the cost. 2721 int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, 2722 unsigned Factor, 2723 ArrayRef<unsigned> Indices, 2724 unsigned Alignment, 2725 unsigned AddressSpace) { 2726 2727 // VecTy for interleave memop is <VF*Factor x Elt>. 2728 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have 2729 // VecTy = <12 x i32>. 2730 2731 // Calculate the number of memory operations (NumOfMemOps), required 2732 // for load/store the VecTy. 2733 MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; 2734 unsigned VecTySize = DL.getTypeStoreSize(VecTy); 2735 unsigned LegalVTSize = LegalVT.getStoreSize(); 2736 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; 2737 2738 // Get the cost of one memory operation. 2739 Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(), 2740 LegalVT.getVectorNumElements()); 2741 unsigned MemOpCost = 2742 getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace); 2743 2744 unsigned VF = VecTy->getVectorNumElements() / Factor; 2745 MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF); 2746 2747 if (Opcode == Instruction::Load) { 2748 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl) 2749 // contain the cost of the optimized shuffle sequence that the 2750 // X86InterleavedAccess pass will generate. 2751 // The cost of loads and stores are computed separately from the table. 2752 2753 // X86InterleavedAccess support only the following interleaved-access group. 2754 static const CostTblEntry AVX512InterleavedLoadTbl[] = { 2755 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8 2756 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8 2757 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8 2758 }; 2759 2760 if (const auto *Entry = 2761 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT)) 2762 return NumOfMemOps * MemOpCost + Entry->Cost; 2763 //If an entry does not exist, fallback to the default implementation. 2764 2765 // Kind of shuffle depends on number of loaded values. 2766 // If we load the entire data in one register, we can use a 1-src shuffle. 2767 // Otherwise, we'll merge 2 sources in each operation. 2768 TTI::ShuffleKind ShuffleKind = 2769 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc; 2770 2771 unsigned ShuffleCost = 2772 getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr); 2773 2774 unsigned NumOfLoadsInInterleaveGrp = 2775 Indices.size() ? Indices.size() : Factor; 2776 Type *ResultTy = VectorType::get(VecTy->getVectorElementType(), 2777 VecTy->getVectorNumElements() / Factor); 2778 unsigned NumOfResults = 2779 getTLI()->getTypeLegalizationCost(DL, ResultTy).first * 2780 NumOfLoadsInInterleaveGrp; 2781 2782 // About a half of the loads may be folded in shuffles when we have only 2783 // one result. If we have more than one result, we do not fold loads at all. 2784 unsigned NumOfUnfoldedLoads = 2785 NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2; 2786 2787 // Get a number of shuffle operations per result. 2788 unsigned NumOfShufflesPerResult = 2789 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1)); 2790 2791 // The SK_MergeTwoSrc shuffle clobbers one of src operands. 2792 // When we have more than one destination, we need additional instructions 2793 // to keep sources. 2794 unsigned NumOfMoves = 0; 2795 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc) 2796 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2; 2797 2798 int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost + 2799 NumOfUnfoldedLoads * MemOpCost + NumOfMoves; 2800 2801 return Cost; 2802 } 2803 2804 // Store. 2805 assert(Opcode == Instruction::Store && 2806 "Expected Store Instruction at this point"); 2807 // X86InterleavedAccess support only the following interleaved-access group. 2808 static const CostTblEntry AVX512InterleavedStoreTbl[] = { 2809 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store) 2810 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store) 2811 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store) 2812 2813 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store) 2814 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store) 2815 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store) 2816 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store) 2817 }; 2818 2819 if (const auto *Entry = 2820 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT)) 2821 return NumOfMemOps * MemOpCost + Entry->Cost; 2822 //If an entry does not exist, fallback to the default implementation. 2823 2824 // There is no strided stores meanwhile. And store can't be folded in 2825 // shuffle. 2826 unsigned NumOfSources = Factor; // The number of values to be merged. 2827 unsigned ShuffleCost = 2828 getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr); 2829 unsigned NumOfShufflesPerStore = NumOfSources - 1; 2830 2831 // The SK_MergeTwoSrc shuffle clobbers one of src operands. 2832 // We need additional instructions to keep sources. 2833 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2; 2834 int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) + 2835 NumOfMoves; 2836 return Cost; 2837 } 2838 2839 int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, 2840 unsigned Factor, 2841 ArrayRef<unsigned> Indices, 2842 unsigned Alignment, 2843 unsigned AddressSpace) { 2844 auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) { 2845 Type *EltTy = VecTy->getVectorElementType(); 2846 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || 2847 EltTy->isIntegerTy(32) || EltTy->isPointerTy()) 2848 return true; 2849 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8)) 2850 return HasBW; 2851 return false; 2852 }; 2853 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI())) 2854 return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices, 2855 Alignment, AddressSpace); 2856 if (ST->hasAVX2()) 2857 return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices, 2858 Alignment, AddressSpace); 2859 2860 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 2861 Alignment, AddressSpace); 2862 } 2863