1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// \file 10 /// This file implements a TargetTransformInfo analysis pass specific to the 11 /// X86 target machine. It uses the target's detailed information to provide 12 /// more precise answers to certain TTI queries, while letting the target 13 /// independent and default TTI implementations handle the rest. 14 /// 15 //===----------------------------------------------------------------------===// 16 /// About Cost Model numbers used below it's necessary to say the following: 17 /// the numbers correspond to some "generic" X86 CPU instead of usage of 18 /// concrete CPU model. Usually the numbers correspond to CPU where the feature 19 /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in 20 /// the lookups below the cost is based on Nehalem as that was the first CPU 21 /// to support that feature level and thus has most likely the worst case cost. 22 /// Some examples of other technologies/CPUs: 23 /// SSE 3 - Pentium4 / Athlon64 24 /// SSE 4.1 - Penryn 25 /// SSE 4.2 - Nehalem 26 /// AVX - Sandy Bridge 27 /// AVX2 - Haswell 28 /// AVX-512 - Xeon Phi / Skylake 29 /// And some examples of instruction target dependent costs (latency) 30 /// divss sqrtss rsqrtss 31 /// AMD K7 11-16 19 3 32 /// Piledriver 9-24 13-15 5 33 /// Jaguar 14 16 2 34 /// Pentium II,III 18 30 2 35 /// Nehalem 7-14 7-18 3 36 /// Haswell 10-13 11 5 37 /// TODO: Develop and implement the target dependent cost model and 38 /// specialize cost numbers for different Cost Model Targets such as throughput, 39 /// code size, latency and uop count. 40 //===----------------------------------------------------------------------===// 41 42 #include "X86TargetTransformInfo.h" 43 #include "llvm/Analysis/TargetTransformInfo.h" 44 #include "llvm/CodeGen/BasicTTIImpl.h" 45 #include "llvm/CodeGen/CostTable.h" 46 #include "llvm/CodeGen/TargetLowering.h" 47 #include "llvm/IR/IntrinsicInst.h" 48 #include "llvm/Support/Debug.h" 49 50 using namespace llvm; 51 52 #define DEBUG_TYPE "x86tti" 53 54 //===----------------------------------------------------------------------===// 55 // 56 // X86 cost model. 57 // 58 //===----------------------------------------------------------------------===// 59 60 TargetTransformInfo::PopcntSupportKind 61 X86TTIImpl::getPopcntSupport(unsigned TyWidth) { 62 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 63 // TODO: Currently the __builtin_popcount() implementation using SSE3 64 // instructions is inefficient. Once the problem is fixed, we should 65 // call ST->hasSSE3() instead of ST->hasPOPCNT(). 66 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software; 67 } 68 69 llvm::Optional<unsigned> X86TTIImpl::getCacheSize( 70 TargetTransformInfo::CacheLevel Level) const { 71 switch (Level) { 72 case TargetTransformInfo::CacheLevel::L1D: 73 // - Penryn 74 // - Nehalem 75 // - Westmere 76 // - Sandy Bridge 77 // - Ivy Bridge 78 // - Haswell 79 // - Broadwell 80 // - Skylake 81 // - Kabylake 82 return 32 * 1024; // 32 KByte 83 case TargetTransformInfo::CacheLevel::L2D: 84 // - Penryn 85 // - Nehalem 86 // - Westmere 87 // - Sandy Bridge 88 // - Ivy Bridge 89 // - Haswell 90 // - Broadwell 91 // - Skylake 92 // - Kabylake 93 return 256 * 1024; // 256 KByte 94 } 95 96 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); 97 } 98 99 llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity( 100 TargetTransformInfo::CacheLevel Level) const { 101 // - Penryn 102 // - Nehalem 103 // - Westmere 104 // - Sandy Bridge 105 // - Ivy Bridge 106 // - Haswell 107 // - Broadwell 108 // - Skylake 109 // - Kabylake 110 switch (Level) { 111 case TargetTransformInfo::CacheLevel::L1D: 112 LLVM_FALLTHROUGH; 113 case TargetTransformInfo::CacheLevel::L2D: 114 return 8; 115 } 116 117 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); 118 } 119 120 unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) { 121 if (Vector && !ST->hasSSE1()) 122 return 0; 123 124 if (ST->is64Bit()) { 125 if (Vector && ST->hasAVX512()) 126 return 32; 127 return 16; 128 } 129 return 8; 130 } 131 132 unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const { 133 unsigned PreferVectorWidth = ST->getPreferVectorWidth(); 134 if (Vector) { 135 if (ST->hasAVX512() && PreferVectorWidth >= 512) 136 return 512; 137 if (ST->hasAVX() && PreferVectorWidth >= 256) 138 return 256; 139 if (ST->hasSSE1() && PreferVectorWidth >= 128) 140 return 128; 141 return 0; 142 } 143 144 if (ST->is64Bit()) 145 return 64; 146 147 return 32; 148 } 149 150 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const { 151 return getRegisterBitWidth(true); 152 } 153 154 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { 155 // If the loop will not be vectorized, don't interleave the loop. 156 // Let regular unroll to unroll the loop, which saves the overflow 157 // check and memory check cost. 158 if (VF == 1) 159 return 1; 160 161 if (ST->isAtom()) 162 return 1; 163 164 // Sandybridge and Haswell have multiple execution ports and pipelined 165 // vector units. 166 if (ST->hasAVX()) 167 return 4; 168 169 return 2; 170 } 171 172 int X86TTIImpl::getArithmeticInstrCost( 173 unsigned Opcode, Type *Ty, 174 TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, 175 TTI::OperandValueProperties Opd1PropInfo, 176 TTI::OperandValueProperties Opd2PropInfo, 177 ArrayRef<const Value *> Args) { 178 // Legalize the type. 179 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 180 181 int ISD = TLI->InstructionOpcodeToISD(Opcode); 182 assert(ISD && "Invalid opcode"); 183 184 static const CostTblEntry GLMCostTable[] = { 185 { ISD::FDIV, MVT::f32, 18 }, // divss 186 { ISD::FDIV, MVT::v4f32, 35 }, // divps 187 { ISD::FDIV, MVT::f64, 33 }, // divsd 188 { ISD::FDIV, MVT::v2f64, 65 }, // divpd 189 }; 190 191 if (ST->isGLM()) 192 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, 193 LT.second)) 194 return LT.first * Entry->Cost; 195 196 static const CostTblEntry SLMCostTable[] = { 197 { ISD::MUL, MVT::v4i32, 11 }, // pmulld 198 { ISD::MUL, MVT::v8i16, 2 }, // pmullw 199 { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence. 200 { ISD::FMUL, MVT::f64, 2 }, // mulsd 201 { ISD::FMUL, MVT::v2f64, 4 }, // mulpd 202 { ISD::FMUL, MVT::v4f32, 2 }, // mulps 203 { ISD::FDIV, MVT::f32, 17 }, // divss 204 { ISD::FDIV, MVT::v4f32, 39 }, // divps 205 { ISD::FDIV, MVT::f64, 32 }, // divsd 206 { ISD::FDIV, MVT::v2f64, 69 }, // divpd 207 { ISD::FADD, MVT::v2f64, 2 }, // addpd 208 { ISD::FSUB, MVT::v2f64, 2 }, // subpd 209 // v2i64/v4i64 mul is custom lowered as a series of long: 210 // multiplies(3), shifts(3) and adds(2) 211 // slm muldq version throughput is 2 and addq throughput 4 212 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) + 213 // 3X4 (addq throughput) = 17 214 { ISD::MUL, MVT::v2i64, 17 }, 215 // slm addq\subq throughput is 4 216 { ISD::ADD, MVT::v2i64, 4 }, 217 { ISD::SUB, MVT::v2i64, 4 }, 218 }; 219 220 if (ST->isSLM()) { 221 if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) { 222 // Check if the operands can be shrinked into a smaller datatype. 223 bool Op1Signed = false; 224 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed); 225 bool Op2Signed = false; 226 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed); 227 228 bool signedMode = Op1Signed | Op2Signed; 229 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize); 230 231 if (OpMinSize <= 7) 232 return LT.first * 3; // pmullw/sext 233 if (!signedMode && OpMinSize <= 8) 234 return LT.first * 3; // pmullw/zext 235 if (OpMinSize <= 15) 236 return LT.first * 5; // pmullw/pmulhw/pshuf 237 if (!signedMode && OpMinSize <= 16) 238 return LT.first * 5; // pmullw/pmulhw/pshuf 239 } 240 241 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, 242 LT.second)) { 243 return LT.first * Entry->Cost; 244 } 245 } 246 247 if ((ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV || 248 ISD == ISD::UREM) && 249 (Op2Info == TargetTransformInfo::OK_UniformConstantValue || 250 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && 251 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { 252 if (ISD == ISD::SDIV || ISD == ISD::SREM) { 253 // On X86, vector signed division by constants power-of-two are 254 // normally expanded to the sequence SRA + SRL + ADD + SRA. 255 // The OperandValue properties may not be the same as that of the previous 256 // operation; conservatively assume OP_None. 257 int Cost = 258 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info, 259 TargetTransformInfo::OP_None, 260 TargetTransformInfo::OP_None); 261 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info, 262 TargetTransformInfo::OP_None, 263 TargetTransformInfo::OP_None); 264 Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info, 265 TargetTransformInfo::OP_None, 266 TargetTransformInfo::OP_None); 267 268 if (ISD == ISD::SREM) { 269 // For SREM: (X % C) is the equivalent of (X - (X/C)*C) 270 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info); 271 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Op1Info, Op2Info); 272 } 273 274 return Cost; 275 } 276 277 // Vector unsigned division/remainder will be simplified to shifts/masks. 278 if (ISD == ISD::UDIV) 279 return getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info, 280 TargetTransformInfo::OP_None, 281 TargetTransformInfo::OP_None); 282 283 if (ISD == ISD::UREM) 284 return getArithmeticInstrCost(Instruction::And, Ty, Op1Info, Op2Info, 285 TargetTransformInfo::OP_None, 286 TargetTransformInfo::OP_None); 287 } 288 289 static const CostTblEntry AVX512BWUniformConstCostTable[] = { 290 { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand. 291 { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand. 292 { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb. 293 294 { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence 295 { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence 296 { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence 297 { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence 298 }; 299 300 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && 301 ST->hasBWI()) { 302 if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD, 303 LT.second)) 304 return LT.first * Entry->Cost; 305 } 306 307 static const CostTblEntry AVX512UniformConstCostTable[] = { 308 { ISD::SRA, MVT::v2i64, 1 }, 309 { ISD::SRA, MVT::v4i64, 1 }, 310 { ISD::SRA, MVT::v8i64, 1 }, 311 312 { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence 313 { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence 314 { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence 315 { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence 316 }; 317 318 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && 319 ST->hasAVX512()) { 320 if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD, 321 LT.second)) 322 return LT.first * Entry->Cost; 323 } 324 325 static const CostTblEntry AVX2UniformConstCostTable[] = { 326 { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand. 327 { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand. 328 { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb. 329 330 { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle. 331 332 { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence 333 { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence 334 { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence 335 { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence 336 { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence 337 { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence 338 { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence 339 { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence 340 }; 341 342 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && 343 ST->hasAVX2()) { 344 if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD, 345 LT.second)) 346 return LT.first * Entry->Cost; 347 } 348 349 static const CostTblEntry SSE2UniformConstCostTable[] = { 350 { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand. 351 { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand. 352 { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. 353 354 { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split. 355 { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split. 356 { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split. 357 358 { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split. 359 { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split. 360 { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence 361 { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence 362 { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split. 363 { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split. 364 { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence 365 { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence 366 { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split. 367 { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split. 368 { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence 369 { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence 370 { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split. 371 { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split. 372 { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence 373 { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence 374 }; 375 376 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && 377 ST->hasSSE2()) { 378 // pmuldq sequence. 379 if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX()) 380 return LT.first * 32; 381 if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX()) 382 return LT.first * 38; 383 if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41()) 384 return LT.first * 15; 385 if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41()) 386 return LT.first * 20; 387 388 // XOP has faster vXi8 shifts. 389 if ((ISD != ISD::SHL && ISD != ISD::SRL && ISD != ISD::SRA) || 390 !ST->hasXOP()) 391 if (const auto *Entry = 392 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second)) 393 return LT.first * Entry->Cost; 394 } 395 396 static const CostTblEntry AVX2UniformCostTable[] = { 397 // Uniform splats are cheaper for the following instructions. 398 { ISD::SHL, MVT::v16i16, 1 }, // psllw. 399 { ISD::SRL, MVT::v16i16, 1 }, // psrlw. 400 { ISD::SRA, MVT::v16i16, 1 }, // psraw. 401 }; 402 403 if (ST->hasAVX2() && 404 ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || 405 (Op2Info == TargetTransformInfo::OK_UniformValue))) { 406 if (const auto *Entry = 407 CostTableLookup(AVX2UniformCostTable, ISD, LT.second)) 408 return LT.first * Entry->Cost; 409 } 410 411 static const CostTblEntry SSE2UniformCostTable[] = { 412 // Uniform splats are cheaper for the following instructions. 413 { ISD::SHL, MVT::v8i16, 1 }, // psllw. 414 { ISD::SHL, MVT::v4i32, 1 }, // pslld 415 { ISD::SHL, MVT::v2i64, 1 }, // psllq. 416 417 { ISD::SRL, MVT::v8i16, 1 }, // psrlw. 418 { ISD::SRL, MVT::v4i32, 1 }, // psrld. 419 { ISD::SRL, MVT::v2i64, 1 }, // psrlq. 420 421 { ISD::SRA, MVT::v8i16, 1 }, // psraw. 422 { ISD::SRA, MVT::v4i32, 1 }, // psrad. 423 }; 424 425 if (ST->hasSSE2() && 426 ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || 427 (Op2Info == TargetTransformInfo::OK_UniformValue))) { 428 if (const auto *Entry = 429 CostTableLookup(SSE2UniformCostTable, ISD, LT.second)) 430 return LT.first * Entry->Cost; 431 } 432 433 static const CostTblEntry AVX512DQCostTable[] = { 434 { ISD::MUL, MVT::v2i64, 1 }, 435 { ISD::MUL, MVT::v4i64, 1 }, 436 { ISD::MUL, MVT::v8i64, 1 } 437 }; 438 439 // Look for AVX512DQ lowering tricks for custom cases. 440 if (ST->hasDQI()) 441 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second)) 442 return LT.first * Entry->Cost; 443 444 static const CostTblEntry AVX512BWCostTable[] = { 445 { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw 446 { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw 447 { ISD::SRA, MVT::v8i16, 1 }, // vpsravw 448 449 { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw 450 { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw 451 { ISD::SRA, MVT::v16i16, 1 }, // vpsravw 452 453 { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw 454 { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw 455 { ISD::SRA, MVT::v32i16, 1 }, // vpsravw 456 457 { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence. 458 { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence. 459 { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence. 460 461 { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence. 462 { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence. 463 { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence. 464 }; 465 466 // Look for AVX512BW lowering tricks for custom cases. 467 if (ST->hasBWI()) 468 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second)) 469 return LT.first * Entry->Cost; 470 471 static const CostTblEntry AVX512CostTable[] = { 472 { ISD::SHL, MVT::v16i32, 1 }, 473 { ISD::SRL, MVT::v16i32, 1 }, 474 { ISD::SRA, MVT::v16i32, 1 }, 475 476 { ISD::SHL, MVT::v8i64, 1 }, 477 { ISD::SRL, MVT::v8i64, 1 }, 478 479 { ISD::SRA, MVT::v2i64, 1 }, 480 { ISD::SRA, MVT::v4i64, 1 }, 481 { ISD::SRA, MVT::v8i64, 1 }, 482 483 { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence. 484 { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence. 485 { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org) 486 { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org) 487 { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org) 488 { ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add 489 490 { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ 491 { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ 492 { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ 493 494 { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ 495 { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ 496 { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ 497 }; 498 499 if (ST->hasAVX512()) 500 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second)) 501 return LT.first * Entry->Cost; 502 503 static const CostTblEntry AVX2ShiftCostTable[] = { 504 // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to 505 // customize them to detect the cases where shift amount is a scalar one. 506 { ISD::SHL, MVT::v4i32, 1 }, 507 { ISD::SRL, MVT::v4i32, 1 }, 508 { ISD::SRA, MVT::v4i32, 1 }, 509 { ISD::SHL, MVT::v8i32, 1 }, 510 { ISD::SRL, MVT::v8i32, 1 }, 511 { ISD::SRA, MVT::v8i32, 1 }, 512 { ISD::SHL, MVT::v2i64, 1 }, 513 { ISD::SRL, MVT::v2i64, 1 }, 514 { ISD::SHL, MVT::v4i64, 1 }, 515 { ISD::SRL, MVT::v4i64, 1 }, 516 }; 517 518 // Look for AVX2 lowering tricks. 519 if (ST->hasAVX2()) { 520 if (ISD == ISD::SHL && LT.second == MVT::v16i16 && 521 (Op2Info == TargetTransformInfo::OK_UniformConstantValue || 522 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) 523 // On AVX2, a packed v16i16 shift left by a constant build_vector 524 // is lowered into a vector multiply (vpmullw). 525 return getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info, 526 TargetTransformInfo::OP_None, 527 TargetTransformInfo::OP_None); 528 529 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second)) 530 return LT.first * Entry->Cost; 531 } 532 533 static const CostTblEntry XOPShiftCostTable[] = { 534 // 128bit shifts take 1cy, but right shifts require negation beforehand. 535 { ISD::SHL, MVT::v16i8, 1 }, 536 { ISD::SRL, MVT::v16i8, 2 }, 537 { ISD::SRA, MVT::v16i8, 2 }, 538 { ISD::SHL, MVT::v8i16, 1 }, 539 { ISD::SRL, MVT::v8i16, 2 }, 540 { ISD::SRA, MVT::v8i16, 2 }, 541 { ISD::SHL, MVT::v4i32, 1 }, 542 { ISD::SRL, MVT::v4i32, 2 }, 543 { ISD::SRA, MVT::v4i32, 2 }, 544 { ISD::SHL, MVT::v2i64, 1 }, 545 { ISD::SRL, MVT::v2i64, 2 }, 546 { ISD::SRA, MVT::v2i64, 2 }, 547 // 256bit shifts require splitting if AVX2 didn't catch them above. 548 { ISD::SHL, MVT::v32i8, 2+2 }, 549 { ISD::SRL, MVT::v32i8, 4+2 }, 550 { ISD::SRA, MVT::v32i8, 4+2 }, 551 { ISD::SHL, MVT::v16i16, 2+2 }, 552 { ISD::SRL, MVT::v16i16, 4+2 }, 553 { ISD::SRA, MVT::v16i16, 4+2 }, 554 { ISD::SHL, MVT::v8i32, 2+2 }, 555 { ISD::SRL, MVT::v8i32, 4+2 }, 556 { ISD::SRA, MVT::v8i32, 4+2 }, 557 { ISD::SHL, MVT::v4i64, 2+2 }, 558 { ISD::SRL, MVT::v4i64, 4+2 }, 559 { ISD::SRA, MVT::v4i64, 4+2 }, 560 }; 561 562 // Look for XOP lowering tricks. 563 if (ST->hasXOP()) 564 if (const auto *Entry = CostTableLookup(XOPShiftCostTable, ISD, LT.second)) 565 return LT.first * Entry->Cost; 566 567 static const CostTblEntry SSE2UniformShiftCostTable[] = { 568 // Uniform splats are cheaper for the following instructions. 569 { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split. 570 { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split. 571 { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split. 572 573 { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split. 574 { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split. 575 { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split. 576 577 { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split. 578 { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split. 579 { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle. 580 { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split. 581 }; 582 583 if (ST->hasSSE2() && 584 ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || 585 (Op2Info == TargetTransformInfo::OK_UniformValue))) { 586 587 // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table. 588 if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2()) 589 return LT.first * 4; // 2*psrad + shuffle. 590 591 if (const auto *Entry = 592 CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second)) 593 return LT.first * Entry->Cost; 594 } 595 596 if (ISD == ISD::SHL && 597 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) { 598 MVT VT = LT.second; 599 // Vector shift left by non uniform constant can be lowered 600 // into vector multiply. 601 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) || 602 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX())) 603 ISD = ISD::MUL; 604 } 605 606 static const CostTblEntry AVX2CostTable[] = { 607 { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence. 608 { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. 609 610 { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence. 611 { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. 612 613 { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence. 614 { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence. 615 { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence. 616 { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence. 617 618 { ISD::SUB, MVT::v32i8, 1 }, // psubb 619 { ISD::ADD, MVT::v32i8, 1 }, // paddb 620 { ISD::SUB, MVT::v16i16, 1 }, // psubw 621 { ISD::ADD, MVT::v16i16, 1 }, // paddw 622 { ISD::SUB, MVT::v8i32, 1 }, // psubd 623 { ISD::ADD, MVT::v8i32, 1 }, // paddd 624 { ISD::SUB, MVT::v4i64, 1 }, // psubq 625 { ISD::ADD, MVT::v4i64, 1 }, // paddq 626 627 { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence. 628 { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence. 629 { ISD::MUL, MVT::v16i16, 1 }, // pmullw 630 { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org) 631 { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add 632 633 { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ 634 { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ 635 { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ 636 { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ 637 { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ 638 { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ 639 640 { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/ 641 { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/ 642 { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/ 643 { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/ 644 { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/ 645 { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/ 646 }; 647 648 // Look for AVX2 lowering tricks for custom cases. 649 if (ST->hasAVX2()) 650 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second)) 651 return LT.first * Entry->Cost; 652 653 static const CostTblEntry AVX1CostTable[] = { 654 // We don't have to scalarize unsupported ops. We can issue two half-sized 655 // operations and we only need to extract the upper YMM half. 656 // Two ops + 1 extract + 1 insert = 4. 657 { ISD::MUL, MVT::v16i16, 4 }, 658 { ISD::MUL, MVT::v8i32, 4 }, 659 { ISD::SUB, MVT::v32i8, 4 }, 660 { ISD::ADD, MVT::v32i8, 4 }, 661 { ISD::SUB, MVT::v16i16, 4 }, 662 { ISD::ADD, MVT::v16i16, 4 }, 663 { ISD::SUB, MVT::v8i32, 4 }, 664 { ISD::ADD, MVT::v8i32, 4 }, 665 { ISD::SUB, MVT::v4i64, 4 }, 666 { ISD::ADD, MVT::v4i64, 4 }, 667 668 // A v4i64 multiply is custom lowered as two split v2i64 vectors that then 669 // are lowered as a series of long multiplies(3), shifts(3) and adds(2) 670 // Because we believe v4i64 to be a legal type, we must also include the 671 // extract+insert in the cost table. Therefore, the cost here is 18 672 // instead of 8. 673 { ISD::MUL, MVT::v4i64, 18 }, 674 675 { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence. 676 677 { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/ 678 { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/ 679 { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/ 680 { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/ 681 { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/ 682 { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/ 683 }; 684 685 if (ST->hasAVX()) 686 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second)) 687 return LT.first * Entry->Cost; 688 689 static const CostTblEntry SSE42CostTable[] = { 690 { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/ 691 { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/ 692 { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/ 693 { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/ 694 695 { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/ 696 { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/ 697 { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/ 698 { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/ 699 700 { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/ 701 { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/ 702 { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/ 703 { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/ 704 705 { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/ 706 { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/ 707 { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/ 708 { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/ 709 }; 710 711 if (ST->hasSSE42()) 712 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second)) 713 return LT.first * Entry->Cost; 714 715 static const CostTblEntry SSE41CostTable[] = { 716 { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence. 717 { ISD::SHL, MVT::v32i8, 2*11+2 }, // pblendvb sequence + split. 718 { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence. 719 { ISD::SHL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split. 720 { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld 721 { ISD::SHL, MVT::v8i32, 2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split 722 723 { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence. 724 { ISD::SRL, MVT::v32i8, 2*12+2 }, // pblendvb sequence + split. 725 { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence. 726 { ISD::SRL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split. 727 { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend. 728 { ISD::SRL, MVT::v8i32, 2*11+2 }, // Shift each lane + blend + split. 729 730 { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence. 731 { ISD::SRA, MVT::v32i8, 2*24+2 }, // pblendvb sequence + split. 732 { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence. 733 { ISD::SRA, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split. 734 { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend. 735 { ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split. 736 737 { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org) 738 }; 739 740 if (ST->hasSSE41()) 741 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second)) 742 return LT.first * Entry->Cost; 743 744 static const CostTblEntry SSE2CostTable[] = { 745 // We don't correctly identify costs of casts because they are marked as 746 // custom. 747 { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence. 748 { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence. 749 { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. 750 { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence. 751 { ISD::SHL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split. 752 753 { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence. 754 { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence. 755 { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend. 756 { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence. 757 { ISD::SRL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split. 758 759 { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence. 760 { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence. 761 { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend. 762 { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence. 763 { ISD::SRA, MVT::v4i64, 2*12+2 }, // srl/xor/sub sequence+split. 764 765 { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence. 766 { ISD::MUL, MVT::v8i16, 1 }, // pmullw 767 { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle 768 { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add 769 770 { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/ 771 { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/ 772 { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/ 773 { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/ 774 }; 775 776 if (ST->hasSSE2()) 777 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second)) 778 return LT.first * Entry->Cost; 779 780 static const CostTblEntry SSE1CostTable[] = { 781 { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/ 782 { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/ 783 }; 784 785 if (ST->hasSSE1()) 786 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second)) 787 return LT.first * Entry->Cost; 788 789 // It is not a good idea to vectorize division. We have to scalarize it and 790 // in the process we will often end up having to spilling regular 791 // registers. The overhead of division is going to dominate most kernels 792 // anyways so try hard to prevent vectorization of division - it is 793 // generally a bad idea. Assume somewhat arbitrarily that we have to be able 794 // to hide "20 cycles" for each lane. 795 if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM || 796 ISD == ISD::UDIV || ISD == ISD::UREM)) { 797 int ScalarCost = getArithmeticInstrCost( 798 Opcode, Ty->getScalarType(), Op1Info, Op2Info, 799 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 800 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost; 801 } 802 803 // Fallback to the default implementation. 804 return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info); 805 } 806 807 int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, 808 Type *SubTp) { 809 // 64-bit packed float vectors (v2f32) are widened to type v4f32. 810 // 64-bit packed integer vectors (v2i32) are promoted to type v2i64. 811 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 812 813 // For Broadcasts we are splatting the first element from the first input 814 // register, so only need to reference that input and all the output 815 // registers are the same. 816 if (Kind == TTI::SK_Broadcast) 817 LT.first = 1; 818 819 // We are going to permute multiple sources and the result will be in multiple 820 // destinations. Providing an accurate cost only for splits where the element 821 // type remains the same. 822 if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) { 823 MVT LegalVT = LT.second; 824 if (LegalVT.isVector() && 825 LegalVT.getVectorElementType().getSizeInBits() == 826 Tp->getVectorElementType()->getPrimitiveSizeInBits() && 827 LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) { 828 829 unsigned VecTySize = DL.getTypeStoreSize(Tp); 830 unsigned LegalVTSize = LegalVT.getStoreSize(); 831 // Number of source vectors after legalization: 832 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize; 833 // Number of destination vectors after legalization: 834 unsigned NumOfDests = LT.first; 835 836 Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(), 837 LegalVT.getVectorNumElements()); 838 839 unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; 840 return NumOfShuffles * 841 getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr); 842 } 843 844 return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); 845 } 846 847 // For 2-input shuffles, we must account for splitting the 2 inputs into many. 848 if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) { 849 // We assume that source and destination have the same vector type. 850 int NumOfDests = LT.first; 851 int NumOfShufflesPerDest = LT.first * 2 - 1; 852 LT.first = NumOfDests * NumOfShufflesPerDest; 853 } 854 855 static const CostTblEntry AVX512VBMIShuffleTbl[] = { 856 { TTI::SK_Reverse, MVT::v64i8, 1 }, // vpermb 857 { TTI::SK_Reverse, MVT::v32i8, 1 }, // vpermb 858 859 { TTI::SK_PermuteSingleSrc, MVT::v64i8, 1 }, // vpermb 860 { TTI::SK_PermuteSingleSrc, MVT::v32i8, 1 }, // vpermb 861 862 { TTI::SK_PermuteTwoSrc, MVT::v64i8, 1 }, // vpermt2b 863 { TTI::SK_PermuteTwoSrc, MVT::v32i8, 1 }, // vpermt2b 864 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 1 } // vpermt2b 865 }; 866 867 if (ST->hasVBMI()) 868 if (const auto *Entry = 869 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second)) 870 return LT.first * Entry->Cost; 871 872 static const CostTblEntry AVX512BWShuffleTbl[] = { 873 { TTI::SK_Broadcast, MVT::v32i16, 1 }, // vpbroadcastw 874 { TTI::SK_Broadcast, MVT::v64i8, 1 }, // vpbroadcastb 875 876 { TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw 877 { TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw 878 { TTI::SK_Reverse, MVT::v64i8, 2 }, // pshufb + vshufi64x2 879 880 { TTI::SK_PermuteSingleSrc, MVT::v32i16, 1 }, // vpermw 881 { TTI::SK_PermuteSingleSrc, MVT::v16i16, 1 }, // vpermw 882 { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // vpermw 883 { TTI::SK_PermuteSingleSrc, MVT::v64i8, 8 }, // extend to v32i16 884 { TTI::SK_PermuteSingleSrc, MVT::v32i8, 3 }, // vpermw + zext/trunc 885 886 { TTI::SK_PermuteTwoSrc, MVT::v32i16, 1 }, // vpermt2w 887 { TTI::SK_PermuteTwoSrc, MVT::v16i16, 1 }, // vpermt2w 888 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 1 }, // vpermt2w 889 { TTI::SK_PermuteTwoSrc, MVT::v32i8, 3 }, // zext + vpermt2w + trunc 890 { TTI::SK_PermuteTwoSrc, MVT::v64i8, 19 }, // 6 * v32i8 + 1 891 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 3 } // zext + vpermt2w + trunc 892 }; 893 894 if (ST->hasBWI()) 895 if (const auto *Entry = 896 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second)) 897 return LT.first * Entry->Cost; 898 899 static const CostTblEntry AVX512ShuffleTbl[] = { 900 { TTI::SK_Broadcast, MVT::v8f64, 1 }, // vbroadcastpd 901 { TTI::SK_Broadcast, MVT::v16f32, 1 }, // vbroadcastps 902 { TTI::SK_Broadcast, MVT::v8i64, 1 }, // vpbroadcastq 903 { TTI::SK_Broadcast, MVT::v16i32, 1 }, // vpbroadcastd 904 905 { TTI::SK_Reverse, MVT::v8f64, 1 }, // vpermpd 906 { TTI::SK_Reverse, MVT::v16f32, 1 }, // vpermps 907 { TTI::SK_Reverse, MVT::v8i64, 1 }, // vpermq 908 { TTI::SK_Reverse, MVT::v16i32, 1 }, // vpermd 909 910 { TTI::SK_PermuteSingleSrc, MVT::v8f64, 1 }, // vpermpd 911 { TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd 912 { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // vpermpd 913 { TTI::SK_PermuteSingleSrc, MVT::v16f32, 1 }, // vpermps 914 { TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps 915 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // vpermps 916 { TTI::SK_PermuteSingleSrc, MVT::v8i64, 1 }, // vpermq 917 { TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq 918 { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // vpermq 919 { TTI::SK_PermuteSingleSrc, MVT::v16i32, 1 }, // vpermd 920 { TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd 921 { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }, // vpermd 922 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb 923 924 { TTI::SK_PermuteTwoSrc, MVT::v8f64, 1 }, // vpermt2pd 925 { TTI::SK_PermuteTwoSrc, MVT::v16f32, 1 }, // vpermt2ps 926 { TTI::SK_PermuteTwoSrc, MVT::v8i64, 1 }, // vpermt2q 927 { TTI::SK_PermuteTwoSrc, MVT::v16i32, 1 }, // vpermt2d 928 { TTI::SK_PermuteTwoSrc, MVT::v4f64, 1 }, // vpermt2pd 929 { TTI::SK_PermuteTwoSrc, MVT::v8f32, 1 }, // vpermt2ps 930 { TTI::SK_PermuteTwoSrc, MVT::v4i64, 1 }, // vpermt2q 931 { TTI::SK_PermuteTwoSrc, MVT::v8i32, 1 }, // vpermt2d 932 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // vpermt2pd 933 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 1 }, // vpermt2ps 934 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // vpermt2q 935 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 1 } // vpermt2d 936 }; 937 938 if (ST->hasAVX512()) 939 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second)) 940 return LT.first * Entry->Cost; 941 942 static const CostTblEntry AVX2ShuffleTbl[] = { 943 { TTI::SK_Broadcast, MVT::v4f64, 1 }, // vbroadcastpd 944 { TTI::SK_Broadcast, MVT::v8f32, 1 }, // vbroadcastps 945 { TTI::SK_Broadcast, MVT::v4i64, 1 }, // vpbroadcastq 946 { TTI::SK_Broadcast, MVT::v8i32, 1 }, // vpbroadcastd 947 { TTI::SK_Broadcast, MVT::v16i16, 1 }, // vpbroadcastw 948 { TTI::SK_Broadcast, MVT::v32i8, 1 }, // vpbroadcastb 949 950 { TTI::SK_Reverse, MVT::v4f64, 1 }, // vpermpd 951 { TTI::SK_Reverse, MVT::v8f32, 1 }, // vpermps 952 { TTI::SK_Reverse, MVT::v4i64, 1 }, // vpermq 953 { TTI::SK_Reverse, MVT::v8i32, 1 }, // vpermd 954 { TTI::SK_Reverse, MVT::v16i16, 2 }, // vperm2i128 + pshufb 955 { TTI::SK_Reverse, MVT::v32i8, 2 }, // vperm2i128 + pshufb 956 957 { TTI::SK_Select, MVT::v16i16, 1 }, // vpblendvb 958 { TTI::SK_Select, MVT::v32i8, 1 }, // vpblendvb 959 960 { TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd 961 { TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps 962 { TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq 963 { TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd 964 { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vperm2i128 + 2*vpshufb 965 // + vpblendvb 966 { TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 }, // vperm2i128 + 2*vpshufb 967 // + vpblendvb 968 969 { TTI::SK_PermuteTwoSrc, MVT::v4f64, 3 }, // 2*vpermpd + vblendpd 970 { TTI::SK_PermuteTwoSrc, MVT::v8f32, 3 }, // 2*vpermps + vblendps 971 { TTI::SK_PermuteTwoSrc, MVT::v4i64, 3 }, // 2*vpermq + vpblendd 972 { TTI::SK_PermuteTwoSrc, MVT::v8i32, 3 }, // 2*vpermd + vpblendd 973 { TTI::SK_PermuteTwoSrc, MVT::v16i16, 7 }, // 2*vperm2i128 + 4*vpshufb 974 // + vpblendvb 975 { TTI::SK_PermuteTwoSrc, MVT::v32i8, 7 }, // 2*vperm2i128 + 4*vpshufb 976 // + vpblendvb 977 }; 978 979 if (ST->hasAVX2()) 980 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second)) 981 return LT.first * Entry->Cost; 982 983 static const CostTblEntry XOPShuffleTbl[] = { 984 { TTI::SK_PermuteSingleSrc, MVT::v4f64, 2 }, // vperm2f128 + vpermil2pd 985 { TTI::SK_PermuteSingleSrc, MVT::v8f32, 2 }, // vperm2f128 + vpermil2ps 986 { TTI::SK_PermuteSingleSrc, MVT::v4i64, 2 }, // vperm2f128 + vpermil2pd 987 { TTI::SK_PermuteSingleSrc, MVT::v8i32, 2 }, // vperm2f128 + vpermil2ps 988 { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vextractf128 + 2*vpperm 989 // + vinsertf128 990 { TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 }, // vextractf128 + 2*vpperm 991 // + vinsertf128 992 993 { TTI::SK_PermuteTwoSrc, MVT::v16i16, 9 }, // 2*vextractf128 + 6*vpperm 994 // + vinsertf128 995 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 1 }, // vpperm 996 { TTI::SK_PermuteTwoSrc, MVT::v32i8, 9 }, // 2*vextractf128 + 6*vpperm 997 // + vinsertf128 998 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 1 }, // vpperm 999 }; 1000 1001 if (ST->hasXOP()) 1002 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second)) 1003 return LT.first * Entry->Cost; 1004 1005 static const CostTblEntry AVX1ShuffleTbl[] = { 1006 { TTI::SK_Broadcast, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd 1007 { TTI::SK_Broadcast, MVT::v8f32, 2 }, // vperm2f128 + vpermilps 1008 { TTI::SK_Broadcast, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd 1009 { TTI::SK_Broadcast, MVT::v8i32, 2 }, // vperm2f128 + vpermilps 1010 { TTI::SK_Broadcast, MVT::v16i16, 3 }, // vpshuflw + vpshufd + vinsertf128 1011 { TTI::SK_Broadcast, MVT::v32i8, 2 }, // vpshufb + vinsertf128 1012 1013 { TTI::SK_Reverse, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd 1014 { TTI::SK_Reverse, MVT::v8f32, 2 }, // vperm2f128 + vpermilps 1015 { TTI::SK_Reverse, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd 1016 { TTI::SK_Reverse, MVT::v8i32, 2 }, // vperm2f128 + vpermilps 1017 { TTI::SK_Reverse, MVT::v16i16, 4 }, // vextractf128 + 2*pshufb 1018 // + vinsertf128 1019 { TTI::SK_Reverse, MVT::v32i8, 4 }, // vextractf128 + 2*pshufb 1020 // + vinsertf128 1021 1022 { TTI::SK_Select, MVT::v4i64, 1 }, // vblendpd 1023 { TTI::SK_Select, MVT::v4f64, 1 }, // vblendpd 1024 { TTI::SK_Select, MVT::v8i32, 1 }, // vblendps 1025 { TTI::SK_Select, MVT::v8f32, 1 }, // vblendps 1026 { TTI::SK_Select, MVT::v16i16, 3 }, // vpand + vpandn + vpor 1027 { TTI::SK_Select, MVT::v32i8, 3 }, // vpand + vpandn + vpor 1028 1029 { TTI::SK_PermuteSingleSrc, MVT::v4f64, 2 }, // vperm2f128 + vshufpd 1030 { TTI::SK_PermuteSingleSrc, MVT::v4i64, 2 }, // vperm2f128 + vshufpd 1031 { TTI::SK_PermuteSingleSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps 1032 { TTI::SK_PermuteSingleSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps 1033 { TTI::SK_PermuteSingleSrc, MVT::v16i16, 8 }, // vextractf128 + 4*pshufb 1034 // + 2*por + vinsertf128 1035 { TTI::SK_PermuteSingleSrc, MVT::v32i8, 8 }, // vextractf128 + 4*pshufb 1036 // + 2*por + vinsertf128 1037 1038 { TTI::SK_PermuteTwoSrc, MVT::v4f64, 3 }, // 2*vperm2f128 + vshufpd 1039 { TTI::SK_PermuteTwoSrc, MVT::v4i64, 3 }, // 2*vperm2f128 + vshufpd 1040 { TTI::SK_PermuteTwoSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps 1041 { TTI::SK_PermuteTwoSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps 1042 { TTI::SK_PermuteTwoSrc, MVT::v16i16, 15 }, // 2*vextractf128 + 8*pshufb 1043 // + 4*por + vinsertf128 1044 { TTI::SK_PermuteTwoSrc, MVT::v32i8, 15 }, // 2*vextractf128 + 8*pshufb 1045 // + 4*por + vinsertf128 1046 }; 1047 1048 if (ST->hasAVX()) 1049 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second)) 1050 return LT.first * Entry->Cost; 1051 1052 static const CostTblEntry SSE41ShuffleTbl[] = { 1053 { TTI::SK_Select, MVT::v2i64, 1 }, // pblendw 1054 { TTI::SK_Select, MVT::v2f64, 1 }, // movsd 1055 { TTI::SK_Select, MVT::v4i32, 1 }, // pblendw 1056 { TTI::SK_Select, MVT::v4f32, 1 }, // blendps 1057 { TTI::SK_Select, MVT::v8i16, 1 }, // pblendw 1058 { TTI::SK_Select, MVT::v16i8, 1 } // pblendvb 1059 }; 1060 1061 if (ST->hasSSE41()) 1062 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second)) 1063 return LT.first * Entry->Cost; 1064 1065 static const CostTblEntry SSSE3ShuffleTbl[] = { 1066 { TTI::SK_Broadcast, MVT::v8i16, 1 }, // pshufb 1067 { TTI::SK_Broadcast, MVT::v16i8, 1 }, // pshufb 1068 1069 { TTI::SK_Reverse, MVT::v8i16, 1 }, // pshufb 1070 { TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb 1071 1072 { TTI::SK_Select, MVT::v8i16, 3 }, // 2*pshufb + por 1073 { TTI::SK_Select, MVT::v16i8, 3 }, // 2*pshufb + por 1074 1075 { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // pshufb 1076 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb 1077 1078 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 3 }, // 2*pshufb + por 1079 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 3 }, // 2*pshufb + por 1080 }; 1081 1082 if (ST->hasSSSE3()) 1083 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second)) 1084 return LT.first * Entry->Cost; 1085 1086 static const CostTblEntry SSE2ShuffleTbl[] = { 1087 { TTI::SK_Broadcast, MVT::v2f64, 1 }, // shufpd 1088 { TTI::SK_Broadcast, MVT::v2i64, 1 }, // pshufd 1089 { TTI::SK_Broadcast, MVT::v4i32, 1 }, // pshufd 1090 { TTI::SK_Broadcast, MVT::v8i16, 2 }, // pshuflw + pshufd 1091 { TTI::SK_Broadcast, MVT::v16i8, 3 }, // unpck + pshuflw + pshufd 1092 1093 { TTI::SK_Reverse, MVT::v2f64, 1 }, // shufpd 1094 { TTI::SK_Reverse, MVT::v2i64, 1 }, // pshufd 1095 { TTI::SK_Reverse, MVT::v4i32, 1 }, // pshufd 1096 { TTI::SK_Reverse, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd 1097 { TTI::SK_Reverse, MVT::v16i8, 9 }, // 2*pshuflw + 2*pshufhw 1098 // + 2*pshufd + 2*unpck + packus 1099 1100 { TTI::SK_Select, MVT::v2i64, 1 }, // movsd 1101 { TTI::SK_Select, MVT::v2f64, 1 }, // movsd 1102 { TTI::SK_Select, MVT::v4i32, 2 }, // 2*shufps 1103 { TTI::SK_Select, MVT::v8i16, 3 }, // pand + pandn + por 1104 { TTI::SK_Select, MVT::v16i8, 3 }, // pand + pandn + por 1105 1106 { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // shufpd 1107 { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd 1108 { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }, // pshufd 1109 { TTI::SK_PermuteSingleSrc, MVT::v8i16, 5 }, // 2*pshuflw + 2*pshufhw 1110 // + pshufd/unpck 1111 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw 1112 // + 2*pshufd + 2*unpck + 2*packus 1113 1114 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd 1115 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd 1116 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd} 1117 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute 1118 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute 1119 }; 1120 1121 if (ST->hasSSE2()) 1122 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second)) 1123 return LT.first * Entry->Cost; 1124 1125 static const CostTblEntry SSE1ShuffleTbl[] = { 1126 { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps 1127 { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps 1128 { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps 1129 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps 1130 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps 1131 }; 1132 1133 if (ST->hasSSE1()) 1134 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second)) 1135 return LT.first * Entry->Cost; 1136 1137 return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); 1138 } 1139 1140 int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, 1141 const Instruction *I) { 1142 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1143 assert(ISD && "Invalid opcode"); 1144 1145 // FIXME: Need a better design of the cost table to handle non-simple types of 1146 // potential massive combinations (elem_num x src_type x dst_type). 1147 1148 static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = { 1149 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, 1150 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 1151 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, 1152 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, 1153 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, 1154 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, 1155 1156 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, 1157 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 1158 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, 1159 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, 1160 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, 1161 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, 1162 1163 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 1 }, 1164 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 }, 1165 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 }, 1166 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, 1167 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 }, 1168 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 }, 1169 1170 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 }, 1171 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 }, 1172 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 }, 1173 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, 1174 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 }, 1175 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 }, 1176 }; 1177 1178 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and 1179 // 256-bit wide vectors. 1180 1181 static const TypeConversionCostTblEntry AVX512FConversionTbl[] = { 1182 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 }, 1183 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 }, 1184 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 }, 1185 1186 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 1 }, 1187 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 1 }, 1188 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 1 }, 1189 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, 1190 1191 // v16i1 -> v16i32 - load + broadcast 1192 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, 1193 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, 1194 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, 1195 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, 1196 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, 1197 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, 1198 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, 1199 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, 1200 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, 1201 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, 1202 1203 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, 1204 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, 1205 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 }, 1206 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 }, 1207 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, 1208 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 }, 1209 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, 1210 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, 1211 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 }, 1212 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 26 }, 1213 1214 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, 1215 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, 1216 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 2 }, 1217 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 }, 1218 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 2 }, 1219 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 }, 1220 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 }, 1221 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 5 }, 1222 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 }, 1223 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, 1224 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, 1225 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 }, 1226 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 }, 1227 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 }, 1228 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 1229 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, 1230 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, 1231 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, 1232 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, 1233 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 }, 1234 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, 1235 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 12 }, 1236 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 26 }, 1237 1238 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, 1239 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, 1240 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 }, 1241 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 }, 1242 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 2 }, 1243 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 2 }, 1244 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 }, 1245 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 2 }, 1246 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 2 }, 1247 }; 1248 1249 static const TypeConversionCostTblEntry AVX2ConversionTbl[] = { 1250 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, 1251 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, 1252 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, 1253 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, 1254 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, 1255 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, 1256 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 1257 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 1258 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, 1259 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, 1260 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 1261 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 1262 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, 1263 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, 1264 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, 1265 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, 1266 1267 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, 1268 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, 1269 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 }, 1270 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, 1271 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 }, 1272 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 4 }, 1273 1274 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 }, 1275 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 }, 1276 1277 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 }, 1278 }; 1279 1280 static const TypeConversionCostTblEntry AVXConversionTbl[] = { 1281 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 }, 1282 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 }, 1283 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 }, 1284 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 }, 1285 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 }, 1286 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, 1287 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 7 }, 1288 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 }, 1289 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, 1290 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, 1291 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 }, 1292 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 1293 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, 1294 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, 1295 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 4 }, 1296 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 4 }, 1297 1298 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 4 }, 1299 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 }, 1300 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, 1301 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 4 }, 1302 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 4 }, 1303 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 4 }, 1304 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 9 }, 1305 1306 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, 1307 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 }, 1308 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 }, 1309 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 1310 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i8, 3 }, 1311 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 8 }, 1312 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 3 }, 1313 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i16, 3 }, 1314 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 }, 1315 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 1316 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, 1317 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, 1318 1319 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 }, 1320 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 }, 1321 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 }, 1322 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 2 }, 1323 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 }, 1324 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 5 }, 1325 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 1326 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 }, 1327 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 }, 1328 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 6 }, 1329 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 6 }, 1330 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 }, 1331 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 9 }, 1332 // The generic code to compute the scalar overhead is currently broken. 1333 // Workaround this limitation by estimating the scalarization overhead 1334 // here. We have roughly 10 instructions per scalar element. 1335 // Multiply that by the vector width. 1336 // FIXME: remove that when PR19268 is fixed. 1337 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 10 }, 1338 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 20 }, 1339 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 }, 1340 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 }, 1341 1342 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 }, 1343 { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 7 }, 1344 // This node is expanded into scalarized operations but BasicTTI is overly 1345 // optimistic estimating its cost. It computes 3 per element (one 1346 // vector-extract, one scalar conversion and one vector-insert). The 1347 // problem is that the inserts form a read-modify-write chain so latency 1348 // should be factored in too. Inflating the cost per element by 1. 1349 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 8*4 }, 1350 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 }, 1351 1352 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 }, 1353 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 }, 1354 }; 1355 1356 static const TypeConversionCostTblEntry SSE41ConversionTbl[] = { 1357 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 2 }, 1358 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 2 }, 1359 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 2 }, 1360 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 2 }, 1361 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 1362 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 1363 1364 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, 1365 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 2 }, 1366 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 1 }, 1367 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 1 }, 1368 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, 1369 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, 1370 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 2 }, 1371 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 2 }, 1372 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 1373 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 1374 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 4 }, 1375 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 4 }, 1376 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, 1377 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, 1378 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 1379 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 1380 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 }, 1381 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 }, 1382 1383 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, 1384 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1 }, 1385 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1 }, 1386 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, 1387 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, 1388 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 3 }, 1389 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 }, 1390 1391 }; 1392 1393 static const TypeConversionCostTblEntry SSE2ConversionTbl[] = { 1394 // These are somewhat magic numbers justified by looking at the output of 1395 // Intel's IACA, running some kernels and making sure when we take 1396 // legalization into account the throughput will be overestimated. 1397 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, 1398 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, 1399 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, 1400 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, 1401 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 }, 1402 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, 1403 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, 1404 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, 1405 1406 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, 1407 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, 1408 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, 1409 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, 1410 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, 1411 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 }, 1412 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, 1413 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, 1414 1415 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 3 }, 1416 1417 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, 1418 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 }, 1419 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, 1420 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 3 }, 1421 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, 1422 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 8 }, 1423 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, 1424 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 2 }, 1425 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 6 }, 1426 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 6 }, 1427 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, 1428 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, 1429 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 9 }, 1430 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 12 }, 1431 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, 1432 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 2 }, 1433 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 1434 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 10 }, 1435 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, 1436 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, 1437 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 }, 1438 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 }, 1439 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 }, 1440 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 5 }, 1441 1442 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 4 }, 1443 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, 1444 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, 1445 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 }, 1446 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 }, 1447 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 }, 1448 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 }, 1449 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, 1450 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 10 }, 1451 }; 1452 1453 std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src); 1454 std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst); 1455 1456 if (ST->hasSSE2() && !ST->hasAVX()) { 1457 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, 1458 LTDest.second, LTSrc.second)) 1459 return LTSrc.first * Entry->Cost; 1460 } 1461 1462 EVT SrcTy = TLI->getValueType(DL, Src); 1463 EVT DstTy = TLI->getValueType(DL, Dst); 1464 1465 // The function getSimpleVT only handles simple value types. 1466 if (!SrcTy.isSimple() || !DstTy.isSimple()) 1467 return BaseT::getCastInstrCost(Opcode, Dst, Src); 1468 1469 if (ST->hasDQI()) 1470 if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD, 1471 DstTy.getSimpleVT(), 1472 SrcTy.getSimpleVT())) 1473 return Entry->Cost; 1474 1475 if (ST->hasAVX512()) 1476 if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD, 1477 DstTy.getSimpleVT(), 1478 SrcTy.getSimpleVT())) 1479 return Entry->Cost; 1480 1481 if (ST->hasAVX2()) { 1482 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, 1483 DstTy.getSimpleVT(), 1484 SrcTy.getSimpleVT())) 1485 return Entry->Cost; 1486 } 1487 1488 if (ST->hasAVX()) { 1489 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, 1490 DstTy.getSimpleVT(), 1491 SrcTy.getSimpleVT())) 1492 return Entry->Cost; 1493 } 1494 1495 if (ST->hasSSE41()) { 1496 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, 1497 DstTy.getSimpleVT(), 1498 SrcTy.getSimpleVT())) 1499 return Entry->Cost; 1500 } 1501 1502 if (ST->hasSSE2()) { 1503 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, 1504 DstTy.getSimpleVT(), 1505 SrcTy.getSimpleVT())) 1506 return Entry->Cost; 1507 } 1508 1509 return BaseT::getCastInstrCost(Opcode, Dst, Src, I); 1510 } 1511 1512 int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, 1513 const Instruction *I) { 1514 // Legalize the type. 1515 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 1516 1517 MVT MTy = LT.second; 1518 1519 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1520 assert(ISD && "Invalid opcode"); 1521 1522 static const CostTblEntry SSE2CostTbl[] = { 1523 { ISD::SETCC, MVT::v2i64, 8 }, 1524 { ISD::SETCC, MVT::v4i32, 1 }, 1525 { ISD::SETCC, MVT::v8i16, 1 }, 1526 { ISD::SETCC, MVT::v16i8, 1 }, 1527 }; 1528 1529 static const CostTblEntry SSE42CostTbl[] = { 1530 { ISD::SETCC, MVT::v2f64, 1 }, 1531 { ISD::SETCC, MVT::v4f32, 1 }, 1532 { ISD::SETCC, MVT::v2i64, 1 }, 1533 }; 1534 1535 static const CostTblEntry AVX1CostTbl[] = { 1536 { ISD::SETCC, MVT::v4f64, 1 }, 1537 { ISD::SETCC, MVT::v8f32, 1 }, 1538 // AVX1 does not support 8-wide integer compare. 1539 { ISD::SETCC, MVT::v4i64, 4 }, 1540 { ISD::SETCC, MVT::v8i32, 4 }, 1541 { ISD::SETCC, MVT::v16i16, 4 }, 1542 { ISD::SETCC, MVT::v32i8, 4 }, 1543 }; 1544 1545 static const CostTblEntry AVX2CostTbl[] = { 1546 { ISD::SETCC, MVT::v4i64, 1 }, 1547 { ISD::SETCC, MVT::v8i32, 1 }, 1548 { ISD::SETCC, MVT::v16i16, 1 }, 1549 { ISD::SETCC, MVT::v32i8, 1 }, 1550 }; 1551 1552 static const CostTblEntry AVX512CostTbl[] = { 1553 { ISD::SETCC, MVT::v8i64, 1 }, 1554 { ISD::SETCC, MVT::v16i32, 1 }, 1555 { ISD::SETCC, MVT::v8f64, 1 }, 1556 { ISD::SETCC, MVT::v16f32, 1 }, 1557 }; 1558 1559 static const CostTblEntry AVX512BWCostTbl[] = { 1560 { ISD::SETCC, MVT::v32i16, 1 }, 1561 { ISD::SETCC, MVT::v64i8, 1 }, 1562 }; 1563 1564 if (ST->hasBWI()) 1565 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) 1566 return LT.first * Entry->Cost; 1567 1568 if (ST->hasAVX512()) 1569 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) 1570 return LT.first * Entry->Cost; 1571 1572 if (ST->hasAVX2()) 1573 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) 1574 return LT.first * Entry->Cost; 1575 1576 if (ST->hasAVX()) 1577 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) 1578 return LT.first * Entry->Cost; 1579 1580 if (ST->hasSSE42()) 1581 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) 1582 return LT.first * Entry->Cost; 1583 1584 if (ST->hasSSE2()) 1585 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) 1586 return LT.first * Entry->Cost; 1587 1588 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); 1589 } 1590 1591 unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; } 1592 1593 int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, 1594 ArrayRef<Type *> Tys, FastMathFlags FMF, 1595 unsigned ScalarizationCostPassed) { 1596 // Costs should match the codegen from: 1597 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll 1598 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll 1599 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll 1600 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll 1601 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll 1602 static const CostTblEntry AVX512CDCostTbl[] = { 1603 { ISD::CTLZ, MVT::v8i64, 1 }, 1604 { ISD::CTLZ, MVT::v16i32, 1 }, 1605 { ISD::CTLZ, MVT::v32i16, 8 }, 1606 { ISD::CTLZ, MVT::v64i8, 20 }, 1607 { ISD::CTLZ, MVT::v4i64, 1 }, 1608 { ISD::CTLZ, MVT::v8i32, 1 }, 1609 { ISD::CTLZ, MVT::v16i16, 4 }, 1610 { ISD::CTLZ, MVT::v32i8, 10 }, 1611 { ISD::CTLZ, MVT::v2i64, 1 }, 1612 { ISD::CTLZ, MVT::v4i32, 1 }, 1613 { ISD::CTLZ, MVT::v8i16, 4 }, 1614 { ISD::CTLZ, MVT::v16i8, 4 }, 1615 }; 1616 static const CostTblEntry AVX512BWCostTbl[] = { 1617 { ISD::BITREVERSE, MVT::v8i64, 5 }, 1618 { ISD::BITREVERSE, MVT::v16i32, 5 }, 1619 { ISD::BITREVERSE, MVT::v32i16, 5 }, 1620 { ISD::BITREVERSE, MVT::v64i8, 5 }, 1621 { ISD::CTLZ, MVT::v8i64, 23 }, 1622 { ISD::CTLZ, MVT::v16i32, 22 }, 1623 { ISD::CTLZ, MVT::v32i16, 18 }, 1624 { ISD::CTLZ, MVT::v64i8, 17 }, 1625 { ISD::CTPOP, MVT::v8i64, 7 }, 1626 { ISD::CTPOP, MVT::v16i32, 11 }, 1627 { ISD::CTPOP, MVT::v32i16, 9 }, 1628 { ISD::CTPOP, MVT::v64i8, 6 }, 1629 { ISD::CTTZ, MVT::v8i64, 10 }, 1630 { ISD::CTTZ, MVT::v16i32, 14 }, 1631 { ISD::CTTZ, MVT::v32i16, 12 }, 1632 { ISD::CTTZ, MVT::v64i8, 9 }, 1633 }; 1634 static const CostTblEntry AVX512CostTbl[] = { 1635 { ISD::BITREVERSE, MVT::v8i64, 36 }, 1636 { ISD::BITREVERSE, MVT::v16i32, 24 }, 1637 { ISD::CTLZ, MVT::v8i64, 29 }, 1638 { ISD::CTLZ, MVT::v16i32, 35 }, 1639 { ISD::CTPOP, MVT::v8i64, 16 }, 1640 { ISD::CTPOP, MVT::v16i32, 24 }, 1641 { ISD::CTTZ, MVT::v8i64, 20 }, 1642 { ISD::CTTZ, MVT::v16i32, 28 }, 1643 }; 1644 static const CostTblEntry XOPCostTbl[] = { 1645 { ISD::BITREVERSE, MVT::v4i64, 4 }, 1646 { ISD::BITREVERSE, MVT::v8i32, 4 }, 1647 { ISD::BITREVERSE, MVT::v16i16, 4 }, 1648 { ISD::BITREVERSE, MVT::v32i8, 4 }, 1649 { ISD::BITREVERSE, MVT::v2i64, 1 }, 1650 { ISD::BITREVERSE, MVT::v4i32, 1 }, 1651 { ISD::BITREVERSE, MVT::v8i16, 1 }, 1652 { ISD::BITREVERSE, MVT::v16i8, 1 }, 1653 { ISD::BITREVERSE, MVT::i64, 3 }, 1654 { ISD::BITREVERSE, MVT::i32, 3 }, 1655 { ISD::BITREVERSE, MVT::i16, 3 }, 1656 { ISD::BITREVERSE, MVT::i8, 3 } 1657 }; 1658 static const CostTblEntry AVX2CostTbl[] = { 1659 { ISD::BITREVERSE, MVT::v4i64, 5 }, 1660 { ISD::BITREVERSE, MVT::v8i32, 5 }, 1661 { ISD::BITREVERSE, MVT::v16i16, 5 }, 1662 { ISD::BITREVERSE, MVT::v32i8, 5 }, 1663 { ISD::BSWAP, MVT::v4i64, 1 }, 1664 { ISD::BSWAP, MVT::v8i32, 1 }, 1665 { ISD::BSWAP, MVT::v16i16, 1 }, 1666 { ISD::CTLZ, MVT::v4i64, 23 }, 1667 { ISD::CTLZ, MVT::v8i32, 18 }, 1668 { ISD::CTLZ, MVT::v16i16, 14 }, 1669 { ISD::CTLZ, MVT::v32i8, 9 }, 1670 { ISD::CTPOP, MVT::v4i64, 7 }, 1671 { ISD::CTPOP, MVT::v8i32, 11 }, 1672 { ISD::CTPOP, MVT::v16i16, 9 }, 1673 { ISD::CTPOP, MVT::v32i8, 6 }, 1674 { ISD::CTTZ, MVT::v4i64, 10 }, 1675 { ISD::CTTZ, MVT::v8i32, 14 }, 1676 { ISD::CTTZ, MVT::v16i16, 12 }, 1677 { ISD::CTTZ, MVT::v32i8, 9 }, 1678 { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/ 1679 { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/ 1680 { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/ 1681 { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/ 1682 { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/ 1683 { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/ 1684 }; 1685 static const CostTblEntry AVX1CostTbl[] = { 1686 { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert 1687 { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert 1688 { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert 1689 { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert 1690 { ISD::BSWAP, MVT::v4i64, 4 }, 1691 { ISD::BSWAP, MVT::v8i32, 4 }, 1692 { ISD::BSWAP, MVT::v16i16, 4 }, 1693 { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert 1694 { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert 1695 { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert 1696 { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert 1697 { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert 1698 { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert 1699 { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert 1700 { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert 1701 { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert 1702 { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert 1703 { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert 1704 { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert 1705 { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/ 1706 { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/ 1707 { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/ 1708 { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/ 1709 { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/ 1710 { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/ 1711 }; 1712 static const CostTblEntry GLMCostTbl[] = { 1713 { ISD::FSQRT, MVT::f32, 19 }, // sqrtss 1714 { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps 1715 { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd 1716 { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd 1717 }; 1718 static const CostTblEntry SLMCostTbl[] = { 1719 { ISD::FSQRT, MVT::f32, 20 }, // sqrtss 1720 { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps 1721 { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd 1722 { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd 1723 }; 1724 static const CostTblEntry SSE42CostTbl[] = { 1725 { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/ 1726 { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/ 1727 }; 1728 static const CostTblEntry SSSE3CostTbl[] = { 1729 { ISD::BITREVERSE, MVT::v2i64, 5 }, 1730 { ISD::BITREVERSE, MVT::v4i32, 5 }, 1731 { ISD::BITREVERSE, MVT::v8i16, 5 }, 1732 { ISD::BITREVERSE, MVT::v16i8, 5 }, 1733 { ISD::BSWAP, MVT::v2i64, 1 }, 1734 { ISD::BSWAP, MVT::v4i32, 1 }, 1735 { ISD::BSWAP, MVT::v8i16, 1 }, 1736 { ISD::CTLZ, MVT::v2i64, 23 }, 1737 { ISD::CTLZ, MVT::v4i32, 18 }, 1738 { ISD::CTLZ, MVT::v8i16, 14 }, 1739 { ISD::CTLZ, MVT::v16i8, 9 }, 1740 { ISD::CTPOP, MVT::v2i64, 7 }, 1741 { ISD::CTPOP, MVT::v4i32, 11 }, 1742 { ISD::CTPOP, MVT::v8i16, 9 }, 1743 { ISD::CTPOP, MVT::v16i8, 6 }, 1744 { ISD::CTTZ, MVT::v2i64, 10 }, 1745 { ISD::CTTZ, MVT::v4i32, 14 }, 1746 { ISD::CTTZ, MVT::v8i16, 12 }, 1747 { ISD::CTTZ, MVT::v16i8, 9 } 1748 }; 1749 static const CostTblEntry SSE2CostTbl[] = { 1750 { ISD::BITREVERSE, MVT::v2i64, 29 }, 1751 { ISD::BITREVERSE, MVT::v4i32, 27 }, 1752 { ISD::BITREVERSE, MVT::v8i16, 27 }, 1753 { ISD::BITREVERSE, MVT::v16i8, 20 }, 1754 { ISD::BSWAP, MVT::v2i64, 7 }, 1755 { ISD::BSWAP, MVT::v4i32, 7 }, 1756 { ISD::BSWAP, MVT::v8i16, 7 }, 1757 { ISD::CTLZ, MVT::v2i64, 25 }, 1758 { ISD::CTLZ, MVT::v4i32, 26 }, 1759 { ISD::CTLZ, MVT::v8i16, 20 }, 1760 { ISD::CTLZ, MVT::v16i8, 17 }, 1761 { ISD::CTPOP, MVT::v2i64, 12 }, 1762 { ISD::CTPOP, MVT::v4i32, 15 }, 1763 { ISD::CTPOP, MVT::v8i16, 13 }, 1764 { ISD::CTPOP, MVT::v16i8, 10 }, 1765 { ISD::CTTZ, MVT::v2i64, 14 }, 1766 { ISD::CTTZ, MVT::v4i32, 18 }, 1767 { ISD::CTTZ, MVT::v8i16, 16 }, 1768 { ISD::CTTZ, MVT::v16i8, 13 }, 1769 { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/ 1770 { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/ 1771 }; 1772 static const CostTblEntry SSE1CostTbl[] = { 1773 { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/ 1774 { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/ 1775 }; 1776 static const CostTblEntry X64CostTbl[] = { // 64-bit targets 1777 { ISD::BITREVERSE, MVT::i64, 14 } 1778 }; 1779 static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets 1780 { ISD::BITREVERSE, MVT::i32, 14 }, 1781 { ISD::BITREVERSE, MVT::i16, 14 }, 1782 { ISD::BITREVERSE, MVT::i8, 11 } 1783 }; 1784 1785 unsigned ISD = ISD::DELETED_NODE; 1786 switch (IID) { 1787 default: 1788 break; 1789 case Intrinsic::bitreverse: 1790 ISD = ISD::BITREVERSE; 1791 break; 1792 case Intrinsic::bswap: 1793 ISD = ISD::BSWAP; 1794 break; 1795 case Intrinsic::ctlz: 1796 ISD = ISD::CTLZ; 1797 break; 1798 case Intrinsic::ctpop: 1799 ISD = ISD::CTPOP; 1800 break; 1801 case Intrinsic::cttz: 1802 ISD = ISD::CTTZ; 1803 break; 1804 case Intrinsic::sqrt: 1805 ISD = ISD::FSQRT; 1806 break; 1807 } 1808 1809 // Legalize the type. 1810 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy); 1811 MVT MTy = LT.second; 1812 1813 // Attempt to lookup cost. 1814 if (ST->isGLM()) 1815 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy)) 1816 return LT.first * Entry->Cost; 1817 1818 if (ST->isSLM()) 1819 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) 1820 return LT.first * Entry->Cost; 1821 1822 if (ST->hasCDI()) 1823 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy)) 1824 return LT.first * Entry->Cost; 1825 1826 if (ST->hasBWI()) 1827 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) 1828 return LT.first * Entry->Cost; 1829 1830 if (ST->hasAVX512()) 1831 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) 1832 return LT.first * Entry->Cost; 1833 1834 if (ST->hasXOP()) 1835 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) 1836 return LT.first * Entry->Cost; 1837 1838 if (ST->hasAVX2()) 1839 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) 1840 return LT.first * Entry->Cost; 1841 1842 if (ST->hasAVX()) 1843 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) 1844 return LT.first * Entry->Cost; 1845 1846 if (ST->hasSSE42()) 1847 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) 1848 return LT.first * Entry->Cost; 1849 1850 if (ST->hasSSSE3()) 1851 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy)) 1852 return LT.first * Entry->Cost; 1853 1854 if (ST->hasSSE2()) 1855 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) 1856 return LT.first * Entry->Cost; 1857 1858 if (ST->hasSSE1()) 1859 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) 1860 return LT.first * Entry->Cost; 1861 1862 if (ST->is64Bit()) 1863 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) 1864 return LT.first * Entry->Cost; 1865 1866 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy)) 1867 return LT.first * Entry->Cost; 1868 1869 return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed); 1870 } 1871 1872 int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, 1873 ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) { 1874 return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF); 1875 } 1876 1877 int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { 1878 assert(Val->isVectorTy() && "This must be a vector type"); 1879 1880 Type *ScalarType = Val->getScalarType(); 1881 1882 if (Index != -1U) { 1883 // Legalize the type. 1884 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); 1885 1886 // This type is legalized to a scalar type. 1887 if (!LT.second.isVector()) 1888 return 0; 1889 1890 // The type may be split. Normalize the index to the new type. 1891 unsigned Width = LT.second.getVectorNumElements(); 1892 Index = Index % Width; 1893 1894 // Floating point scalars are already located in index #0. 1895 if (ScalarType->isFloatingPointTy() && Index == 0) 1896 return 0; 1897 } 1898 1899 // Add to the base cost if we know that the extracted element of a vector is 1900 // destined to be moved to and used in the integer register file. 1901 int RegisterFileMoveCost = 0; 1902 if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy()) 1903 RegisterFileMoveCost = 1; 1904 1905 return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost; 1906 } 1907 1908 int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, 1909 unsigned AddressSpace, const Instruction *I) { 1910 // Handle non-power-of-two vectors such as <3 x float> 1911 if (VectorType *VTy = dyn_cast<VectorType>(Src)) { 1912 unsigned NumElem = VTy->getVectorNumElements(); 1913 1914 // Handle a few common cases: 1915 // <3 x float> 1916 if (NumElem == 3 && VTy->getScalarSizeInBits() == 32) 1917 // Cost = 64 bit store + extract + 32 bit store. 1918 return 3; 1919 1920 // <3 x double> 1921 if (NumElem == 3 && VTy->getScalarSizeInBits() == 64) 1922 // Cost = 128 bit store + unpack + 64 bit store. 1923 return 3; 1924 1925 // Assume that all other non-power-of-two numbers are scalarized. 1926 if (!isPowerOf2_32(NumElem)) { 1927 int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment, 1928 AddressSpace); 1929 int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load, 1930 Opcode == Instruction::Store); 1931 return NumElem * Cost + SplitCost; 1932 } 1933 } 1934 1935 // Legalize the type. 1936 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); 1937 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && 1938 "Invalid Opcode"); 1939 1940 // Each load/store unit costs 1. 1941 int Cost = LT.first * 1; 1942 1943 // This isn't exactly right. We're using slow unaligned 32-byte accesses as a 1944 // proxy for a double-pumped AVX memory interface such as on Sandybridge. 1945 if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow()) 1946 Cost *= 2; 1947 1948 return Cost; 1949 } 1950 1951 int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, 1952 unsigned Alignment, 1953 unsigned AddressSpace) { 1954 VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy); 1955 if (!SrcVTy) 1956 // To calculate scalar take the regular cost, without mask 1957 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace); 1958 1959 unsigned NumElem = SrcVTy->getVectorNumElements(); 1960 VectorType *MaskTy = 1961 VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem); 1962 if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy)) || 1963 (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy)) || 1964 !isPowerOf2_32(NumElem)) { 1965 // Scalarization 1966 int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true); 1967 int ScalarCompareCost = getCmpSelInstrCost( 1968 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr); 1969 int BranchCost = getCFInstrCost(Instruction::Br); 1970 int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); 1971 1972 int ValueSplitCost = getScalarizationOverhead( 1973 SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store); 1974 int MemopCost = 1975 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), 1976 Alignment, AddressSpace); 1977 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; 1978 } 1979 1980 // Legalize the type. 1981 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy); 1982 auto VT = TLI->getValueType(DL, SrcVTy); 1983 int Cost = 0; 1984 if (VT.isSimple() && LT.second != VT.getSimpleVT() && 1985 LT.second.getVectorNumElements() == NumElem) 1986 // Promotion requires expand/truncate for data and a shuffle for mask. 1987 Cost += getShuffleCost(TTI::SK_Select, SrcVTy, 0, nullptr) + 1988 getShuffleCost(TTI::SK_Select, MaskTy, 0, nullptr); 1989 1990 else if (LT.second.getVectorNumElements() > NumElem) { 1991 VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(), 1992 LT.second.getVectorNumElements()); 1993 // Expanding requires fill mask with zeroes 1994 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy); 1995 } 1996 if (!ST->hasAVX512()) 1997 return Cost + LT.first*4; // Each maskmov costs 4 1998 1999 // AVX-512 masked load/store is cheapper 2000 return Cost+LT.first; 2001 } 2002 2003 int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, 2004 const SCEV *Ptr) { 2005 // Address computations in vectorized code with non-consecutive addresses will 2006 // likely result in more instructions compared to scalar code where the 2007 // computation can more often be merged into the index mode. The resulting 2008 // extra micro-ops can significantly decrease throughput. 2009 unsigned NumVectorInstToHideOverhead = 10; 2010 2011 // Cost modeling of Strided Access Computation is hidden by the indexing 2012 // modes of X86 regardless of the stride value. We dont believe that there 2013 // is a difference between constant strided access in gerenal and constant 2014 // strided value which is less than or equal to 64. 2015 // Even in the case of (loop invariant) stride whose value is not known at 2016 // compile time, the address computation will not incur more than one extra 2017 // ADD instruction. 2018 if (Ty->isVectorTy() && SE) { 2019 if (!BaseT::isStridedAccess(Ptr)) 2020 return NumVectorInstToHideOverhead; 2021 if (!BaseT::getConstantStrideStep(SE, Ptr)) 2022 return 1; 2023 } 2024 2025 return BaseT::getAddressComputationCost(Ty, SE, Ptr); 2026 } 2027 2028 int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, 2029 bool IsPairwise) { 2030 2031 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 2032 2033 MVT MTy = LT.second; 2034 2035 int ISD = TLI->InstructionOpcodeToISD(Opcode); 2036 assert(ISD && "Invalid opcode"); 2037 2038 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput 2039 // and make it as the cost. 2040 2041 static const CostTblEntry SSE42CostTblPairWise[] = { 2042 { ISD::FADD, MVT::v2f64, 2 }, 2043 { ISD::FADD, MVT::v4f32, 4 }, 2044 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". 2045 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". 2046 { ISD::ADD, MVT::v8i16, 5 }, 2047 }; 2048 2049 static const CostTblEntry AVX1CostTblPairWise[] = { 2050 { ISD::FADD, MVT::v4f32, 4 }, 2051 { ISD::FADD, MVT::v4f64, 5 }, 2052 { ISD::FADD, MVT::v8f32, 7 }, 2053 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". 2054 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". 2055 { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8". 2056 { ISD::ADD, MVT::v8i16, 5 }, 2057 { ISD::ADD, MVT::v8i32, 5 }, 2058 }; 2059 2060 static const CostTblEntry SSE42CostTblNoPairWise[] = { 2061 { ISD::FADD, MVT::v2f64, 2 }, 2062 { ISD::FADD, MVT::v4f32, 4 }, 2063 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". 2064 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3". 2065 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3". 2066 }; 2067 2068 static const CostTblEntry AVX1CostTblNoPairWise[] = { 2069 { ISD::FADD, MVT::v4f32, 3 }, 2070 { ISD::FADD, MVT::v4f64, 3 }, 2071 { ISD::FADD, MVT::v8f32, 4 }, 2072 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". 2073 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8". 2074 { ISD::ADD, MVT::v4i64, 3 }, 2075 { ISD::ADD, MVT::v8i16, 4 }, 2076 { ISD::ADD, MVT::v8i32, 5 }, 2077 }; 2078 2079 if (IsPairwise) { 2080 if (ST->hasAVX()) 2081 if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy)) 2082 return LT.first * Entry->Cost; 2083 2084 if (ST->hasSSE42()) 2085 if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy)) 2086 return LT.first * Entry->Cost; 2087 } else { 2088 if (ST->hasAVX()) 2089 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) 2090 return LT.first * Entry->Cost; 2091 2092 if (ST->hasSSE42()) 2093 if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy)) 2094 return LT.first * Entry->Cost; 2095 } 2096 2097 return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise); 2098 } 2099 2100 int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy, 2101 bool IsPairwise, bool IsUnsigned) { 2102 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 2103 2104 MVT MTy = LT.second; 2105 2106 int ISD; 2107 if (ValTy->isIntOrIntVectorTy()) { 2108 ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN; 2109 } else { 2110 assert(ValTy->isFPOrFPVectorTy() && 2111 "Expected float point or integer vector type."); 2112 ISD = ISD::FMINNUM; 2113 } 2114 2115 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput 2116 // and make it as the cost. 2117 2118 static const CostTblEntry SSE42CostTblPairWise[] = { 2119 {ISD::FMINNUM, MVT::v2f64, 3}, 2120 {ISD::FMINNUM, MVT::v4f32, 2}, 2121 {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8" 2122 {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6" 2123 {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5" 2124 {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8" 2125 {ISD::SMIN, MVT::v8i16, 2}, 2126 {ISD::UMIN, MVT::v8i16, 2}, 2127 }; 2128 2129 static const CostTblEntry AVX1CostTblPairWise[] = { 2130 {ISD::FMINNUM, MVT::v4f32, 1}, 2131 {ISD::FMINNUM, MVT::v4f64, 1}, 2132 {ISD::FMINNUM, MVT::v8f32, 2}, 2133 {ISD::SMIN, MVT::v2i64, 3}, 2134 {ISD::UMIN, MVT::v2i64, 3}, 2135 {ISD::SMIN, MVT::v4i32, 1}, 2136 {ISD::UMIN, MVT::v4i32, 1}, 2137 {ISD::SMIN, MVT::v8i16, 1}, 2138 {ISD::UMIN, MVT::v8i16, 1}, 2139 {ISD::SMIN, MVT::v8i32, 3}, 2140 {ISD::UMIN, MVT::v8i32, 3}, 2141 }; 2142 2143 static const CostTblEntry AVX2CostTblPairWise[] = { 2144 {ISD::SMIN, MVT::v4i64, 2}, 2145 {ISD::UMIN, MVT::v4i64, 2}, 2146 {ISD::SMIN, MVT::v8i32, 1}, 2147 {ISD::UMIN, MVT::v8i32, 1}, 2148 {ISD::SMIN, MVT::v16i16, 1}, 2149 {ISD::UMIN, MVT::v16i16, 1}, 2150 {ISD::SMIN, MVT::v32i8, 2}, 2151 {ISD::UMIN, MVT::v32i8, 2}, 2152 }; 2153 2154 static const CostTblEntry AVX512CostTblPairWise[] = { 2155 {ISD::FMINNUM, MVT::v8f64, 1}, 2156 {ISD::FMINNUM, MVT::v16f32, 2}, 2157 {ISD::SMIN, MVT::v8i64, 2}, 2158 {ISD::UMIN, MVT::v8i64, 2}, 2159 {ISD::SMIN, MVT::v16i32, 1}, 2160 {ISD::UMIN, MVT::v16i32, 1}, 2161 }; 2162 2163 static const CostTblEntry SSE42CostTblNoPairWise[] = { 2164 {ISD::FMINNUM, MVT::v2f64, 3}, 2165 {ISD::FMINNUM, MVT::v4f32, 3}, 2166 {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8" 2167 {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6" 2168 {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5" 2169 {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8" 2170 {ISD::SMIN, MVT::v8i16, 1}, // The data reported by the IACA is "1.5" 2171 {ISD::UMIN, MVT::v8i16, 2}, // The data reported by the IACA is "1.8" 2172 }; 2173 2174 static const CostTblEntry AVX1CostTblNoPairWise[] = { 2175 {ISD::FMINNUM, MVT::v4f32, 1}, 2176 {ISD::FMINNUM, MVT::v4f64, 1}, 2177 {ISD::FMINNUM, MVT::v8f32, 1}, 2178 {ISD::SMIN, MVT::v2i64, 3}, 2179 {ISD::UMIN, MVT::v2i64, 3}, 2180 {ISD::SMIN, MVT::v4i32, 1}, 2181 {ISD::UMIN, MVT::v4i32, 1}, 2182 {ISD::SMIN, MVT::v8i16, 1}, 2183 {ISD::UMIN, MVT::v8i16, 1}, 2184 {ISD::SMIN, MVT::v8i32, 2}, 2185 {ISD::UMIN, MVT::v8i32, 2}, 2186 }; 2187 2188 static const CostTblEntry AVX2CostTblNoPairWise[] = { 2189 {ISD::SMIN, MVT::v4i64, 1}, 2190 {ISD::UMIN, MVT::v4i64, 1}, 2191 {ISD::SMIN, MVT::v8i32, 1}, 2192 {ISD::UMIN, MVT::v8i32, 1}, 2193 {ISD::SMIN, MVT::v16i16, 1}, 2194 {ISD::UMIN, MVT::v16i16, 1}, 2195 {ISD::SMIN, MVT::v32i8, 1}, 2196 {ISD::UMIN, MVT::v32i8, 1}, 2197 }; 2198 2199 static const CostTblEntry AVX512CostTblNoPairWise[] = { 2200 {ISD::FMINNUM, MVT::v8f64, 1}, 2201 {ISD::FMINNUM, MVT::v16f32, 2}, 2202 {ISD::SMIN, MVT::v8i64, 1}, 2203 {ISD::UMIN, MVT::v8i64, 1}, 2204 {ISD::SMIN, MVT::v16i32, 1}, 2205 {ISD::UMIN, MVT::v16i32, 1}, 2206 }; 2207 2208 if (IsPairwise) { 2209 if (ST->hasAVX512()) 2210 if (const auto *Entry = CostTableLookup(AVX512CostTblPairWise, ISD, MTy)) 2211 return LT.first * Entry->Cost; 2212 2213 if (ST->hasAVX2()) 2214 if (const auto *Entry = CostTableLookup(AVX2CostTblPairWise, ISD, MTy)) 2215 return LT.first * Entry->Cost; 2216 2217 if (ST->hasAVX()) 2218 if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy)) 2219 return LT.first * Entry->Cost; 2220 2221 if (ST->hasSSE42()) 2222 if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy)) 2223 return LT.first * Entry->Cost; 2224 } else { 2225 if (ST->hasAVX512()) 2226 if (const auto *Entry = 2227 CostTableLookup(AVX512CostTblNoPairWise, ISD, MTy)) 2228 return LT.first * Entry->Cost; 2229 2230 if (ST->hasAVX2()) 2231 if (const auto *Entry = CostTableLookup(AVX2CostTblNoPairWise, ISD, MTy)) 2232 return LT.first * Entry->Cost; 2233 2234 if (ST->hasAVX()) 2235 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) 2236 return LT.first * Entry->Cost; 2237 2238 if (ST->hasSSE42()) 2239 if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy)) 2240 return LT.first * Entry->Cost; 2241 } 2242 2243 return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned); 2244 } 2245 2246 /// Calculate the cost of materializing a 64-bit value. This helper 2247 /// method might only calculate a fraction of a larger immediate. Therefore it 2248 /// is valid to return a cost of ZERO. 2249 int X86TTIImpl::getIntImmCost(int64_t Val) { 2250 if (Val == 0) 2251 return TTI::TCC_Free; 2252 2253 if (isInt<32>(Val)) 2254 return TTI::TCC_Basic; 2255 2256 return 2 * TTI::TCC_Basic; 2257 } 2258 2259 int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { 2260 assert(Ty->isIntegerTy()); 2261 2262 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 2263 if (BitSize == 0) 2264 return ~0U; 2265 2266 // Never hoist constants larger than 128bit, because this might lead to 2267 // incorrect code generation or assertions in codegen. 2268 // Fixme: Create a cost model for types larger than i128 once the codegen 2269 // issues have been fixed. 2270 if (BitSize > 128) 2271 return TTI::TCC_Free; 2272 2273 if (Imm == 0) 2274 return TTI::TCC_Free; 2275 2276 // Sign-extend all constants to a multiple of 64-bit. 2277 APInt ImmVal = Imm; 2278 if (BitSize % 64 != 0) 2279 ImmVal = Imm.sext(alignTo(BitSize, 64)); 2280 2281 // Split the constant into 64-bit chunks and calculate the cost for each 2282 // chunk. 2283 int Cost = 0; 2284 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { 2285 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); 2286 int64_t Val = Tmp.getSExtValue(); 2287 Cost += getIntImmCost(Val); 2288 } 2289 // We need at least one instruction to materialize the constant. 2290 return std::max(1, Cost); 2291 } 2292 2293 int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, 2294 Type *Ty) { 2295 assert(Ty->isIntegerTy()); 2296 2297 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 2298 // There is no cost model for constants with a bit size of 0. Return TCC_Free 2299 // here, so that constant hoisting will ignore this constant. 2300 if (BitSize == 0) 2301 return TTI::TCC_Free; 2302 2303 unsigned ImmIdx = ~0U; 2304 switch (Opcode) { 2305 default: 2306 return TTI::TCC_Free; 2307 case Instruction::GetElementPtr: 2308 // Always hoist the base address of a GetElementPtr. This prevents the 2309 // creation of new constants for every base constant that gets constant 2310 // folded with the offset. 2311 if (Idx == 0) 2312 return 2 * TTI::TCC_Basic; 2313 return TTI::TCC_Free; 2314 case Instruction::Store: 2315 ImmIdx = 0; 2316 break; 2317 case Instruction::ICmp: 2318 // This is an imperfect hack to prevent constant hoisting of 2319 // compares that might be trying to check if a 64-bit value fits in 2320 // 32-bits. The backend can optimize these cases using a right shift by 32. 2321 // Ideally we would check the compare predicate here. There also other 2322 // similar immediates the backend can use shifts for. 2323 if (Idx == 1 && Imm.getBitWidth() == 64) { 2324 uint64_t ImmVal = Imm.getZExtValue(); 2325 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff) 2326 return TTI::TCC_Free; 2327 } 2328 ImmIdx = 1; 2329 break; 2330 case Instruction::And: 2331 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes 2332 // by using a 32-bit operation with implicit zero extension. Detect such 2333 // immediates here as the normal path expects bit 31 to be sign extended. 2334 if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue())) 2335 return TTI::TCC_Free; 2336 ImmIdx = 1; 2337 break; 2338 case Instruction::Add: 2339 case Instruction::Sub: 2340 // For add/sub, we can use the opposite instruction for INT32_MIN. 2341 if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000) 2342 return TTI::TCC_Free; 2343 ImmIdx = 1; 2344 break; 2345 case Instruction::Mul: 2346 case Instruction::UDiv: 2347 case Instruction::SDiv: 2348 case Instruction::URem: 2349 case Instruction::SRem: 2350 case Instruction::Or: 2351 case Instruction::Xor: 2352 ImmIdx = 1; 2353 break; 2354 // Always return TCC_Free for the shift value of a shift instruction. 2355 case Instruction::Shl: 2356 case Instruction::LShr: 2357 case Instruction::AShr: 2358 if (Idx == 1) 2359 return TTI::TCC_Free; 2360 break; 2361 case Instruction::Trunc: 2362 case Instruction::ZExt: 2363 case Instruction::SExt: 2364 case Instruction::IntToPtr: 2365 case Instruction::PtrToInt: 2366 case Instruction::BitCast: 2367 case Instruction::PHI: 2368 case Instruction::Call: 2369 case Instruction::Select: 2370 case Instruction::Ret: 2371 case Instruction::Load: 2372 break; 2373 } 2374 2375 if (Idx == ImmIdx) { 2376 int NumConstants = divideCeil(BitSize, 64); 2377 int Cost = X86TTIImpl::getIntImmCost(Imm, Ty); 2378 return (Cost <= NumConstants * TTI::TCC_Basic) 2379 ? static_cast<int>(TTI::TCC_Free) 2380 : Cost; 2381 } 2382 2383 return X86TTIImpl::getIntImmCost(Imm, Ty); 2384 } 2385 2386 int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, 2387 Type *Ty) { 2388 assert(Ty->isIntegerTy()); 2389 2390 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 2391 // There is no cost model for constants with a bit size of 0. Return TCC_Free 2392 // here, so that constant hoisting will ignore this constant. 2393 if (BitSize == 0) 2394 return TTI::TCC_Free; 2395 2396 switch (IID) { 2397 default: 2398 return TTI::TCC_Free; 2399 case Intrinsic::sadd_with_overflow: 2400 case Intrinsic::uadd_with_overflow: 2401 case Intrinsic::ssub_with_overflow: 2402 case Intrinsic::usub_with_overflow: 2403 case Intrinsic::smul_with_overflow: 2404 case Intrinsic::umul_with_overflow: 2405 if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue())) 2406 return TTI::TCC_Free; 2407 break; 2408 case Intrinsic::experimental_stackmap: 2409 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 2410 return TTI::TCC_Free; 2411 break; 2412 case Intrinsic::experimental_patchpoint_void: 2413 case Intrinsic::experimental_patchpoint_i64: 2414 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 2415 return TTI::TCC_Free; 2416 break; 2417 } 2418 return X86TTIImpl::getIntImmCost(Imm, Ty); 2419 } 2420 2421 unsigned X86TTIImpl::getUserCost(const User *U, 2422 ArrayRef<const Value *> Operands) { 2423 if (isa<StoreInst>(U)) { 2424 Value *Ptr = U->getOperand(1); 2425 // Store instruction with index and scale costs 2 Uops. 2426 // Check the preceding GEP to identify non-const indices. 2427 if (auto GEP = dyn_cast<GetElementPtrInst>(Ptr)) { 2428 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); })) 2429 return TTI::TCC_Basic * 2; 2430 } 2431 return TTI::TCC_Basic; 2432 } 2433 return BaseT::getUserCost(U, Operands); 2434 } 2435 2436 // Return an average cost of Gather / Scatter instruction, maybe improved later 2437 int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr, 2438 unsigned Alignment, unsigned AddressSpace) { 2439 2440 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost"); 2441 unsigned VF = SrcVTy->getVectorNumElements(); 2442 2443 // Try to reduce index size from 64 bit (default for GEP) 2444 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the 2445 // operation will use 16 x 64 indices which do not fit in a zmm and needs 2446 // to split. Also check that the base pointer is the same for all lanes, 2447 // and that there's at most one variable index. 2448 auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) { 2449 unsigned IndexSize = DL.getPointerSizeInBits(); 2450 GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); 2451 if (IndexSize < 64 || !GEP) 2452 return IndexSize; 2453 2454 unsigned NumOfVarIndices = 0; 2455 Value *Ptrs = GEP->getPointerOperand(); 2456 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs)) 2457 return IndexSize; 2458 for (unsigned i = 1; i < GEP->getNumOperands(); ++i) { 2459 if (isa<Constant>(GEP->getOperand(i))) 2460 continue; 2461 Type *IndxTy = GEP->getOperand(i)->getType(); 2462 if (IndxTy->isVectorTy()) 2463 IndxTy = IndxTy->getVectorElementType(); 2464 if ((IndxTy->getPrimitiveSizeInBits() == 64 && 2465 !isa<SExtInst>(GEP->getOperand(i))) || 2466 ++NumOfVarIndices > 1) 2467 return IndexSize; // 64 2468 } 2469 return (unsigned)32; 2470 }; 2471 2472 2473 // Trying to reduce IndexSize to 32 bits for vector 16. 2474 // By default the IndexSize is equal to pointer size. 2475 unsigned IndexSize = (ST->hasAVX512() && VF >= 16) 2476 ? getIndexSizeInBits(Ptr, DL) 2477 : DL.getPointerSizeInBits(); 2478 2479 Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(), 2480 IndexSize), VF); 2481 std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy); 2482 std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy); 2483 int SplitFactor = std::max(IdxsLT.first, SrcLT.first); 2484 if (SplitFactor > 1) { 2485 // Handle splitting of vector of pointers 2486 Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor); 2487 return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment, 2488 AddressSpace); 2489 } 2490 2491 // The gather / scatter cost is given by Intel architects. It is a rough 2492 // number since we are looking at one instruction in a time. 2493 const int GSOverhead = (Opcode == Instruction::Load) 2494 ? ST->getGatherOverhead() 2495 : ST->getScatterOverhead(); 2496 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), 2497 Alignment, AddressSpace); 2498 } 2499 2500 /// Return the cost of full scalarization of gather / scatter operation. 2501 /// 2502 /// Opcode - Load or Store instruction. 2503 /// SrcVTy - The type of the data vector that should be gathered or scattered. 2504 /// VariableMask - The mask is non-constant at compile time. 2505 /// Alignment - Alignment for one element. 2506 /// AddressSpace - pointer[s] address space. 2507 /// 2508 int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy, 2509 bool VariableMask, unsigned Alignment, 2510 unsigned AddressSpace) { 2511 unsigned VF = SrcVTy->getVectorNumElements(); 2512 2513 int MaskUnpackCost = 0; 2514 if (VariableMask) { 2515 VectorType *MaskTy = 2516 VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF); 2517 MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true); 2518 int ScalarCompareCost = 2519 getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), 2520 nullptr); 2521 int BranchCost = getCFInstrCost(Instruction::Br); 2522 MaskUnpackCost += VF * (BranchCost + ScalarCompareCost); 2523 } 2524 2525 // The cost of the scalar loads/stores. 2526 int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), 2527 Alignment, AddressSpace); 2528 2529 int InsertExtractCost = 0; 2530 if (Opcode == Instruction::Load) 2531 for (unsigned i = 0; i < VF; ++i) 2532 // Add the cost of inserting each scalar load into the vector 2533 InsertExtractCost += 2534 getVectorInstrCost(Instruction::InsertElement, SrcVTy, i); 2535 else 2536 for (unsigned i = 0; i < VF; ++i) 2537 // Add the cost of extracting each element out of the data vector 2538 InsertExtractCost += 2539 getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i); 2540 2541 return MemoryOpCost + MaskUnpackCost + InsertExtractCost; 2542 } 2543 2544 /// Calculate the cost of Gather / Scatter operation 2545 int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy, 2546 Value *Ptr, bool VariableMask, 2547 unsigned Alignment) { 2548 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter"); 2549 unsigned VF = SrcVTy->getVectorNumElements(); 2550 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); 2551 if (!PtrTy && Ptr->getType()->isVectorTy()) 2552 PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType()); 2553 assert(PtrTy && "Unexpected type for Ptr argument"); 2554 unsigned AddressSpace = PtrTy->getAddressSpace(); 2555 2556 bool Scalarize = false; 2557 if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) || 2558 (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy))) 2559 Scalarize = true; 2560 // Gather / Scatter for vector 2 is not profitable on KNL / SKX 2561 // Vector-4 of gather/scatter instruction does not exist on KNL. 2562 // We can extend it to 8 elements, but zeroing upper bits of 2563 // the mask vector will add more instructions. Right now we give the scalar 2564 // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction 2565 // is better in the VariableMask case. 2566 if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX()))) 2567 Scalarize = true; 2568 2569 if (Scalarize) 2570 return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, 2571 AddressSpace); 2572 2573 return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace); 2574 } 2575 2576 bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1, 2577 TargetTransformInfo::LSRCost &C2) { 2578 // X86 specific here are "instruction number 1st priority". 2579 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, 2580 C1.NumIVMuls, C1.NumBaseAdds, 2581 C1.ScaleCost, C1.ImmCost, C1.SetupCost) < 2582 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, 2583 C2.NumIVMuls, C2.NumBaseAdds, 2584 C2.ScaleCost, C2.ImmCost, C2.SetupCost); 2585 } 2586 2587 bool X86TTIImpl::canMacroFuseCmp() { 2588 return ST->hasMacroFusion(); 2589 } 2590 2591 bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { 2592 // The backend can't handle a single element vector. 2593 if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1) 2594 return false; 2595 Type *ScalarTy = DataTy->getScalarType(); 2596 int DataWidth = isa<PointerType>(ScalarTy) ? 2597 DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits(); 2598 2599 return ((DataWidth == 32 || DataWidth == 64) && ST->hasAVX()) || 2600 ((DataWidth == 8 || DataWidth == 16) && ST->hasBWI()); 2601 } 2602 2603 bool X86TTIImpl::isLegalMaskedStore(Type *DataType) { 2604 return isLegalMaskedLoad(DataType); 2605 } 2606 2607 bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) { 2608 // This function is called now in two cases: from the Loop Vectorizer 2609 // and from the Scalarizer. 2610 // When the Loop Vectorizer asks about legality of the feature, 2611 // the vectorization factor is not calculated yet. The Loop Vectorizer 2612 // sends a scalar type and the decision is based on the width of the 2613 // scalar element. 2614 // Later on, the cost model will estimate usage this intrinsic based on 2615 // the vector type. 2616 // The Scalarizer asks again about legality. It sends a vector type. 2617 // In this case we can reject non-power-of-2 vectors. 2618 // We also reject single element vectors as the type legalizer can't 2619 // scalarize it. 2620 if (isa<VectorType>(DataTy)) { 2621 unsigned NumElts = DataTy->getVectorNumElements(); 2622 if (NumElts == 1 || !isPowerOf2_32(NumElts)) 2623 return false; 2624 } 2625 Type *ScalarTy = DataTy->getScalarType(); 2626 int DataWidth = isa<PointerType>(ScalarTy) ? 2627 DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits(); 2628 2629 // Some CPUs have better gather performance than others. 2630 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only 2631 // enable gather with a -march. 2632 return (DataWidth == 32 || DataWidth == 64) && 2633 (ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2())); 2634 } 2635 2636 bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) { 2637 // AVX2 doesn't support scatter 2638 if (!ST->hasAVX512()) 2639 return false; 2640 return isLegalMaskedGather(DataType); 2641 } 2642 2643 bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) { 2644 EVT VT = TLI->getValueType(DL, DataType); 2645 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT); 2646 } 2647 2648 bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) { 2649 return false; 2650 } 2651 2652 bool X86TTIImpl::areInlineCompatible(const Function *Caller, 2653 const Function *Callee) const { 2654 const TargetMachine &TM = getTLI()->getTargetMachine(); 2655 2656 // Work this as a subsetting of subtarget features. 2657 const FeatureBitset &CallerBits = 2658 TM.getSubtargetImpl(*Caller)->getFeatureBits(); 2659 const FeatureBitset &CalleeBits = 2660 TM.getSubtargetImpl(*Callee)->getFeatureBits(); 2661 2662 // FIXME: This is likely too limiting as it will include subtarget features 2663 // that we might not care about for inlining, but it is conservatively 2664 // correct. 2665 return (CallerBits & CalleeBits) == CalleeBits; 2666 } 2667 2668 const X86TTIImpl::TTI::MemCmpExpansionOptions * 2669 X86TTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const { 2670 // Only enable vector loads for equality comparison. 2671 // Right now the vector version is not as fast, see #33329. 2672 static const auto ThreeWayOptions = [this]() { 2673 TTI::MemCmpExpansionOptions Options; 2674 if (ST->is64Bit()) { 2675 Options.LoadSizes.push_back(8); 2676 } 2677 Options.LoadSizes.push_back(4); 2678 Options.LoadSizes.push_back(2); 2679 Options.LoadSizes.push_back(1); 2680 return Options; 2681 }(); 2682 static const auto EqZeroOptions = [this]() { 2683 TTI::MemCmpExpansionOptions Options; 2684 // TODO: enable AVX512 when the DAG is ready. 2685 // if (ST->hasAVX512()) Options.LoadSizes.push_back(64); 2686 if (ST->hasAVX2()) Options.LoadSizes.push_back(32); 2687 if (ST->hasSSE2()) Options.LoadSizes.push_back(16); 2688 if (ST->is64Bit()) { 2689 Options.LoadSizes.push_back(8); 2690 } 2691 Options.LoadSizes.push_back(4); 2692 Options.LoadSizes.push_back(2); 2693 Options.LoadSizes.push_back(1); 2694 return Options; 2695 }(); 2696 return IsZeroCmp ? &EqZeroOptions : &ThreeWayOptions; 2697 } 2698 2699 bool X86TTIImpl::enableInterleavedAccessVectorization() { 2700 // TODO: We expect this to be beneficial regardless of arch, 2701 // but there are currently some unexplained performance artifacts on Atom. 2702 // As a temporary solution, disable on Atom. 2703 return !(ST->isAtom()); 2704 } 2705 2706 // Get estimation for interleaved load/store operations for AVX2. 2707 // \p Factor is the interleaved-access factor (stride) - number of 2708 // (interleaved) elements in the group. 2709 // \p Indices contains the indices for a strided load: when the 2710 // interleaved load has gaps they indicate which elements are used. 2711 // If Indices is empty (or if the number of indices is equal to the size 2712 // of the interleaved-access as given in \p Factor) the access has no gaps. 2713 // 2714 // As opposed to AVX-512, AVX2 does not have generic shuffles that allow 2715 // computing the cost using a generic formula as a function of generic 2716 // shuffles. We therefore use a lookup table instead, filled according to 2717 // the instruction sequences that codegen currently generates. 2718 int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, 2719 unsigned Factor, 2720 ArrayRef<unsigned> Indices, 2721 unsigned Alignment, 2722 unsigned AddressSpace) { 2723 2724 // We currently Support only fully-interleaved groups, with no gaps. 2725 // TODO: Support also strided loads (interleaved-groups with gaps). 2726 if (Indices.size() && Indices.size() != Factor) 2727 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 2728 Alignment, AddressSpace); 2729 2730 // VecTy for interleave memop is <VF*Factor x Elt>. 2731 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have 2732 // VecTy = <12 x i32>. 2733 MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; 2734 2735 // This function can be called with VecTy=<6xi128>, Factor=3, in which case 2736 // the VF=2, while v2i128 is an unsupported MVT vector type 2737 // (see MachineValueType.h::getVectorVT()). 2738 if (!LegalVT.isVector()) 2739 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 2740 Alignment, AddressSpace); 2741 2742 unsigned VF = VecTy->getVectorNumElements() / Factor; 2743 Type *ScalarTy = VecTy->getVectorElementType(); 2744 2745 // Calculate the number of memory operations (NumOfMemOps), required 2746 // for load/store the VecTy. 2747 unsigned VecTySize = DL.getTypeStoreSize(VecTy); 2748 unsigned LegalVTSize = LegalVT.getStoreSize(); 2749 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; 2750 2751 // Get the cost of one memory operation. 2752 Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(), 2753 LegalVT.getVectorNumElements()); 2754 unsigned MemOpCost = 2755 getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace); 2756 2757 VectorType *VT = VectorType::get(ScalarTy, VF); 2758 EVT ETy = TLI->getValueType(DL, VT); 2759 if (!ETy.isSimple()) 2760 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 2761 Alignment, AddressSpace); 2762 2763 // TODO: Complete for other data-types and strides. 2764 // Each combination of Stride, ElementTy and VF results in a different 2765 // sequence; The cost tables are therefore accessed with: 2766 // Factor (stride) and VectorType=VFxElemType. 2767 // The Cost accounts only for the shuffle sequence; 2768 // The cost of the loads/stores is accounted for separately. 2769 // 2770 static const CostTblEntry AVX2InterleavedLoadTbl[] = { 2771 { 2, MVT::v4i64, 6 }, //(load 8i64 and) deinterleave into 2 x 4i64 2772 { 2, MVT::v4f64, 6 }, //(load 8f64 and) deinterleave into 2 x 4f64 2773 2774 { 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8 2775 { 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8 2776 { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8 2777 { 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8 2778 { 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8 2779 { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32 2780 2781 { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8 2782 { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8 2783 { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8 2784 { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8 2785 { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8 2786 2787 { 8, MVT::v8f32, 40 } //(load 64f32 and)deinterleave into 8 x 8f32 2788 }; 2789 2790 static const CostTblEntry AVX2InterleavedStoreTbl[] = { 2791 { 2, MVT::v4i64, 6 }, //interleave into 2 x 4i64 into 8i64 (and store) 2792 { 2, MVT::v4f64, 6 }, //interleave into 2 x 4f64 into 8f64 (and store) 2793 2794 { 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store) 2795 { 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store) 2796 { 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store) 2797 { 3, MVT::v16i8, 11 }, //interleave 3 x 16i8 into 48i8 (and store) 2798 { 3, MVT::v32i8, 13 }, //interleave 3 x 32i8 into 96i8 (and store) 2799 2800 { 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store) 2801 { 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store) 2802 { 4, MVT::v8i8, 10 }, //interleave 4 x 8i8 into 32i8 (and store) 2803 { 4, MVT::v16i8, 10 }, //interleave 4 x 16i8 into 64i8 (and store) 2804 { 4, MVT::v32i8, 12 } //interleave 4 x 32i8 into 128i8 (and store) 2805 }; 2806 2807 if (Opcode == Instruction::Load) { 2808 if (const auto *Entry = 2809 CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT())) 2810 return NumOfMemOps * MemOpCost + Entry->Cost; 2811 } else { 2812 assert(Opcode == Instruction::Store && 2813 "Expected Store Instruction at this point"); 2814 if (const auto *Entry = 2815 CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT())) 2816 return NumOfMemOps * MemOpCost + Entry->Cost; 2817 } 2818 2819 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 2820 Alignment, AddressSpace); 2821 } 2822 2823 // Get estimation for interleaved load/store operations and strided load. 2824 // \p Indices contains indices for strided load. 2825 // \p Factor - the factor of interleaving. 2826 // AVX-512 provides 3-src shuffles that significantly reduces the cost. 2827 int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, 2828 unsigned Factor, 2829 ArrayRef<unsigned> Indices, 2830 unsigned Alignment, 2831 unsigned AddressSpace) { 2832 2833 // VecTy for interleave memop is <VF*Factor x Elt>. 2834 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have 2835 // VecTy = <12 x i32>. 2836 2837 // Calculate the number of memory operations (NumOfMemOps), required 2838 // for load/store the VecTy. 2839 MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; 2840 unsigned VecTySize = DL.getTypeStoreSize(VecTy); 2841 unsigned LegalVTSize = LegalVT.getStoreSize(); 2842 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; 2843 2844 // Get the cost of one memory operation. 2845 Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(), 2846 LegalVT.getVectorNumElements()); 2847 unsigned MemOpCost = 2848 getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace); 2849 2850 unsigned VF = VecTy->getVectorNumElements() / Factor; 2851 MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF); 2852 2853 if (Opcode == Instruction::Load) { 2854 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl) 2855 // contain the cost of the optimized shuffle sequence that the 2856 // X86InterleavedAccess pass will generate. 2857 // The cost of loads and stores are computed separately from the table. 2858 2859 // X86InterleavedAccess support only the following interleaved-access group. 2860 static const CostTblEntry AVX512InterleavedLoadTbl[] = { 2861 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8 2862 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8 2863 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8 2864 }; 2865 2866 if (const auto *Entry = 2867 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT)) 2868 return NumOfMemOps * MemOpCost + Entry->Cost; 2869 //If an entry does not exist, fallback to the default implementation. 2870 2871 // Kind of shuffle depends on number of loaded values. 2872 // If we load the entire data in one register, we can use a 1-src shuffle. 2873 // Otherwise, we'll merge 2 sources in each operation. 2874 TTI::ShuffleKind ShuffleKind = 2875 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc; 2876 2877 unsigned ShuffleCost = 2878 getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr); 2879 2880 unsigned NumOfLoadsInInterleaveGrp = 2881 Indices.size() ? Indices.size() : Factor; 2882 Type *ResultTy = VectorType::get(VecTy->getVectorElementType(), 2883 VecTy->getVectorNumElements() / Factor); 2884 unsigned NumOfResults = 2885 getTLI()->getTypeLegalizationCost(DL, ResultTy).first * 2886 NumOfLoadsInInterleaveGrp; 2887 2888 // About a half of the loads may be folded in shuffles when we have only 2889 // one result. If we have more than one result, we do not fold loads at all. 2890 unsigned NumOfUnfoldedLoads = 2891 NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2; 2892 2893 // Get a number of shuffle operations per result. 2894 unsigned NumOfShufflesPerResult = 2895 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1)); 2896 2897 // The SK_MergeTwoSrc shuffle clobbers one of src operands. 2898 // When we have more than one destination, we need additional instructions 2899 // to keep sources. 2900 unsigned NumOfMoves = 0; 2901 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc) 2902 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2; 2903 2904 int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost + 2905 NumOfUnfoldedLoads * MemOpCost + NumOfMoves; 2906 2907 return Cost; 2908 } 2909 2910 // Store. 2911 assert(Opcode == Instruction::Store && 2912 "Expected Store Instruction at this point"); 2913 // X86InterleavedAccess support only the following interleaved-access group. 2914 static const CostTblEntry AVX512InterleavedStoreTbl[] = { 2915 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store) 2916 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store) 2917 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store) 2918 2919 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store) 2920 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store) 2921 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store) 2922 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store) 2923 }; 2924 2925 if (const auto *Entry = 2926 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT)) 2927 return NumOfMemOps * MemOpCost + Entry->Cost; 2928 //If an entry does not exist, fallback to the default implementation. 2929 2930 // There is no strided stores meanwhile. And store can't be folded in 2931 // shuffle. 2932 unsigned NumOfSources = Factor; // The number of values to be merged. 2933 unsigned ShuffleCost = 2934 getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr); 2935 unsigned NumOfShufflesPerStore = NumOfSources - 1; 2936 2937 // The SK_MergeTwoSrc shuffle clobbers one of src operands. 2938 // We need additional instructions to keep sources. 2939 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2; 2940 int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) + 2941 NumOfMoves; 2942 return Cost; 2943 } 2944 2945 int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, 2946 unsigned Factor, 2947 ArrayRef<unsigned> Indices, 2948 unsigned Alignment, 2949 unsigned AddressSpace) { 2950 auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) { 2951 Type *EltTy = VecTy->getVectorElementType(); 2952 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || 2953 EltTy->isIntegerTy(32) || EltTy->isPointerTy()) 2954 return true; 2955 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8)) 2956 return HasBW; 2957 return false; 2958 }; 2959 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI())) 2960 return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices, 2961 Alignment, AddressSpace); 2962 if (ST->hasAVX2()) 2963 return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices, 2964 Alignment, AddressSpace); 2965 2966 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 2967 Alignment, AddressSpace); 2968 } 2969