1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements a TargetTransformInfo analysis pass specific to the 10 /// X86 target machine. It uses the target's detailed information to provide 11 /// more precise answers to certain TTI queries, while letting the target 12 /// independent and default TTI implementations handle the rest. 13 /// 14 //===----------------------------------------------------------------------===// 15 /// About Cost Model numbers used below it's necessary to say the following: 16 /// the numbers correspond to some "generic" X86 CPU instead of usage of 17 /// concrete CPU model. Usually the numbers correspond to CPU where the feature 18 /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in 19 /// the lookups below the cost is based on Nehalem as that was the first CPU 20 /// to support that feature level and thus has most likely the worst case cost. 21 /// Some examples of other technologies/CPUs: 22 /// SSE 3 - Pentium4 / Athlon64 23 /// SSE 4.1 - Penryn 24 /// SSE 4.2 - Nehalem 25 /// AVX - Sandy Bridge 26 /// AVX2 - Haswell 27 /// AVX-512 - Xeon Phi / Skylake 28 /// And some examples of instruction target dependent costs (latency) 29 /// divss sqrtss rsqrtss 30 /// AMD K7 11-16 19 3 31 /// Piledriver 9-24 13-15 5 32 /// Jaguar 14 16 2 33 /// Pentium II,III 18 30 2 34 /// Nehalem 7-14 7-18 3 35 /// Haswell 10-13 11 5 36 /// TODO: Develop and implement the target dependent cost model and 37 /// specialize cost numbers for different Cost Model Targets such as throughput, 38 /// code size, latency and uop count. 39 //===----------------------------------------------------------------------===// 40 41 #include "X86TargetTransformInfo.h" 42 #include "llvm/Analysis/TargetTransformInfo.h" 43 #include "llvm/CodeGen/BasicTTIImpl.h" 44 #include "llvm/CodeGen/CostTable.h" 45 #include "llvm/CodeGen/TargetLowering.h" 46 #include "llvm/IR/IntrinsicInst.h" 47 #include "llvm/Support/Debug.h" 48 49 using namespace llvm; 50 51 #define DEBUG_TYPE "x86tti" 52 53 //===----------------------------------------------------------------------===// 54 // 55 // X86 cost model. 56 // 57 //===----------------------------------------------------------------------===// 58 59 TargetTransformInfo::PopcntSupportKind 60 X86TTIImpl::getPopcntSupport(unsigned TyWidth) { 61 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 62 // TODO: Currently the __builtin_popcount() implementation using SSE3 63 // instructions is inefficient. Once the problem is fixed, we should 64 // call ST->hasSSE3() instead of ST->hasPOPCNT(). 65 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software; 66 } 67 68 llvm::Optional<unsigned> X86TTIImpl::getCacheSize( 69 TargetTransformInfo::CacheLevel Level) const { 70 switch (Level) { 71 case TargetTransformInfo::CacheLevel::L1D: 72 // - Penryn 73 // - Nehalem 74 // - Westmere 75 // - Sandy Bridge 76 // - Ivy Bridge 77 // - Haswell 78 // - Broadwell 79 // - Skylake 80 // - Kabylake 81 return 32 * 1024; // 32 KByte 82 case TargetTransformInfo::CacheLevel::L2D: 83 // - Penryn 84 // - Nehalem 85 // - Westmere 86 // - Sandy Bridge 87 // - Ivy Bridge 88 // - Haswell 89 // - Broadwell 90 // - Skylake 91 // - Kabylake 92 return 256 * 1024; // 256 KByte 93 } 94 95 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); 96 } 97 98 llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity( 99 TargetTransformInfo::CacheLevel Level) const { 100 // - Penryn 101 // - Nehalem 102 // - Westmere 103 // - Sandy Bridge 104 // - Ivy Bridge 105 // - Haswell 106 // - Broadwell 107 // - Skylake 108 // - Kabylake 109 switch (Level) { 110 case TargetTransformInfo::CacheLevel::L1D: 111 LLVM_FALLTHROUGH; 112 case TargetTransformInfo::CacheLevel::L2D: 113 return 8; 114 } 115 116 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); 117 } 118 119 unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const { 120 bool Vector = (ClassID == 1); 121 if (Vector && !ST->hasSSE1()) 122 return 0; 123 124 if (ST->is64Bit()) { 125 if (Vector && ST->hasAVX512()) 126 return 32; 127 return 16; 128 } 129 return 8; 130 } 131 132 TypeSize 133 X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 134 unsigned PreferVectorWidth = ST->getPreferVectorWidth(); 135 switch (K) { 136 case TargetTransformInfo::RGK_Scalar: 137 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32); 138 case TargetTransformInfo::RGK_FixedWidthVector: 139 if (ST->hasAVX512() && PreferVectorWidth >= 512) 140 return TypeSize::getFixed(512); 141 if (ST->hasAVX() && PreferVectorWidth >= 256) 142 return TypeSize::getFixed(256); 143 if (ST->hasSSE1() && PreferVectorWidth >= 128) 144 return TypeSize::getFixed(128); 145 return TypeSize::getFixed(0); 146 case TargetTransformInfo::RGK_ScalableVector: 147 return TypeSize::getScalable(0); 148 } 149 150 llvm_unreachable("Unsupported register kind"); 151 } 152 153 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const { 154 return getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 155 .getFixedSize(); 156 } 157 158 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { 159 // If the loop will not be vectorized, don't interleave the loop. 160 // Let regular unroll to unroll the loop, which saves the overflow 161 // check and memory check cost. 162 if (VF == 1) 163 return 1; 164 165 if (ST->isAtom()) 166 return 1; 167 168 // Sandybridge and Haswell have multiple execution ports and pipelined 169 // vector units. 170 if (ST->hasAVX()) 171 return 4; 172 173 return 2; 174 } 175 176 InstructionCost X86TTIImpl::getArithmeticInstrCost( 177 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 178 TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, 179 TTI::OperandValueProperties Opd1PropInfo, 180 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, 181 const Instruction *CxtI) { 182 // TODO: Handle more cost kinds. 183 if (CostKind != TTI::TCK_RecipThroughput) 184 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, 185 Op2Info, Opd1PropInfo, 186 Opd2PropInfo, Args, CxtI); 187 188 // vXi8 multiplications are always promoted to vXi16. 189 if (Opcode == Instruction::Mul && Ty->isVectorTy() && 190 Ty->getScalarSizeInBits() == 8) { 191 Type *WideVecTy = 192 VectorType::getExtendedElementVectorType(cast<VectorType>(Ty)); 193 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty, 194 TargetTransformInfo::CastContextHint::None, 195 CostKind) + 196 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy, 197 TargetTransformInfo::CastContextHint::None, 198 CostKind) + 199 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info, 200 Opd1PropInfo, Opd2PropInfo); 201 } 202 203 // Legalize the type. 204 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 205 206 int ISD = TLI->InstructionOpcodeToISD(Opcode); 207 assert(ISD && "Invalid opcode"); 208 209 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() && 210 LT.second.getScalarType() == MVT::i32) { 211 // Check if the operands can be represented as a smaller datatype. 212 bool Op1Signed = false, Op2Signed = false; 213 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed); 214 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed); 215 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize); 216 217 // If both are representable as i15 and at least one is constant, 218 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we 219 // can treat this as PMADDWD which has the same costs as a vXi16 multiply. 220 if (OpMinSize <= 15 && !ST->isPMADDWDSlow()) { 221 bool Op1Constant = 222 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]); 223 bool Op2Constant = 224 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]); 225 bool Op1Sext = isa<SExtInst>(Args[0]) && 226 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41())); 227 bool Op2Sext = isa<SExtInst>(Args[1]) && 228 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41())); 229 230 bool IsZeroExtended = !Op1Signed || !Op2Signed; 231 bool IsConstant = Op1Constant || Op2Constant; 232 bool IsSext = Op1Sext || Op2Sext; 233 if (IsConstant || IsZeroExtended || IsSext) 234 LT.second = 235 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements()); 236 } 237 } 238 239 // Vector multiply by pow2 will be simplified to shifts. 240 if (ISD == ISD::MUL && 241 (Op2Info == TargetTransformInfo::OK_UniformConstantValue || 242 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && 243 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) 244 return getArithmeticInstrCost(Instruction::Shl, Ty, CostKind, Op1Info, 245 Op2Info, TargetTransformInfo::OP_None, 246 TargetTransformInfo::OP_None); 247 248 // On X86, vector signed division by constants power-of-two are 249 // normally expanded to the sequence SRA + SRL + ADD + SRA. 250 // The OperandValue properties may not be the same as that of the previous 251 // operation; conservatively assume OP_None. 252 if ((ISD == ISD::SDIV || ISD == ISD::SREM) && 253 (Op2Info == TargetTransformInfo::OK_UniformConstantValue || 254 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && 255 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { 256 InstructionCost Cost = 257 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info, 258 Op2Info, TargetTransformInfo::OP_None, 259 TargetTransformInfo::OP_None); 260 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info, 261 Op2Info, TargetTransformInfo::OP_None, 262 TargetTransformInfo::OP_None); 263 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info, 264 Op2Info, TargetTransformInfo::OP_None, 265 TargetTransformInfo::OP_None); 266 267 if (ISD == ISD::SREM) { 268 // For SREM: (X % C) is the equivalent of (X - (X/C)*C) 269 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info, 270 Op2Info); 271 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info, 272 Op2Info); 273 } 274 275 return Cost; 276 } 277 278 // Vector unsigned division/remainder will be simplified to shifts/masks. 279 if ((ISD == ISD::UDIV || ISD == ISD::UREM) && 280 (Op2Info == TargetTransformInfo::OK_UniformConstantValue || 281 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && 282 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { 283 if (ISD == ISD::UDIV) 284 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info, 285 Op2Info, TargetTransformInfo::OP_None, 286 TargetTransformInfo::OP_None); 287 // UREM 288 return getArithmeticInstrCost(Instruction::And, Ty, CostKind, Op1Info, 289 Op2Info, TargetTransformInfo::OP_None, 290 TargetTransformInfo::OP_None); 291 } 292 293 static const CostTblEntry GLMCostTable[] = { 294 { ISD::FDIV, MVT::f32, 18 }, // divss 295 { ISD::FDIV, MVT::v4f32, 35 }, // divps 296 { ISD::FDIV, MVT::f64, 33 }, // divsd 297 { ISD::FDIV, MVT::v2f64, 65 }, // divpd 298 }; 299 300 if (ST->useGLMDivSqrtCosts()) 301 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, 302 LT.second)) 303 return LT.first * Entry->Cost; 304 305 static const CostTblEntry SLMCostTable[] = { 306 { ISD::MUL, MVT::v4i32, 11 }, // pmulld 307 { ISD::MUL, MVT::v8i16, 2 }, // pmullw 308 { ISD::FMUL, MVT::f64, 2 }, // mulsd 309 { ISD::FMUL, MVT::v2f64, 4 }, // mulpd 310 { ISD::FMUL, MVT::v4f32, 2 }, // mulps 311 { ISD::FDIV, MVT::f32, 17 }, // divss 312 { ISD::FDIV, MVT::v4f32, 39 }, // divps 313 { ISD::FDIV, MVT::f64, 32 }, // divsd 314 { ISD::FDIV, MVT::v2f64, 69 }, // divpd 315 { ISD::FADD, MVT::v2f64, 2 }, // addpd 316 { ISD::FSUB, MVT::v2f64, 2 }, // subpd 317 // v2i64/v4i64 mul is custom lowered as a series of long: 318 // multiplies(3), shifts(3) and adds(2) 319 // slm muldq version throughput is 2 and addq throughput 4 320 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) + 321 // 3X4 (addq throughput) = 17 322 { ISD::MUL, MVT::v2i64, 17 }, 323 // slm addq\subq throughput is 4 324 { ISD::ADD, MVT::v2i64, 4 }, 325 { ISD::SUB, MVT::v2i64, 4 }, 326 }; 327 328 if (ST->useSLMArithCosts()) { 329 if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) { 330 // Check if the operands can be shrinked into a smaller datatype. 331 // TODO: Merge this into generiic vXi32 MUL patterns above. 332 bool Op1Signed = false; 333 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed); 334 bool Op2Signed = false; 335 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed); 336 337 bool SignedMode = Op1Signed || Op2Signed; 338 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize); 339 340 if (OpMinSize <= 7) 341 return LT.first * 3; // pmullw/sext 342 if (!SignedMode && OpMinSize <= 8) 343 return LT.first * 3; // pmullw/zext 344 if (OpMinSize <= 15) 345 return LT.first * 5; // pmullw/pmulhw/pshuf 346 if (!SignedMode && OpMinSize <= 16) 347 return LT.first * 5; // pmullw/pmulhw/pshuf 348 } 349 350 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, 351 LT.second)) { 352 return LT.first * Entry->Cost; 353 } 354 } 355 356 static const CostTblEntry AVX512BWUniformConstCostTable[] = { 357 { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand. 358 { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand. 359 { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb. 360 }; 361 362 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && 363 ST->hasBWI()) { 364 if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD, 365 LT.second)) 366 return LT.first * Entry->Cost; 367 } 368 369 static const CostTblEntry AVX512UniformConstCostTable[] = { 370 { ISD::SRA, MVT::v2i64, 1 }, 371 { ISD::SRA, MVT::v4i64, 1 }, 372 { ISD::SRA, MVT::v8i64, 1 }, 373 374 { ISD::SHL, MVT::v64i8, 4 }, // psllw + pand. 375 { ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand. 376 { ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb. 377 378 { ISD::SDIV, MVT::v16i32, 6 }, // pmuludq sequence 379 { ISD::SREM, MVT::v16i32, 8 }, // pmuludq+mul+sub sequence 380 { ISD::UDIV, MVT::v16i32, 5 }, // pmuludq sequence 381 { ISD::UREM, MVT::v16i32, 7 }, // pmuludq+mul+sub sequence 382 }; 383 384 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && 385 ST->hasAVX512()) { 386 if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD, 387 LT.second)) 388 return LT.first * Entry->Cost; 389 } 390 391 static const CostTblEntry AVX2UniformConstCostTable[] = { 392 { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand. 393 { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand. 394 { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb. 395 396 { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle. 397 398 { ISD::SDIV, MVT::v8i32, 6 }, // pmuludq sequence 399 { ISD::SREM, MVT::v8i32, 8 }, // pmuludq+mul+sub sequence 400 { ISD::UDIV, MVT::v8i32, 5 }, // pmuludq sequence 401 { ISD::UREM, MVT::v8i32, 7 }, // pmuludq+mul+sub sequence 402 }; 403 404 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && 405 ST->hasAVX2()) { 406 if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD, 407 LT.second)) 408 return LT.first * Entry->Cost; 409 } 410 411 static const CostTblEntry SSE2UniformConstCostTable[] = { 412 { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand. 413 { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand. 414 { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. 415 416 { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split. 417 { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split. 418 { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split. 419 420 { ISD::SDIV, MVT::v8i32, 12+2 }, // 2*pmuludq sequence + split. 421 { ISD::SREM, MVT::v8i32, 16+2 }, // 2*pmuludq+mul+sub sequence + split. 422 { ISD::SDIV, MVT::v4i32, 6 }, // pmuludq sequence 423 { ISD::SREM, MVT::v4i32, 8 }, // pmuludq+mul+sub sequence 424 { ISD::UDIV, MVT::v8i32, 10+2 }, // 2*pmuludq sequence + split. 425 { ISD::UREM, MVT::v8i32, 14+2 }, // 2*pmuludq+mul+sub sequence + split. 426 { ISD::UDIV, MVT::v4i32, 5 }, // pmuludq sequence 427 { ISD::UREM, MVT::v4i32, 7 }, // pmuludq+mul+sub sequence 428 }; 429 430 // XOP has faster vXi8 shifts. 431 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && 432 ST->hasSSE2() && !ST->hasXOP()) { 433 if (const auto *Entry = 434 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second)) 435 return LT.first * Entry->Cost; 436 } 437 438 static const CostTblEntry AVX512BWConstCostTable[] = { 439 { ISD::SDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence 440 { ISD::SREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence 441 { ISD::UDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence 442 { ISD::UREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence 443 { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence 444 { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence 445 { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence 446 { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence 447 }; 448 449 if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue || 450 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && 451 ST->hasBWI()) { 452 if (const auto *Entry = 453 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second)) 454 return LT.first * Entry->Cost; 455 } 456 457 static const CostTblEntry AVX512ConstCostTable[] = { 458 { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence 459 { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence 460 { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence 461 { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence 462 { ISD::SDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence 463 { ISD::SREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence 464 { ISD::UDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence 465 { ISD::UREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence 466 { ISD::SDIV, MVT::v32i16, 12 }, // 2*vpmulhw sequence 467 { ISD::SREM, MVT::v32i16, 16 }, // 2*vpmulhw+mul+sub sequence 468 { ISD::UDIV, MVT::v32i16, 12 }, // 2*vpmulhuw sequence 469 { ISD::UREM, MVT::v32i16, 16 }, // 2*vpmulhuw+mul+sub sequence 470 }; 471 472 if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue || 473 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && 474 ST->hasAVX512()) { 475 if (const auto *Entry = 476 CostTableLookup(AVX512ConstCostTable, ISD, LT.second)) 477 return LT.first * Entry->Cost; 478 } 479 480 static const CostTblEntry AVX2ConstCostTable[] = { 481 { ISD::SDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence 482 { ISD::SREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence 483 { ISD::UDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence 484 { ISD::UREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence 485 { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence 486 { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence 487 { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence 488 { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence 489 { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence 490 { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence 491 { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence 492 { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence 493 }; 494 495 if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue || 496 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && 497 ST->hasAVX2()) { 498 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second)) 499 return LT.first * Entry->Cost; 500 } 501 502 static const CostTblEntry SSE2ConstCostTable[] = { 503 { ISD::SDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split. 504 { ISD::SREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split. 505 { ISD::SDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence 506 { ISD::SREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence 507 { ISD::UDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split. 508 { ISD::UREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split. 509 { ISD::UDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence 510 { ISD::UREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence 511 { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split. 512 { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split. 513 { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence 514 { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence 515 { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split. 516 { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split. 517 { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence 518 { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence 519 { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split. 520 { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split. 521 { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence 522 { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence 523 { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split. 524 { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split. 525 { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence 526 { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence 527 }; 528 529 if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue || 530 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && 531 ST->hasSSE2()) { 532 // pmuldq sequence. 533 if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX()) 534 return LT.first * 32; 535 if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX()) 536 return LT.first * 38; 537 if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41()) 538 return LT.first * 15; 539 if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41()) 540 return LT.first * 20; 541 542 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second)) 543 return LT.first * Entry->Cost; 544 } 545 546 static const CostTblEntry AVX512BWShiftCostTable[] = { 547 { ISD::SHL, MVT::v16i8, 4 }, // extend/vpsllvw/pack sequence. 548 { ISD::SRL, MVT::v16i8, 4 }, // extend/vpsrlvw/pack sequence. 549 { ISD::SRA, MVT::v16i8, 4 }, // extend/vpsravw/pack sequence. 550 { ISD::SHL, MVT::v32i8, 4 }, // extend/vpsllvw/pack sequence. 551 { ISD::SRL, MVT::v32i8, 4 }, // extend/vpsrlvw/pack sequence. 552 { ISD::SRA, MVT::v32i8, 6 }, // extend/vpsravw/pack sequence. 553 { ISD::SHL, MVT::v64i8, 6 }, // extend/vpsllvw/pack sequence. 554 { ISD::SRL, MVT::v64i8, 7 }, // extend/vpsrlvw/pack sequence. 555 { ISD::SRA, MVT::v64i8, 15 }, // extend/vpsravw/pack sequence. 556 557 { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw 558 { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw 559 { ISD::SRA, MVT::v8i16, 1 }, // vpsravw 560 { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw 561 { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw 562 { ISD::SRA, MVT::v16i16, 1 }, // vpsravw 563 { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw 564 { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw 565 { ISD::SRA, MVT::v32i16, 1 }, // vpsravw 566 }; 567 568 if (ST->hasBWI()) 569 if (const auto *Entry = CostTableLookup(AVX512BWShiftCostTable, ISD, LT.second)) 570 return LT.first * Entry->Cost; 571 572 static const CostTblEntry AVX2UniformCostTable[] = { 573 // Uniform splats are cheaper for the following instructions. 574 { ISD::SHL, MVT::v16i16, 1 }, // psllw. 575 { ISD::SRL, MVT::v16i16, 1 }, // psrlw. 576 { ISD::SRA, MVT::v16i16, 1 }, // psraw. 577 { ISD::SHL, MVT::v32i16, 2 }, // 2*psllw. 578 { ISD::SRL, MVT::v32i16, 2 }, // 2*psrlw. 579 { ISD::SRA, MVT::v32i16, 2 }, // 2*psraw. 580 581 { ISD::SHL, MVT::v8i32, 1 }, // pslld 582 { ISD::SRL, MVT::v8i32, 1 }, // psrld 583 { ISD::SRA, MVT::v8i32, 1 }, // psrad 584 { ISD::SHL, MVT::v4i64, 1 }, // psllq 585 { ISD::SRL, MVT::v4i64, 1 }, // psrlq 586 }; 587 588 if (ST->hasAVX2() && 589 ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || 590 (Op2Info == TargetTransformInfo::OK_UniformValue))) { 591 if (const auto *Entry = 592 CostTableLookup(AVX2UniformCostTable, ISD, LT.second)) 593 return LT.first * Entry->Cost; 594 } 595 596 static const CostTblEntry SSE2UniformCostTable[] = { 597 // Uniform splats are cheaper for the following instructions. 598 { ISD::SHL, MVT::v8i16, 1 }, // psllw. 599 { ISD::SHL, MVT::v4i32, 1 }, // pslld 600 { ISD::SHL, MVT::v2i64, 1 }, // psllq. 601 602 { ISD::SRL, MVT::v8i16, 1 }, // psrlw. 603 { ISD::SRL, MVT::v4i32, 1 }, // psrld. 604 { ISD::SRL, MVT::v2i64, 1 }, // psrlq. 605 606 { ISD::SRA, MVT::v8i16, 1 }, // psraw. 607 { ISD::SRA, MVT::v4i32, 1 }, // psrad. 608 }; 609 610 if (ST->hasSSE2() && 611 ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || 612 (Op2Info == TargetTransformInfo::OK_UniformValue))) { 613 if (const auto *Entry = 614 CostTableLookup(SSE2UniformCostTable, ISD, LT.second)) 615 return LT.first * Entry->Cost; 616 } 617 618 static const CostTblEntry AVX512DQCostTable[] = { 619 { ISD::MUL, MVT::v2i64, 2 }, // pmullq 620 { ISD::MUL, MVT::v4i64, 2 }, // pmullq 621 { ISD::MUL, MVT::v8i64, 2 } // pmullq 622 }; 623 624 // Look for AVX512DQ lowering tricks for custom cases. 625 if (ST->hasDQI()) 626 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second)) 627 return LT.first * Entry->Cost; 628 629 static const CostTblEntry AVX512BWCostTable[] = { 630 { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence. 631 { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence. 632 { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence. 633 }; 634 635 // Look for AVX512BW lowering tricks for custom cases. 636 if (ST->hasBWI()) 637 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second)) 638 return LT.first * Entry->Cost; 639 640 static const CostTblEntry AVX512CostTable[] = { 641 { ISD::SHL, MVT::v4i32, 1 }, 642 { ISD::SRL, MVT::v4i32, 1 }, 643 { ISD::SRA, MVT::v4i32, 1 }, 644 { ISD::SHL, MVT::v8i32, 1 }, 645 { ISD::SRL, MVT::v8i32, 1 }, 646 { ISD::SRA, MVT::v8i32, 1 }, 647 { ISD::SHL, MVT::v16i32, 1 }, 648 { ISD::SRL, MVT::v16i32, 1 }, 649 { ISD::SRA, MVT::v16i32, 1 }, 650 651 { ISD::SHL, MVT::v2i64, 1 }, 652 { ISD::SRL, MVT::v2i64, 1 }, 653 { ISD::SHL, MVT::v4i64, 1 }, 654 { ISD::SRL, MVT::v4i64, 1 }, 655 { ISD::SHL, MVT::v8i64, 1 }, 656 { ISD::SRL, MVT::v8i64, 1 }, 657 658 { ISD::SRA, MVT::v2i64, 1 }, 659 { ISD::SRA, MVT::v4i64, 1 }, 660 { ISD::SRA, MVT::v8i64, 1 }, 661 662 { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org) 663 { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org) 664 { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org) 665 { ISD::MUL, MVT::v8i64, 6 }, // 3*pmuludq/3*shift/2*add 666 { ISD::MUL, MVT::i64, 1 }, // Skylake from http://www.agner.org/ 667 668 { ISD::FNEG, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ 669 { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ 670 { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ 671 { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ 672 { ISD::FDIV, MVT::f64, 4 }, // Skylake from http://www.agner.org/ 673 { ISD::FDIV, MVT::v2f64, 4 }, // Skylake from http://www.agner.org/ 674 { ISD::FDIV, MVT::v4f64, 8 }, // Skylake from http://www.agner.org/ 675 { ISD::FDIV, MVT::v8f64, 16 }, // Skylake from http://www.agner.org/ 676 677 { ISD::FNEG, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ 678 { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ 679 { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ 680 { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ 681 { ISD::FDIV, MVT::f32, 3 }, // Skylake from http://www.agner.org/ 682 { ISD::FDIV, MVT::v4f32, 3 }, // Skylake from http://www.agner.org/ 683 { ISD::FDIV, MVT::v8f32, 5 }, // Skylake from http://www.agner.org/ 684 { ISD::FDIV, MVT::v16f32, 10 }, // Skylake from http://www.agner.org/ 685 }; 686 687 if (ST->hasAVX512()) 688 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second)) 689 return LT.first * Entry->Cost; 690 691 static const CostTblEntry AVX2ShiftCostTable[] = { 692 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to 693 // customize them to detect the cases where shift amount is a scalar one. 694 { ISD::SHL, MVT::v4i32, 2 }, // vpsllvd (Haswell from agner.org) 695 { ISD::SRL, MVT::v4i32, 2 }, // vpsrlvd (Haswell from agner.org) 696 { ISD::SRA, MVT::v4i32, 2 }, // vpsravd (Haswell from agner.org) 697 { ISD::SHL, MVT::v8i32, 2 }, // vpsllvd (Haswell from agner.org) 698 { ISD::SRL, MVT::v8i32, 2 }, // vpsrlvd (Haswell from agner.org) 699 { ISD::SRA, MVT::v8i32, 2 }, // vpsravd (Haswell from agner.org) 700 { ISD::SHL, MVT::v2i64, 1 }, // vpsllvq (Haswell from agner.org) 701 { ISD::SRL, MVT::v2i64, 1 }, // vpsrlvq (Haswell from agner.org) 702 { ISD::SHL, MVT::v4i64, 1 }, // vpsllvq (Haswell from agner.org) 703 { ISD::SRL, MVT::v4i64, 1 }, // vpsrlvq (Haswell from agner.org) 704 }; 705 706 if (ST->hasAVX512()) { 707 if (ISD == ISD::SHL && LT.second == MVT::v32i16 && 708 (Op2Info == TargetTransformInfo::OK_UniformConstantValue || 709 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) 710 // On AVX512, a packed v32i16 shift left by a constant build_vector 711 // is lowered into a vector multiply (vpmullw). 712 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, 713 Op1Info, Op2Info, 714 TargetTransformInfo::OP_None, 715 TargetTransformInfo::OP_None); 716 } 717 718 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts). 719 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) { 720 if (ISD == ISD::SHL && LT.second == MVT::v16i16 && 721 (Op2Info == TargetTransformInfo::OK_UniformConstantValue || 722 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) 723 // On AVX2, a packed v16i16 shift left by a constant build_vector 724 // is lowered into a vector multiply (vpmullw). 725 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, 726 Op1Info, Op2Info, 727 TargetTransformInfo::OP_None, 728 TargetTransformInfo::OP_None); 729 730 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second)) 731 return LT.first * Entry->Cost; 732 } 733 734 static const CostTblEntry XOPShiftCostTable[] = { 735 // 128bit shifts take 1cy, but right shifts require negation beforehand. 736 { ISD::SHL, MVT::v16i8, 1 }, 737 { ISD::SRL, MVT::v16i8, 2 }, 738 { ISD::SRA, MVT::v16i8, 2 }, 739 { ISD::SHL, MVT::v8i16, 1 }, 740 { ISD::SRL, MVT::v8i16, 2 }, 741 { ISD::SRA, MVT::v8i16, 2 }, 742 { ISD::SHL, MVT::v4i32, 1 }, 743 { ISD::SRL, MVT::v4i32, 2 }, 744 { ISD::SRA, MVT::v4i32, 2 }, 745 { ISD::SHL, MVT::v2i64, 1 }, 746 { ISD::SRL, MVT::v2i64, 2 }, 747 { ISD::SRA, MVT::v2i64, 2 }, 748 // 256bit shifts require splitting if AVX2 didn't catch them above. 749 { ISD::SHL, MVT::v32i8, 2+2 }, 750 { ISD::SRL, MVT::v32i8, 4+2 }, 751 { ISD::SRA, MVT::v32i8, 4+2 }, 752 { ISD::SHL, MVT::v16i16, 2+2 }, 753 { ISD::SRL, MVT::v16i16, 4+2 }, 754 { ISD::SRA, MVT::v16i16, 4+2 }, 755 { ISD::SHL, MVT::v8i32, 2+2 }, 756 { ISD::SRL, MVT::v8i32, 4+2 }, 757 { ISD::SRA, MVT::v8i32, 4+2 }, 758 { ISD::SHL, MVT::v4i64, 2+2 }, 759 { ISD::SRL, MVT::v4i64, 4+2 }, 760 { ISD::SRA, MVT::v4i64, 4+2 }, 761 }; 762 763 // Look for XOP lowering tricks. 764 if (ST->hasXOP()) { 765 // If the right shift is constant then we'll fold the negation so 766 // it's as cheap as a left shift. 767 int ShiftISD = ISD; 768 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && 769 (Op2Info == TargetTransformInfo::OK_UniformConstantValue || 770 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) 771 ShiftISD = ISD::SHL; 772 if (const auto *Entry = 773 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second)) 774 return LT.first * Entry->Cost; 775 } 776 777 static const CostTblEntry SSE2UniformShiftCostTable[] = { 778 // Uniform splats are cheaper for the following instructions. 779 { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split. 780 { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split. 781 { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split. 782 783 { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split. 784 { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split. 785 { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split. 786 787 { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split. 788 { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split. 789 { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle. 790 { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split. 791 }; 792 793 if (ST->hasSSE2() && 794 ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || 795 (Op2Info == TargetTransformInfo::OK_UniformValue))) { 796 797 // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table. 798 if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2()) 799 return LT.first * 4; // 2*psrad + shuffle. 800 801 if (const auto *Entry = 802 CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second)) 803 return LT.first * Entry->Cost; 804 } 805 806 if (ISD == ISD::SHL && 807 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) { 808 MVT VT = LT.second; 809 // Vector shift left by non uniform constant can be lowered 810 // into vector multiply. 811 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) || 812 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX())) 813 ISD = ISD::MUL; 814 } 815 816 static const CostTblEntry AVX2CostTable[] = { 817 { ISD::SHL, MVT::v16i8, 6 }, // vpblendvb sequence. 818 { ISD::SHL, MVT::v32i8, 6 }, // vpblendvb sequence. 819 { ISD::SHL, MVT::v64i8, 12 }, // 2*vpblendvb sequence. 820 { ISD::SHL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence. 821 { ISD::SHL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence. 822 { ISD::SHL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence. 823 824 { ISD::SRL, MVT::v16i8, 6 }, // vpblendvb sequence. 825 { ISD::SRL, MVT::v32i8, 6 }, // vpblendvb sequence. 826 { ISD::SRL, MVT::v64i8, 12 }, // 2*vpblendvb sequence. 827 { ISD::SRL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence. 828 { ISD::SRL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence. 829 { ISD::SRL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence. 830 831 { ISD::SRA, MVT::v16i8, 17 }, // vpblendvb sequence. 832 { ISD::SRA, MVT::v32i8, 17 }, // vpblendvb sequence. 833 { ISD::SRA, MVT::v64i8, 34 }, // 2*vpblendvb sequence. 834 { ISD::SRA, MVT::v8i16, 5 }, // extend/vpsravd/pack sequence. 835 { ISD::SRA, MVT::v16i16, 7 }, // extend/vpsravd/pack sequence. 836 { ISD::SRA, MVT::v32i16, 14 }, // 2*extend/vpsravd/pack sequence. 837 { ISD::SRA, MVT::v2i64, 2 }, // srl/xor/sub sequence. 838 { ISD::SRA, MVT::v4i64, 2 }, // srl/xor/sub sequence. 839 840 { ISD::SUB, MVT::v32i8, 1 }, // psubb 841 { ISD::ADD, MVT::v32i8, 1 }, // paddb 842 { ISD::SUB, MVT::v16i16, 1 }, // psubw 843 { ISD::ADD, MVT::v16i16, 1 }, // paddw 844 { ISD::SUB, MVT::v8i32, 1 }, // psubd 845 { ISD::ADD, MVT::v8i32, 1 }, // paddd 846 { ISD::SUB, MVT::v4i64, 1 }, // psubq 847 { ISD::ADD, MVT::v4i64, 1 }, // paddq 848 849 { ISD::MUL, MVT::v16i16, 1 }, // pmullw 850 { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org) 851 { ISD::MUL, MVT::v4i64, 6 }, // 3*pmuludq/3*shift/2*add 852 853 { ISD::FNEG, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ 854 { ISD::FNEG, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ 855 { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ 856 { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ 857 { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ 858 { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ 859 { ISD::FMUL, MVT::f64, 1 }, // Haswell from http://www.agner.org/ 860 { ISD::FMUL, MVT::v2f64, 1 }, // Haswell from http://www.agner.org/ 861 { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ 862 { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ 863 864 { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/ 865 { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/ 866 { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/ 867 { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/ 868 { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/ 869 { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/ 870 }; 871 872 // Look for AVX2 lowering tricks for custom cases. 873 if (ST->hasAVX2()) 874 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second)) 875 return LT.first * Entry->Cost; 876 877 static const CostTblEntry AVX1CostTable[] = { 878 // We don't have to scalarize unsupported ops. We can issue two half-sized 879 // operations and we only need to extract the upper YMM half. 880 // Two ops + 1 extract + 1 insert = 4. 881 { ISD::MUL, MVT::v16i16, 4 }, 882 { ISD::MUL, MVT::v8i32, 5 }, // BTVER2 from http://www.agner.org/ 883 { ISD::MUL, MVT::v4i64, 12 }, 884 885 { ISD::SUB, MVT::v32i8, 4 }, 886 { ISD::ADD, MVT::v32i8, 4 }, 887 { ISD::SUB, MVT::v16i16, 4 }, 888 { ISD::ADD, MVT::v16i16, 4 }, 889 { ISD::SUB, MVT::v8i32, 4 }, 890 { ISD::ADD, MVT::v8i32, 4 }, 891 { ISD::SUB, MVT::v4i64, 4 }, 892 { ISD::ADD, MVT::v4i64, 4 }, 893 894 { ISD::SHL, MVT::v32i8, 22 }, // pblendvb sequence + split. 895 { ISD::SHL, MVT::v8i16, 6 }, // pblendvb sequence. 896 { ISD::SHL, MVT::v16i16, 13 }, // pblendvb sequence + split. 897 { ISD::SHL, MVT::v4i32, 3 }, // pslld/paddd/cvttps2dq/pmulld 898 { ISD::SHL, MVT::v8i32, 9 }, // pslld/paddd/cvttps2dq/pmulld + split 899 { ISD::SHL, MVT::v2i64, 2 }, // Shift each lane + blend. 900 { ISD::SHL, MVT::v4i64, 6 }, // Shift each lane + blend + split. 901 902 { ISD::SRL, MVT::v32i8, 23 }, // pblendvb sequence + split. 903 { ISD::SRL, MVT::v16i16, 28 }, // pblendvb sequence + split. 904 { ISD::SRL, MVT::v4i32, 6 }, // Shift each lane + blend. 905 { ISD::SRL, MVT::v8i32, 14 }, // Shift each lane + blend + split. 906 { ISD::SRL, MVT::v2i64, 2 }, // Shift each lane + blend. 907 { ISD::SRL, MVT::v4i64, 6 }, // Shift each lane + blend + split. 908 909 { ISD::SRA, MVT::v32i8, 44 }, // pblendvb sequence + split. 910 { ISD::SRA, MVT::v16i16, 28 }, // pblendvb sequence + split. 911 { ISD::SRA, MVT::v4i32, 6 }, // Shift each lane + blend. 912 { ISD::SRA, MVT::v8i32, 14 }, // Shift each lane + blend + split. 913 { ISD::SRA, MVT::v2i64, 5 }, // Shift each lane + blend. 914 { ISD::SRA, MVT::v4i64, 12 }, // Shift each lane + blend + split. 915 916 { ISD::FNEG, MVT::v4f64, 2 }, // BTVER2 from http://www.agner.org/ 917 { ISD::FNEG, MVT::v8f32, 2 }, // BTVER2 from http://www.agner.org/ 918 919 { ISD::FMUL, MVT::f64, 2 }, // BTVER2 from http://www.agner.org/ 920 { ISD::FMUL, MVT::v2f64, 2 }, // BTVER2 from http://www.agner.org/ 921 { ISD::FMUL, MVT::v4f64, 4 }, // BTVER2 from http://www.agner.org/ 922 923 { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/ 924 { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/ 925 { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/ 926 { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/ 927 { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/ 928 { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/ 929 }; 930 931 if (ST->hasAVX()) 932 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second)) 933 return LT.first * Entry->Cost; 934 935 static const CostTblEntry SSE42CostTable[] = { 936 { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/ 937 { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/ 938 { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/ 939 { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/ 940 941 { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/ 942 { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/ 943 { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/ 944 { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/ 945 946 { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/ 947 { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/ 948 { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/ 949 { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/ 950 951 { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/ 952 { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/ 953 { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/ 954 { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/ 955 956 { ISD::MUL, MVT::v2i64, 6 } // 3*pmuludq/3*shift/2*add 957 }; 958 959 if (ST->hasSSE42()) 960 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second)) 961 return LT.first * Entry->Cost; 962 963 static const CostTblEntry SSE41CostTable[] = { 964 { ISD::SHL, MVT::v16i8, 10 }, // pblendvb sequence. 965 { ISD::SHL, MVT::v8i16, 11 }, // pblendvb sequence. 966 { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld 967 968 { ISD::SRL, MVT::v16i8, 11 }, // pblendvb sequence. 969 { ISD::SRL, MVT::v8i16, 13 }, // pblendvb sequence. 970 { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend. 971 972 { ISD::SRA, MVT::v16i8, 21 }, // pblendvb sequence. 973 { ISD::SRA, MVT::v8i16, 13 }, // pblendvb sequence. 974 975 { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org) 976 }; 977 978 if (ST->hasSSE41()) 979 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second)) 980 return LT.first * Entry->Cost; 981 982 static const CostTblEntry SSE2CostTable[] = { 983 // We don't correctly identify costs of casts because they are marked as 984 // custom. 985 { ISD::SHL, MVT::v16i8, 13 }, // cmpgtb sequence. 986 { ISD::SHL, MVT::v8i16, 25 }, // cmpgtw sequence. 987 { ISD::SHL, MVT::v4i32, 16 }, // pslld/paddd/cvttps2dq/pmuludq. 988 { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence. 989 990 { ISD::SRL, MVT::v16i8, 14 }, // cmpgtb sequence. 991 { ISD::SRL, MVT::v8i16, 16 }, // cmpgtw sequence. 992 { ISD::SRL, MVT::v4i32, 12 }, // Shift each lane + blend. 993 { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence. 994 995 { ISD::SRA, MVT::v16i8, 27 }, // unpacked cmpgtb sequence. 996 { ISD::SRA, MVT::v8i16, 16 }, // cmpgtw sequence. 997 { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend. 998 { ISD::SRA, MVT::v2i64, 8 }, // srl/xor/sub splat+shuffle sequence. 999 1000 { ISD::MUL, MVT::v8i16, 1 }, // pmullw 1001 { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle 1002 { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add 1003 1004 { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/ 1005 { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/ 1006 { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/ 1007 { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/ 1008 1009 { ISD::FNEG, MVT::f32, 1 }, // Pentium IV from http://www.agner.org/ 1010 { ISD::FNEG, MVT::f64, 1 }, // Pentium IV from http://www.agner.org/ 1011 { ISD::FNEG, MVT::v4f32, 1 }, // Pentium IV from http://www.agner.org/ 1012 { ISD::FNEG, MVT::v2f64, 1 }, // Pentium IV from http://www.agner.org/ 1013 1014 { ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/ 1015 { ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/ 1016 1017 { ISD::FSUB, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/ 1018 { ISD::FSUB, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/ 1019 }; 1020 1021 if (ST->hasSSE2()) 1022 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second)) 1023 return LT.first * Entry->Cost; 1024 1025 static const CostTblEntry SSE1CostTable[] = { 1026 { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/ 1027 { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/ 1028 1029 { ISD::FNEG, MVT::f32, 2 }, // Pentium III from http://www.agner.org/ 1030 { ISD::FNEG, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/ 1031 1032 { ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/ 1033 { ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/ 1034 1035 { ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/ 1036 { ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/ 1037 }; 1038 1039 if (ST->hasSSE1()) 1040 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second)) 1041 return LT.first * Entry->Cost; 1042 1043 static const CostTblEntry X64CostTbl[] = { // 64-bit targets 1044 { ISD::ADD, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/ 1045 { ISD::SUB, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/ 1046 { ISD::MUL, MVT::i64, 2 }, // Nehalem from http://www.agner.org/ 1047 }; 1048 1049 if (ST->is64Bit()) 1050 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second)) 1051 return LT.first * Entry->Cost; 1052 1053 static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets 1054 { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/ 1055 { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/ 1056 { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/ 1057 1058 { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/ 1059 { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/ 1060 { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/ 1061 }; 1062 1063 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second)) 1064 return LT.first * Entry->Cost; 1065 1066 // It is not a good idea to vectorize division. We have to scalarize it and 1067 // in the process we will often end up having to spilling regular 1068 // registers. The overhead of division is going to dominate most kernels 1069 // anyways so try hard to prevent vectorization of division - it is 1070 // generally a bad idea. Assume somewhat arbitrarily that we have to be able 1071 // to hide "20 cycles" for each lane. 1072 if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM || 1073 ISD == ISD::UDIV || ISD == ISD::UREM)) { 1074 InstructionCost ScalarCost = getArithmeticInstrCost( 1075 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info, 1076 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 1077 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost; 1078 } 1079 1080 // Fallback to the default implementation. 1081 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info); 1082 } 1083 1084 InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, 1085 VectorType *BaseTp, 1086 ArrayRef<int> Mask, int Index, 1087 VectorType *SubTp) { 1088 // 64-bit packed float vectors (v2f32) are widened to type v4f32. 1089 // 64-bit packed integer vectors (v2i32) are widened to type v4i32. 1090 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp); 1091 1092 Kind = improveShuffleKindFromMask(Kind, Mask); 1093 // Treat Transpose as 2-op shuffles - there's no difference in lowering. 1094 if (Kind == TTI::SK_Transpose) 1095 Kind = TTI::SK_PermuteTwoSrc; 1096 1097 // For Broadcasts we are splatting the first element from the first input 1098 // register, so only need to reference that input and all the output 1099 // registers are the same. 1100 if (Kind == TTI::SK_Broadcast) 1101 LT.first = 1; 1102 1103 // Subvector extractions are free if they start at the beginning of a 1104 // vector and cheap if the subvectors are aligned. 1105 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) { 1106 int NumElts = LT.second.getVectorNumElements(); 1107 if ((Index % NumElts) == 0) 1108 return 0; 1109 std::pair<InstructionCost, MVT> SubLT = 1110 TLI->getTypeLegalizationCost(DL, SubTp); 1111 if (SubLT.second.isVector()) { 1112 int NumSubElts = SubLT.second.getVectorNumElements(); 1113 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) 1114 return SubLT.first; 1115 // Handle some cases for widening legalization. For now we only handle 1116 // cases where the original subvector was naturally aligned and evenly 1117 // fit in its legalized subvector type. 1118 // FIXME: Remove some of the alignment restrictions. 1119 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit 1120 // vectors. 1121 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements(); 1122 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 && 1123 (NumSubElts % OrigSubElts) == 0 && 1124 LT.second.getVectorElementType() == 1125 SubLT.second.getVectorElementType() && 1126 LT.second.getVectorElementType().getSizeInBits() == 1127 BaseTp->getElementType()->getPrimitiveSizeInBits()) { 1128 assert(NumElts >= NumSubElts && NumElts > OrigSubElts && 1129 "Unexpected number of elements!"); 1130 auto *VecTy = FixedVectorType::get(BaseTp->getElementType(), 1131 LT.second.getVectorNumElements()); 1132 auto *SubTy = FixedVectorType::get(BaseTp->getElementType(), 1133 SubLT.second.getVectorNumElements()); 1134 int ExtractIndex = alignDown((Index % NumElts), NumSubElts); 1135 InstructionCost ExtractCost = getShuffleCost( 1136 TTI::SK_ExtractSubvector, VecTy, None, ExtractIndex, SubTy); 1137 1138 // If the original size is 32-bits or more, we can use pshufd. Otherwise 1139 // if we have SSSE3 we can use pshufb. 1140 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3()) 1141 return ExtractCost + 1; // pshufd or pshufb 1142 1143 assert(SubTp->getPrimitiveSizeInBits() == 16 && 1144 "Unexpected vector size"); 1145 1146 return ExtractCost + 2; // worst case pshufhw + pshufd 1147 } 1148 } 1149 } 1150 1151 // Subvector insertions are cheap if the subvectors are aligned. 1152 // Note that in general, the insertion starting at the beginning of a vector 1153 // isn't free, because we need to preserve the rest of the wide vector. 1154 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) { 1155 int NumElts = LT.second.getVectorNumElements(); 1156 std::pair<InstructionCost, MVT> SubLT = 1157 TLI->getTypeLegalizationCost(DL, SubTp); 1158 if (SubLT.second.isVector()) { 1159 int NumSubElts = SubLT.second.getVectorNumElements(); 1160 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) 1161 return SubLT.first; 1162 } 1163 1164 // If the insertion isn't aligned, treat it like a 2-op shuffle. 1165 Kind = TTI::SK_PermuteTwoSrc; 1166 } 1167 1168 // Handle some common (illegal) sub-vector types as they are often very cheap 1169 // to shuffle even on targets without PSHUFB. 1170 EVT VT = TLI->getValueType(DL, BaseTp); 1171 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 && 1172 !ST->hasSSSE3()) { 1173 static const CostTblEntry SSE2SubVectorShuffleTbl[] = { 1174 {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw 1175 {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw 1176 {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw 1177 {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw 1178 {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck 1179 1180 {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw 1181 {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw 1182 {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus 1183 {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck 1184 1185 {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw 1186 {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw 1187 {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw 1188 {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw 1189 {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck 1190 1191 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw 1192 {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw 1193 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw 1194 {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw 1195 {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck 1196 }; 1197 1198 if (ST->hasSSE2()) 1199 if (const auto *Entry = 1200 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT())) 1201 return Entry->Cost; 1202 } 1203 1204 // We are going to permute multiple sources and the result will be in multiple 1205 // destinations. Providing an accurate cost only for splits where the element 1206 // type remains the same. 1207 if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) { 1208 MVT LegalVT = LT.second; 1209 if (LegalVT.isVector() && 1210 LegalVT.getVectorElementType().getSizeInBits() == 1211 BaseTp->getElementType()->getPrimitiveSizeInBits() && 1212 LegalVT.getVectorNumElements() < 1213 cast<FixedVectorType>(BaseTp)->getNumElements()) { 1214 1215 unsigned VecTySize = DL.getTypeStoreSize(BaseTp); 1216 unsigned LegalVTSize = LegalVT.getStoreSize(); 1217 // Number of source vectors after legalization: 1218 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize; 1219 // Number of destination vectors after legalization: 1220 InstructionCost NumOfDests = LT.first; 1221 1222 auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(), 1223 LegalVT.getVectorNumElements()); 1224 1225 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; 1226 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 1227 None, 0, nullptr); 1228 } 1229 1230 return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp); 1231 } 1232 1233 // For 2-input shuffles, we must account for splitting the 2 inputs into many. 1234 if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) { 1235 // We assume that source and destination have the same vector type. 1236 InstructionCost NumOfDests = LT.first; 1237 InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1; 1238 LT.first = NumOfDests * NumOfShufflesPerDest; 1239 } 1240 1241 static const CostTblEntry AVX512FP16ShuffleTbl[] = { 1242 {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw 1243 {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw 1244 {TTI::SK_Broadcast, MVT::v8f16, 1}, // vpbroadcastw 1245 1246 {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw 1247 {TTI::SK_Reverse, MVT::v16f16, 2}, // vpermw 1248 {TTI::SK_Reverse, MVT::v8f16, 1}, // vpshufb 1249 1250 {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw 1251 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw 1252 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // vpshufb 1253 1254 {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w 1255 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 2}, // vpermt2w 1256 {TTI::SK_PermuteTwoSrc, MVT::v8f16, 2} // vpermt2w 1257 }; 1258 1259 if (!ST->useSoftFloat() && ST->hasFP16()) 1260 if (const auto *Entry = 1261 CostTableLookup(AVX512FP16ShuffleTbl, Kind, LT.second)) 1262 return LT.first * Entry->Cost; 1263 1264 static const CostTblEntry AVX512VBMIShuffleTbl[] = { 1265 {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb 1266 {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb 1267 1268 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb 1269 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb 1270 1271 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b 1272 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b 1273 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b 1274 }; 1275 1276 if (ST->hasVBMI()) 1277 if (const auto *Entry = 1278 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second)) 1279 return LT.first * Entry->Cost; 1280 1281 static const CostTblEntry AVX512BWShuffleTbl[] = { 1282 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw 1283 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb 1284 1285 {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw 1286 {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw 1287 {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2 1288 1289 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw 1290 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw 1291 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16 1292 1293 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w 1294 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w 1295 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w 1296 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1 1297 1298 {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw 1299 {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb 1300 }; 1301 1302 if (ST->hasBWI()) 1303 if (const auto *Entry = 1304 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second)) 1305 return LT.first * Entry->Cost; 1306 1307 static const CostTblEntry AVX512ShuffleTbl[] = { 1308 {TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd 1309 {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps 1310 {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq 1311 {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd 1312 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw 1313 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb 1314 1315 {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd 1316 {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps 1317 {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq 1318 {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd 1319 {TTI::SK_Reverse, MVT::v32i16, 7}, // per mca 1320 {TTI::SK_Reverse, MVT::v64i8, 7}, // per mca 1321 1322 {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd 1323 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd 1324 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd 1325 {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps 1326 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps 1327 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps 1328 {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq 1329 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq 1330 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq 1331 {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd 1332 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd 1333 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd 1334 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb 1335 1336 {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd 1337 {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps 1338 {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q 1339 {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d 1340 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd 1341 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps 1342 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q 1343 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d 1344 {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd 1345 {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps 1346 {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q 1347 {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1}, // vpermt2d 1348 1349 // FIXME: This just applies the type legalization cost rules above 1350 // assuming these completely split. 1351 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 14}, 1352 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 14}, 1353 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 42}, 1354 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 42}, 1355 1356 {TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq 1357 {TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq 1358 {TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd 1359 {TTI::SK_Select, MVT::v16f32, 1}, // vblendmps 1360 {TTI::SK_Select, MVT::v8i64, 1}, // vblendmq 1361 {TTI::SK_Select, MVT::v16i32, 1}, // vblendmd 1362 }; 1363 1364 if (ST->hasAVX512()) 1365 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second)) 1366 return LT.first * Entry->Cost; 1367 1368 static const CostTblEntry AVX2ShuffleTbl[] = { 1369 {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd 1370 {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps 1371 {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq 1372 {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd 1373 {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw 1374 {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb 1375 1376 {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd 1377 {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps 1378 {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq 1379 {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd 1380 {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb 1381 {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb 1382 1383 {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb 1384 {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb 1385 1386 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd 1387 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps 1388 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq 1389 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd 1390 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb 1391 // + vpblendvb 1392 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb 1393 // + vpblendvb 1394 1395 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd 1396 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps 1397 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd 1398 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd 1399 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb 1400 // + vpblendvb 1401 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb 1402 // + vpblendvb 1403 }; 1404 1405 if (ST->hasAVX2()) 1406 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second)) 1407 return LT.first * Entry->Cost; 1408 1409 static const CostTblEntry XOPShuffleTbl[] = { 1410 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd 1411 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps 1412 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd 1413 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps 1414 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm 1415 // + vinsertf128 1416 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm 1417 // + vinsertf128 1418 1419 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm 1420 // + vinsertf128 1421 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm 1422 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm 1423 // + vinsertf128 1424 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm 1425 }; 1426 1427 if (ST->hasXOP()) 1428 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second)) 1429 return LT.first * Entry->Cost; 1430 1431 static const CostTblEntry AVX1ShuffleTbl[] = { 1432 {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd 1433 {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps 1434 {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd 1435 {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps 1436 {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128 1437 {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128 1438 1439 {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd 1440 {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps 1441 {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd 1442 {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps 1443 {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb 1444 // + vinsertf128 1445 {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb 1446 // + vinsertf128 1447 1448 {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd 1449 {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd 1450 {TTI::SK_Select, MVT::v8i32, 1}, // vblendps 1451 {TTI::SK_Select, MVT::v8f32, 1}, // vblendps 1452 {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor 1453 {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor 1454 1455 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd 1456 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd 1457 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps 1458 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps 1459 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb 1460 // + 2*por + vinsertf128 1461 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb 1462 // + 2*por + vinsertf128 1463 1464 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd 1465 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd 1466 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps 1467 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps 1468 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb 1469 // + 4*por + vinsertf128 1470 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb 1471 // + 4*por + vinsertf128 1472 }; 1473 1474 if (ST->hasAVX()) 1475 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second)) 1476 return LT.first * Entry->Cost; 1477 1478 static const CostTblEntry SSE41ShuffleTbl[] = { 1479 {TTI::SK_Select, MVT::v2i64, 1}, // pblendw 1480 {TTI::SK_Select, MVT::v2f64, 1}, // movsd 1481 {TTI::SK_Select, MVT::v4i32, 1}, // pblendw 1482 {TTI::SK_Select, MVT::v4f32, 1}, // blendps 1483 {TTI::SK_Select, MVT::v8i16, 1}, // pblendw 1484 {TTI::SK_Select, MVT::v16i8, 1} // pblendvb 1485 }; 1486 1487 if (ST->hasSSE41()) 1488 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second)) 1489 return LT.first * Entry->Cost; 1490 1491 static const CostTblEntry SSSE3ShuffleTbl[] = { 1492 {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb 1493 {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb 1494 1495 {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb 1496 {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb 1497 1498 {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por 1499 {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por 1500 1501 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb 1502 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb 1503 1504 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por 1505 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por 1506 }; 1507 1508 if (ST->hasSSSE3()) 1509 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second)) 1510 return LT.first * Entry->Cost; 1511 1512 static const CostTblEntry SSE2ShuffleTbl[] = { 1513 {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd 1514 {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd 1515 {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd 1516 {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd 1517 {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd 1518 1519 {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd 1520 {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd 1521 {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd 1522 {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd 1523 {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw 1524 // + 2*pshufd + 2*unpck + packus 1525 1526 {TTI::SK_Select, MVT::v2i64, 1}, // movsd 1527 {TTI::SK_Select, MVT::v2f64, 1}, // movsd 1528 {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps 1529 {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por 1530 {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por 1531 1532 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd 1533 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd 1534 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd 1535 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw 1536 // + pshufd/unpck 1537 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw 1538 // + 2*pshufd + 2*unpck + 2*packus 1539 1540 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd 1541 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd 1542 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd} 1543 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute 1544 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute 1545 }; 1546 1547 if (ST->hasSSE2()) 1548 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second)) 1549 return LT.first * Entry->Cost; 1550 1551 static const CostTblEntry SSE1ShuffleTbl[] = { 1552 { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps 1553 { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps 1554 { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps 1555 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps 1556 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps 1557 }; 1558 1559 if (ST->hasSSE1()) 1560 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second)) 1561 return LT.first * Entry->Cost; 1562 1563 return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp); 1564 } 1565 1566 InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 1567 Type *Src, 1568 TTI::CastContextHint CCH, 1569 TTI::TargetCostKind CostKind, 1570 const Instruction *I) { 1571 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1572 assert(ISD && "Invalid opcode"); 1573 1574 // TODO: Allow non-throughput costs that aren't binary. 1575 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { 1576 if (CostKind != TTI::TCK_RecipThroughput) 1577 return Cost == 0 ? 0 : 1; 1578 return Cost; 1579 }; 1580 1581 // The cost tables include both specific, custom (non-legal) src/dst type 1582 // conversions and generic, legalized types. We test for customs first, before 1583 // falling back to legalization. 1584 // FIXME: Need a better design of the cost table to handle non-simple types of 1585 // potential massive combinations (elem_num x src_type x dst_type). 1586 static const TypeConversionCostTblEntry AVX512BWConversionTbl[] { 1587 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 }, 1588 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 }, 1589 1590 // Mask sign extend has an instruction. 1591 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 }, 1592 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 }, 1593 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 }, 1594 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 }, 1595 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 }, 1596 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 }, 1597 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 }, 1598 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 }, 1599 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 }, 1600 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 }, 1601 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, 1602 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, 1603 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, 1604 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, 1605 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 }, 1606 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 }, 1607 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, 1 }, 1608 1609 // Mask zero extend is a sext + shift. 1610 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 }, 1611 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 }, 1612 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 }, 1613 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 }, 1614 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 }, 1615 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 }, 1616 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 }, 1617 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 }, 1618 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 }, 1619 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 }, 1620 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, 1621 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, 1622 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, 1623 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, 1624 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 }, 1625 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 }, 1626 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, 2 }, 1627 1628 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, 1629 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 }, 1630 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, 1631 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 }, 1632 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, 1633 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 }, 1634 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, 1635 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 }, 1636 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, 1637 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 }, 1638 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, 1639 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, 1640 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, 1641 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, 1642 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 }, 1643 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 }, 1644 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, 2 }, 1645 1646 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 }, 1647 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm 1648 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb 1649 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb 1650 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb 1651 }; 1652 1653 static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = { 1654 // Mask sign extend has an instruction. 1655 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, 1656 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 }, 1657 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, 1658 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, 1659 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, 1660 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, 1 }, 1661 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, 1662 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, 1663 1664 // Mask zero extend is a sext + shift. 1665 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, 1666 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 }, 1667 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, 1668 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, 1669 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, 1670 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, 2 }, 1671 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, 1672 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, 1673 1674 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, 1675 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 }, 1676 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, 1677 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, 1678 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, 1679 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, 1680 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, 1681 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, 2 }, 1682 1683 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, 1684 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, 1685 1686 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, 1687 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, 1688 1689 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 }, 1690 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 }, 1691 1692 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 }, 1693 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 }, 1694 }; 1695 1696 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and 1697 // 256-bit wide vectors. 1698 1699 static const TypeConversionCostTblEntry AVX512FConversionTbl[] = { 1700 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 }, 1701 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 }, 1702 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 }, 1703 1704 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd 1705 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd 1706 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd 1707 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd 1708 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq 1709 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq 1710 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq 1711 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd 1712 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd 1713 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd 1714 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd 1715 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd 1716 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq 1717 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq 1718 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq 1719 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb 1720 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb 1721 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb 1722 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, 2 }, // vpmovdb 1723 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, 2 }, // vpmovdb 1724 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdw 1725 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, 2 }, // vpmovdw 1726 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb 1727 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb 1728 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb 1729 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, 2 }, // vpmovqb 1730 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, 2 }, // vpmovqb 1731 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, 2 }, // vpmovqb 1732 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw 1733 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, 2 }, // vpmovqw 1734 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, 2 }, // vpmovqw 1735 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd 1736 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd 1737 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb 1738 1739 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32 1740 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 8 }, 1741 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, 8 }, 1742 1743 // Sign extend is zmm vpternlogd+vptruncdb. 1744 // Zero extend is zmm broadcast load+vptruncdw. 1745 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 3 }, 1746 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 4 }, 1747 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 3 }, 1748 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 4 }, 1749 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 3 }, 1750 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 4 }, 1751 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 3 }, 1752 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 4 }, 1753 1754 // Sign extend is zmm vpternlogd+vptruncdw. 1755 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw. 1756 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 3 }, 1757 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 4 }, 1758 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 3 }, 1759 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 4 }, 1760 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 3 }, 1761 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 4 }, 1762 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 3 }, 1763 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, 1764 1765 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd 1766 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld 1767 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd 1768 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld 1769 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd 1770 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld 1771 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq 1772 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq 1773 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq 1774 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq 1775 1776 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd 1777 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld 1778 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq 1779 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq 1780 1781 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, 1782 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, 1783 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, 1784 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, 1785 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, 1786 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, 1787 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, 1788 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, 1789 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, 1790 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, 1791 1792 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right 1793 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right 1794 1795 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, 1796 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, 1797 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 }, 1798 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 }, 1799 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, 1800 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 }, 1801 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, 1802 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, 1803 1804 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, 1805 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, 1806 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 }, 1807 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 }, 1808 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, 1809 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 }, 1810 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, 1811 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, 1812 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 }, 1813 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 }, 1814 1815 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 }, 1816 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, 7 }, 1817 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64,15 }, 1818 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32,11 }, 1819 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64,31 }, 1820 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 }, 1821 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, 7 }, 1822 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, 5 }, 1823 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64,15 }, 1824 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 1 }, 1825 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, 3 }, 1826 1827 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 }, 1828 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 }, 1829 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 3 }, 1830 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 }, 1831 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 3 }, 1832 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 3 }, 1833 }; 1834 1835 static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] { 1836 // Mask sign extend has an instruction. 1837 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 }, 1838 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 }, 1839 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 }, 1840 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 }, 1841 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 }, 1842 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 }, 1843 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 }, 1844 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 }, 1845 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 }, 1846 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 }, 1847 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, 1848 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, 1849 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, 1850 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, 1851 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, 1 }, 1852 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, 1 }, 1853 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, 1 }, 1854 1855 // Mask zero extend is a sext + shift. 1856 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 }, 1857 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 }, 1858 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 }, 1859 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 }, 1860 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 }, 1861 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 }, 1862 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 }, 1863 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 }, 1864 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 }, 1865 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 }, 1866 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, 1867 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, 1868 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, 1869 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, 1870 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, 2 }, 1871 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, 2 }, 1872 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, 2 }, 1873 1874 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, 1875 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 }, 1876 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, 1877 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 }, 1878 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, 1879 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 }, 1880 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, 1881 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 }, 1882 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, 1883 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 }, 1884 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, 1885 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, 1886 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, 1887 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, 1888 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, 2 }, 1889 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, 2 }, 1890 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, 2 }, 1891 1892 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, 1893 }; 1894 1895 static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = { 1896 // Mask sign extend has an instruction. 1897 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, 1898 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 }, 1899 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, 1900 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, 1 }, 1901 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, 1902 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, 1 }, 1903 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 }, 1904 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, 1905 1906 // Mask zero extend is a sext + shift. 1907 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, 1908 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 }, 1909 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, 1910 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, 2 }, 1911 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, 1912 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, 2 }, 1913 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 }, 1914 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, 1915 1916 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, 2 }, 1917 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 }, 1918 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, 1919 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 }, 1920 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, 1921 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, 1922 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, 2 }, 1923 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, 1924 1925 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, 1926 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 1927 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, 1928 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, 1929 1930 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, 1931 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 1932 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, 1933 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, 1934 1935 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, 1 }, 1936 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 }, 1937 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, 1938 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 }, 1939 1940 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, 1 }, 1941 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 }, 1942 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, 1943 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 }, 1944 }; 1945 1946 static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = { 1947 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd 1948 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd 1949 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd 1950 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8 1951 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq 1952 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq 1953 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq 1954 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16 1955 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd 1956 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd 1957 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd 1958 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq 1959 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq 1960 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd 1961 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb 1962 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw 1963 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb 1964 1965 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb 1966 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb 1967 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 5 }, 1968 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 6 }, 1969 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 5 }, 1970 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 6 }, 1971 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 5 }, 1972 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 6 }, 1973 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 10 }, 1974 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 12 }, 1975 1976 // sign extend is vpcmpeq+maskedmove+vpmovdw 1977 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw 1978 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 4 }, 1979 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 5 }, 1980 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 4 }, 1981 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 5 }, 1982 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 4 }, 1983 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 5 }, 1984 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 }, 1985 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 }, 1986 1987 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd 1988 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld 1989 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd 1990 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld 1991 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd 1992 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld 1993 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq 1994 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq 1995 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq 1996 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq 1997 1998 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 1 }, 1999 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 1 }, 2000 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 1 }, 2001 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 1 }, 2002 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, 2003 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, 2004 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 1 }, 2005 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 1 }, 2006 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, 2007 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, 2008 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, 2009 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, 2010 2011 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, 2012 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 }, 2013 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, 2014 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 }, 2015 2016 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 }, 2017 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 }, 2018 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, 2019 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 }, 2020 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, 2021 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 }, 2022 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 2023 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 2024 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, 2025 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, 2026 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 }, 2027 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, 2028 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 }, 2029 2030 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 }, 2031 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 }, 2032 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, 5 }, 2033 2034 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 }, 2035 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 }, 2036 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, 2037 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 1 }, 2038 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 }, 2039 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 }, 2040 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 }, 2041 }; 2042 2043 static const TypeConversionCostTblEntry AVX2ConversionTbl[] = { 2044 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, 2045 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, 2046 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, 2047 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, 2048 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, 2049 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, 2050 2051 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 2 }, 2052 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 2 }, 2053 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 2 }, 2054 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 2 }, 2055 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 2056 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 2057 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 2 }, 2058 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 2 }, 2059 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 2060 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 2061 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 }, 2062 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 }, 2063 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 2064 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 2065 2066 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, 2067 2068 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 4 }, 2069 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 4 }, 2070 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 1 }, 2071 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 1 }, 2072 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 1 }, 2073 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 4 }, 2074 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 4 }, 2075 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 1 }, 2076 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 1 }, 2077 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 5 }, 2078 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, 2079 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 }, 2080 2081 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 }, 2082 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 }, 2083 2084 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 1 }, 2085 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 1 }, 2086 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 1 }, 2087 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 3 }, 2088 2089 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 3 }, 2090 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 3 }, 2091 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 1 }, 2092 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 }, 2093 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, 2094 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4 }, 2095 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 3 }, 2096 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 4 }, 2097 2098 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 }, 2099 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 }, 2100 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 }, 2101 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, 2102 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, 2103 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, 2104 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 3 }, 2105 2106 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 }, 2107 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 }, 2108 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 }, 2109 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, 2110 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 }, 2111 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 }, 2112 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 2 }, 2113 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 }, 2114 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, 2115 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 }, 2116 }; 2117 2118 static const TypeConversionCostTblEntry AVXConversionTbl[] = { 2119 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 }, 2120 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 }, 2121 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 }, 2122 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 }, 2123 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, 2124 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, 2125 2126 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 3 }, 2127 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 3 }, 2128 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 3 }, 2129 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 3 }, 2130 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, 2131 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, 2132 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 3 }, 2133 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 3 }, 2134 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, 2135 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, 2136 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 3 }, 2137 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 }, 2138 2139 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 }, 2140 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 }, 2141 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 }, 2142 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 }, 2143 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 }, 2144 2145 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 }, 2146 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, 2147 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // and+extract+packuswb 2148 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 5 }, 2149 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, 2150 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 5 }, 2151 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 3 }, // and+extract+2*packusdw 2152 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 }, 2153 2154 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, 2155 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 }, 2156 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 }, 2157 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 }, 2158 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 }, 2159 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 2160 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 }, 2161 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 }, 2162 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, 2163 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 }, 2164 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 5 }, 2165 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 8 }, 2166 2167 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 }, 2168 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 }, 2169 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 }, 2170 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 }, 2171 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 }, 2172 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 2173 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 }, 2174 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 4 }, 2175 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 4 }, 2176 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 }, 2177 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 }, 2178 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 }, 2179 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 10 }, 2180 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 10 }, 2181 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 18 }, 2182 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, 2183 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 10 }, 2184 2185 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 }, 2186 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, 2 }, 2187 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, 2 }, 2188 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, 2 }, 2189 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 2 }, 2190 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, 2 }, 2191 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 2 }, 2192 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, 2 }, 2193 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 2 }, 2194 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 2 }, 2195 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 5 }, 2196 2197 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, 2 }, 2198 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, 2 }, 2199 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, 2 }, 2200 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, 2 }, 2201 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 2 }, 2202 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, 2 }, 2203 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 2 }, 2204 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, 2 }, 2205 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 }, 2206 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, 2207 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 6 }, 2208 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 7 }, 2209 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 7 }, 2210 2211 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 }, 2212 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 }, 2213 }; 2214 2215 static const TypeConversionCostTblEntry SSE41ConversionTbl[] = { 2216 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 1 }, 2217 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 1 }, 2218 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 1 }, 2219 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 1 }, 2220 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 }, 2221 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 1 }, 2222 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 1 }, 2223 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 1 }, 2224 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 }, 2225 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 1 }, 2226 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 }, 2227 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 1 }, 2228 2229 // These truncates end up widening elements. 2230 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ 2231 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ 2232 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD 2233 2234 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 2 }, 2235 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 2 }, 2236 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 2 }, 2237 2238 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 1 }, 2239 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 1 }, 2240 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 1 }, 2241 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 1 }, 2242 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 }, 2243 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, 2244 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 }, 2245 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, 2246 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 2247 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 1 }, 2248 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 }, 2249 2250 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 1 }, 2251 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 1 }, 2252 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 }, 2253 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 }, 2254 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 }, 2255 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, 2256 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 }, 2257 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, 2258 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 3 }, 2259 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 }, 2260 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 2 }, 2261 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 12 }, 2262 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 22 }, 2263 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 4 }, 2264 2265 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 1 }, 2266 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 1 }, 2267 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 1 }, 2268 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 1 }, 2269 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 2 }, 2270 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 2 }, 2271 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 1 }, 2272 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 1 }, 2273 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, 2274 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 1 }, 2275 2276 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 1 }, 2277 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 }, 2278 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 1 }, 2279 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 }, 2280 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 2 }, 2281 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 2 }, 2282 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 1 }, 2283 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 1 }, 2284 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 4 }, 2285 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, 2286 }; 2287 2288 static const TypeConversionCostTblEntry SSE2ConversionTbl[] = { 2289 // These are somewhat magic numbers justified by comparing the 2290 // output of llvm-mca for our various supported scheduler models 2291 // and basing it off the worst case scenario. 2292 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 3 }, 2293 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 3 }, 2294 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 3 }, 2295 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 3 }, 2296 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 3 }, 2297 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 }, 2298 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 3 }, 2299 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 }, 2300 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 }, 2301 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4 }, 2302 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 8 }, 2303 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 8 }, 2304 2305 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 3 }, 2306 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 3 }, 2307 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 8 }, 2308 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 9 }, 2309 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 }, 2310 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 4 }, 2311 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 4 }, 2312 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 }, 2313 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 7 }, 2314 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 7 }, 2315 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 }, 2316 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 15 }, 2317 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 18 }, 2318 2319 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 4 }, 2320 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 4 }, 2321 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 4 }, 2322 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 4 }, 2323 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 6 }, 2324 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 6 }, 2325 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 5 }, 2326 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 5 }, 2327 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 4 }, 2328 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 4 }, 2329 2330 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 4 }, 2331 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 }, 2332 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 4 }, 2333 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 15 }, 2334 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 6 }, 2335 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 6 }, 2336 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 5 }, 2337 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 5 }, 2338 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 8 }, 2339 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 8 }, 2340 2341 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 4 }, 2342 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 4 }, 2343 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 2 }, 2344 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 3 }, 2345 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 }, 2346 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 2 }, 2347 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 2 }, 2348 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 3 }, 2349 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 }, 2350 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 2 }, 2351 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 }, 2352 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 2 }, 2353 2354 // These truncates are really widening elements. 2355 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD 2356 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ 2357 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD 2358 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD 2359 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD 2360 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW 2361 2362 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 2 }, // PAND+PACKUSWB 2363 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, 2364 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 3 }, // PAND+2*PACKUSWB 2365 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 }, 2366 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 }, 2367 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 3 }, 2368 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, 2369 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32,10 }, 2370 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB 2371 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW 2372 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, 1 }, // PSHUFD 2373 }; 2374 2375 // Attempt to map directly to (simple) MVT types to let us match custom entries. 2376 EVT SrcTy = TLI->getValueType(DL, Src); 2377 EVT DstTy = TLI->getValueType(DL, Dst); 2378 2379 // The function getSimpleVT only handles simple value types. 2380 if (SrcTy.isSimple() && DstTy.isSimple()) { 2381 MVT SimpleSrcTy = SrcTy.getSimpleVT(); 2382 MVT SimpleDstTy = DstTy.getSimpleVT(); 2383 2384 if (ST->useAVX512Regs()) { 2385 if (ST->hasBWI()) 2386 if (const auto *Entry = ConvertCostTableLookup( 2387 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) 2388 return AdjustCost(Entry->Cost); 2389 2390 if (ST->hasDQI()) 2391 if (const auto *Entry = ConvertCostTableLookup( 2392 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) 2393 return AdjustCost(Entry->Cost); 2394 2395 if (ST->hasAVX512()) 2396 if (const auto *Entry = ConvertCostTableLookup( 2397 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) 2398 return AdjustCost(Entry->Cost); 2399 } 2400 2401 if (ST->hasBWI()) 2402 if (const auto *Entry = ConvertCostTableLookup( 2403 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) 2404 return AdjustCost(Entry->Cost); 2405 2406 if (ST->hasDQI()) 2407 if (const auto *Entry = ConvertCostTableLookup( 2408 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) 2409 return AdjustCost(Entry->Cost); 2410 2411 if (ST->hasAVX512()) 2412 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD, 2413 SimpleDstTy, SimpleSrcTy)) 2414 return AdjustCost(Entry->Cost); 2415 2416 if (ST->hasAVX2()) { 2417 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, 2418 SimpleDstTy, SimpleSrcTy)) 2419 return AdjustCost(Entry->Cost); 2420 } 2421 2422 if (ST->hasAVX()) { 2423 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, 2424 SimpleDstTy, SimpleSrcTy)) 2425 return AdjustCost(Entry->Cost); 2426 } 2427 2428 if (ST->hasSSE41()) { 2429 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, 2430 SimpleDstTy, SimpleSrcTy)) 2431 return AdjustCost(Entry->Cost); 2432 } 2433 2434 if (ST->hasSSE2()) { 2435 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, 2436 SimpleDstTy, SimpleSrcTy)) 2437 return AdjustCost(Entry->Cost); 2438 } 2439 } 2440 2441 // Fall back to legalized types. 2442 std::pair<InstructionCost, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src); 2443 std::pair<InstructionCost, MVT> LTDest = 2444 TLI->getTypeLegalizationCost(DL, Dst); 2445 2446 if (ST->useAVX512Regs()) { 2447 if (ST->hasBWI()) 2448 if (const auto *Entry = ConvertCostTableLookup( 2449 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second)) 2450 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); 2451 2452 if (ST->hasDQI()) 2453 if (const auto *Entry = ConvertCostTableLookup( 2454 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second)) 2455 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); 2456 2457 if (ST->hasAVX512()) 2458 if (const auto *Entry = ConvertCostTableLookup( 2459 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second)) 2460 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); 2461 } 2462 2463 if (ST->hasBWI()) 2464 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD, 2465 LTDest.second, LTSrc.second)) 2466 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); 2467 2468 if (ST->hasDQI()) 2469 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD, 2470 LTDest.second, LTSrc.second)) 2471 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); 2472 2473 if (ST->hasAVX512()) 2474 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD, 2475 LTDest.second, LTSrc.second)) 2476 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); 2477 2478 if (ST->hasAVX2()) 2479 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, 2480 LTDest.second, LTSrc.second)) 2481 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); 2482 2483 if (ST->hasAVX()) 2484 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, 2485 LTDest.second, LTSrc.second)) 2486 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); 2487 2488 if (ST->hasSSE41()) 2489 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, 2490 LTDest.second, LTSrc.second)) 2491 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); 2492 2493 if (ST->hasSSE2()) 2494 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, 2495 LTDest.second, LTSrc.second)) 2496 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); 2497 2498 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for 2499 // sitofp. 2500 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) && 2501 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) { 2502 Type *ExtSrc = Src->getWithNewBitWidth(32); 2503 unsigned ExtOpc = 2504 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt; 2505 2506 // For scalar loads the extend would be free. 2507 InstructionCost ExtCost = 0; 2508 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0)))) 2509 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind); 2510 2511 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc, 2512 TTI::CastContextHint::None, CostKind); 2513 } 2514 2515 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi 2516 // i32. 2517 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) && 2518 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) { 2519 Type *TruncDst = Dst->getWithNewBitWidth(32); 2520 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) + 2521 getCastInstrCost(Instruction::Trunc, Dst, TruncDst, 2522 TTI::CastContextHint::None, CostKind); 2523 } 2524 2525 return AdjustCost( 2526 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 2527 } 2528 2529 InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 2530 Type *CondTy, 2531 CmpInst::Predicate VecPred, 2532 TTI::TargetCostKind CostKind, 2533 const Instruction *I) { 2534 // TODO: Handle other cost kinds. 2535 if (CostKind != TTI::TCK_RecipThroughput) 2536 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 2537 I); 2538 2539 // Legalize the type. 2540 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 2541 2542 MVT MTy = LT.second; 2543 2544 int ISD = TLI->InstructionOpcodeToISD(Opcode); 2545 assert(ISD && "Invalid opcode"); 2546 2547 unsigned ExtraCost = 0; 2548 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) { 2549 // Some vector comparison predicates cost extra instructions. 2550 // TODO: Should we invert this and assume worst case cmp costs 2551 // and reduce for particular predicates? 2552 if (MTy.isVector() && 2553 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) || 2554 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) || 2555 ST->hasBWI())) { 2556 // Fallback to I if a specific predicate wasn't specified. 2557 CmpInst::Predicate Pred = VecPred; 2558 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE || 2559 Pred == CmpInst::BAD_FCMP_PREDICATE)) 2560 Pred = cast<CmpInst>(I)->getPredicate(); 2561 2562 switch (Pred) { 2563 case CmpInst::Predicate::ICMP_NE: 2564 // xor(cmpeq(x,y),-1) 2565 ExtraCost = 1; 2566 break; 2567 case CmpInst::Predicate::ICMP_SGE: 2568 case CmpInst::Predicate::ICMP_SLE: 2569 // xor(cmpgt(x,y),-1) 2570 ExtraCost = 1; 2571 break; 2572 case CmpInst::Predicate::ICMP_ULT: 2573 case CmpInst::Predicate::ICMP_UGT: 2574 // cmpgt(xor(x,signbit),xor(y,signbit)) 2575 // xor(cmpeq(pmaxu(x,y),x),-1) 2576 ExtraCost = 2; 2577 break; 2578 case CmpInst::Predicate::ICMP_ULE: 2579 case CmpInst::Predicate::ICMP_UGE: 2580 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) || 2581 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) { 2582 // cmpeq(psubus(x,y),0) 2583 // cmpeq(pminu(x,y),x) 2584 ExtraCost = 1; 2585 } else { 2586 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1) 2587 ExtraCost = 3; 2588 } 2589 break; 2590 case CmpInst::Predicate::BAD_ICMP_PREDICATE: 2591 case CmpInst::Predicate::BAD_FCMP_PREDICATE: 2592 // Assume worst case scenario and add the maximum extra cost. 2593 ExtraCost = 3; 2594 break; 2595 default: 2596 break; 2597 } 2598 } 2599 } 2600 2601 static const CostTblEntry SLMCostTbl[] = { 2602 // slm pcmpeq/pcmpgt throughput is 2 2603 { ISD::SETCC, MVT::v2i64, 2 }, 2604 }; 2605 2606 static const CostTblEntry AVX512BWCostTbl[] = { 2607 { ISD::SETCC, MVT::v32i16, 1 }, 2608 { ISD::SETCC, MVT::v64i8, 1 }, 2609 2610 { ISD::SELECT, MVT::v32i16, 1 }, 2611 { ISD::SELECT, MVT::v64i8, 1 }, 2612 }; 2613 2614 static const CostTblEntry AVX512CostTbl[] = { 2615 { ISD::SETCC, MVT::v8i64, 1 }, 2616 { ISD::SETCC, MVT::v16i32, 1 }, 2617 { ISD::SETCC, MVT::v8f64, 1 }, 2618 { ISD::SETCC, MVT::v16f32, 1 }, 2619 2620 { ISD::SELECT, MVT::v8i64, 1 }, 2621 { ISD::SELECT, MVT::v16i32, 1 }, 2622 { ISD::SELECT, MVT::v8f64, 1 }, 2623 { ISD::SELECT, MVT::v16f32, 1 }, 2624 2625 { ISD::SETCC, MVT::v32i16, 2 }, // FIXME: should probably be 4 2626 { ISD::SETCC, MVT::v64i8, 2 }, // FIXME: should probably be 4 2627 2628 { ISD::SELECT, MVT::v32i16, 2 }, // FIXME: should be 3 2629 { ISD::SELECT, MVT::v64i8, 2 }, // FIXME: should be 3 2630 }; 2631 2632 static const CostTblEntry AVX2CostTbl[] = { 2633 { ISD::SETCC, MVT::v4i64, 1 }, 2634 { ISD::SETCC, MVT::v8i32, 1 }, 2635 { ISD::SETCC, MVT::v16i16, 1 }, 2636 { ISD::SETCC, MVT::v32i8, 1 }, 2637 2638 { ISD::SELECT, MVT::v4i64, 1 }, // pblendvb 2639 { ISD::SELECT, MVT::v8i32, 1 }, // pblendvb 2640 { ISD::SELECT, MVT::v16i16, 1 }, // pblendvb 2641 { ISD::SELECT, MVT::v32i8, 1 }, // pblendvb 2642 }; 2643 2644 static const CostTblEntry AVX1CostTbl[] = { 2645 { ISD::SETCC, MVT::v4f64, 1 }, 2646 { ISD::SETCC, MVT::v8f32, 1 }, 2647 // AVX1 does not support 8-wide integer compare. 2648 { ISD::SETCC, MVT::v4i64, 4 }, 2649 { ISD::SETCC, MVT::v8i32, 4 }, 2650 { ISD::SETCC, MVT::v16i16, 4 }, 2651 { ISD::SETCC, MVT::v32i8, 4 }, 2652 2653 { ISD::SELECT, MVT::v4f64, 1 }, // vblendvpd 2654 { ISD::SELECT, MVT::v8f32, 1 }, // vblendvps 2655 { ISD::SELECT, MVT::v4i64, 1 }, // vblendvpd 2656 { ISD::SELECT, MVT::v8i32, 1 }, // vblendvps 2657 { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps 2658 { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps 2659 }; 2660 2661 static const CostTblEntry SSE42CostTbl[] = { 2662 { ISD::SETCC, MVT::v2f64, 1 }, 2663 { ISD::SETCC, MVT::v4f32, 1 }, 2664 { ISD::SETCC, MVT::v2i64, 1 }, 2665 }; 2666 2667 static const CostTblEntry SSE41CostTbl[] = { 2668 { ISD::SELECT, MVT::v2f64, 1 }, // blendvpd 2669 { ISD::SELECT, MVT::v4f32, 1 }, // blendvps 2670 { ISD::SELECT, MVT::v2i64, 1 }, // pblendvb 2671 { ISD::SELECT, MVT::v4i32, 1 }, // pblendvb 2672 { ISD::SELECT, MVT::v8i16, 1 }, // pblendvb 2673 { ISD::SELECT, MVT::v16i8, 1 }, // pblendvb 2674 }; 2675 2676 static const CostTblEntry SSE2CostTbl[] = { 2677 { ISD::SETCC, MVT::v2f64, 2 }, 2678 { ISD::SETCC, MVT::f64, 1 }, 2679 { ISD::SETCC, MVT::v2i64, 8 }, 2680 { ISD::SETCC, MVT::v4i32, 1 }, 2681 { ISD::SETCC, MVT::v8i16, 1 }, 2682 { ISD::SETCC, MVT::v16i8, 1 }, 2683 2684 { ISD::SELECT, MVT::v2f64, 3 }, // andpd + andnpd + orpd 2685 { ISD::SELECT, MVT::v2i64, 3 }, // pand + pandn + por 2686 { ISD::SELECT, MVT::v4i32, 3 }, // pand + pandn + por 2687 { ISD::SELECT, MVT::v8i16, 3 }, // pand + pandn + por 2688 { ISD::SELECT, MVT::v16i8, 3 }, // pand + pandn + por 2689 }; 2690 2691 static const CostTblEntry SSE1CostTbl[] = { 2692 { ISD::SETCC, MVT::v4f32, 2 }, 2693 { ISD::SETCC, MVT::f32, 1 }, 2694 2695 { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps 2696 }; 2697 2698 if (ST->useSLMArithCosts()) 2699 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) 2700 return LT.first * (ExtraCost + Entry->Cost); 2701 2702 if (ST->hasBWI()) 2703 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) 2704 return LT.first * (ExtraCost + Entry->Cost); 2705 2706 if (ST->hasAVX512()) 2707 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) 2708 return LT.first * (ExtraCost + Entry->Cost); 2709 2710 if (ST->hasAVX2()) 2711 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) 2712 return LT.first * (ExtraCost + Entry->Cost); 2713 2714 if (ST->hasAVX()) 2715 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) 2716 return LT.first * (ExtraCost + Entry->Cost); 2717 2718 if (ST->hasSSE42()) 2719 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) 2720 return LT.first * (ExtraCost + Entry->Cost); 2721 2722 if (ST->hasSSE41()) 2723 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) 2724 return LT.first * (ExtraCost + Entry->Cost); 2725 2726 if (ST->hasSSE2()) 2727 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) 2728 return LT.first * (ExtraCost + Entry->Cost); 2729 2730 if (ST->hasSSE1()) 2731 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) 2732 return LT.first * (ExtraCost + Entry->Cost); 2733 2734 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); 2735 } 2736 2737 unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; } 2738 2739 InstructionCost 2740 X86TTIImpl::getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 2741 TTI::TargetCostKind CostKind) { 2742 2743 // Costs should match the codegen from: 2744 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll 2745 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll 2746 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll 2747 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll 2748 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll 2749 2750 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not 2751 // specialized in these tables yet. 2752 static const CostTblEntry AVX512BITALGCostTbl[] = { 2753 { ISD::CTPOP, MVT::v32i16, 1 }, 2754 { ISD::CTPOP, MVT::v64i8, 1 }, 2755 { ISD::CTPOP, MVT::v16i16, 1 }, 2756 { ISD::CTPOP, MVT::v32i8, 1 }, 2757 { ISD::CTPOP, MVT::v8i16, 1 }, 2758 { ISD::CTPOP, MVT::v16i8, 1 }, 2759 }; 2760 static const CostTblEntry AVX512VPOPCNTDQCostTbl[] = { 2761 { ISD::CTPOP, MVT::v8i64, 1 }, 2762 { ISD::CTPOP, MVT::v16i32, 1 }, 2763 { ISD::CTPOP, MVT::v4i64, 1 }, 2764 { ISD::CTPOP, MVT::v8i32, 1 }, 2765 { ISD::CTPOP, MVT::v2i64, 1 }, 2766 { ISD::CTPOP, MVT::v4i32, 1 }, 2767 }; 2768 static const CostTblEntry AVX512CDCostTbl[] = { 2769 { ISD::CTLZ, MVT::v8i64, 1 }, 2770 { ISD::CTLZ, MVT::v16i32, 1 }, 2771 { ISD::CTLZ, MVT::v32i16, 8 }, 2772 { ISD::CTLZ, MVT::v64i8, 20 }, 2773 { ISD::CTLZ, MVT::v4i64, 1 }, 2774 { ISD::CTLZ, MVT::v8i32, 1 }, 2775 { ISD::CTLZ, MVT::v16i16, 4 }, 2776 { ISD::CTLZ, MVT::v32i8, 10 }, 2777 { ISD::CTLZ, MVT::v2i64, 1 }, 2778 { ISD::CTLZ, MVT::v4i32, 1 }, 2779 { ISD::CTLZ, MVT::v8i16, 4 }, 2780 { ISD::CTLZ, MVT::v16i8, 4 }, 2781 }; 2782 static const CostTblEntry AVX512BWCostTbl[] = { 2783 { ISD::ABS, MVT::v32i16, 1 }, 2784 { ISD::ABS, MVT::v64i8, 1 }, 2785 { ISD::BITREVERSE, MVT::v8i64, 3 }, 2786 { ISD::BITREVERSE, MVT::v16i32, 3 }, 2787 { ISD::BITREVERSE, MVT::v32i16, 3 }, 2788 { ISD::BITREVERSE, MVT::v64i8, 2 }, 2789 { ISD::BSWAP, MVT::v8i64, 1 }, 2790 { ISD::BSWAP, MVT::v16i32, 1 }, 2791 { ISD::BSWAP, MVT::v32i16, 1 }, 2792 { ISD::CTLZ, MVT::v8i64, 23 }, 2793 { ISD::CTLZ, MVT::v16i32, 22 }, 2794 { ISD::CTLZ, MVT::v32i16, 18 }, 2795 { ISD::CTLZ, MVT::v64i8, 17 }, 2796 { ISD::CTPOP, MVT::v8i64, 7 }, 2797 { ISD::CTPOP, MVT::v16i32, 11 }, 2798 { ISD::CTPOP, MVT::v32i16, 9 }, 2799 { ISD::CTPOP, MVT::v64i8, 6 }, 2800 { ISD::CTTZ, MVT::v8i64, 10 }, 2801 { ISD::CTTZ, MVT::v16i32, 14 }, 2802 { ISD::CTTZ, MVT::v32i16, 12 }, 2803 { ISD::CTTZ, MVT::v64i8, 9 }, 2804 { ISD::SADDSAT, MVT::v32i16, 1 }, 2805 { ISD::SADDSAT, MVT::v64i8, 1 }, 2806 { ISD::SMAX, MVT::v32i16, 1 }, 2807 { ISD::SMAX, MVT::v64i8, 1 }, 2808 { ISD::SMIN, MVT::v32i16, 1 }, 2809 { ISD::SMIN, MVT::v64i8, 1 }, 2810 { ISD::SSUBSAT, MVT::v32i16, 1 }, 2811 { ISD::SSUBSAT, MVT::v64i8, 1 }, 2812 { ISD::UADDSAT, MVT::v32i16, 1 }, 2813 { ISD::UADDSAT, MVT::v64i8, 1 }, 2814 { ISD::UMAX, MVT::v32i16, 1 }, 2815 { ISD::UMAX, MVT::v64i8, 1 }, 2816 { ISD::UMIN, MVT::v32i16, 1 }, 2817 { ISD::UMIN, MVT::v64i8, 1 }, 2818 { ISD::USUBSAT, MVT::v32i16, 1 }, 2819 { ISD::USUBSAT, MVT::v64i8, 1 }, 2820 }; 2821 static const CostTblEntry AVX512CostTbl[] = { 2822 { ISD::ABS, MVT::v8i64, 1 }, 2823 { ISD::ABS, MVT::v16i32, 1 }, 2824 { ISD::ABS, MVT::v32i16, 2 }, 2825 { ISD::ABS, MVT::v64i8, 2 }, 2826 { ISD::ABS, MVT::v4i64, 1 }, 2827 { ISD::ABS, MVT::v2i64, 1 }, 2828 { ISD::BITREVERSE, MVT::v8i64, 36 }, 2829 { ISD::BITREVERSE, MVT::v16i32, 24 }, 2830 { ISD::BITREVERSE, MVT::v32i16, 10 }, 2831 { ISD::BITREVERSE, MVT::v64i8, 10 }, 2832 { ISD::BSWAP, MVT::v8i64, 4 }, 2833 { ISD::BSWAP, MVT::v16i32, 4 }, 2834 { ISD::BSWAP, MVT::v32i16, 4 }, 2835 { ISD::CTLZ, MVT::v8i64, 29 }, 2836 { ISD::CTLZ, MVT::v16i32, 35 }, 2837 { ISD::CTLZ, MVT::v32i16, 28 }, 2838 { ISD::CTLZ, MVT::v64i8, 18 }, 2839 { ISD::CTPOP, MVT::v8i64, 16 }, 2840 { ISD::CTPOP, MVT::v16i32, 24 }, 2841 { ISD::CTPOP, MVT::v32i16, 18 }, 2842 { ISD::CTPOP, MVT::v64i8, 12 }, 2843 { ISD::CTTZ, MVT::v8i64, 20 }, 2844 { ISD::CTTZ, MVT::v16i32, 28 }, 2845 { ISD::CTTZ, MVT::v32i16, 24 }, 2846 { ISD::CTTZ, MVT::v64i8, 18 }, 2847 { ISD::SMAX, MVT::v8i64, 1 }, 2848 { ISD::SMAX, MVT::v16i32, 1 }, 2849 { ISD::SMAX, MVT::v32i16, 2 }, 2850 { ISD::SMAX, MVT::v64i8, 2 }, 2851 { ISD::SMAX, MVT::v4i64, 1 }, 2852 { ISD::SMAX, MVT::v2i64, 1 }, 2853 { ISD::SMIN, MVT::v8i64, 1 }, 2854 { ISD::SMIN, MVT::v16i32, 1 }, 2855 { ISD::SMIN, MVT::v32i16, 2 }, 2856 { ISD::SMIN, MVT::v64i8, 2 }, 2857 { ISD::SMIN, MVT::v4i64, 1 }, 2858 { ISD::SMIN, MVT::v2i64, 1 }, 2859 { ISD::UMAX, MVT::v8i64, 1 }, 2860 { ISD::UMAX, MVT::v16i32, 1 }, 2861 { ISD::UMAX, MVT::v32i16, 2 }, 2862 { ISD::UMAX, MVT::v64i8, 2 }, 2863 { ISD::UMAX, MVT::v4i64, 1 }, 2864 { ISD::UMAX, MVT::v2i64, 1 }, 2865 { ISD::UMIN, MVT::v8i64, 1 }, 2866 { ISD::UMIN, MVT::v16i32, 1 }, 2867 { ISD::UMIN, MVT::v32i16, 2 }, 2868 { ISD::UMIN, MVT::v64i8, 2 }, 2869 { ISD::UMIN, MVT::v4i64, 1 }, 2870 { ISD::UMIN, MVT::v2i64, 1 }, 2871 { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd 2872 { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq 2873 { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq 2874 { ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq 2875 { ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd 2876 { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq 2877 { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq 2878 { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq 2879 { ISD::SADDSAT, MVT::v32i16, 2 }, 2880 { ISD::SADDSAT, MVT::v64i8, 2 }, 2881 { ISD::SSUBSAT, MVT::v32i16, 2 }, 2882 { ISD::SSUBSAT, MVT::v64i8, 2 }, 2883 { ISD::UADDSAT, MVT::v32i16, 2 }, 2884 { ISD::UADDSAT, MVT::v64i8, 2 }, 2885 { ISD::USUBSAT, MVT::v32i16, 2 }, 2886 { ISD::USUBSAT, MVT::v64i8, 2 }, 2887 { ISD::FMAXNUM, MVT::f32, 2 }, 2888 { ISD::FMAXNUM, MVT::v4f32, 2 }, 2889 { ISD::FMAXNUM, MVT::v8f32, 2 }, 2890 { ISD::FMAXNUM, MVT::v16f32, 2 }, 2891 { ISD::FMAXNUM, MVT::f64, 2 }, 2892 { ISD::FMAXNUM, MVT::v2f64, 2 }, 2893 { ISD::FMAXNUM, MVT::v4f64, 2 }, 2894 { ISD::FMAXNUM, MVT::v8f64, 2 }, 2895 }; 2896 static const CostTblEntry XOPCostTbl[] = { 2897 { ISD::BITREVERSE, MVT::v4i64, 4 }, 2898 { ISD::BITREVERSE, MVT::v8i32, 4 }, 2899 { ISD::BITREVERSE, MVT::v16i16, 4 }, 2900 { ISD::BITREVERSE, MVT::v32i8, 4 }, 2901 { ISD::BITREVERSE, MVT::v2i64, 1 }, 2902 { ISD::BITREVERSE, MVT::v4i32, 1 }, 2903 { ISD::BITREVERSE, MVT::v8i16, 1 }, 2904 { ISD::BITREVERSE, MVT::v16i8, 1 }, 2905 { ISD::BITREVERSE, MVT::i64, 3 }, 2906 { ISD::BITREVERSE, MVT::i32, 3 }, 2907 { ISD::BITREVERSE, MVT::i16, 3 }, 2908 { ISD::BITREVERSE, MVT::i8, 3 } 2909 }; 2910 static const CostTblEntry AVX2CostTbl[] = { 2911 { ISD::ABS, MVT::v4i64, 2 }, // VBLENDVPD(X,VPSUBQ(0,X),X) 2912 { ISD::ABS, MVT::v8i32, 1 }, 2913 { ISD::ABS, MVT::v16i16, 1 }, 2914 { ISD::ABS, MVT::v32i8, 1 }, 2915 { ISD::BITREVERSE, MVT::v2i64, 3 }, 2916 { ISD::BITREVERSE, MVT::v4i64, 3 }, 2917 { ISD::BITREVERSE, MVT::v4i32, 3 }, 2918 { ISD::BITREVERSE, MVT::v8i32, 3 }, 2919 { ISD::BITREVERSE, MVT::v8i16, 3 }, 2920 { ISD::BITREVERSE, MVT::v16i16, 3 }, 2921 { ISD::BITREVERSE, MVT::v16i8, 3 }, 2922 { ISD::BITREVERSE, MVT::v32i8, 3 }, 2923 { ISD::BSWAP, MVT::v4i64, 1 }, 2924 { ISD::BSWAP, MVT::v8i32, 1 }, 2925 { ISD::BSWAP, MVT::v16i16, 1 }, 2926 { ISD::CTLZ, MVT::v2i64, 7 }, 2927 { ISD::CTLZ, MVT::v4i64, 7 }, 2928 { ISD::CTLZ, MVT::v4i32, 5 }, 2929 { ISD::CTLZ, MVT::v8i32, 5 }, 2930 { ISD::CTLZ, MVT::v8i16, 4 }, 2931 { ISD::CTLZ, MVT::v16i16, 4 }, 2932 { ISD::CTLZ, MVT::v16i8, 3 }, 2933 { ISD::CTLZ, MVT::v32i8, 3 }, 2934 { ISD::CTPOP, MVT::v2i64, 3 }, 2935 { ISD::CTPOP, MVT::v4i64, 3 }, 2936 { ISD::CTPOP, MVT::v4i32, 7 }, 2937 { ISD::CTPOP, MVT::v8i32, 7 }, 2938 { ISD::CTPOP, MVT::v8i16, 3 }, 2939 { ISD::CTPOP, MVT::v16i16, 3 }, 2940 { ISD::CTPOP, MVT::v16i8, 2 }, 2941 { ISD::CTPOP, MVT::v32i8, 2 }, 2942 { ISD::CTTZ, MVT::v2i64, 4 }, 2943 { ISD::CTTZ, MVT::v4i64, 4 }, 2944 { ISD::CTTZ, MVT::v4i32, 7 }, 2945 { ISD::CTTZ, MVT::v8i32, 7 }, 2946 { ISD::CTTZ, MVT::v8i16, 4 }, 2947 { ISD::CTTZ, MVT::v16i16, 4 }, 2948 { ISD::CTTZ, MVT::v16i8, 3 }, 2949 { ISD::CTTZ, MVT::v32i8, 3 }, 2950 { ISD::SADDSAT, MVT::v16i16, 1 }, 2951 { ISD::SADDSAT, MVT::v32i8, 1 }, 2952 { ISD::SMAX, MVT::v8i32, 1 }, 2953 { ISD::SMAX, MVT::v16i16, 1 }, 2954 { ISD::SMAX, MVT::v32i8, 1 }, 2955 { ISD::SMIN, MVT::v8i32, 1 }, 2956 { ISD::SMIN, MVT::v16i16, 1 }, 2957 { ISD::SMIN, MVT::v32i8, 1 }, 2958 { ISD::SSUBSAT, MVT::v16i16, 1 }, 2959 { ISD::SSUBSAT, MVT::v32i8, 1 }, 2960 { ISD::UADDSAT, MVT::v16i16, 1 }, 2961 { ISD::UADDSAT, MVT::v32i8, 1 }, 2962 { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd 2963 { ISD::UMAX, MVT::v8i32, 1 }, 2964 { ISD::UMAX, MVT::v16i16, 1 }, 2965 { ISD::UMAX, MVT::v32i8, 1 }, 2966 { ISD::UMIN, MVT::v8i32, 1 }, 2967 { ISD::UMIN, MVT::v16i16, 1 }, 2968 { ISD::UMIN, MVT::v32i8, 1 }, 2969 { ISD::USUBSAT, MVT::v16i16, 1 }, 2970 { ISD::USUBSAT, MVT::v32i8, 1 }, 2971 { ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd 2972 { ISD::FMAXNUM, MVT::v8f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS 2973 { ISD::FMAXNUM, MVT::v4f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD 2974 { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/ 2975 { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/ 2976 { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/ 2977 { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/ 2978 { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/ 2979 { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/ 2980 }; 2981 static const CostTblEntry AVX1CostTbl[] = { 2982 { ISD::ABS, MVT::v4i64, 5 }, // VBLENDVPD(X,VPSUBQ(0,X),X) 2983 { ISD::ABS, MVT::v8i32, 3 }, 2984 { ISD::ABS, MVT::v16i16, 3 }, 2985 { ISD::ABS, MVT::v32i8, 3 }, 2986 { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert 2987 { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert 2988 { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert 2989 { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert 2990 { ISD::BSWAP, MVT::v4i64, 4 }, 2991 { ISD::BSWAP, MVT::v8i32, 4 }, 2992 { ISD::BSWAP, MVT::v16i16, 4 }, 2993 { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert 2994 { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert 2995 { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert 2996 { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert 2997 { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert 2998 { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert 2999 { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert 3000 { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert 3001 { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert 3002 { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert 3003 { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert 3004 { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert 3005 { ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert 3006 { ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert 3007 { ISD::SMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert 3008 { ISD::SMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert 3009 { ISD::SMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert 3010 { ISD::SMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert 3011 { ISD::SMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert 3012 { ISD::SMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert 3013 { ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert 3014 { ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert 3015 { ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert 3016 { ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert 3017 { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert 3018 { ISD::UMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert 3019 { ISD::UMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert 3020 { ISD::UMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert 3021 { ISD::UMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert 3022 { ISD::UMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert 3023 { ISD::UMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert 3024 { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert 3025 { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert 3026 { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert 3027 { ISD::FMAXNUM, MVT::f32, 3 }, // MAXSS + CMPUNORDSS + BLENDVPS 3028 { ISD::FMAXNUM, MVT::v4f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS 3029 { ISD::FMAXNUM, MVT::v8f32, 5 }, // MAXPS + CMPUNORDPS + BLENDVPS + ? 3030 { ISD::FMAXNUM, MVT::f64, 3 }, // MAXSD + CMPUNORDSD + BLENDVPD 3031 { ISD::FMAXNUM, MVT::v2f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD 3032 { ISD::FMAXNUM, MVT::v4f64, 5 }, // MAXPD + CMPUNORDPD + BLENDVPD + ? 3033 { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/ 3034 { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/ 3035 { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/ 3036 { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/ 3037 { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/ 3038 { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/ 3039 }; 3040 static const CostTblEntry GLMCostTbl[] = { 3041 { ISD::FSQRT, MVT::f32, 19 }, // sqrtss 3042 { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps 3043 { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd 3044 { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd 3045 }; 3046 static const CostTblEntry SLMCostTbl[] = { 3047 { ISD::FSQRT, MVT::f32, 20 }, // sqrtss 3048 { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps 3049 { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd 3050 { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd 3051 }; 3052 static const CostTblEntry SSE42CostTbl[] = { 3053 { ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd 3054 { ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd 3055 { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/ 3056 { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/ 3057 }; 3058 static const CostTblEntry SSE41CostTbl[] = { 3059 { ISD::ABS, MVT::v2i64, 2 }, // BLENDVPD(X,PSUBQ(0,X),X) 3060 { ISD::SMAX, MVT::v4i32, 1 }, 3061 { ISD::SMAX, MVT::v16i8, 1 }, 3062 { ISD::SMIN, MVT::v4i32, 1 }, 3063 { ISD::SMIN, MVT::v16i8, 1 }, 3064 { ISD::UMAX, MVT::v4i32, 1 }, 3065 { ISD::UMAX, MVT::v8i16, 1 }, 3066 { ISD::UMIN, MVT::v4i32, 1 }, 3067 { ISD::UMIN, MVT::v8i16, 1 }, 3068 }; 3069 static const CostTblEntry SSSE3CostTbl[] = { 3070 { ISD::ABS, MVT::v4i32, 1 }, 3071 { ISD::ABS, MVT::v8i16, 1 }, 3072 { ISD::ABS, MVT::v16i8, 1 }, 3073 { ISD::BITREVERSE, MVT::v2i64, 5 }, 3074 { ISD::BITREVERSE, MVT::v4i32, 5 }, 3075 { ISD::BITREVERSE, MVT::v8i16, 5 }, 3076 { ISD::BITREVERSE, MVT::v16i8, 5 }, 3077 { ISD::BSWAP, MVT::v2i64, 1 }, 3078 { ISD::BSWAP, MVT::v4i32, 1 }, 3079 { ISD::BSWAP, MVT::v8i16, 1 }, 3080 { ISD::CTLZ, MVT::v2i64, 23 }, 3081 { ISD::CTLZ, MVT::v4i32, 18 }, 3082 { ISD::CTLZ, MVT::v8i16, 14 }, 3083 { ISD::CTLZ, MVT::v16i8, 9 }, 3084 { ISD::CTPOP, MVT::v2i64, 7 }, 3085 { ISD::CTPOP, MVT::v4i32, 11 }, 3086 { ISD::CTPOP, MVT::v8i16, 9 }, 3087 { ISD::CTPOP, MVT::v16i8, 6 }, 3088 { ISD::CTTZ, MVT::v2i64, 10 }, 3089 { ISD::CTTZ, MVT::v4i32, 14 }, 3090 { ISD::CTTZ, MVT::v8i16, 12 }, 3091 { ISD::CTTZ, MVT::v16i8, 9 } 3092 }; 3093 static const CostTblEntry SSE2CostTbl[] = { 3094 { ISD::ABS, MVT::v2i64, 4 }, 3095 { ISD::ABS, MVT::v4i32, 3 }, 3096 { ISD::ABS, MVT::v8i16, 2 }, 3097 { ISD::ABS, MVT::v16i8, 2 }, 3098 { ISD::BITREVERSE, MVT::v2i64, 29 }, 3099 { ISD::BITREVERSE, MVT::v4i32, 27 }, 3100 { ISD::BITREVERSE, MVT::v8i16, 27 }, 3101 { ISD::BITREVERSE, MVT::v16i8, 20 }, 3102 { ISD::BSWAP, MVT::v2i64, 7 }, 3103 { ISD::BSWAP, MVT::v4i32, 7 }, 3104 { ISD::BSWAP, MVT::v8i16, 7 }, 3105 { ISD::CTLZ, MVT::v2i64, 25 }, 3106 { ISD::CTLZ, MVT::v4i32, 26 }, 3107 { ISD::CTLZ, MVT::v8i16, 20 }, 3108 { ISD::CTLZ, MVT::v16i8, 17 }, 3109 { ISD::CTPOP, MVT::v2i64, 12 }, 3110 { ISD::CTPOP, MVT::v4i32, 15 }, 3111 { ISD::CTPOP, MVT::v8i16, 13 }, 3112 { ISD::CTPOP, MVT::v16i8, 10 }, 3113 { ISD::CTTZ, MVT::v2i64, 14 }, 3114 { ISD::CTTZ, MVT::v4i32, 18 }, 3115 { ISD::CTTZ, MVT::v8i16, 16 }, 3116 { ISD::CTTZ, MVT::v16i8, 13 }, 3117 { ISD::SADDSAT, MVT::v8i16, 1 }, 3118 { ISD::SADDSAT, MVT::v16i8, 1 }, 3119 { ISD::SMAX, MVT::v8i16, 1 }, 3120 { ISD::SMIN, MVT::v8i16, 1 }, 3121 { ISD::SSUBSAT, MVT::v8i16, 1 }, 3122 { ISD::SSUBSAT, MVT::v16i8, 1 }, 3123 { ISD::UADDSAT, MVT::v8i16, 1 }, 3124 { ISD::UADDSAT, MVT::v16i8, 1 }, 3125 { ISD::UMAX, MVT::v8i16, 2 }, 3126 { ISD::UMAX, MVT::v16i8, 1 }, 3127 { ISD::UMIN, MVT::v8i16, 2 }, 3128 { ISD::UMIN, MVT::v16i8, 1 }, 3129 { ISD::USUBSAT, MVT::v8i16, 1 }, 3130 { ISD::USUBSAT, MVT::v16i8, 1 }, 3131 { ISD::FMAXNUM, MVT::f64, 4 }, 3132 { ISD::FMAXNUM, MVT::v2f64, 4 }, 3133 { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/ 3134 { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/ 3135 }; 3136 static const CostTblEntry SSE1CostTbl[] = { 3137 { ISD::FMAXNUM, MVT::f32, 4 }, 3138 { ISD::FMAXNUM, MVT::v4f32, 4 }, 3139 { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/ 3140 { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/ 3141 }; 3142 static const CostTblEntry BMI64CostTbl[] = { // 64-bit targets 3143 { ISD::CTTZ, MVT::i64, 1 }, 3144 }; 3145 static const CostTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets 3146 { ISD::CTTZ, MVT::i32, 1 }, 3147 { ISD::CTTZ, MVT::i16, 1 }, 3148 { ISD::CTTZ, MVT::i8, 1 }, 3149 }; 3150 static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets 3151 { ISD::CTLZ, MVT::i64, 1 }, 3152 }; 3153 static const CostTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets 3154 { ISD::CTLZ, MVT::i32, 1 }, 3155 { ISD::CTLZ, MVT::i16, 1 }, 3156 { ISD::CTLZ, MVT::i8, 1 }, 3157 }; 3158 static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets 3159 { ISD::CTPOP, MVT::i64, 1 }, 3160 }; 3161 static const CostTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets 3162 { ISD::CTPOP, MVT::i32, 1 }, 3163 { ISD::CTPOP, MVT::i16, 1 }, 3164 { ISD::CTPOP, MVT::i8, 1 }, 3165 }; 3166 static const CostTblEntry X64CostTbl[] = { // 64-bit targets 3167 { ISD::ABS, MVT::i64, 2 }, // SUB+CMOV 3168 { ISD::BITREVERSE, MVT::i64, 14 }, 3169 { ISD::BSWAP, MVT::i64, 1 }, 3170 { ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV 3171 { ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH 3172 { ISD::CTPOP, MVT::i64, 10 }, 3173 { ISD::SADDO, MVT::i64, 1 }, 3174 { ISD::UADDO, MVT::i64, 1 }, 3175 { ISD::UMULO, MVT::i64, 2 }, // mulq + seto 3176 }; 3177 static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets 3178 { ISD::ABS, MVT::i32, 2 }, // SUB+CMOV 3179 { ISD::ABS, MVT::i16, 2 }, // SUB+CMOV 3180 { ISD::BITREVERSE, MVT::i32, 14 }, 3181 { ISD::BITREVERSE, MVT::i16, 14 }, 3182 { ISD::BITREVERSE, MVT::i8, 11 }, 3183 { ISD::BSWAP, MVT::i32, 1 }, 3184 { ISD::BSWAP, MVT::i16, 1 }, // ROL 3185 { ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV 3186 { ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV 3187 { ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV 3188 { ISD::CTTZ, MVT::i32, 3 }, // TEST+BSF+CMOV/BRANCH 3189 { ISD::CTTZ, MVT::i16, 3 }, // TEST+BSF+CMOV/BRANCH 3190 { ISD::CTTZ, MVT::i8, 3 }, // TEST+BSF+CMOV/BRANCH 3191 { ISD::CTPOP, MVT::i32, 8 }, 3192 { ISD::CTPOP, MVT::i16, 9 }, 3193 { ISD::CTPOP, MVT::i8, 7 }, 3194 { ISD::SADDO, MVT::i32, 1 }, 3195 { ISD::SADDO, MVT::i16, 1 }, 3196 { ISD::SADDO, MVT::i8, 1 }, 3197 { ISD::UADDO, MVT::i32, 1 }, 3198 { ISD::UADDO, MVT::i16, 1 }, 3199 { ISD::UADDO, MVT::i8, 1 }, 3200 { ISD::UMULO, MVT::i32, 2 }, // mul + seto 3201 { ISD::UMULO, MVT::i16, 2 }, 3202 { ISD::UMULO, MVT::i8, 2 }, 3203 }; 3204 3205 Type *RetTy = ICA.getReturnType(); 3206 Type *OpTy = RetTy; 3207 Intrinsic::ID IID = ICA.getID(); 3208 unsigned ISD = ISD::DELETED_NODE; 3209 switch (IID) { 3210 default: 3211 break; 3212 case Intrinsic::abs: 3213 ISD = ISD::ABS; 3214 break; 3215 case Intrinsic::bitreverse: 3216 ISD = ISD::BITREVERSE; 3217 break; 3218 case Intrinsic::bswap: 3219 ISD = ISD::BSWAP; 3220 break; 3221 case Intrinsic::ctlz: 3222 ISD = ISD::CTLZ; 3223 break; 3224 case Intrinsic::ctpop: 3225 ISD = ISD::CTPOP; 3226 break; 3227 case Intrinsic::cttz: 3228 ISD = ISD::CTTZ; 3229 break; 3230 case Intrinsic::maxnum: 3231 case Intrinsic::minnum: 3232 // FMINNUM has same costs so don't duplicate. 3233 ISD = ISD::FMAXNUM; 3234 break; 3235 case Intrinsic::sadd_sat: 3236 ISD = ISD::SADDSAT; 3237 break; 3238 case Intrinsic::smax: 3239 ISD = ISD::SMAX; 3240 break; 3241 case Intrinsic::smin: 3242 ISD = ISD::SMIN; 3243 break; 3244 case Intrinsic::ssub_sat: 3245 ISD = ISD::SSUBSAT; 3246 break; 3247 case Intrinsic::uadd_sat: 3248 ISD = ISD::UADDSAT; 3249 break; 3250 case Intrinsic::umax: 3251 ISD = ISD::UMAX; 3252 break; 3253 case Intrinsic::umin: 3254 ISD = ISD::UMIN; 3255 break; 3256 case Intrinsic::usub_sat: 3257 ISD = ISD::USUBSAT; 3258 break; 3259 case Intrinsic::sqrt: 3260 ISD = ISD::FSQRT; 3261 break; 3262 case Intrinsic::sadd_with_overflow: 3263 case Intrinsic::ssub_with_overflow: 3264 // SSUBO has same costs so don't duplicate. 3265 ISD = ISD::SADDO; 3266 OpTy = RetTy->getContainedType(0); 3267 break; 3268 case Intrinsic::uadd_with_overflow: 3269 case Intrinsic::usub_with_overflow: 3270 // USUBO has same costs so don't duplicate. 3271 ISD = ISD::UADDO; 3272 OpTy = RetTy->getContainedType(0); 3273 break; 3274 case Intrinsic::umul_with_overflow: 3275 case Intrinsic::smul_with_overflow: 3276 // SMULO has same costs so don't duplicate. 3277 ISD = ISD::UMULO; 3278 OpTy = RetTy->getContainedType(0); 3279 break; 3280 } 3281 3282 if (ISD != ISD::DELETED_NODE) { 3283 // Legalize the type. 3284 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy); 3285 MVT MTy = LT.second; 3286 3287 // Attempt to lookup cost. 3288 if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() && 3289 MTy.isVector()) { 3290 // With PSHUFB the code is very similar for all types. If we have integer 3291 // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types 3292 // we also need a PSHUFB. 3293 unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2; 3294 3295 // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB 3296 // instructions. We also need an extract and an insert. 3297 if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) || 3298 (ST->hasBWI() && MTy.is512BitVector()))) 3299 Cost = Cost * 2 + 2; 3300 3301 return LT.first * Cost; 3302 } 3303 3304 auto adjustTableCost = [](const CostTblEntry &Entry, 3305 InstructionCost LegalizationCost, 3306 FastMathFlags FMF) { 3307 // If there are no NANs to deal with, then these are reduced to a 3308 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we 3309 // assume is used in the non-fast case. 3310 if (Entry.ISD == ISD::FMAXNUM || Entry.ISD == ISD::FMINNUM) { 3311 if (FMF.noNaNs()) 3312 return LegalizationCost * 1; 3313 } 3314 return LegalizationCost * (int)Entry.Cost; 3315 }; 3316 3317 if (ST->useGLMDivSqrtCosts()) 3318 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy)) 3319 return adjustTableCost(*Entry, LT.first, ICA.getFlags()); 3320 3321 if (ST->useSLMArithCosts()) 3322 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) 3323 return adjustTableCost(*Entry, LT.first, ICA.getFlags()); 3324 3325 if (ST->hasBITALG()) 3326 if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy)) 3327 return adjustTableCost(*Entry, LT.first, ICA.getFlags()); 3328 3329 if (ST->hasVPOPCNTDQ()) 3330 if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy)) 3331 return adjustTableCost(*Entry, LT.first, ICA.getFlags()); 3332 3333 if (ST->hasCDI()) 3334 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy)) 3335 return adjustTableCost(*Entry, LT.first, ICA.getFlags()); 3336 3337 if (ST->hasBWI()) 3338 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) 3339 return adjustTableCost(*Entry, LT.first, ICA.getFlags()); 3340 3341 if (ST->hasAVX512()) 3342 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) 3343 return adjustTableCost(*Entry, LT.first, ICA.getFlags()); 3344 3345 if (ST->hasXOP()) 3346 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) 3347 return adjustTableCost(*Entry, LT.first, ICA.getFlags()); 3348 3349 if (ST->hasAVX2()) 3350 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) 3351 return adjustTableCost(*Entry, LT.first, ICA.getFlags()); 3352 3353 if (ST->hasAVX()) 3354 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) 3355 return adjustTableCost(*Entry, LT.first, ICA.getFlags()); 3356 3357 if (ST->hasSSE42()) 3358 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) 3359 return adjustTableCost(*Entry, LT.first, ICA.getFlags()); 3360 3361 if (ST->hasSSE41()) 3362 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) 3363 return adjustTableCost(*Entry, LT.first, ICA.getFlags()); 3364 3365 if (ST->hasSSSE3()) 3366 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy)) 3367 return adjustTableCost(*Entry, LT.first, ICA.getFlags()); 3368 3369 if (ST->hasSSE2()) 3370 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) 3371 return adjustTableCost(*Entry, LT.first, ICA.getFlags()); 3372 3373 if (ST->hasSSE1()) 3374 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) 3375 return adjustTableCost(*Entry, LT.first, ICA.getFlags()); 3376 3377 if (ST->hasBMI()) { 3378 if (ST->is64Bit()) 3379 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy)) 3380 return adjustTableCost(*Entry, LT.first, ICA.getFlags()); 3381 3382 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy)) 3383 return adjustTableCost(*Entry, LT.first, ICA.getFlags()); 3384 } 3385 3386 if (ST->hasLZCNT()) { 3387 if (ST->is64Bit()) 3388 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy)) 3389 return adjustTableCost(*Entry, LT.first, ICA.getFlags()); 3390 3391 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy)) 3392 return adjustTableCost(*Entry, LT.first, ICA.getFlags()); 3393 } 3394 3395 if (ST->hasPOPCNT()) { 3396 if (ST->is64Bit()) 3397 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy)) 3398 return adjustTableCost(*Entry, LT.first, ICA.getFlags()); 3399 3400 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy)) 3401 return adjustTableCost(*Entry, LT.first, ICA.getFlags()); 3402 } 3403 3404 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) { 3405 if (const Instruction *II = ICA.getInst()) { 3406 if (II->hasOneUse() && isa<StoreInst>(II->user_back())) 3407 return TTI::TCC_Free; 3408 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) { 3409 if (LI->hasOneUse()) 3410 return TTI::TCC_Free; 3411 } 3412 } 3413 } 3414 3415 if (ST->is64Bit()) 3416 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) 3417 return adjustTableCost(*Entry, LT.first, ICA.getFlags()); 3418 3419 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy)) 3420 return adjustTableCost(*Entry, LT.first, ICA.getFlags()); 3421 } 3422 3423 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 3424 } 3425 3426 InstructionCost 3427 X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 3428 TTI::TargetCostKind CostKind) { 3429 if (ICA.isTypeBasedOnly()) 3430 return getTypeBasedIntrinsicInstrCost(ICA, CostKind); 3431 3432 static const CostTblEntry AVX512CostTbl[] = { 3433 { ISD::ROTL, MVT::v8i64, 1 }, 3434 { ISD::ROTL, MVT::v4i64, 1 }, 3435 { ISD::ROTL, MVT::v2i64, 1 }, 3436 { ISD::ROTL, MVT::v16i32, 1 }, 3437 { ISD::ROTL, MVT::v8i32, 1 }, 3438 { ISD::ROTL, MVT::v4i32, 1 }, 3439 { ISD::ROTR, MVT::v8i64, 1 }, 3440 { ISD::ROTR, MVT::v4i64, 1 }, 3441 { ISD::ROTR, MVT::v2i64, 1 }, 3442 { ISD::ROTR, MVT::v16i32, 1 }, 3443 { ISD::ROTR, MVT::v8i32, 1 }, 3444 { ISD::ROTR, MVT::v4i32, 1 } 3445 }; 3446 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y)) 3447 static const CostTblEntry XOPCostTbl[] = { 3448 { ISD::ROTL, MVT::v4i64, 4 }, 3449 { ISD::ROTL, MVT::v8i32, 4 }, 3450 { ISD::ROTL, MVT::v16i16, 4 }, 3451 { ISD::ROTL, MVT::v32i8, 4 }, 3452 { ISD::ROTL, MVT::v2i64, 1 }, 3453 { ISD::ROTL, MVT::v4i32, 1 }, 3454 { ISD::ROTL, MVT::v8i16, 1 }, 3455 { ISD::ROTL, MVT::v16i8, 1 }, 3456 { ISD::ROTR, MVT::v4i64, 6 }, 3457 { ISD::ROTR, MVT::v8i32, 6 }, 3458 { ISD::ROTR, MVT::v16i16, 6 }, 3459 { ISD::ROTR, MVT::v32i8, 6 }, 3460 { ISD::ROTR, MVT::v2i64, 2 }, 3461 { ISD::ROTR, MVT::v4i32, 2 }, 3462 { ISD::ROTR, MVT::v8i16, 2 }, 3463 { ISD::ROTR, MVT::v16i8, 2 } 3464 }; 3465 static const CostTblEntry X64CostTbl[] = { // 64-bit targets 3466 { ISD::ROTL, MVT::i64, 1 }, 3467 { ISD::ROTR, MVT::i64, 1 }, 3468 { ISD::FSHL, MVT::i64, 4 } 3469 }; 3470 static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets 3471 { ISD::ROTL, MVT::i32, 1 }, 3472 { ISD::ROTL, MVT::i16, 1 }, 3473 { ISD::ROTL, MVT::i8, 1 }, 3474 { ISD::ROTR, MVT::i32, 1 }, 3475 { ISD::ROTR, MVT::i16, 1 }, 3476 { ISD::ROTR, MVT::i8, 1 }, 3477 { ISD::FSHL, MVT::i32, 4 }, 3478 { ISD::FSHL, MVT::i16, 4 }, 3479 { ISD::FSHL, MVT::i8, 4 } 3480 }; 3481 3482 Intrinsic::ID IID = ICA.getID(); 3483 Type *RetTy = ICA.getReturnType(); 3484 const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); 3485 unsigned ISD = ISD::DELETED_NODE; 3486 switch (IID) { 3487 default: 3488 break; 3489 case Intrinsic::fshl: 3490 ISD = ISD::FSHL; 3491 if (Args[0] == Args[1]) 3492 ISD = ISD::ROTL; 3493 break; 3494 case Intrinsic::fshr: 3495 // FSHR has same costs so don't duplicate. 3496 ISD = ISD::FSHL; 3497 if (Args[0] == Args[1]) 3498 ISD = ISD::ROTR; 3499 break; 3500 } 3501 3502 if (ISD != ISD::DELETED_NODE) { 3503 // Legalize the type. 3504 std::pair<InstructionCost, MVT> LT = 3505 TLI->getTypeLegalizationCost(DL, RetTy); 3506 MVT MTy = LT.second; 3507 3508 // Attempt to lookup cost. 3509 if (ST->hasAVX512()) 3510 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) 3511 return LT.first * Entry->Cost; 3512 3513 if (ST->hasXOP()) 3514 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) 3515 return LT.first * Entry->Cost; 3516 3517 if (ST->is64Bit()) 3518 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) 3519 return LT.first * Entry->Cost; 3520 3521 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy)) 3522 return LT.first * Entry->Cost; 3523 } 3524 3525 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 3526 } 3527 3528 InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 3529 unsigned Index) { 3530 static const CostTblEntry SLMCostTbl[] = { 3531 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 }, 3532 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 }, 3533 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 }, 3534 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 } 3535 }; 3536 3537 assert(Val->isVectorTy() && "This must be a vector type"); 3538 Type *ScalarType = Val->getScalarType(); 3539 int RegisterFileMoveCost = 0; 3540 3541 // Non-immediate extraction/insertion can be handled as a sequence of 3542 // aliased loads+stores via the stack. 3543 if (Index == -1U && (Opcode == Instruction::ExtractElement || 3544 Opcode == Instruction::InsertElement)) { 3545 // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns: 3546 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0. 3547 3548 // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling. 3549 assert(isa<FixedVectorType>(Val) && "Fixed vector type expected"); 3550 Align VecAlign = DL.getPrefTypeAlign(Val); 3551 Align SclAlign = DL.getPrefTypeAlign(ScalarType); 3552 3553 // Extract - store vector to stack, load scalar. 3554 if (Opcode == Instruction::ExtractElement) { 3555 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, 3556 TTI::TargetCostKind::TCK_RecipThroughput) + 3557 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0, 3558 TTI::TargetCostKind::TCK_RecipThroughput); 3559 } 3560 // Insert - store vector to stack, store scalar, load vector. 3561 if (Opcode == Instruction::InsertElement) { 3562 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, 3563 TTI::TargetCostKind::TCK_RecipThroughput) + 3564 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0, 3565 TTI::TargetCostKind::TCK_RecipThroughput) + 3566 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, 3567 TTI::TargetCostKind::TCK_RecipThroughput); 3568 } 3569 } 3570 3571 if (Index != -1U && (Opcode == Instruction::ExtractElement || 3572 Opcode == Instruction::InsertElement)) { 3573 // Legalize the type. 3574 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); 3575 3576 // This type is legalized to a scalar type. 3577 if (!LT.second.isVector()) 3578 return 0; 3579 3580 // The type may be split. Normalize the index to the new type. 3581 unsigned NumElts = LT.second.getVectorNumElements(); 3582 unsigned SubNumElts = NumElts; 3583 Index = Index % NumElts; 3584 3585 // For >128-bit vectors, we need to extract higher 128-bit subvectors. 3586 // For inserts, we also need to insert the subvector back. 3587 if (LT.second.getSizeInBits() > 128) { 3588 assert((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector"); 3589 unsigned NumSubVecs = LT.second.getSizeInBits() / 128; 3590 SubNumElts = NumElts / NumSubVecs; 3591 if (SubNumElts <= Index) { 3592 RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1); 3593 Index %= SubNumElts; 3594 } 3595 } 3596 3597 if (Index == 0) { 3598 // Floating point scalars are already located in index #0. 3599 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume 3600 // true for all. 3601 if (ScalarType->isFloatingPointTy()) 3602 return RegisterFileMoveCost; 3603 3604 // Assume movd/movq XMM -> GPR is relatively cheap on all targets. 3605 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement) 3606 return 1 + RegisterFileMoveCost; 3607 } 3608 3609 int ISD = TLI->InstructionOpcodeToISD(Opcode); 3610 assert(ISD && "Unexpected vector opcode"); 3611 MVT MScalarTy = LT.second.getScalarType(); 3612 if (ST->useSLMArithCosts()) 3613 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy)) 3614 return Entry->Cost + RegisterFileMoveCost; 3615 3616 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets. 3617 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) || 3618 (MScalarTy.isInteger() && ST->hasSSE41())) 3619 return 1 + RegisterFileMoveCost; 3620 3621 // Assume insertps is relatively cheap on all targets. 3622 if (MScalarTy == MVT::f32 && ST->hasSSE41() && 3623 Opcode == Instruction::InsertElement) 3624 return 1 + RegisterFileMoveCost; 3625 3626 // For extractions we just need to shuffle the element to index 0, which 3627 // should be very cheap (assume cost = 1). For insertions we need to shuffle 3628 // the elements to its destination. In both cases we must handle the 3629 // subvector move(s). 3630 // If the vector type is already less than 128-bits then don't reduce it. 3631 // TODO: Under what circumstances should we shuffle using the full width? 3632 InstructionCost ShuffleCost = 1; 3633 if (Opcode == Instruction::InsertElement) { 3634 auto *SubTy = cast<VectorType>(Val); 3635 EVT VT = TLI->getValueType(DL, Val); 3636 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128) 3637 SubTy = FixedVectorType::get(ScalarType, SubNumElts); 3638 ShuffleCost = 3639 getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, None, 0, SubTy); 3640 } 3641 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1; 3642 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost; 3643 } 3644 3645 // Add to the base cost if we know that the extracted element of a vector is 3646 // destined to be moved to and used in the integer register file. 3647 if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy()) 3648 RegisterFileMoveCost += 1; 3649 3650 return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost; 3651 } 3652 3653 InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty, 3654 const APInt &DemandedElts, 3655 bool Insert, 3656 bool Extract) { 3657 InstructionCost Cost = 0; 3658 3659 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much 3660 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT. 3661 if (Insert) { 3662 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 3663 MVT MScalarTy = LT.second.getScalarType(); 3664 3665 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) || 3666 (MScalarTy.isInteger() && ST->hasSSE41()) || 3667 (MScalarTy == MVT::f32 && ST->hasSSE41())) { 3668 // For types we can insert directly, insertion into 128-bit sub vectors is 3669 // cheap, followed by a cheap chain of concatenations. 3670 if (LT.second.getSizeInBits() <= 128) { 3671 Cost += 3672 BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false); 3673 } else { 3674 // In each 128-lane, if at least one index is demanded but not all 3675 // indices are demanded and this 128-lane is not the first 128-lane of 3676 // the legalized-vector, then this 128-lane needs a extracti128; If in 3677 // each 128-lane, there is at least one demanded index, this 128-lane 3678 // needs a inserti128. 3679 3680 // The following cases will help you build a better understanding: 3681 // Assume we insert several elements into a v8i32 vector in avx2, 3682 // Case#1: inserting into 1th index needs vpinsrd + inserti128. 3683 // Case#2: inserting into 5th index needs extracti128 + vpinsrd + 3684 // inserti128. 3685 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128. 3686 const int CostValue = *LT.first.getValue(); 3687 assert(CostValue >= 0 && "Negative cost!"); 3688 unsigned Num128Lanes = LT.second.getSizeInBits() / 128 * CostValue; 3689 unsigned NumElts = LT.second.getVectorNumElements() * CostValue; 3690 APInt WidenedDemandedElts = DemandedElts.zextOrSelf(NumElts); 3691 unsigned Scale = NumElts / Num128Lanes; 3692 // We iterate each 128-lane, and check if we need a 3693 // extracti128/inserti128 for this 128-lane. 3694 for (unsigned I = 0; I < NumElts; I += Scale) { 3695 APInt Mask = WidenedDemandedElts.getBitsSet(NumElts, I, I + Scale); 3696 APInt MaskedDE = Mask & WidenedDemandedElts; 3697 unsigned Population = MaskedDE.countPopulation(); 3698 Cost += (Population > 0 && Population != Scale && 3699 I % LT.second.getVectorNumElements() != 0); 3700 Cost += Population > 0; 3701 } 3702 Cost += DemandedElts.countPopulation(); 3703 3704 // For vXf32 cases, insertion into the 0'th index in each v4f32 3705 // 128-bit vector is free. 3706 // NOTE: This assumes legalization widens vXf32 vectors. 3707 if (MScalarTy == MVT::f32) 3708 for (unsigned i = 0, e = cast<FixedVectorType>(Ty)->getNumElements(); 3709 i < e; i += 4) 3710 if (DemandedElts[i]) 3711 Cost--; 3712 } 3713 } else if (LT.second.isVector()) { 3714 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded 3715 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a 3716 // series of UNPCK followed by CONCAT_VECTORS - all of these can be 3717 // considered cheap. 3718 if (Ty->isIntOrIntVectorTy()) 3719 Cost += DemandedElts.countPopulation(); 3720 3721 // Get the smaller of the legalized or original pow2-extended number of 3722 // vector elements, which represents the number of unpacks we'll end up 3723 // performing. 3724 unsigned NumElts = LT.second.getVectorNumElements(); 3725 unsigned Pow2Elts = 3726 PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements()); 3727 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first; 3728 } 3729 } 3730 3731 // TODO: Use default extraction for now, but we should investigate extending this 3732 // to handle repeated subvector extraction. 3733 if (Extract) 3734 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract); 3735 3736 return Cost; 3737 } 3738 3739 InstructionCost 3740 X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, 3741 int VF, const APInt &DemandedDstElts, 3742 TTI::TargetCostKind CostKind) { 3743 const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy); 3744 // We don't differentiate element types here, only element bit width. 3745 EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits); 3746 3747 auto bailout = [&]() { 3748 return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF, 3749 DemandedDstElts, CostKind); 3750 }; 3751 3752 // For now, only deal with AVX512 cases. 3753 if (!ST->hasAVX512()) 3754 return bailout(); 3755 3756 // Do we have a native shuffle for this element type, or should we promote? 3757 unsigned PromEltTyBits = EltTyBits; 3758 switch (EltTyBits) { 3759 case 32: 3760 case 64: 3761 break; // AVX512F. 3762 case 16: 3763 if (!ST->hasBWI()) 3764 PromEltTyBits = 32; // promote to i32, AVX512F. 3765 break; // AVX512BW 3766 case 8: 3767 if (!ST->hasVBMI()) 3768 PromEltTyBits = 32; // promote to i32, AVX512F. 3769 break; // AVX512VBMI 3770 case 1: 3771 // There is no support for shuffling i1 elements. We *must* promote. 3772 if (ST->hasBWI()) { 3773 if (ST->hasVBMI()) 3774 PromEltTyBits = 8; // promote to i8, AVX512VBMI. 3775 else 3776 PromEltTyBits = 16; // promote to i16, AVX512BW. 3777 break; 3778 } 3779 if (ST->hasDQI()) { 3780 PromEltTyBits = 32; // promote to i32, AVX512F. 3781 break; 3782 } 3783 return bailout(); 3784 default: 3785 return bailout(); 3786 } 3787 auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits); 3788 3789 auto *SrcVecTy = FixedVectorType::get(EltTy, VF); 3790 auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF); 3791 3792 int NumDstElements = VF * ReplicationFactor; 3793 auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements); 3794 auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements); 3795 3796 // Legalize the types. 3797 MVT LegalSrcVecTy = TLI->getTypeLegalizationCost(DL, SrcVecTy).second; 3798 MVT LegalPromSrcVecTy = TLI->getTypeLegalizationCost(DL, PromSrcVecTy).second; 3799 MVT LegalPromDstVecTy = TLI->getTypeLegalizationCost(DL, PromDstVecTy).second; 3800 MVT LegalDstVecTy = TLI->getTypeLegalizationCost(DL, DstVecTy).second; 3801 // They should have legalized into vector types. 3802 if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() || 3803 !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector()) 3804 return bailout(); 3805 3806 if (PromEltTyBits != EltTyBits) { 3807 // If we have to perform the shuffle with wider elt type than our data type, 3808 // then we will first need to anyext (we don't care about the new bits) 3809 // the source elements, and then truncate Dst elements. 3810 InstructionCost PromotionCost; 3811 PromotionCost += getCastInstrCost( 3812 Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy, 3813 TargetTransformInfo::CastContextHint::None, CostKind); 3814 PromotionCost += 3815 getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy, 3816 /*Src=*/PromDstVecTy, 3817 TargetTransformInfo::CastContextHint::None, CostKind); 3818 return PromotionCost + getReplicationShuffleCost(PromEltTy, 3819 ReplicationFactor, VF, 3820 DemandedDstElts, CostKind); 3821 } 3822 3823 assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && 3824 LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && 3825 "We expect that the legalization doesn't affect the element width, " 3826 "doesn't coalesce/split elements."); 3827 3828 unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements(); 3829 unsigned NumDstVectors = 3830 divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec); 3831 3832 auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec); 3833 3834 // Not all the produced Dst elements may be demanded. In our case, 3835 // given that a single Dst vector is formed by a single shuffle, 3836 // if all elements that will form a single Dst vector aren't demanded, 3837 // then we won't need to do that shuffle, so adjust the cost accordingly. 3838 APInt DemandedDstVectors = APIntOps::ScaleBitMask( 3839 DemandedDstElts.zextOrSelf(NumDstVectors * NumEltsPerDstVec), 3840 NumDstVectors); 3841 unsigned NumDstVectorsDemanded = DemandedDstVectors.countPopulation(); 3842 3843 InstructionCost SingleShuffleCost = 3844 getShuffleCost(TTI::SK_PermuteSingleSrc, SingleDstVecTy, 3845 /*Mask=*/None, /*Index=*/0, /*SubTp=*/nullptr); 3846 return NumDstVectorsDemanded * SingleShuffleCost; 3847 } 3848 3849 InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, 3850 MaybeAlign Alignment, 3851 unsigned AddressSpace, 3852 TTI::TargetCostKind CostKind, 3853 const Instruction *I) { 3854 // TODO: Handle other cost kinds. 3855 if (CostKind != TTI::TCK_RecipThroughput) { 3856 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) { 3857 // Store instruction with index and scale costs 2 Uops. 3858 // Check the preceding GEP to identify non-const indices. 3859 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) { 3860 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); })) 3861 return TTI::TCC_Basic * 2; 3862 } 3863 } 3864 return TTI::TCC_Basic; 3865 } 3866 3867 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && 3868 "Invalid Opcode"); 3869 // Type legalization can't handle structs 3870 if (TLI->getValueType(DL, Src, true) == MVT::Other) 3871 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 3872 CostKind); 3873 3874 // Legalize the type. 3875 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); 3876 3877 auto *VTy = dyn_cast<FixedVectorType>(Src); 3878 3879 // Handle the simple case of non-vectors. 3880 // NOTE: this assumes that legalization never creates vector from scalars! 3881 if (!VTy || !LT.second.isVector()) 3882 // Each load/store unit costs 1. 3883 return LT.first * 1; 3884 3885 bool IsLoad = Opcode == Instruction::Load; 3886 3887 Type *EltTy = VTy->getElementType(); 3888 3889 const int EltTyBits = DL.getTypeSizeInBits(EltTy); 3890 3891 InstructionCost Cost = 0; 3892 3893 // Source of truth: how many elements were there in the original IR vector? 3894 const unsigned SrcNumElt = VTy->getNumElements(); 3895 3896 // How far have we gotten? 3897 int NumEltRemaining = SrcNumElt; 3898 // Note that we intentionally capture by-reference, NumEltRemaining changes. 3899 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; }; 3900 3901 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8); 3902 3903 // Note that even if we can store 64 bits of an XMM, we still operate on XMM. 3904 const unsigned XMMBits = 128; 3905 if (XMMBits % EltTyBits != 0) 3906 // Vector size must be a multiple of the element size. I.e. no padding. 3907 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 3908 CostKind); 3909 const int NumEltPerXMM = XMMBits / EltTyBits; 3910 3911 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM); 3912 3913 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0; 3914 NumEltRemaining > 0; CurrOpSizeBytes /= 2) { 3915 // How many elements would a single op deal with at once? 3916 if ((8 * CurrOpSizeBytes) % EltTyBits != 0) 3917 // Vector size must be a multiple of the element size. I.e. no padding. 3918 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 3919 CostKind); 3920 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits; 3921 3922 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?"); 3923 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || 3924 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && 3925 "Unless we haven't halved the op size yet, " 3926 "we have less than two op's sized units of work left."); 3927 3928 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM 3929 ? FixedVectorType::get(EltTy, CurrNumEltPerOp) 3930 : XMMVecTy; 3931 3932 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && 3933 "After halving sizes, the vector elt count is no longer a multiple " 3934 "of number of elements per operation?"); 3935 auto *CoalescedVecTy = 3936 CurrNumEltPerOp == 1 3937 ? CurrVecTy 3938 : FixedVectorType::get( 3939 IntegerType::get(Src->getContext(), 3940 EltTyBits * CurrNumEltPerOp), 3941 CurrVecTy->getNumElements() / CurrNumEltPerOp); 3942 assert(DL.getTypeSizeInBits(CoalescedVecTy) == 3943 DL.getTypeSizeInBits(CurrVecTy) && 3944 "coalesciing elements doesn't change vector width."); 3945 3946 while (NumEltRemaining > 0) { 3947 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?"); 3948 3949 // Can we use this vector size, as per the remaining element count? 3950 // Iff the vector is naturally aligned, we can do a wide load regardless. 3951 if (NumEltRemaining < CurrNumEltPerOp && 3952 (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) && 3953 CurrOpSizeBytes != 1) 3954 break; // Try smalled vector size. 3955 3956 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0; 3957 3958 // If we have fully processed the previous reg, we need to replenish it. 3959 if (SubVecEltsLeft == 0) { 3960 SubVecEltsLeft += CurrVecTy->getNumElements(); 3961 // And that's free only for the 0'th subvector of a legalized vector. 3962 if (!Is0thSubVec) 3963 Cost += getShuffleCost(IsLoad ? TTI::ShuffleKind::SK_InsertSubvector 3964 : TTI::ShuffleKind::SK_ExtractSubvector, 3965 VTy, None, NumEltDone(), CurrVecTy); 3966 } 3967 3968 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM, 3969 // for smaller widths (32/16/8) we have to insert/extract them separately. 3970 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide, 3971 // but let's pretend that it is also true for 16/8 bit wide ops...) 3972 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) { 3973 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM; 3974 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && ""); 3975 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp; 3976 APInt DemandedElts = 3977 APInt::getBitsSet(CoalescedVecTy->getNumElements(), 3978 CoalescedVecEltIdx, CoalescedVecEltIdx + 1); 3979 assert(DemandedElts.countPopulation() == 1 && "Inserting single value"); 3980 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad, 3981 !IsLoad); 3982 } 3983 3984 // This isn't exactly right. We're using slow unaligned 32-byte accesses 3985 // as a proxy for a double-pumped AVX memory interface such as on 3986 // Sandybridge. 3987 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow()) 3988 Cost += 2; 3989 else 3990 Cost += 1; 3991 3992 SubVecEltsLeft -= CurrNumEltPerOp; 3993 NumEltRemaining -= CurrNumEltPerOp; 3994 Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes); 3995 } 3996 } 3997 3998 assert(NumEltRemaining <= 0 && "Should have processed all the elements."); 3999 4000 return Cost; 4001 } 4002 4003 InstructionCost 4004 X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment, 4005 unsigned AddressSpace, 4006 TTI::TargetCostKind CostKind) { 4007 bool IsLoad = (Instruction::Load == Opcode); 4008 bool IsStore = (Instruction::Store == Opcode); 4009 4010 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy); 4011 if (!SrcVTy) 4012 // To calculate scalar take the regular cost, without mask 4013 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind); 4014 4015 unsigned NumElem = SrcVTy->getNumElements(); 4016 auto *MaskTy = 4017 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem); 4018 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) || 4019 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) { 4020 // Scalarization 4021 APInt DemandedElts = APInt::getAllOnes(NumElem); 4022 InstructionCost MaskSplitCost = 4023 getScalarizationOverhead(MaskTy, DemandedElts, false, true); 4024 InstructionCost ScalarCompareCost = getCmpSelInstrCost( 4025 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr, 4026 CmpInst::BAD_ICMP_PREDICATE, CostKind); 4027 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind); 4028 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); 4029 InstructionCost ValueSplitCost = 4030 getScalarizationOverhead(SrcVTy, DemandedElts, IsLoad, IsStore); 4031 InstructionCost MemopCost = 4032 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), 4033 Alignment, AddressSpace, CostKind); 4034 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; 4035 } 4036 4037 // Legalize the type. 4038 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy); 4039 auto VT = TLI->getValueType(DL, SrcVTy); 4040 InstructionCost Cost = 0; 4041 if (VT.isSimple() && LT.second != VT.getSimpleVT() && 4042 LT.second.getVectorNumElements() == NumElem) 4043 // Promotion requires extend/truncate for data and a shuffle for mask. 4044 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, None, 0, nullptr) + 4045 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, None, 0, nullptr); 4046 4047 else if (LT.first * LT.second.getVectorNumElements() > NumElem) { 4048 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(), 4049 LT.second.getVectorNumElements()); 4050 // Expanding requires fill mask with zeroes 4051 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, None, 0, MaskTy); 4052 } 4053 4054 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8. 4055 if (!ST->hasAVX512()) 4056 return Cost + LT.first * (IsLoad ? 2 : 8); 4057 4058 // AVX-512 masked load/store is cheapper 4059 return Cost + LT.first; 4060 } 4061 4062 InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty, 4063 ScalarEvolution *SE, 4064 const SCEV *Ptr) { 4065 // Address computations in vectorized code with non-consecutive addresses will 4066 // likely result in more instructions compared to scalar code where the 4067 // computation can more often be merged into the index mode. The resulting 4068 // extra micro-ops can significantly decrease throughput. 4069 const unsigned NumVectorInstToHideOverhead = 10; 4070 4071 // Cost modeling of Strided Access Computation is hidden by the indexing 4072 // modes of X86 regardless of the stride value. We dont believe that there 4073 // is a difference between constant strided access in gerenal and constant 4074 // strided value which is less than or equal to 64. 4075 // Even in the case of (loop invariant) stride whose value is not known at 4076 // compile time, the address computation will not incur more than one extra 4077 // ADD instruction. 4078 if (Ty->isVectorTy() && SE && !ST->hasAVX2()) { 4079 // TODO: AVX2 is the current cut-off because we don't have correct 4080 // interleaving costs for prior ISA's. 4081 if (!BaseT::isStridedAccess(Ptr)) 4082 return NumVectorInstToHideOverhead; 4083 if (!BaseT::getConstantStrideStep(SE, Ptr)) 4084 return 1; 4085 } 4086 4087 return BaseT::getAddressComputationCost(Ty, SE, Ptr); 4088 } 4089 4090 InstructionCost 4091 X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, 4092 Optional<FastMathFlags> FMF, 4093 TTI::TargetCostKind CostKind) { 4094 if (TTI::requiresOrderedReduction(FMF)) 4095 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 4096 4097 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput 4098 // and make it as the cost. 4099 4100 static const CostTblEntry SLMCostTblNoPairWise[] = { 4101 { ISD::FADD, MVT::v2f64, 3 }, 4102 { ISD::ADD, MVT::v2i64, 5 }, 4103 }; 4104 4105 static const CostTblEntry SSE2CostTblNoPairWise[] = { 4106 { ISD::FADD, MVT::v2f64, 2 }, 4107 { ISD::FADD, MVT::v2f32, 2 }, 4108 { ISD::FADD, MVT::v4f32, 4 }, 4109 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". 4110 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32 4111 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3". 4112 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3". 4113 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3". 4114 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3". 4115 { ISD::ADD, MVT::v2i8, 2 }, 4116 { ISD::ADD, MVT::v4i8, 2 }, 4117 { ISD::ADD, MVT::v8i8, 2 }, 4118 { ISD::ADD, MVT::v16i8, 3 }, 4119 }; 4120 4121 static const CostTblEntry AVX1CostTblNoPairWise[] = { 4122 { ISD::FADD, MVT::v4f64, 3 }, 4123 { ISD::FADD, MVT::v4f32, 3 }, 4124 { ISD::FADD, MVT::v8f32, 4 }, 4125 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". 4126 { ISD::ADD, MVT::v4i64, 3 }, 4127 { ISD::ADD, MVT::v8i32, 5 }, 4128 { ISD::ADD, MVT::v16i16, 5 }, 4129 { ISD::ADD, MVT::v32i8, 4 }, 4130 }; 4131 4132 int ISD = TLI->InstructionOpcodeToISD(Opcode); 4133 assert(ISD && "Invalid opcode"); 4134 4135 // Before legalizing the type, give a chance to look up illegal narrow types 4136 // in the table. 4137 // FIXME: Is there a better way to do this? 4138 EVT VT = TLI->getValueType(DL, ValTy); 4139 if (VT.isSimple()) { 4140 MVT MTy = VT.getSimpleVT(); 4141 if (ST->useSLMArithCosts()) 4142 if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy)) 4143 return Entry->Cost; 4144 4145 if (ST->hasAVX()) 4146 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) 4147 return Entry->Cost; 4148 4149 if (ST->hasSSE2()) 4150 if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) 4151 return Entry->Cost; 4152 } 4153 4154 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 4155 4156 MVT MTy = LT.second; 4157 4158 auto *ValVTy = cast<FixedVectorType>(ValTy); 4159 4160 // Special case: vXi8 mul reductions are performed as vXi16. 4161 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) { 4162 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16); 4163 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements()); 4164 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy, 4165 TargetTransformInfo::CastContextHint::None, 4166 CostKind) + 4167 getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind); 4168 } 4169 4170 InstructionCost ArithmeticCost = 0; 4171 if (LT.first != 1 && MTy.isVector() && 4172 MTy.getVectorNumElements() < ValVTy->getNumElements()) { 4173 // Type needs to be split. We need LT.first - 1 arithmetic ops. 4174 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(), 4175 MTy.getVectorNumElements()); 4176 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind); 4177 ArithmeticCost *= LT.first - 1; 4178 } 4179 4180 if (ST->useSLMArithCosts()) 4181 if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy)) 4182 return ArithmeticCost + Entry->Cost; 4183 4184 if (ST->hasAVX()) 4185 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) 4186 return ArithmeticCost + Entry->Cost; 4187 4188 if (ST->hasSSE2()) 4189 if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) 4190 return ArithmeticCost + Entry->Cost; 4191 4192 // FIXME: These assume a naive kshift+binop lowering, which is probably 4193 // conservative in most cases. 4194 static const CostTblEntry AVX512BoolReduction[] = { 4195 { ISD::AND, MVT::v2i1, 3 }, 4196 { ISD::AND, MVT::v4i1, 5 }, 4197 { ISD::AND, MVT::v8i1, 7 }, 4198 { ISD::AND, MVT::v16i1, 9 }, 4199 { ISD::AND, MVT::v32i1, 11 }, 4200 { ISD::AND, MVT::v64i1, 13 }, 4201 { ISD::OR, MVT::v2i1, 3 }, 4202 { ISD::OR, MVT::v4i1, 5 }, 4203 { ISD::OR, MVT::v8i1, 7 }, 4204 { ISD::OR, MVT::v16i1, 9 }, 4205 { ISD::OR, MVT::v32i1, 11 }, 4206 { ISD::OR, MVT::v64i1, 13 }, 4207 }; 4208 4209 static const CostTblEntry AVX2BoolReduction[] = { 4210 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp 4211 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp 4212 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp 4213 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp 4214 }; 4215 4216 static const CostTblEntry AVX1BoolReduction[] = { 4217 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp 4218 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp 4219 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp 4220 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp 4221 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp 4222 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp 4223 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp 4224 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp 4225 }; 4226 4227 static const CostTblEntry SSE2BoolReduction[] = { 4228 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp 4229 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp 4230 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp 4231 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp 4232 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp 4233 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp 4234 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp 4235 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp 4236 }; 4237 4238 // Handle bool allof/anyof patterns. 4239 if (ValVTy->getElementType()->isIntegerTy(1)) { 4240 InstructionCost ArithmeticCost = 0; 4241 if (LT.first != 1 && MTy.isVector() && 4242 MTy.getVectorNumElements() < ValVTy->getNumElements()) { 4243 // Type needs to be split. We need LT.first - 1 arithmetic ops. 4244 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(), 4245 MTy.getVectorNumElements()); 4246 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind); 4247 ArithmeticCost *= LT.first - 1; 4248 } 4249 4250 if (ST->hasAVX512()) 4251 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy)) 4252 return ArithmeticCost + Entry->Cost; 4253 if (ST->hasAVX2()) 4254 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy)) 4255 return ArithmeticCost + Entry->Cost; 4256 if (ST->hasAVX()) 4257 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy)) 4258 return ArithmeticCost + Entry->Cost; 4259 if (ST->hasSSE2()) 4260 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy)) 4261 return ArithmeticCost + Entry->Cost; 4262 4263 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind); 4264 } 4265 4266 unsigned NumVecElts = ValVTy->getNumElements(); 4267 unsigned ScalarSize = ValVTy->getScalarSizeInBits(); 4268 4269 // Special case power of 2 reductions where the scalar type isn't changed 4270 // by type legalization. 4271 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits()) 4272 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind); 4273 4274 InstructionCost ReductionCost = 0; 4275 4276 auto *Ty = ValVTy; 4277 if (LT.first != 1 && MTy.isVector() && 4278 MTy.getVectorNumElements() < ValVTy->getNumElements()) { 4279 // Type needs to be split. We need LT.first - 1 arithmetic ops. 4280 Ty = FixedVectorType::get(ValVTy->getElementType(), 4281 MTy.getVectorNumElements()); 4282 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind); 4283 ReductionCost *= LT.first - 1; 4284 NumVecElts = MTy.getVectorNumElements(); 4285 } 4286 4287 // Now handle reduction with the legal type, taking into account size changes 4288 // at each level. 4289 while (NumVecElts > 1) { 4290 // Determine the size of the remaining vector we need to reduce. 4291 unsigned Size = NumVecElts * ScalarSize; 4292 NumVecElts /= 2; 4293 // If we're reducing from 256/512 bits, use an extract_subvector. 4294 if (Size > 128) { 4295 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); 4296 ReductionCost += 4297 getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, NumVecElts, SubTy); 4298 Ty = SubTy; 4299 } else if (Size == 128) { 4300 // Reducing from 128 bits is a permute of v2f64/v2i64. 4301 FixedVectorType *ShufTy; 4302 if (ValVTy->isFloatingPointTy()) 4303 ShufTy = 4304 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2); 4305 else 4306 ShufTy = 4307 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2); 4308 ReductionCost += 4309 getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr); 4310 } else if (Size == 64) { 4311 // Reducing from 64 bits is a shuffle of v4f32/v4i32. 4312 FixedVectorType *ShufTy; 4313 if (ValVTy->isFloatingPointTy()) 4314 ShufTy = 4315 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4); 4316 else 4317 ShufTy = 4318 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4); 4319 ReductionCost += 4320 getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr); 4321 } else { 4322 // Reducing from smaller size is a shift by immediate. 4323 auto *ShiftTy = FixedVectorType::get( 4324 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size); 4325 ReductionCost += getArithmeticInstrCost( 4326 Instruction::LShr, ShiftTy, CostKind, 4327 TargetTransformInfo::OK_AnyValue, 4328 TargetTransformInfo::OK_UniformConstantValue, 4329 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 4330 } 4331 4332 // Add the arithmetic op for this level. 4333 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind); 4334 } 4335 4336 // Add the final extract element to the cost. 4337 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0); 4338 } 4339 4340 InstructionCost X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, 4341 bool IsUnsigned) { 4342 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 4343 4344 MVT MTy = LT.second; 4345 4346 int ISD; 4347 if (Ty->isIntOrIntVectorTy()) { 4348 ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN; 4349 } else { 4350 assert(Ty->isFPOrFPVectorTy() && 4351 "Expected float point or integer vector type."); 4352 ISD = ISD::FMINNUM; 4353 } 4354 4355 static const CostTblEntry SSE1CostTbl[] = { 4356 {ISD::FMINNUM, MVT::v4f32, 1}, 4357 }; 4358 4359 static const CostTblEntry SSE2CostTbl[] = { 4360 {ISD::FMINNUM, MVT::v2f64, 1}, 4361 {ISD::SMIN, MVT::v8i16, 1}, 4362 {ISD::UMIN, MVT::v16i8, 1}, 4363 }; 4364 4365 static const CostTblEntry SSE41CostTbl[] = { 4366 {ISD::SMIN, MVT::v4i32, 1}, 4367 {ISD::UMIN, MVT::v4i32, 1}, 4368 {ISD::UMIN, MVT::v8i16, 1}, 4369 {ISD::SMIN, MVT::v16i8, 1}, 4370 }; 4371 4372 static const CostTblEntry SSE42CostTbl[] = { 4373 {ISD::UMIN, MVT::v2i64, 3}, // xor+pcmpgtq+blendvpd 4374 }; 4375 4376 static const CostTblEntry AVX1CostTbl[] = { 4377 {ISD::FMINNUM, MVT::v8f32, 1}, 4378 {ISD::FMINNUM, MVT::v4f64, 1}, 4379 {ISD::SMIN, MVT::v8i32, 3}, 4380 {ISD::UMIN, MVT::v8i32, 3}, 4381 {ISD::SMIN, MVT::v16i16, 3}, 4382 {ISD::UMIN, MVT::v16i16, 3}, 4383 {ISD::SMIN, MVT::v32i8, 3}, 4384 {ISD::UMIN, MVT::v32i8, 3}, 4385 }; 4386 4387 static const CostTblEntry AVX2CostTbl[] = { 4388 {ISD::SMIN, MVT::v8i32, 1}, 4389 {ISD::UMIN, MVT::v8i32, 1}, 4390 {ISD::SMIN, MVT::v16i16, 1}, 4391 {ISD::UMIN, MVT::v16i16, 1}, 4392 {ISD::SMIN, MVT::v32i8, 1}, 4393 {ISD::UMIN, MVT::v32i8, 1}, 4394 }; 4395 4396 static const CostTblEntry AVX512CostTbl[] = { 4397 {ISD::FMINNUM, MVT::v16f32, 1}, 4398 {ISD::FMINNUM, MVT::v8f64, 1}, 4399 {ISD::SMIN, MVT::v2i64, 1}, 4400 {ISD::UMIN, MVT::v2i64, 1}, 4401 {ISD::SMIN, MVT::v4i64, 1}, 4402 {ISD::UMIN, MVT::v4i64, 1}, 4403 {ISD::SMIN, MVT::v8i64, 1}, 4404 {ISD::UMIN, MVT::v8i64, 1}, 4405 {ISD::SMIN, MVT::v16i32, 1}, 4406 {ISD::UMIN, MVT::v16i32, 1}, 4407 }; 4408 4409 static const CostTblEntry AVX512BWCostTbl[] = { 4410 {ISD::SMIN, MVT::v32i16, 1}, 4411 {ISD::UMIN, MVT::v32i16, 1}, 4412 {ISD::SMIN, MVT::v64i8, 1}, 4413 {ISD::UMIN, MVT::v64i8, 1}, 4414 }; 4415 4416 // If we have a native MIN/MAX instruction for this type, use it. 4417 if (ST->hasBWI()) 4418 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) 4419 return LT.first * Entry->Cost; 4420 4421 if (ST->hasAVX512()) 4422 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) 4423 return LT.first * Entry->Cost; 4424 4425 if (ST->hasAVX2()) 4426 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) 4427 return LT.first * Entry->Cost; 4428 4429 if (ST->hasAVX()) 4430 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) 4431 return LT.first * Entry->Cost; 4432 4433 if (ST->hasSSE42()) 4434 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) 4435 return LT.first * Entry->Cost; 4436 4437 if (ST->hasSSE41()) 4438 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) 4439 return LT.first * Entry->Cost; 4440 4441 if (ST->hasSSE2()) 4442 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) 4443 return LT.first * Entry->Cost; 4444 4445 if (ST->hasSSE1()) 4446 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) 4447 return LT.first * Entry->Cost; 4448 4449 unsigned CmpOpcode; 4450 if (Ty->isFPOrFPVectorTy()) { 4451 CmpOpcode = Instruction::FCmp; 4452 } else { 4453 assert(Ty->isIntOrIntVectorTy() && 4454 "expecting floating point or integer type for min/max reduction"); 4455 CmpOpcode = Instruction::ICmp; 4456 } 4457 4458 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 4459 // Otherwise fall back to cmp+select. 4460 InstructionCost Result = 4461 getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CmpInst::BAD_ICMP_PREDICATE, 4462 CostKind) + 4463 getCmpSelInstrCost(Instruction::Select, Ty, CondTy, 4464 CmpInst::BAD_ICMP_PREDICATE, CostKind); 4465 return Result; 4466 } 4467 4468 InstructionCost 4469 X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy, 4470 bool IsUnsigned, 4471 TTI::TargetCostKind CostKind) { 4472 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 4473 4474 MVT MTy = LT.second; 4475 4476 int ISD; 4477 if (ValTy->isIntOrIntVectorTy()) { 4478 ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN; 4479 } else { 4480 assert(ValTy->isFPOrFPVectorTy() && 4481 "Expected float point or integer vector type."); 4482 ISD = ISD::FMINNUM; 4483 } 4484 4485 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput 4486 // and make it as the cost. 4487 4488 static const CostTblEntry SSE2CostTblNoPairWise[] = { 4489 {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw 4490 {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw 4491 {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw 4492 }; 4493 4494 static const CostTblEntry SSE41CostTblNoPairWise[] = { 4495 {ISD::SMIN, MVT::v2i16, 3}, // same as sse2 4496 {ISD::SMIN, MVT::v4i16, 5}, // same as sse2 4497 {ISD::UMIN, MVT::v2i16, 5}, // same as sse2 4498 {ISD::UMIN, MVT::v4i16, 7}, // same as sse2 4499 {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor 4500 {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax 4501 {ISD::SMIN, MVT::v2i8, 3}, // pminsb 4502 {ISD::SMIN, MVT::v4i8, 5}, // pminsb 4503 {ISD::SMIN, MVT::v8i8, 7}, // pminsb 4504 {ISD::SMIN, MVT::v16i8, 6}, 4505 {ISD::UMIN, MVT::v2i8, 3}, // same as sse2 4506 {ISD::UMIN, MVT::v4i8, 5}, // same as sse2 4507 {ISD::UMIN, MVT::v8i8, 7}, // same as sse2 4508 {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax 4509 }; 4510 4511 static const CostTblEntry AVX1CostTblNoPairWise[] = { 4512 {ISD::SMIN, MVT::v16i16, 6}, 4513 {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax 4514 {ISD::SMIN, MVT::v32i8, 8}, 4515 {ISD::UMIN, MVT::v32i8, 8}, 4516 }; 4517 4518 static const CostTblEntry AVX512BWCostTblNoPairWise[] = { 4519 {ISD::SMIN, MVT::v32i16, 8}, 4520 {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax 4521 {ISD::SMIN, MVT::v64i8, 10}, 4522 {ISD::UMIN, MVT::v64i8, 10}, 4523 }; 4524 4525 // Before legalizing the type, give a chance to look up illegal narrow types 4526 // in the table. 4527 // FIXME: Is there a better way to do this? 4528 EVT VT = TLI->getValueType(DL, ValTy); 4529 if (VT.isSimple()) { 4530 MVT MTy = VT.getSimpleVT(); 4531 if (ST->hasBWI()) 4532 if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy)) 4533 return Entry->Cost; 4534 4535 if (ST->hasAVX()) 4536 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) 4537 return Entry->Cost; 4538 4539 if (ST->hasSSE41()) 4540 if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy)) 4541 return Entry->Cost; 4542 4543 if (ST->hasSSE2()) 4544 if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) 4545 return Entry->Cost; 4546 } 4547 4548 auto *ValVTy = cast<FixedVectorType>(ValTy); 4549 unsigned NumVecElts = ValVTy->getNumElements(); 4550 4551 auto *Ty = ValVTy; 4552 InstructionCost MinMaxCost = 0; 4553 if (LT.first != 1 && MTy.isVector() && 4554 MTy.getVectorNumElements() < ValVTy->getNumElements()) { 4555 // Type needs to be split. We need LT.first - 1 operations ops. 4556 Ty = FixedVectorType::get(ValVTy->getElementType(), 4557 MTy.getVectorNumElements()); 4558 auto *SubCondTy = FixedVectorType::get(CondTy->getElementType(), 4559 MTy.getVectorNumElements()); 4560 MinMaxCost = getMinMaxCost(Ty, SubCondTy, IsUnsigned); 4561 MinMaxCost *= LT.first - 1; 4562 NumVecElts = MTy.getVectorNumElements(); 4563 } 4564 4565 if (ST->hasBWI()) 4566 if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy)) 4567 return MinMaxCost + Entry->Cost; 4568 4569 if (ST->hasAVX()) 4570 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) 4571 return MinMaxCost + Entry->Cost; 4572 4573 if (ST->hasSSE41()) 4574 if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy)) 4575 return MinMaxCost + Entry->Cost; 4576 4577 if (ST->hasSSE2()) 4578 if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) 4579 return MinMaxCost + Entry->Cost; 4580 4581 unsigned ScalarSize = ValTy->getScalarSizeInBits(); 4582 4583 // Special case power of 2 reductions where the scalar type isn't changed 4584 // by type legalization. 4585 if (!isPowerOf2_32(ValVTy->getNumElements()) || 4586 ScalarSize != MTy.getScalarSizeInBits()) 4587 return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsUnsigned, CostKind); 4588 4589 // Now handle reduction with the legal type, taking into account size changes 4590 // at each level. 4591 while (NumVecElts > 1) { 4592 // Determine the size of the remaining vector we need to reduce. 4593 unsigned Size = NumVecElts * ScalarSize; 4594 NumVecElts /= 2; 4595 // If we're reducing from 256/512 bits, use an extract_subvector. 4596 if (Size > 128) { 4597 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); 4598 MinMaxCost += 4599 getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, NumVecElts, SubTy); 4600 Ty = SubTy; 4601 } else if (Size == 128) { 4602 // Reducing from 128 bits is a permute of v2f64/v2i64. 4603 VectorType *ShufTy; 4604 if (ValTy->isFloatingPointTy()) 4605 ShufTy = 4606 FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2); 4607 else 4608 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2); 4609 MinMaxCost += 4610 getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr); 4611 } else if (Size == 64) { 4612 // Reducing from 64 bits is a shuffle of v4f32/v4i32. 4613 FixedVectorType *ShufTy; 4614 if (ValTy->isFloatingPointTy()) 4615 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4); 4616 else 4617 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4); 4618 MinMaxCost += 4619 getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr); 4620 } else { 4621 // Reducing from smaller size is a shift by immediate. 4622 auto *ShiftTy = FixedVectorType::get( 4623 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size); 4624 MinMaxCost += getArithmeticInstrCost( 4625 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput, 4626 TargetTransformInfo::OK_AnyValue, 4627 TargetTransformInfo::OK_UniformConstantValue, 4628 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 4629 } 4630 4631 // Add the arithmetic op for this level. 4632 auto *SubCondTy = 4633 FixedVectorType::get(CondTy->getElementType(), Ty->getNumElements()); 4634 MinMaxCost += getMinMaxCost(Ty, SubCondTy, IsUnsigned); 4635 } 4636 4637 // Add the final extract element to the cost. 4638 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0); 4639 } 4640 4641 /// Calculate the cost of materializing a 64-bit value. This helper 4642 /// method might only calculate a fraction of a larger immediate. Therefore it 4643 /// is valid to return a cost of ZERO. 4644 InstructionCost X86TTIImpl::getIntImmCost(int64_t Val) { 4645 if (Val == 0) 4646 return TTI::TCC_Free; 4647 4648 if (isInt<32>(Val)) 4649 return TTI::TCC_Basic; 4650 4651 return 2 * TTI::TCC_Basic; 4652 } 4653 4654 InstructionCost X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 4655 TTI::TargetCostKind CostKind) { 4656 assert(Ty->isIntegerTy()); 4657 4658 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 4659 if (BitSize == 0) 4660 return ~0U; 4661 4662 // Never hoist constants larger than 128bit, because this might lead to 4663 // incorrect code generation or assertions in codegen. 4664 // Fixme: Create a cost model for types larger than i128 once the codegen 4665 // issues have been fixed. 4666 if (BitSize > 128) 4667 return TTI::TCC_Free; 4668 4669 if (Imm == 0) 4670 return TTI::TCC_Free; 4671 4672 // Sign-extend all constants to a multiple of 64-bit. 4673 APInt ImmVal = Imm; 4674 if (BitSize % 64 != 0) 4675 ImmVal = Imm.sext(alignTo(BitSize, 64)); 4676 4677 // Split the constant into 64-bit chunks and calculate the cost for each 4678 // chunk. 4679 InstructionCost Cost = 0; 4680 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { 4681 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); 4682 int64_t Val = Tmp.getSExtValue(); 4683 Cost += getIntImmCost(Val); 4684 } 4685 // We need at least one instruction to materialize the constant. 4686 return std::max<InstructionCost>(1, Cost); 4687 } 4688 4689 InstructionCost X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 4690 const APInt &Imm, Type *Ty, 4691 TTI::TargetCostKind CostKind, 4692 Instruction *Inst) { 4693 assert(Ty->isIntegerTy()); 4694 4695 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 4696 // There is no cost model for constants with a bit size of 0. Return TCC_Free 4697 // here, so that constant hoisting will ignore this constant. 4698 if (BitSize == 0) 4699 return TTI::TCC_Free; 4700 4701 unsigned ImmIdx = ~0U; 4702 switch (Opcode) { 4703 default: 4704 return TTI::TCC_Free; 4705 case Instruction::GetElementPtr: 4706 // Always hoist the base address of a GetElementPtr. This prevents the 4707 // creation of new constants for every base constant that gets constant 4708 // folded with the offset. 4709 if (Idx == 0) 4710 return 2 * TTI::TCC_Basic; 4711 return TTI::TCC_Free; 4712 case Instruction::Store: 4713 ImmIdx = 0; 4714 break; 4715 case Instruction::ICmp: 4716 // This is an imperfect hack to prevent constant hoisting of 4717 // compares that might be trying to check if a 64-bit value fits in 4718 // 32-bits. The backend can optimize these cases using a right shift by 32. 4719 // Ideally we would check the compare predicate here. There also other 4720 // similar immediates the backend can use shifts for. 4721 if (Idx == 1 && Imm.getBitWidth() == 64) { 4722 uint64_t ImmVal = Imm.getZExtValue(); 4723 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff) 4724 return TTI::TCC_Free; 4725 } 4726 ImmIdx = 1; 4727 break; 4728 case Instruction::And: 4729 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes 4730 // by using a 32-bit operation with implicit zero extension. Detect such 4731 // immediates here as the normal path expects bit 31 to be sign extended. 4732 if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue())) 4733 return TTI::TCC_Free; 4734 ImmIdx = 1; 4735 break; 4736 case Instruction::Add: 4737 case Instruction::Sub: 4738 // For add/sub, we can use the opposite instruction for INT32_MIN. 4739 if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000) 4740 return TTI::TCC_Free; 4741 ImmIdx = 1; 4742 break; 4743 case Instruction::UDiv: 4744 case Instruction::SDiv: 4745 case Instruction::URem: 4746 case Instruction::SRem: 4747 // Division by constant is typically expanded later into a different 4748 // instruction sequence. This completely changes the constants. 4749 // Report them as "free" to stop ConstantHoist from marking them as opaque. 4750 return TTI::TCC_Free; 4751 case Instruction::Mul: 4752 case Instruction::Or: 4753 case Instruction::Xor: 4754 ImmIdx = 1; 4755 break; 4756 // Always return TCC_Free for the shift value of a shift instruction. 4757 case Instruction::Shl: 4758 case Instruction::LShr: 4759 case Instruction::AShr: 4760 if (Idx == 1) 4761 return TTI::TCC_Free; 4762 break; 4763 case Instruction::Trunc: 4764 case Instruction::ZExt: 4765 case Instruction::SExt: 4766 case Instruction::IntToPtr: 4767 case Instruction::PtrToInt: 4768 case Instruction::BitCast: 4769 case Instruction::PHI: 4770 case Instruction::Call: 4771 case Instruction::Select: 4772 case Instruction::Ret: 4773 case Instruction::Load: 4774 break; 4775 } 4776 4777 if (Idx == ImmIdx) { 4778 int NumConstants = divideCeil(BitSize, 64); 4779 InstructionCost Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); 4780 return (Cost <= NumConstants * TTI::TCC_Basic) 4781 ? static_cast<int>(TTI::TCC_Free) 4782 : Cost; 4783 } 4784 4785 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); 4786 } 4787 4788 InstructionCost X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, 4789 const APInt &Imm, Type *Ty, 4790 TTI::TargetCostKind CostKind) { 4791 assert(Ty->isIntegerTy()); 4792 4793 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 4794 // There is no cost model for constants with a bit size of 0. Return TCC_Free 4795 // here, so that constant hoisting will ignore this constant. 4796 if (BitSize == 0) 4797 return TTI::TCC_Free; 4798 4799 switch (IID) { 4800 default: 4801 return TTI::TCC_Free; 4802 case Intrinsic::sadd_with_overflow: 4803 case Intrinsic::uadd_with_overflow: 4804 case Intrinsic::ssub_with_overflow: 4805 case Intrinsic::usub_with_overflow: 4806 case Intrinsic::smul_with_overflow: 4807 case Intrinsic::umul_with_overflow: 4808 if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue())) 4809 return TTI::TCC_Free; 4810 break; 4811 case Intrinsic::experimental_stackmap: 4812 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 4813 return TTI::TCC_Free; 4814 break; 4815 case Intrinsic::experimental_patchpoint_void: 4816 case Intrinsic::experimental_patchpoint_i64: 4817 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 4818 return TTI::TCC_Free; 4819 break; 4820 } 4821 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); 4822 } 4823 4824 InstructionCost X86TTIImpl::getCFInstrCost(unsigned Opcode, 4825 TTI::TargetCostKind CostKind, 4826 const Instruction *I) { 4827 if (CostKind != TTI::TCK_RecipThroughput) 4828 return Opcode == Instruction::PHI ? 0 : 1; 4829 // Branches are assumed to be predicted. 4830 return 0; 4831 } 4832 4833 int X86TTIImpl::getGatherOverhead() const { 4834 // Some CPUs have more overhead for gather. The specified overhead is relative 4835 // to the Load operation. "2" is the number provided by Intel architects. This 4836 // parameter is used for cost estimation of Gather Op and comparison with 4837 // other alternatives. 4838 // TODO: Remove the explicit hasAVX512()?, That would mean we would only 4839 // enable gather with a -march. 4840 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather())) 4841 return 2; 4842 4843 return 1024; 4844 } 4845 4846 int X86TTIImpl::getScatterOverhead() const { 4847 if (ST->hasAVX512()) 4848 return 2; 4849 4850 return 1024; 4851 } 4852 4853 // Return an average cost of Gather / Scatter instruction, maybe improved later. 4854 // FIXME: Add TargetCostKind support. 4855 InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, 4856 const Value *Ptr, Align Alignment, 4857 unsigned AddressSpace) { 4858 4859 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost"); 4860 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements(); 4861 4862 // Try to reduce index size from 64 bit (default for GEP) 4863 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the 4864 // operation will use 16 x 64 indices which do not fit in a zmm and needs 4865 // to split. Also check that the base pointer is the same for all lanes, 4866 // and that there's at most one variable index. 4867 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) { 4868 unsigned IndexSize = DL.getPointerSizeInBits(); 4869 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); 4870 if (IndexSize < 64 || !GEP) 4871 return IndexSize; 4872 4873 unsigned NumOfVarIndices = 0; 4874 const Value *Ptrs = GEP->getPointerOperand(); 4875 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs)) 4876 return IndexSize; 4877 for (unsigned i = 1; i < GEP->getNumOperands(); ++i) { 4878 if (isa<Constant>(GEP->getOperand(i))) 4879 continue; 4880 Type *IndxTy = GEP->getOperand(i)->getType(); 4881 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy)) 4882 IndxTy = IndexVTy->getElementType(); 4883 if ((IndxTy->getPrimitiveSizeInBits() == 64 && 4884 !isa<SExtInst>(GEP->getOperand(i))) || 4885 ++NumOfVarIndices > 1) 4886 return IndexSize; // 64 4887 } 4888 return (unsigned)32; 4889 }; 4890 4891 // Trying to reduce IndexSize to 32 bits for vector 16. 4892 // By default the IndexSize is equal to pointer size. 4893 unsigned IndexSize = (ST->hasAVX512() && VF >= 16) 4894 ? getIndexSizeInBits(Ptr, DL) 4895 : DL.getPointerSizeInBits(); 4896 4897 auto *IndexVTy = FixedVectorType::get( 4898 IntegerType::get(SrcVTy->getContext(), IndexSize), VF); 4899 std::pair<InstructionCost, MVT> IdxsLT = 4900 TLI->getTypeLegalizationCost(DL, IndexVTy); 4901 std::pair<InstructionCost, MVT> SrcLT = 4902 TLI->getTypeLegalizationCost(DL, SrcVTy); 4903 InstructionCost::CostType SplitFactor = 4904 *std::max(IdxsLT.first, SrcLT.first).getValue(); 4905 if (SplitFactor > 1) { 4906 // Handle splitting of vector of pointers 4907 auto *SplitSrcTy = 4908 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor); 4909 return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment, 4910 AddressSpace); 4911 } 4912 4913 // The gather / scatter cost is given by Intel architects. It is a rough 4914 // number since we are looking at one instruction in a time. 4915 const int GSOverhead = (Opcode == Instruction::Load) 4916 ? getGatherOverhead() 4917 : getScatterOverhead(); 4918 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), 4919 MaybeAlign(Alignment), AddressSpace, 4920 TTI::TCK_RecipThroughput); 4921 } 4922 4923 /// Return the cost of full scalarization of gather / scatter operation. 4924 /// 4925 /// Opcode - Load or Store instruction. 4926 /// SrcVTy - The type of the data vector that should be gathered or scattered. 4927 /// VariableMask - The mask is non-constant at compile time. 4928 /// Alignment - Alignment for one element. 4929 /// AddressSpace - pointer[s] address space. 4930 /// 4931 /// FIXME: Add TargetCostKind support. 4932 InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy, 4933 bool VariableMask, Align Alignment, 4934 unsigned AddressSpace) { 4935 Type *ScalarTy = SrcVTy->getScalarType(); 4936 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements(); 4937 APInt DemandedElts = APInt::getAllOnes(VF); 4938 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 4939 4940 InstructionCost MaskUnpackCost = 0; 4941 if (VariableMask) { 4942 auto *MaskTy = 4943 FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF); 4944 MaskUnpackCost = getScalarizationOverhead( 4945 MaskTy, DemandedElts, /*Insert=*/false, /*Extract=*/true); 4946 InstructionCost ScalarCompareCost = getCmpSelInstrCost( 4947 Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr, 4948 CmpInst::BAD_ICMP_PREDICATE, CostKind); 4949 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind); 4950 MaskUnpackCost += VF * (BranchCost + ScalarCompareCost); 4951 } 4952 4953 InstructionCost AddressUnpackCost = getScalarizationOverhead( 4954 FixedVectorType::get(ScalarTy->getPointerTo(), VF), DemandedElts, 4955 /*Insert=*/false, /*Extract=*/true); 4956 4957 // The cost of the scalar loads/stores. 4958 InstructionCost MemoryOpCost = 4959 VF * getMemoryOpCost(Opcode, ScalarTy, MaybeAlign(Alignment), 4960 AddressSpace, CostKind); 4961 4962 // The cost of forming the vector from loaded scalars/ 4963 // scalarizing the vector to perform scalar stores. 4964 InstructionCost InsertExtractCost = 4965 getScalarizationOverhead(cast<FixedVectorType>(SrcVTy), DemandedElts, 4966 /*Insert=*/Opcode == Instruction::Load, 4967 /*Extract=*/Opcode == Instruction::Store); 4968 4969 return AddressUnpackCost + MemoryOpCost + MaskUnpackCost + InsertExtractCost; 4970 } 4971 4972 /// Calculate the cost of Gather / Scatter operation 4973 InstructionCost X86TTIImpl::getGatherScatterOpCost( 4974 unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask, 4975 Align Alignment, TTI::TargetCostKind CostKind, 4976 const Instruction *I = nullptr) { 4977 if (CostKind != TTI::TCK_RecipThroughput) { 4978 if ((Opcode == Instruction::Load && 4979 isLegalMaskedGather(SrcVTy, Align(Alignment))) || 4980 (Opcode == Instruction::Store && 4981 isLegalMaskedScatter(SrcVTy, Align(Alignment)))) 4982 return 1; 4983 return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask, 4984 Alignment, CostKind, I); 4985 } 4986 4987 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter"); 4988 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); 4989 if (!PtrTy && Ptr->getType()->isVectorTy()) 4990 PtrTy = dyn_cast<PointerType>( 4991 cast<VectorType>(Ptr->getType())->getElementType()); 4992 assert(PtrTy && "Unexpected type for Ptr argument"); 4993 unsigned AddressSpace = PtrTy->getAddressSpace(); 4994 4995 if ((Opcode == Instruction::Load && 4996 !isLegalMaskedGather(SrcVTy, Align(Alignment))) || 4997 (Opcode == Instruction::Store && 4998 !isLegalMaskedScatter(SrcVTy, Align(Alignment)))) 4999 return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, 5000 AddressSpace); 5001 5002 return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace); 5003 } 5004 5005 bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1, 5006 TargetTransformInfo::LSRCost &C2) { 5007 // X86 specific here are "instruction number 1st priority". 5008 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, 5009 C1.NumIVMuls, C1.NumBaseAdds, 5010 C1.ScaleCost, C1.ImmCost, C1.SetupCost) < 5011 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, 5012 C2.NumIVMuls, C2.NumBaseAdds, 5013 C2.ScaleCost, C2.ImmCost, C2.SetupCost); 5014 } 5015 5016 bool X86TTIImpl::canMacroFuseCmp() { 5017 return ST->hasMacroFusion() || ST->hasBranchFusion(); 5018 } 5019 5020 bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) { 5021 if (!ST->hasAVX()) 5022 return false; 5023 5024 // The backend can't handle a single element vector. 5025 if (isa<VectorType>(DataTy) && 5026 cast<FixedVectorType>(DataTy)->getNumElements() == 1) 5027 return false; 5028 Type *ScalarTy = DataTy->getScalarType(); 5029 5030 if (ScalarTy->isPointerTy()) 5031 return true; 5032 5033 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) 5034 return true; 5035 5036 if (ScalarTy->isHalfTy() && ST->hasBWI() && ST->hasFP16()) 5037 return true; 5038 5039 if (!ScalarTy->isIntegerTy()) 5040 return false; 5041 5042 unsigned IntWidth = ScalarTy->getIntegerBitWidth(); 5043 return IntWidth == 32 || IntWidth == 64 || 5044 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI()); 5045 } 5046 5047 bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) { 5048 return isLegalMaskedLoad(DataType, Alignment); 5049 } 5050 5051 bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) { 5052 unsigned DataSize = DL.getTypeStoreSize(DataType); 5053 // The only supported nontemporal loads are for aligned vectors of 16 or 32 5054 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2 5055 // (the equivalent stores only require AVX). 5056 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32)) 5057 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2(); 5058 5059 return false; 5060 } 5061 5062 bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) { 5063 unsigned DataSize = DL.getTypeStoreSize(DataType); 5064 5065 // SSE4A supports nontemporal stores of float and double at arbitrary 5066 // alignment. 5067 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy())) 5068 return true; 5069 5070 // Besides the SSE4A subtarget exception above, only aligned stores are 5071 // available nontemporaly on any other subtarget. And only stores with a size 5072 // of 4..32 bytes (powers of 2, only) are permitted. 5073 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 || 5074 !isPowerOf2_32(DataSize)) 5075 return false; 5076 5077 // 32-byte vector nontemporal stores are supported by AVX (the equivalent 5078 // loads require AVX2). 5079 if (DataSize == 32) 5080 return ST->hasAVX(); 5081 if (DataSize == 16) 5082 return ST->hasSSE1(); 5083 return true; 5084 } 5085 5086 bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) { 5087 if (!isa<VectorType>(DataTy)) 5088 return false; 5089 5090 if (!ST->hasAVX512()) 5091 return false; 5092 5093 // The backend can't handle a single element vector. 5094 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1) 5095 return false; 5096 5097 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType(); 5098 5099 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) 5100 return true; 5101 5102 if (!ScalarTy->isIntegerTy()) 5103 return false; 5104 5105 unsigned IntWidth = ScalarTy->getIntegerBitWidth(); 5106 return IntWidth == 32 || IntWidth == 64 || 5107 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2()); 5108 } 5109 5110 bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) { 5111 return isLegalMaskedExpandLoad(DataTy); 5112 } 5113 5114 bool X86TTIImpl::supportsGather() const { 5115 // Some CPUs have better gather performance than others. 5116 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only 5117 // enable gather with a -march. 5118 return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()); 5119 } 5120 5121 bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) { 5122 if (!supportsGather()) 5123 return false; 5124 5125 // This function is called now in two cases: from the Loop Vectorizer 5126 // and from the Scalarizer. 5127 // When the Loop Vectorizer asks about legality of the feature, 5128 // the vectorization factor is not calculated yet. The Loop Vectorizer 5129 // sends a scalar type and the decision is based on the width of the 5130 // scalar element. 5131 // Later on, the cost model will estimate usage this intrinsic based on 5132 // the vector type. 5133 // The Scalarizer asks again about legality. It sends a vector type. 5134 // In this case we can reject non-power-of-2 vectors. 5135 // We also reject single element vectors as the type legalizer can't 5136 // scalarize it. 5137 if (auto *DataVTy = dyn_cast<FixedVectorType>(DataTy)) { 5138 unsigned NumElts = DataVTy->getNumElements(); 5139 if (NumElts == 1) 5140 return false; 5141 // Gather / Scatter for vector 2 is not profitable on KNL / SKX 5142 // Vector-4 of gather/scatter instruction does not exist on KNL. 5143 // We can extend it to 8 elements, but zeroing upper bits of 5144 // the mask vector will add more instructions. Right now we give the scalar 5145 // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter 5146 // instruction is better in the VariableMask case. 5147 if (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX()))) 5148 return false; 5149 } 5150 Type *ScalarTy = DataTy->getScalarType(); 5151 if (ScalarTy->isPointerTy()) 5152 return true; 5153 5154 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) 5155 return true; 5156 5157 if (!ScalarTy->isIntegerTy()) 5158 return false; 5159 5160 unsigned IntWidth = ScalarTy->getIntegerBitWidth(); 5161 return IntWidth == 32 || IntWidth == 64; 5162 } 5163 5164 bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) { 5165 // AVX2 doesn't support scatter 5166 if (!ST->hasAVX512()) 5167 return false; 5168 return isLegalMaskedGather(DataType, Alignment); 5169 } 5170 5171 bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) { 5172 EVT VT = TLI->getValueType(DL, DataType); 5173 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT); 5174 } 5175 5176 bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) { 5177 return false; 5178 } 5179 5180 bool X86TTIImpl::areInlineCompatible(const Function *Caller, 5181 const Function *Callee) const { 5182 const TargetMachine &TM = getTLI()->getTargetMachine(); 5183 5184 // Work this as a subsetting of subtarget features. 5185 const FeatureBitset &CallerBits = 5186 TM.getSubtargetImpl(*Caller)->getFeatureBits(); 5187 const FeatureBitset &CalleeBits = 5188 TM.getSubtargetImpl(*Callee)->getFeatureBits(); 5189 5190 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; 5191 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; 5192 return (RealCallerBits & RealCalleeBits) == RealCalleeBits; 5193 } 5194 5195 bool X86TTIImpl::areFunctionArgsABICompatible( 5196 const Function *Caller, const Function *Callee, 5197 SmallPtrSetImpl<Argument *> &Args) const { 5198 if (!BaseT::areFunctionArgsABICompatible(Caller, Callee, Args)) 5199 return false; 5200 5201 // If we get here, we know the target features match. If one function 5202 // considers 512-bit vectors legal and the other does not, consider them 5203 // incompatible. 5204 const TargetMachine &TM = getTLI()->getTargetMachine(); 5205 5206 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() == 5207 TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs()) 5208 return true; 5209 5210 // Consider the arguments compatible if they aren't vectors or aggregates. 5211 // FIXME: Look at the size of vectors. 5212 // FIXME: Look at the element types of aggregates to see if there are vectors. 5213 // FIXME: The API of this function seems intended to allow arguments 5214 // to be removed from the set, but the caller doesn't check if the set 5215 // becomes empty so that may not work in practice. 5216 return llvm::none_of(Args, [](Argument *A) { 5217 auto *EltTy = cast<PointerType>(A->getType())->getElementType(); 5218 return EltTy->isVectorTy() || EltTy->isAggregateType(); 5219 }); 5220 } 5221 5222 X86TTIImpl::TTI::MemCmpExpansionOptions 5223 X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { 5224 TTI::MemCmpExpansionOptions Options; 5225 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); 5226 Options.NumLoadsPerBlock = 2; 5227 // All GPR and vector loads can be unaligned. 5228 Options.AllowOverlappingLoads = true; 5229 if (IsZeroCmp) { 5230 // Only enable vector loads for equality comparison. Right now the vector 5231 // version is not as fast for three way compare (see #33329). 5232 const unsigned PreferredWidth = ST->getPreferVectorWidth(); 5233 if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64); 5234 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32); 5235 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16); 5236 } 5237 if (ST->is64Bit()) { 5238 Options.LoadSizes.push_back(8); 5239 } 5240 Options.LoadSizes.push_back(4); 5241 Options.LoadSizes.push_back(2); 5242 Options.LoadSizes.push_back(1); 5243 return Options; 5244 } 5245 5246 bool X86TTIImpl::prefersVectorizedAddressing() const { 5247 return supportsGather(); 5248 } 5249 5250 bool X86TTIImpl::supportsEfficientVectorElementLoadStore() const { 5251 return false; 5252 } 5253 5254 bool X86TTIImpl::enableInterleavedAccessVectorization() { 5255 // TODO: We expect this to be beneficial regardless of arch, 5256 // but there are currently some unexplained performance artifacts on Atom. 5257 // As a temporary solution, disable on Atom. 5258 return !(ST->isAtom()); 5259 } 5260 5261 // Get estimation for interleaved load/store operations and strided load. 5262 // \p Indices contains indices for strided load. 5263 // \p Factor - the factor of interleaving. 5264 // AVX-512 provides 3-src shuffles that significantly reduces the cost. 5265 InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( 5266 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, 5267 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace, 5268 TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) { 5269 // VecTy for interleave memop is <VF*Factor x Elt>. 5270 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have 5271 // VecTy = <12 x i32>. 5272 5273 // Calculate the number of memory operations (NumOfMemOps), required 5274 // for load/store the VecTy. 5275 MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; 5276 unsigned VecTySize = DL.getTypeStoreSize(VecTy); 5277 unsigned LegalVTSize = LegalVT.getStoreSize(); 5278 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; 5279 5280 // Get the cost of one memory operation. 5281 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(), 5282 LegalVT.getVectorNumElements()); 5283 InstructionCost MemOpCost; 5284 bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps; 5285 if (UseMaskedMemOp) 5286 MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment, 5287 AddressSpace, CostKind); 5288 else 5289 MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment), 5290 AddressSpace, CostKind); 5291 5292 unsigned VF = VecTy->getNumElements() / Factor; 5293 MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF); 5294 5295 InstructionCost MaskCost; 5296 if (UseMaskedMemOp) { 5297 APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements()); 5298 for (unsigned Index : Indices) { 5299 assert(Index < Factor && "Invalid index for interleaved memory op"); 5300 for (unsigned Elm = 0; Elm < VF; Elm++) 5301 DemandedLoadStoreElts.setBit(Index + Elm * Factor); 5302 } 5303 5304 Type *I1Type = Type::getInt1Ty(VecTy->getContext()); 5305 5306 MaskCost = getReplicationShuffleCost( 5307 I1Type, Factor, VF, 5308 UseMaskForGaps ? DemandedLoadStoreElts 5309 : APInt::getAllOnes(VecTy->getNumElements()), 5310 CostKind); 5311 5312 // The Gaps mask is invariant and created outside the loop, therefore the 5313 // cost of creating it is not accounted for here. However if we have both 5314 // a MaskForGaps and some other mask that guards the execution of the 5315 // memory access, we need to account for the cost of And-ing the two masks 5316 // inside the loop. 5317 if (UseMaskForGaps) { 5318 auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements()); 5319 MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind); 5320 } 5321 } 5322 5323 if (Opcode == Instruction::Load) { 5324 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl) 5325 // contain the cost of the optimized shuffle sequence that the 5326 // X86InterleavedAccess pass will generate. 5327 // The cost of loads and stores are computed separately from the table. 5328 5329 // X86InterleavedAccess support only the following interleaved-access group. 5330 static const CostTblEntry AVX512InterleavedLoadTbl[] = { 5331 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8 5332 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8 5333 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8 5334 }; 5335 5336 if (const auto *Entry = 5337 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT)) 5338 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost; 5339 //If an entry does not exist, fallback to the default implementation. 5340 5341 // Kind of shuffle depends on number of loaded values. 5342 // If we load the entire data in one register, we can use a 1-src shuffle. 5343 // Otherwise, we'll merge 2 sources in each operation. 5344 TTI::ShuffleKind ShuffleKind = 5345 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc; 5346 5347 InstructionCost ShuffleCost = 5348 getShuffleCost(ShuffleKind, SingleMemOpTy, None, 0, nullptr); 5349 5350 unsigned NumOfLoadsInInterleaveGrp = 5351 Indices.size() ? Indices.size() : Factor; 5352 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(), 5353 VecTy->getNumElements() / Factor); 5354 InstructionCost NumOfResults = 5355 getTLI()->getTypeLegalizationCost(DL, ResultTy).first * 5356 NumOfLoadsInInterleaveGrp; 5357 5358 // About a half of the loads may be folded in shuffles when we have only 5359 // one result. If we have more than one result, or the loads are masked, 5360 // we do not fold loads at all. 5361 unsigned NumOfUnfoldedLoads = 5362 UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2; 5363 5364 // Get a number of shuffle operations per result. 5365 unsigned NumOfShufflesPerResult = 5366 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1)); 5367 5368 // The SK_MergeTwoSrc shuffle clobbers one of src operands. 5369 // When we have more than one destination, we need additional instructions 5370 // to keep sources. 5371 InstructionCost NumOfMoves = 0; 5372 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc) 5373 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2; 5374 5375 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost + 5376 MaskCost + NumOfUnfoldedLoads * MemOpCost + 5377 NumOfMoves; 5378 5379 return Cost; 5380 } 5381 5382 // Store. 5383 assert(Opcode == Instruction::Store && 5384 "Expected Store Instruction at this point"); 5385 // X86InterleavedAccess support only the following interleaved-access group. 5386 static const CostTblEntry AVX512InterleavedStoreTbl[] = { 5387 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store) 5388 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store) 5389 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store) 5390 5391 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store) 5392 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store) 5393 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store) 5394 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store) 5395 }; 5396 5397 if (const auto *Entry = 5398 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT)) 5399 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost; 5400 //If an entry does not exist, fallback to the default implementation. 5401 5402 // There is no strided stores meanwhile. And store can't be folded in 5403 // shuffle. 5404 unsigned NumOfSources = Factor; // The number of values to be merged. 5405 InstructionCost ShuffleCost = 5406 getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, None, 0, nullptr); 5407 unsigned NumOfShufflesPerStore = NumOfSources - 1; 5408 5409 // The SK_MergeTwoSrc shuffle clobbers one of src operands. 5410 // We need additional instructions to keep sources. 5411 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2; 5412 InstructionCost Cost = 5413 MaskCost + 5414 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) + 5415 NumOfMoves; 5416 return Cost; 5417 } 5418 5419 InstructionCost X86TTIImpl::getInterleavedMemoryOpCost( 5420 unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices, 5421 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 5422 bool UseMaskForCond, bool UseMaskForGaps) { 5423 auto *VecTy = cast<FixedVectorType>(BaseTy); 5424 5425 auto isSupportedOnAVX512 = [&](Type *VecTy, bool HasBW) { 5426 Type *EltTy = cast<VectorType>(VecTy)->getElementType(); 5427 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || 5428 EltTy->isIntegerTy(32) || EltTy->isPointerTy()) 5429 return true; 5430 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || 5431 (!ST->useSoftFloat() && ST->hasFP16() && EltTy->isHalfTy())) 5432 return HasBW; 5433 return false; 5434 }; 5435 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI())) 5436 return getInterleavedMemoryOpCostAVX512( 5437 Opcode, VecTy, Factor, Indices, Alignment, 5438 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); 5439 5440 if (UseMaskForCond || UseMaskForGaps) 5441 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 5442 Alignment, AddressSpace, CostKind, 5443 UseMaskForCond, UseMaskForGaps); 5444 5445 // Get estimation for interleaved load/store operations for SSE-AVX2. 5446 // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow 5447 // computing the cost using a generic formula as a function of generic 5448 // shuffles. We therefore use a lookup table instead, filled according to 5449 // the instruction sequences that codegen currently generates. 5450 5451 // VecTy for interleave memop is <VF*Factor x Elt>. 5452 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have 5453 // VecTy = <12 x i32>. 5454 MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; 5455 5456 // This function can be called with VecTy=<6xi128>, Factor=3, in which case 5457 // the VF=2, while v2i128 is an unsupported MVT vector type 5458 // (see MachineValueType.h::getVectorVT()). 5459 if (!LegalVT.isVector()) 5460 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 5461 Alignment, AddressSpace, CostKind); 5462 5463 unsigned VF = VecTy->getNumElements() / Factor; 5464 Type *ScalarTy = VecTy->getElementType(); 5465 // Deduplicate entries, model floats/pointers as appropriately-sized integers. 5466 if (!ScalarTy->isIntegerTy()) 5467 ScalarTy = 5468 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy)); 5469 5470 // Get the cost of all the memory operations. 5471 // FIXME: discount dead loads. 5472 InstructionCost MemOpCosts = getMemoryOpCost( 5473 Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind); 5474 5475 auto *VT = FixedVectorType::get(ScalarTy, VF); 5476 EVT ETy = TLI->getValueType(DL, VT); 5477 if (!ETy.isSimple()) 5478 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 5479 Alignment, AddressSpace, CostKind); 5480 5481 // TODO: Complete for other data-types and strides. 5482 // Each combination of Stride, element bit width and VF results in a different 5483 // sequence; The cost tables are therefore accessed with: 5484 // Factor (stride) and VectorType=VFxiN. 5485 // The Cost accounts only for the shuffle sequence; 5486 // The cost of the loads/stores is accounted for separately. 5487 // 5488 static const CostTblEntry AVX2InterleavedLoadTbl[] = { 5489 {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8 5490 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8 5491 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8 5492 {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8 5493 {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8 5494 5495 {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16 5496 {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16 5497 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16 5498 5499 {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32 5500 {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32 5501 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32 5502 5503 {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64 5504 {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64 5505 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64 5506 {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64 5507 5508 {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8 5509 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8 5510 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8 5511 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8 5512 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8 5513 5514 {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16 5515 {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16 5516 {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16 5517 {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16 5518 {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16 5519 5520 {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32 5521 {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32 5522 {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32 5523 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32 5524 {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32 5525 5526 {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64 5527 {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64 5528 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64 5529 {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64 5530 5531 {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8 5532 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8 5533 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8 5534 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8 5535 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8 5536 5537 {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16 5538 {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16 5539 {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16 5540 {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16 5541 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16 5542 5543 {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32 5544 {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32 5545 {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32 5546 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32 5547 {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32 5548 5549 {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64 5550 {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64 5551 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64 5552 {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64 5553 5554 {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8 5555 {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8 5556 {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8 5557 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8 5558 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8 5559 5560 {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16 5561 {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16 5562 {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16 5563 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16 5564 {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16 5565 5566 {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32 5567 {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32 5568 {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32 5569 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32 5570 5571 {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64 5572 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64 5573 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64 5574 5575 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32 5576 }; 5577 5578 static const CostTblEntry SSSE3InterleavedLoadTbl[] = { 5579 {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16 5580 }; 5581 5582 static const CostTblEntry SSE2InterleavedLoadTbl[] = { 5583 {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16 5584 {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16 5585 5586 {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32 5587 {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32 5588 5589 {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64 5590 }; 5591 5592 static const CostTblEntry AVX2InterleavedStoreTbl[] = { 5593 {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store) 5594 {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store) 5595 5596 {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store) 5597 {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store) 5598 {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store) 5599 5600 {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store) 5601 {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store) 5602 {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store) 5603 {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store) 5604 5605 {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store) 5606 {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store) 5607 {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store) 5608 {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store) 5609 {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store) 5610 5611 {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store) 5612 {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store) 5613 {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store) 5614 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store) 5615 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store) 5616 5617 {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store) 5618 {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store) 5619 {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store) 5620 {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store) 5621 {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store) 5622 5623 {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store) 5624 {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store) 5625 {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store) 5626 {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store) 5627 {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store) 5628 5629 {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store) 5630 {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store) 5631 {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store) 5632 {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store) 5633 5634 {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store) 5635 {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store) 5636 {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store) 5637 {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store) 5638 {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store) 5639 5640 {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store) 5641 {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store) 5642 {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store) 5643 {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store) 5644 {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store) 5645 5646 {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store) 5647 {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store) 5648 {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store) 5649 {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store) 5650 {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store) 5651 5652 {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store) 5653 {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store) 5654 {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store) 5655 {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store) 5656 5657 {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store) 5658 {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store) 5659 {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store) 5660 {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store) 5661 {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store) 5662 5663 {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store) 5664 {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store) 5665 {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store) 5666 {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store) 5667 {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store) 5668 5669 {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store) 5670 {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store) 5671 {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store) 5672 {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store) 5673 5674 {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store) 5675 {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store) 5676 {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store) 5677 }; 5678 5679 static const CostTblEntry SSE2InterleavedStoreTbl[] = { 5680 {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store) 5681 {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store) 5682 {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store) 5683 5684 {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store) 5685 {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store) 5686 5687 {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store) 5688 }; 5689 5690 if (Opcode == Instruction::Load) { 5691 auto GetDiscountedCost = [Factor, NumMembers = Indices.size(), 5692 MemOpCosts](const CostTblEntry *Entry) { 5693 // NOTE: this is just an approximation! 5694 // It can over/under -estimate the cost! 5695 return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor); 5696 }; 5697 5698 if (ST->hasAVX2()) 5699 if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor, 5700 ETy.getSimpleVT())) 5701 return GetDiscountedCost(Entry); 5702 5703 if (ST->hasSSSE3()) 5704 if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor, 5705 ETy.getSimpleVT())) 5706 return GetDiscountedCost(Entry); 5707 5708 if (ST->hasSSE2()) 5709 if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor, 5710 ETy.getSimpleVT())) 5711 return GetDiscountedCost(Entry); 5712 } else { 5713 assert(Opcode == Instruction::Store && 5714 "Expected Store Instruction at this point"); 5715 assert((!Indices.size() || Indices.size() == Factor) && 5716 "Interleaved store only supports fully-interleaved groups."); 5717 if (ST->hasAVX2()) 5718 if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor, 5719 ETy.getSimpleVT())) 5720 return MemOpCosts + Entry->Cost; 5721 5722 if (ST->hasSSE2()) 5723 if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor, 5724 ETy.getSimpleVT())) 5725 return MemOpCosts + Entry->Cost; 5726 } 5727 5728 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 5729 Alignment, AddressSpace, CostKind, 5730 UseMaskForCond, UseMaskForGaps); 5731 } 5732