1 //===- InstCombineCalls.cpp -----------------------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements the visitCall and visitInvoke functions. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "InstCombineInternal.h" 15 #include "llvm/ADT/APFloat.h" 16 #include "llvm/ADT/APInt.h" 17 #include "llvm/ADT/ArrayRef.h" 18 #include "llvm/ADT/None.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/ADT/Statistic.h" 22 #include "llvm/ADT/Twine.h" 23 #include "llvm/Analysis/InstructionSimplify.h" 24 #include "llvm/Analysis/MemoryBuiltins.h" 25 #include "llvm/Analysis/ValueTracking.h" 26 #include "llvm/IR/BasicBlock.h" 27 #include "llvm/IR/CallSite.h" 28 #include "llvm/IR/Constant.h" 29 #include "llvm/IR/DataLayout.h" 30 #include "llvm/IR/DerivedTypes.h" 31 #include "llvm/IR/Function.h" 32 #include "llvm/IR/GlobalVariable.h" 33 #include "llvm/IR/InstrTypes.h" 34 #include "llvm/IR/Instruction.h" 35 #include "llvm/IR/Instructions.h" 36 #include "llvm/IR/IntrinsicInst.h" 37 #include "llvm/IR/Intrinsics.h" 38 #include "llvm/IR/LLVMContext.h" 39 #include "llvm/IR/Metadata.h" 40 #include "llvm/IR/PatternMatch.h" 41 #include "llvm/IR/Statepoint.h" 42 #include "llvm/IR/Type.h" 43 #include "llvm/IR/Value.h" 44 #include "llvm/IR/ValueHandle.h" 45 #include "llvm/Support/Casting.h" 46 #include "llvm/Support/Debug.h" 47 #include "llvm/Support/KnownBits.h" 48 #include "llvm/Support/MathExtras.h" 49 #include "llvm/Transforms/Utils/Local.h" 50 #include "llvm/Transforms/Utils/SimplifyLibCalls.h" 51 #include <algorithm> 52 #include <cassert> 53 #include <cstdint> 54 #include <cstring> 55 #include <vector> 56 57 using namespace llvm; 58 using namespace PatternMatch; 59 60 #define DEBUG_TYPE "instcombine" 61 62 STATISTIC(NumSimplified, "Number of library calls simplified"); 63 64 static cl::opt<unsigned> UnfoldElementAtomicMemcpyMaxElements( 65 "unfold-element-atomic-memcpy-max-elements", 66 cl::init(16), 67 cl::desc("Maximum number of elements in atomic memcpy the optimizer is " 68 "allowed to unfold")); 69 70 /// Return the specified type promoted as it would be to pass though a va_arg 71 /// area. 72 static Type *getPromotedType(Type *Ty) { 73 if (IntegerType* ITy = dyn_cast<IntegerType>(Ty)) { 74 if (ITy->getBitWidth() < 32) 75 return Type::getInt32Ty(Ty->getContext()); 76 } 77 return Ty; 78 } 79 80 /// Return a constant boolean vector that has true elements in all positions 81 /// where the input constant data vector has an element with the sign bit set. 82 static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) { 83 SmallVector<Constant *, 32> BoolVec; 84 IntegerType *BoolTy = Type::getInt1Ty(V->getContext()); 85 for (unsigned I = 0, E = V->getNumElements(); I != E; ++I) { 86 Constant *Elt = V->getElementAsConstant(I); 87 assert((isa<ConstantInt>(Elt) || isa<ConstantFP>(Elt)) && 88 "Unexpected constant data vector element type"); 89 bool Sign = V->getElementType()->isIntegerTy() 90 ? cast<ConstantInt>(Elt)->isNegative() 91 : cast<ConstantFP>(Elt)->isNegative(); 92 BoolVec.push_back(ConstantInt::get(BoolTy, Sign)); 93 } 94 return ConstantVector::get(BoolVec); 95 } 96 97 Instruction *InstCombiner::SimplifyElementUnorderedAtomicMemCpy( 98 ElementUnorderedAtomicMemCpyInst *AMI) { 99 // Try to unfold this intrinsic into sequence of explicit atomic loads and 100 // stores. 101 // First check that number of elements is compile time constant. 102 auto *LengthCI = dyn_cast<ConstantInt>(AMI->getLength()); 103 if (!LengthCI) 104 return nullptr; 105 106 // Check that there are not too many elements. 107 uint64_t LengthInBytes = LengthCI->getZExtValue(); 108 uint32_t ElementSizeInBytes = AMI->getElementSizeInBytes(); 109 uint64_t NumElements = LengthInBytes / ElementSizeInBytes; 110 if (NumElements >= UnfoldElementAtomicMemcpyMaxElements) 111 return nullptr; 112 113 // Only expand if there are elements to copy. 114 if (NumElements > 0) { 115 // Don't unfold into illegal integers 116 uint64_t ElementSizeInBits = ElementSizeInBytes * 8; 117 if (!getDataLayout().isLegalInteger(ElementSizeInBits)) 118 return nullptr; 119 120 // Cast source and destination to the correct type. Intrinsic input 121 // arguments are usually represented as i8*. Often operands will be 122 // explicitly casted to i8* and we can just strip those casts instead of 123 // inserting new ones. However it's easier to rely on other InstCombine 124 // rules which will cover trivial cases anyway. 125 Value *Src = AMI->getRawSource(); 126 Value *Dst = AMI->getRawDest(); 127 Type *ElementPointerType = 128 Type::getIntNPtrTy(AMI->getContext(), ElementSizeInBits, 129 Src->getType()->getPointerAddressSpace()); 130 131 Value *SrcCasted = Builder.CreatePointerCast(Src, ElementPointerType, 132 "memcpy_unfold.src_casted"); 133 Value *DstCasted = Builder.CreatePointerCast(Dst, ElementPointerType, 134 "memcpy_unfold.dst_casted"); 135 136 for (uint64_t i = 0; i < NumElements; ++i) { 137 // Get current element addresses 138 ConstantInt *ElementIdxCI = 139 ConstantInt::get(AMI->getContext(), APInt(64, i)); 140 Value *SrcElementAddr = 141 Builder.CreateGEP(SrcCasted, ElementIdxCI, "memcpy_unfold.src_addr"); 142 Value *DstElementAddr = 143 Builder.CreateGEP(DstCasted, ElementIdxCI, "memcpy_unfold.dst_addr"); 144 145 // Load from the source. Transfer alignment information and mark load as 146 // unordered atomic. 147 LoadInst *Load = Builder.CreateLoad(SrcElementAddr, "memcpy_unfold.val"); 148 Load->setOrdering(AtomicOrdering::Unordered); 149 // We know alignment of the first element. It is also guaranteed by the 150 // verifier that element size is less or equal than first element 151 // alignment and both of this values are powers of two. This means that 152 // all subsequent accesses are at least element size aligned. 153 // TODO: We can infer better alignment but there is no evidence that this 154 // will matter. 155 Load->setAlignment(i == 0 ? AMI->getParamAlignment(1) 156 : ElementSizeInBytes); 157 Load->setDebugLoc(AMI->getDebugLoc()); 158 159 // Store loaded value via unordered atomic store. 160 StoreInst *Store = Builder.CreateStore(Load, DstElementAddr); 161 Store->setOrdering(AtomicOrdering::Unordered); 162 Store->setAlignment(i == 0 ? AMI->getParamAlignment(0) 163 : ElementSizeInBytes); 164 Store->setDebugLoc(AMI->getDebugLoc()); 165 } 166 } 167 168 // Set the number of elements of the copy to 0, it will be deleted on the 169 // next iteration. 170 AMI->setLength(Constant::getNullValue(LengthCI->getType())); 171 return AMI; 172 } 173 174 Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) { 175 unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, MI, &AC, &DT); 176 unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, MI, &AC, &DT); 177 unsigned MinAlign = std::min(DstAlign, SrcAlign); 178 unsigned CopyAlign = MI->getAlignment(); 179 180 if (CopyAlign < MinAlign) { 181 MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), MinAlign, false)); 182 return MI; 183 } 184 185 // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with 186 // load/store. 187 ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getArgOperand(2)); 188 if (!MemOpLength) return nullptr; 189 190 // Source and destination pointer types are always "i8*" for intrinsic. See 191 // if the size is something we can handle with a single primitive load/store. 192 // A single load+store correctly handles overlapping memory in the memmove 193 // case. 194 uint64_t Size = MemOpLength->getLimitedValue(); 195 assert(Size && "0-sized memory transferring should be removed already."); 196 197 if (Size > 8 || (Size&(Size-1))) 198 return nullptr; // If not 1/2/4/8 bytes, exit. 199 200 // Use an integer load+store unless we can find something better. 201 unsigned SrcAddrSp = 202 cast<PointerType>(MI->getArgOperand(1)->getType())->getAddressSpace(); 203 unsigned DstAddrSp = 204 cast<PointerType>(MI->getArgOperand(0)->getType())->getAddressSpace(); 205 206 IntegerType* IntType = IntegerType::get(MI->getContext(), Size<<3); 207 Type *NewSrcPtrTy = PointerType::get(IntType, SrcAddrSp); 208 Type *NewDstPtrTy = PointerType::get(IntType, DstAddrSp); 209 210 // If the memcpy has metadata describing the members, see if we can get the 211 // TBAA tag describing our copy. 212 MDNode *CopyMD = nullptr; 213 if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) { 214 if (M->getNumOperands() == 3 && M->getOperand(0) && 215 mdconst::hasa<ConstantInt>(M->getOperand(0)) && 216 mdconst::extract<ConstantInt>(M->getOperand(0))->isZero() && 217 M->getOperand(1) && 218 mdconst::hasa<ConstantInt>(M->getOperand(1)) && 219 mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() == 220 Size && 221 M->getOperand(2) && isa<MDNode>(M->getOperand(2))) 222 CopyMD = cast<MDNode>(M->getOperand(2)); 223 } 224 225 // If the memcpy/memmove provides better alignment info than we can 226 // infer, use it. 227 SrcAlign = std::max(SrcAlign, CopyAlign); 228 DstAlign = std::max(DstAlign, CopyAlign); 229 230 Value *Src = Builder.CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy); 231 Value *Dest = Builder.CreateBitCast(MI->getArgOperand(0), NewDstPtrTy); 232 LoadInst *L = Builder.CreateLoad(Src, MI->isVolatile()); 233 L->setAlignment(SrcAlign); 234 if (CopyMD) 235 L->setMetadata(LLVMContext::MD_tbaa, CopyMD); 236 MDNode *LoopMemParallelMD = 237 MI->getMetadata(LLVMContext::MD_mem_parallel_loop_access); 238 if (LoopMemParallelMD) 239 L->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD); 240 241 StoreInst *S = Builder.CreateStore(L, Dest, MI->isVolatile()); 242 S->setAlignment(DstAlign); 243 if (CopyMD) 244 S->setMetadata(LLVMContext::MD_tbaa, CopyMD); 245 if (LoopMemParallelMD) 246 S->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD); 247 248 // Set the size of the copy to 0, it will be deleted on the next iteration. 249 MI->setArgOperand(2, Constant::getNullValue(MemOpLength->getType())); 250 return MI; 251 } 252 253 Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) { 254 unsigned Alignment = getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT); 255 if (MI->getAlignment() < Alignment) { 256 MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), 257 Alignment, false)); 258 return MI; 259 } 260 261 // Extract the length and alignment and fill if they are constant. 262 ConstantInt *LenC = dyn_cast<ConstantInt>(MI->getLength()); 263 ConstantInt *FillC = dyn_cast<ConstantInt>(MI->getValue()); 264 if (!LenC || !FillC || !FillC->getType()->isIntegerTy(8)) 265 return nullptr; 266 uint64_t Len = LenC->getLimitedValue(); 267 Alignment = MI->getAlignment(); 268 assert(Len && "0-sized memory setting should be removed already."); 269 270 // memset(s,c,n) -> store s, c (for n=1,2,4,8) 271 if (Len <= 8 && isPowerOf2_32((uint32_t)Len)) { 272 Type *ITy = IntegerType::get(MI->getContext(), Len*8); // n=1 -> i8. 273 274 Value *Dest = MI->getDest(); 275 unsigned DstAddrSp = cast<PointerType>(Dest->getType())->getAddressSpace(); 276 Type *NewDstPtrTy = PointerType::get(ITy, DstAddrSp); 277 Dest = Builder.CreateBitCast(Dest, NewDstPtrTy); 278 279 // Alignment 0 is identity for alignment 1 for memset, but not store. 280 if (Alignment == 0) Alignment = 1; 281 282 // Extract the fill value and store. 283 uint64_t Fill = FillC->getZExtValue()*0x0101010101010101ULL; 284 StoreInst *S = Builder.CreateStore(ConstantInt::get(ITy, Fill), Dest, 285 MI->isVolatile()); 286 S->setAlignment(Alignment); 287 288 // Set the size of the copy to 0, it will be deleted on the next iteration. 289 MI->setLength(Constant::getNullValue(LenC->getType())); 290 return MI; 291 } 292 293 return nullptr; 294 } 295 296 static Value *simplifyX86immShift(const IntrinsicInst &II, 297 InstCombiner::BuilderTy &Builder) { 298 bool LogicalShift = false; 299 bool ShiftLeft = false; 300 301 switch (II.getIntrinsicID()) { 302 default: llvm_unreachable("Unexpected intrinsic!"); 303 case Intrinsic::x86_sse2_psra_d: 304 case Intrinsic::x86_sse2_psra_w: 305 case Intrinsic::x86_sse2_psrai_d: 306 case Intrinsic::x86_sse2_psrai_w: 307 case Intrinsic::x86_avx2_psra_d: 308 case Intrinsic::x86_avx2_psra_w: 309 case Intrinsic::x86_avx2_psrai_d: 310 case Intrinsic::x86_avx2_psrai_w: 311 case Intrinsic::x86_avx512_psra_q_128: 312 case Intrinsic::x86_avx512_psrai_q_128: 313 case Intrinsic::x86_avx512_psra_q_256: 314 case Intrinsic::x86_avx512_psrai_q_256: 315 case Intrinsic::x86_avx512_psra_d_512: 316 case Intrinsic::x86_avx512_psra_q_512: 317 case Intrinsic::x86_avx512_psra_w_512: 318 case Intrinsic::x86_avx512_psrai_d_512: 319 case Intrinsic::x86_avx512_psrai_q_512: 320 case Intrinsic::x86_avx512_psrai_w_512: 321 LogicalShift = false; ShiftLeft = false; 322 break; 323 case Intrinsic::x86_sse2_psrl_d: 324 case Intrinsic::x86_sse2_psrl_q: 325 case Intrinsic::x86_sse2_psrl_w: 326 case Intrinsic::x86_sse2_psrli_d: 327 case Intrinsic::x86_sse2_psrli_q: 328 case Intrinsic::x86_sse2_psrli_w: 329 case Intrinsic::x86_avx2_psrl_d: 330 case Intrinsic::x86_avx2_psrl_q: 331 case Intrinsic::x86_avx2_psrl_w: 332 case Intrinsic::x86_avx2_psrli_d: 333 case Intrinsic::x86_avx2_psrli_q: 334 case Intrinsic::x86_avx2_psrli_w: 335 case Intrinsic::x86_avx512_psrl_d_512: 336 case Intrinsic::x86_avx512_psrl_q_512: 337 case Intrinsic::x86_avx512_psrl_w_512: 338 case Intrinsic::x86_avx512_psrli_d_512: 339 case Intrinsic::x86_avx512_psrli_q_512: 340 case Intrinsic::x86_avx512_psrli_w_512: 341 LogicalShift = true; ShiftLeft = false; 342 break; 343 case Intrinsic::x86_sse2_psll_d: 344 case Intrinsic::x86_sse2_psll_q: 345 case Intrinsic::x86_sse2_psll_w: 346 case Intrinsic::x86_sse2_pslli_d: 347 case Intrinsic::x86_sse2_pslli_q: 348 case Intrinsic::x86_sse2_pslli_w: 349 case Intrinsic::x86_avx2_psll_d: 350 case Intrinsic::x86_avx2_psll_q: 351 case Intrinsic::x86_avx2_psll_w: 352 case Intrinsic::x86_avx2_pslli_d: 353 case Intrinsic::x86_avx2_pslli_q: 354 case Intrinsic::x86_avx2_pslli_w: 355 case Intrinsic::x86_avx512_psll_d_512: 356 case Intrinsic::x86_avx512_psll_q_512: 357 case Intrinsic::x86_avx512_psll_w_512: 358 case Intrinsic::x86_avx512_pslli_d_512: 359 case Intrinsic::x86_avx512_pslli_q_512: 360 case Intrinsic::x86_avx512_pslli_w_512: 361 LogicalShift = true; ShiftLeft = true; 362 break; 363 } 364 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 365 366 // Simplify if count is constant. 367 auto Arg1 = II.getArgOperand(1); 368 auto CAZ = dyn_cast<ConstantAggregateZero>(Arg1); 369 auto CDV = dyn_cast<ConstantDataVector>(Arg1); 370 auto CInt = dyn_cast<ConstantInt>(Arg1); 371 if (!CAZ && !CDV && !CInt) 372 return nullptr; 373 374 APInt Count(64, 0); 375 if (CDV) { 376 // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector 377 // operand to compute the shift amount. 378 auto VT = cast<VectorType>(CDV->getType()); 379 unsigned BitWidth = VT->getElementType()->getPrimitiveSizeInBits(); 380 assert((64 % BitWidth) == 0 && "Unexpected packed shift size"); 381 unsigned NumSubElts = 64 / BitWidth; 382 383 // Concatenate the sub-elements to create the 64-bit value. 384 for (unsigned i = 0; i != NumSubElts; ++i) { 385 unsigned SubEltIdx = (NumSubElts - 1) - i; 386 auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx)); 387 Count <<= BitWidth; 388 Count |= SubElt->getValue().zextOrTrunc(64); 389 } 390 } 391 else if (CInt) 392 Count = CInt->getValue(); 393 394 auto Vec = II.getArgOperand(0); 395 auto VT = cast<VectorType>(Vec->getType()); 396 auto SVT = VT->getElementType(); 397 unsigned VWidth = VT->getNumElements(); 398 unsigned BitWidth = SVT->getPrimitiveSizeInBits(); 399 400 // If shift-by-zero then just return the original value. 401 if (Count.isNullValue()) 402 return Vec; 403 404 // Handle cases when Shift >= BitWidth. 405 if (Count.uge(BitWidth)) { 406 // If LogicalShift - just return zero. 407 if (LogicalShift) 408 return ConstantAggregateZero::get(VT); 409 410 // If ArithmeticShift - clamp Shift to (BitWidth - 1). 411 Count = APInt(64, BitWidth - 1); 412 } 413 414 // Get a constant vector of the same type as the first operand. 415 auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); 416 auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); 417 418 if (ShiftLeft) 419 return Builder.CreateShl(Vec, ShiftVec); 420 421 if (LogicalShift) 422 return Builder.CreateLShr(Vec, ShiftVec); 423 424 return Builder.CreateAShr(Vec, ShiftVec); 425 } 426 427 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift. 428 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out 429 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit). 430 static Value *simplifyX86varShift(const IntrinsicInst &II, 431 InstCombiner::BuilderTy &Builder) { 432 bool LogicalShift = false; 433 bool ShiftLeft = false; 434 435 switch (II.getIntrinsicID()) { 436 default: llvm_unreachable("Unexpected intrinsic!"); 437 case Intrinsic::x86_avx2_psrav_d: 438 case Intrinsic::x86_avx2_psrav_d_256: 439 case Intrinsic::x86_avx512_psrav_q_128: 440 case Intrinsic::x86_avx512_psrav_q_256: 441 case Intrinsic::x86_avx512_psrav_d_512: 442 case Intrinsic::x86_avx512_psrav_q_512: 443 case Intrinsic::x86_avx512_psrav_w_128: 444 case Intrinsic::x86_avx512_psrav_w_256: 445 case Intrinsic::x86_avx512_psrav_w_512: 446 LogicalShift = false; 447 ShiftLeft = false; 448 break; 449 case Intrinsic::x86_avx2_psrlv_d: 450 case Intrinsic::x86_avx2_psrlv_d_256: 451 case Intrinsic::x86_avx2_psrlv_q: 452 case Intrinsic::x86_avx2_psrlv_q_256: 453 case Intrinsic::x86_avx512_psrlv_d_512: 454 case Intrinsic::x86_avx512_psrlv_q_512: 455 case Intrinsic::x86_avx512_psrlv_w_128: 456 case Intrinsic::x86_avx512_psrlv_w_256: 457 case Intrinsic::x86_avx512_psrlv_w_512: 458 LogicalShift = true; 459 ShiftLeft = false; 460 break; 461 case Intrinsic::x86_avx2_psllv_d: 462 case Intrinsic::x86_avx2_psllv_d_256: 463 case Intrinsic::x86_avx2_psllv_q: 464 case Intrinsic::x86_avx2_psllv_q_256: 465 case Intrinsic::x86_avx512_psllv_d_512: 466 case Intrinsic::x86_avx512_psllv_q_512: 467 case Intrinsic::x86_avx512_psllv_w_128: 468 case Intrinsic::x86_avx512_psllv_w_256: 469 case Intrinsic::x86_avx512_psllv_w_512: 470 LogicalShift = true; 471 ShiftLeft = true; 472 break; 473 } 474 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 475 476 // Simplify if all shift amounts are constant/undef. 477 auto *CShift = dyn_cast<Constant>(II.getArgOperand(1)); 478 if (!CShift) 479 return nullptr; 480 481 auto Vec = II.getArgOperand(0); 482 auto VT = cast<VectorType>(II.getType()); 483 auto SVT = VT->getVectorElementType(); 484 int NumElts = VT->getNumElements(); 485 int BitWidth = SVT->getIntegerBitWidth(); 486 487 // Collect each element's shift amount. 488 // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth. 489 bool AnyOutOfRange = false; 490 SmallVector<int, 8> ShiftAmts; 491 for (int I = 0; I < NumElts; ++I) { 492 auto *CElt = CShift->getAggregateElement(I); 493 if (CElt && isa<UndefValue>(CElt)) { 494 ShiftAmts.push_back(-1); 495 continue; 496 } 497 498 auto *COp = dyn_cast_or_null<ConstantInt>(CElt); 499 if (!COp) 500 return nullptr; 501 502 // Handle out of range shifts. 503 // If LogicalShift - set to BitWidth (special case). 504 // If ArithmeticShift - set to (BitWidth - 1) (sign splat). 505 APInt ShiftVal = COp->getValue(); 506 if (ShiftVal.uge(BitWidth)) { 507 AnyOutOfRange = LogicalShift; 508 ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1); 509 continue; 510 } 511 512 ShiftAmts.push_back((int)ShiftVal.getZExtValue()); 513 } 514 515 // If all elements out of range or UNDEF, return vector of zeros/undefs. 516 // ArithmeticShift should only hit this if they are all UNDEF. 517 auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); }; 518 if (all_of(ShiftAmts, OutOfRange)) { 519 SmallVector<Constant *, 8> ConstantVec; 520 for (int Idx : ShiftAmts) { 521 if (Idx < 0) { 522 ConstantVec.push_back(UndefValue::get(SVT)); 523 } else { 524 assert(LogicalShift && "Logical shift expected"); 525 ConstantVec.push_back(ConstantInt::getNullValue(SVT)); 526 } 527 } 528 return ConstantVector::get(ConstantVec); 529 } 530 531 // We can't handle only some out of range values with generic logical shifts. 532 if (AnyOutOfRange) 533 return nullptr; 534 535 // Build the shift amount constant vector. 536 SmallVector<Constant *, 8> ShiftVecAmts; 537 for (int Idx : ShiftAmts) { 538 if (Idx < 0) 539 ShiftVecAmts.push_back(UndefValue::get(SVT)); 540 else 541 ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx)); 542 } 543 auto ShiftVec = ConstantVector::get(ShiftVecAmts); 544 545 if (ShiftLeft) 546 return Builder.CreateShl(Vec, ShiftVec); 547 548 if (LogicalShift) 549 return Builder.CreateLShr(Vec, ShiftVec); 550 551 return Builder.CreateAShr(Vec, ShiftVec); 552 } 553 554 static Value *simplifyX86muldq(const IntrinsicInst &II, 555 InstCombiner::BuilderTy &Builder) { 556 Value *Arg0 = II.getArgOperand(0); 557 Value *Arg1 = II.getArgOperand(1); 558 Type *ResTy = II.getType(); 559 assert(Arg0->getType()->getScalarSizeInBits() == 32 && 560 Arg1->getType()->getScalarSizeInBits() == 32 && 561 ResTy->getScalarSizeInBits() == 64 && "Unexpected muldq/muludq types"); 562 563 // muldq/muludq(undef, undef) -> zero (matches generic mul behavior) 564 if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1)) 565 return ConstantAggregateZero::get(ResTy); 566 567 // Constant folding. 568 // PMULDQ = (mul(vXi64 sext(shuffle<0,2,..>(Arg0)), 569 // vXi64 sext(shuffle<0,2,..>(Arg1)))) 570 // PMULUDQ = (mul(vXi64 zext(shuffle<0,2,..>(Arg0)), 571 // vXi64 zext(shuffle<0,2,..>(Arg1)))) 572 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) 573 return nullptr; 574 575 unsigned NumElts = ResTy->getVectorNumElements(); 576 assert(Arg0->getType()->getVectorNumElements() == (2 * NumElts) && 577 Arg1->getType()->getVectorNumElements() == (2 * NumElts) && 578 "Unexpected muldq/muludq types"); 579 580 unsigned IntrinsicID = II.getIntrinsicID(); 581 bool IsSigned = (Intrinsic::x86_sse41_pmuldq == IntrinsicID || 582 Intrinsic::x86_avx2_pmul_dq == IntrinsicID || 583 Intrinsic::x86_avx512_pmul_dq_512 == IntrinsicID); 584 585 SmallVector<unsigned, 16> ShuffleMask; 586 for (unsigned i = 0; i != NumElts; ++i) 587 ShuffleMask.push_back(i * 2); 588 589 auto *LHS = Builder.CreateShuffleVector(Arg0, Arg0, ShuffleMask); 590 auto *RHS = Builder.CreateShuffleVector(Arg1, Arg1, ShuffleMask); 591 592 if (IsSigned) { 593 LHS = Builder.CreateSExt(LHS, ResTy); 594 RHS = Builder.CreateSExt(RHS, ResTy); 595 } else { 596 LHS = Builder.CreateZExt(LHS, ResTy); 597 RHS = Builder.CreateZExt(RHS, ResTy); 598 } 599 600 return Builder.CreateMul(LHS, RHS); 601 } 602 603 static Value *simplifyX86pack(IntrinsicInst &II, bool IsSigned) { 604 Value *Arg0 = II.getArgOperand(0); 605 Value *Arg1 = II.getArgOperand(1); 606 Type *ResTy = II.getType(); 607 608 // Fast all undef handling. 609 if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1)) 610 return UndefValue::get(ResTy); 611 612 Type *ArgTy = Arg0->getType(); 613 unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128; 614 unsigned NumDstElts = ResTy->getVectorNumElements(); 615 unsigned NumSrcElts = ArgTy->getVectorNumElements(); 616 assert(NumDstElts == (2 * NumSrcElts) && "Unexpected packing types"); 617 618 unsigned NumDstEltsPerLane = NumDstElts / NumLanes; 619 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; 620 unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits(); 621 assert(ArgTy->getScalarSizeInBits() == (2 * DstScalarSizeInBits) && 622 "Unexpected packing types"); 623 624 // Constant folding. 625 auto *Cst0 = dyn_cast<Constant>(Arg0); 626 auto *Cst1 = dyn_cast<Constant>(Arg1); 627 if (!Cst0 || !Cst1) 628 return nullptr; 629 630 SmallVector<Constant *, 32> Vals; 631 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 632 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) { 633 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane; 634 auto *Cst = (Elt >= NumSrcEltsPerLane) ? Cst1 : Cst0; 635 auto *COp = Cst->getAggregateElement(SrcIdx); 636 if (COp && isa<UndefValue>(COp)) { 637 Vals.push_back(UndefValue::get(ResTy->getScalarType())); 638 continue; 639 } 640 641 auto *CInt = dyn_cast_or_null<ConstantInt>(COp); 642 if (!CInt) 643 return nullptr; 644 645 APInt Val = CInt->getValue(); 646 assert(Val.getBitWidth() == ArgTy->getScalarSizeInBits() && 647 "Unexpected constant bitwidth"); 648 649 if (IsSigned) { 650 // PACKSS: Truncate signed value with signed saturation. 651 // Source values less than dst minint are saturated to minint. 652 // Source values greater than dst maxint are saturated to maxint. 653 if (Val.isSignedIntN(DstScalarSizeInBits)) 654 Val = Val.trunc(DstScalarSizeInBits); 655 else if (Val.isNegative()) 656 Val = APInt::getSignedMinValue(DstScalarSizeInBits); 657 else 658 Val = APInt::getSignedMaxValue(DstScalarSizeInBits); 659 } else { 660 // PACKUS: Truncate signed value with unsigned saturation. 661 // Source values less than zero are saturated to zero. 662 // Source values greater than dst maxuint are saturated to maxuint. 663 if (Val.isIntN(DstScalarSizeInBits)) 664 Val = Val.trunc(DstScalarSizeInBits); 665 else if (Val.isNegative()) 666 Val = APInt::getNullValue(DstScalarSizeInBits); 667 else 668 Val = APInt::getAllOnesValue(DstScalarSizeInBits); 669 } 670 671 Vals.push_back(ConstantInt::get(ResTy->getScalarType(), Val)); 672 } 673 } 674 675 return ConstantVector::get(Vals); 676 } 677 678 static Value *simplifyX86movmsk(const IntrinsicInst &II) { 679 Value *Arg = II.getArgOperand(0); 680 Type *ResTy = II.getType(); 681 Type *ArgTy = Arg->getType(); 682 683 // movmsk(undef) -> zero as we must ensure the upper bits are zero. 684 if (isa<UndefValue>(Arg)) 685 return Constant::getNullValue(ResTy); 686 687 // We can't easily peek through x86_mmx types. 688 if (!ArgTy->isVectorTy()) 689 return nullptr; 690 691 auto *C = dyn_cast<Constant>(Arg); 692 if (!C) 693 return nullptr; 694 695 // Extract signbits of the vector input and pack into integer result. 696 APInt Result(ResTy->getPrimitiveSizeInBits(), 0); 697 for (unsigned I = 0, E = ArgTy->getVectorNumElements(); I != E; ++I) { 698 auto *COp = C->getAggregateElement(I); 699 if (!COp) 700 return nullptr; 701 if (isa<UndefValue>(COp)) 702 continue; 703 704 auto *CInt = dyn_cast<ConstantInt>(COp); 705 auto *CFp = dyn_cast<ConstantFP>(COp); 706 if (!CInt && !CFp) 707 return nullptr; 708 709 if ((CInt && CInt->isNegative()) || (CFp && CFp->isNegative())) 710 Result.setBit(I); 711 } 712 713 return Constant::getIntegerValue(ResTy, Result); 714 } 715 716 static Value *simplifyX86insertps(const IntrinsicInst &II, 717 InstCombiner::BuilderTy &Builder) { 718 auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2)); 719 if (!CInt) 720 return nullptr; 721 722 VectorType *VecTy = cast<VectorType>(II.getType()); 723 assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); 724 725 // The immediate permute control byte looks like this: 726 // [3:0] - zero mask for each 32-bit lane 727 // [5:4] - select one 32-bit destination lane 728 // [7:6] - select one 32-bit source lane 729 730 uint8_t Imm = CInt->getZExtValue(); 731 uint8_t ZMask = Imm & 0xf; 732 uint8_t DestLane = (Imm >> 4) & 0x3; 733 uint8_t SourceLane = (Imm >> 6) & 0x3; 734 735 ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); 736 737 // If all zero mask bits are set, this was just a weird way to 738 // generate a zero vector. 739 if (ZMask == 0xf) 740 return ZeroVector; 741 742 // Initialize by passing all of the first source bits through. 743 uint32_t ShuffleMask[4] = { 0, 1, 2, 3 }; 744 745 // We may replace the second operand with the zero vector. 746 Value *V1 = II.getArgOperand(1); 747 748 if (ZMask) { 749 // If the zero mask is being used with a single input or the zero mask 750 // overrides the destination lane, this is a shuffle with the zero vector. 751 if ((II.getArgOperand(0) == II.getArgOperand(1)) || 752 (ZMask & (1 << DestLane))) { 753 V1 = ZeroVector; 754 // We may still move 32-bits of the first source vector from one lane 755 // to another. 756 ShuffleMask[DestLane] = SourceLane; 757 // The zero mask may override the previous insert operation. 758 for (unsigned i = 0; i < 4; ++i) 759 if ((ZMask >> i) & 0x1) 760 ShuffleMask[i] = i + 4; 761 } else { 762 // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle? 763 return nullptr; 764 } 765 } else { 766 // Replace the selected destination lane with the selected source lane. 767 ShuffleMask[DestLane] = SourceLane + 4; 768 } 769 770 return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask); 771 } 772 773 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding 774 /// or conversion to a shuffle vector. 775 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, 776 ConstantInt *CILength, ConstantInt *CIIndex, 777 InstCombiner::BuilderTy &Builder) { 778 auto LowConstantHighUndef = [&](uint64_t Val) { 779 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 780 Constant *Args[] = {ConstantInt::get(IntTy64, Val), 781 UndefValue::get(IntTy64)}; 782 return ConstantVector::get(Args); 783 }; 784 785 // See if we're dealing with constant values. 786 Constant *C0 = dyn_cast<Constant>(Op0); 787 ConstantInt *CI0 = 788 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 789 : nullptr; 790 791 // Attempt to constant fold. 792 if (CILength && CIIndex) { 793 // From AMD documentation: "The bit index and field length are each six 794 // bits in length other bits of the field are ignored." 795 APInt APIndex = CIIndex->getValue().zextOrTrunc(6); 796 APInt APLength = CILength->getValue().zextOrTrunc(6); 797 798 unsigned Index = APIndex.getZExtValue(); 799 800 // From AMD documentation: "a value of zero in the field length is 801 // defined as length of 64". 802 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 803 804 // From AMD documentation: "If the sum of the bit index + length field 805 // is greater than 64, the results are undefined". 806 unsigned End = Index + Length; 807 808 // Note that both field index and field length are 8-bit quantities. 809 // Since variables 'Index' and 'Length' are unsigned values 810 // obtained from zero-extending field index and field length 811 // respectively, their sum should never wrap around. 812 if (End > 64) 813 return UndefValue::get(II.getType()); 814 815 // If we are inserting whole bytes, we can convert this to a shuffle. 816 // Lowering can recognize EXTRQI shuffle masks. 817 if ((Length % 8) == 0 && (Index % 8) == 0) { 818 // Convert bit indices to byte indices. 819 Length /= 8; 820 Index /= 8; 821 822 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 823 Type *IntTy32 = Type::getInt32Ty(II.getContext()); 824 VectorType *ShufTy = VectorType::get(IntTy8, 16); 825 826 SmallVector<Constant *, 16> ShuffleMask; 827 for (int i = 0; i != (int)Length; ++i) 828 ShuffleMask.push_back( 829 Constant::getIntegerValue(IntTy32, APInt(32, i + Index))); 830 for (int i = Length; i != 8; ++i) 831 ShuffleMask.push_back( 832 Constant::getIntegerValue(IntTy32, APInt(32, i + 16))); 833 for (int i = 8; i != 16; ++i) 834 ShuffleMask.push_back(UndefValue::get(IntTy32)); 835 836 Value *SV = Builder.CreateShuffleVector( 837 Builder.CreateBitCast(Op0, ShufTy), 838 ConstantAggregateZero::get(ShufTy), ConstantVector::get(ShuffleMask)); 839 return Builder.CreateBitCast(SV, II.getType()); 840 } 841 842 // Constant Fold - shift Index'th bit to lowest position and mask off 843 // Length bits. 844 if (CI0) { 845 APInt Elt = CI0->getValue(); 846 Elt.lshrInPlace(Index); 847 Elt = Elt.zextOrTrunc(Length); 848 return LowConstantHighUndef(Elt.getZExtValue()); 849 } 850 851 // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. 852 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { 853 Value *Args[] = {Op0, CILength, CIIndex}; 854 Module *M = II.getModule(); 855 Value *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); 856 return Builder.CreateCall(F, Args); 857 } 858 } 859 860 // Constant Fold - extraction from zero is always {zero, undef}. 861 if (CI0 && CI0->isZero()) 862 return LowConstantHighUndef(0); 863 864 return nullptr; 865 } 866 867 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant 868 /// folding or conversion to a shuffle vector. 869 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, 870 APInt APLength, APInt APIndex, 871 InstCombiner::BuilderTy &Builder) { 872 // From AMD documentation: "The bit index and field length are each six bits 873 // in length other bits of the field are ignored." 874 APIndex = APIndex.zextOrTrunc(6); 875 APLength = APLength.zextOrTrunc(6); 876 877 // Attempt to constant fold. 878 unsigned Index = APIndex.getZExtValue(); 879 880 // From AMD documentation: "a value of zero in the field length is 881 // defined as length of 64". 882 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 883 884 // From AMD documentation: "If the sum of the bit index + length field 885 // is greater than 64, the results are undefined". 886 unsigned End = Index + Length; 887 888 // Note that both field index and field length are 8-bit quantities. 889 // Since variables 'Index' and 'Length' are unsigned values 890 // obtained from zero-extending field index and field length 891 // respectively, their sum should never wrap around. 892 if (End > 64) 893 return UndefValue::get(II.getType()); 894 895 // If we are inserting whole bytes, we can convert this to a shuffle. 896 // Lowering can recognize INSERTQI shuffle masks. 897 if ((Length % 8) == 0 && (Index % 8) == 0) { 898 // Convert bit indices to byte indices. 899 Length /= 8; 900 Index /= 8; 901 902 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 903 Type *IntTy32 = Type::getInt32Ty(II.getContext()); 904 VectorType *ShufTy = VectorType::get(IntTy8, 16); 905 906 SmallVector<Constant *, 16> ShuffleMask; 907 for (int i = 0; i != (int)Index; ++i) 908 ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i))); 909 for (int i = 0; i != (int)Length; ++i) 910 ShuffleMask.push_back( 911 Constant::getIntegerValue(IntTy32, APInt(32, i + 16))); 912 for (int i = Index + Length; i != 8; ++i) 913 ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i))); 914 for (int i = 8; i != 16; ++i) 915 ShuffleMask.push_back(UndefValue::get(IntTy32)); 916 917 Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy), 918 Builder.CreateBitCast(Op1, ShufTy), 919 ConstantVector::get(ShuffleMask)); 920 return Builder.CreateBitCast(SV, II.getType()); 921 } 922 923 // See if we're dealing with constant values. 924 Constant *C0 = dyn_cast<Constant>(Op0); 925 Constant *C1 = dyn_cast<Constant>(Op1); 926 ConstantInt *CI00 = 927 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 928 : nullptr; 929 ConstantInt *CI10 = 930 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 931 : nullptr; 932 933 // Constant Fold - insert bottom Length bits starting at the Index'th bit. 934 if (CI00 && CI10) { 935 APInt V00 = CI00->getValue(); 936 APInt V10 = CI10->getValue(); 937 APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); 938 V00 = V00 & ~Mask; 939 V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); 940 APInt Val = V00 | V10; 941 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 942 Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), 943 UndefValue::get(IntTy64)}; 944 return ConstantVector::get(Args); 945 } 946 947 // If we were an INSERTQ call, we'll save demanded elements if we convert to 948 // INSERTQI. 949 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) { 950 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 951 Constant *CILength = ConstantInt::get(IntTy8, Length, false); 952 Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); 953 954 Value *Args[] = {Op0, Op1, CILength, CIIndex}; 955 Module *M = II.getModule(); 956 Value *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); 957 return Builder.CreateCall(F, Args); 958 } 959 960 return nullptr; 961 } 962 963 /// Attempt to convert pshufb* to shufflevector if the mask is constant. 964 static Value *simplifyX86pshufb(const IntrinsicInst &II, 965 InstCombiner::BuilderTy &Builder) { 966 Constant *V = dyn_cast<Constant>(II.getArgOperand(1)); 967 if (!V) 968 return nullptr; 969 970 auto *VecTy = cast<VectorType>(II.getType()); 971 auto *MaskEltTy = Type::getInt32Ty(II.getContext()); 972 unsigned NumElts = VecTy->getNumElements(); 973 assert((NumElts == 16 || NumElts == 32 || NumElts == 64) && 974 "Unexpected number of elements in shuffle mask!"); 975 976 // Construct a shuffle mask from constant integers or UNDEFs. 977 Constant *Indexes[64] = {nullptr}; 978 979 // Each byte in the shuffle control mask forms an index to permute the 980 // corresponding byte in the destination operand. 981 for (unsigned I = 0; I < NumElts; ++I) { 982 Constant *COp = V->getAggregateElement(I); 983 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 984 return nullptr; 985 986 if (isa<UndefValue>(COp)) { 987 Indexes[I] = UndefValue::get(MaskEltTy); 988 continue; 989 } 990 991 int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue(); 992 993 // If the most significant bit (bit[7]) of each byte of the shuffle 994 // control mask is set, then zero is written in the result byte. 995 // The zero vector is in the right-hand side of the resulting 996 // shufflevector. 997 998 // The value of each index for the high 128-bit lane is the least 999 // significant 4 bits of the respective shuffle control byte. 1000 Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0); 1001 Indexes[I] = ConstantInt::get(MaskEltTy, Index); 1002 } 1003 1004 auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, NumElts)); 1005 auto V1 = II.getArgOperand(0); 1006 auto V2 = Constant::getNullValue(VecTy); 1007 return Builder.CreateShuffleVector(V1, V2, ShuffleMask); 1008 } 1009 1010 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant. 1011 static Value *simplifyX86vpermilvar(const IntrinsicInst &II, 1012 InstCombiner::BuilderTy &Builder) { 1013 Constant *V = dyn_cast<Constant>(II.getArgOperand(1)); 1014 if (!V) 1015 return nullptr; 1016 1017 auto *VecTy = cast<VectorType>(II.getType()); 1018 auto *MaskEltTy = Type::getInt32Ty(II.getContext()); 1019 unsigned NumElts = VecTy->getVectorNumElements(); 1020 bool IsPD = VecTy->getScalarType()->isDoubleTy(); 1021 unsigned NumLaneElts = IsPD ? 2 : 4; 1022 assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2); 1023 1024 // Construct a shuffle mask from constant integers or UNDEFs. 1025 Constant *Indexes[16] = {nullptr}; 1026 1027 // The intrinsics only read one or two bits, clear the rest. 1028 for (unsigned I = 0; I < NumElts; ++I) { 1029 Constant *COp = V->getAggregateElement(I); 1030 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 1031 return nullptr; 1032 1033 if (isa<UndefValue>(COp)) { 1034 Indexes[I] = UndefValue::get(MaskEltTy); 1035 continue; 1036 } 1037 1038 APInt Index = cast<ConstantInt>(COp)->getValue(); 1039 Index = Index.zextOrTrunc(32).getLoBits(2); 1040 1041 // The PD variants uses bit 1 to select per-lane element index, so 1042 // shift down to convert to generic shuffle mask index. 1043 if (IsPD) 1044 Index.lshrInPlace(1); 1045 1046 // The _256 variants are a bit trickier since the mask bits always index 1047 // into the corresponding 128 half. In order to convert to a generic 1048 // shuffle, we have to make that explicit. 1049 Index += APInt(32, (I / NumLaneElts) * NumLaneElts); 1050 1051 Indexes[I] = ConstantInt::get(MaskEltTy, Index); 1052 } 1053 1054 auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, NumElts)); 1055 auto V1 = II.getArgOperand(0); 1056 auto V2 = UndefValue::get(V1->getType()); 1057 return Builder.CreateShuffleVector(V1, V2, ShuffleMask); 1058 } 1059 1060 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant. 1061 static Value *simplifyX86vpermv(const IntrinsicInst &II, 1062 InstCombiner::BuilderTy &Builder) { 1063 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 1064 if (!V) 1065 return nullptr; 1066 1067 auto *VecTy = cast<VectorType>(II.getType()); 1068 auto *MaskEltTy = Type::getInt32Ty(II.getContext()); 1069 unsigned Size = VecTy->getNumElements(); 1070 assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) && 1071 "Unexpected shuffle mask size"); 1072 1073 // Construct a shuffle mask from constant integers or UNDEFs. 1074 Constant *Indexes[64] = {nullptr}; 1075 1076 for (unsigned I = 0; I < Size; ++I) { 1077 Constant *COp = V->getAggregateElement(I); 1078 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 1079 return nullptr; 1080 1081 if (isa<UndefValue>(COp)) { 1082 Indexes[I] = UndefValue::get(MaskEltTy); 1083 continue; 1084 } 1085 1086 uint32_t Index = cast<ConstantInt>(COp)->getZExtValue(); 1087 Index &= Size - 1; 1088 Indexes[I] = ConstantInt::get(MaskEltTy, Index); 1089 } 1090 1091 auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, Size)); 1092 auto V1 = II.getArgOperand(0); 1093 auto V2 = UndefValue::get(VecTy); 1094 return Builder.CreateShuffleVector(V1, V2, ShuffleMask); 1095 } 1096 1097 /// The shuffle mask for a perm2*128 selects any two halves of two 256-bit 1098 /// source vectors, unless a zero bit is set. If a zero bit is set, 1099 /// then ignore that half of the mask and clear that half of the vector. 1100 static Value *simplifyX86vperm2(const IntrinsicInst &II, 1101 InstCombiner::BuilderTy &Builder) { 1102 auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2)); 1103 if (!CInt) 1104 return nullptr; 1105 1106 VectorType *VecTy = cast<VectorType>(II.getType()); 1107 ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); 1108 1109 // The immediate permute control byte looks like this: 1110 // [1:0] - select 128 bits from sources for low half of destination 1111 // [2] - ignore 1112 // [3] - zero low half of destination 1113 // [5:4] - select 128 bits from sources for high half of destination 1114 // [6] - ignore 1115 // [7] - zero high half of destination 1116 1117 uint8_t Imm = CInt->getZExtValue(); 1118 1119 bool LowHalfZero = Imm & 0x08; 1120 bool HighHalfZero = Imm & 0x80; 1121 1122 // If both zero mask bits are set, this was just a weird way to 1123 // generate a zero vector. 1124 if (LowHalfZero && HighHalfZero) 1125 return ZeroVector; 1126 1127 // If 0 or 1 zero mask bits are set, this is a simple shuffle. 1128 unsigned NumElts = VecTy->getNumElements(); 1129 unsigned HalfSize = NumElts / 2; 1130 SmallVector<uint32_t, 8> ShuffleMask(NumElts); 1131 1132 // The high bit of the selection field chooses the 1st or 2nd operand. 1133 bool LowInputSelect = Imm & 0x02; 1134 bool HighInputSelect = Imm & 0x20; 1135 1136 // The low bit of the selection field chooses the low or high half 1137 // of the selected operand. 1138 bool LowHalfSelect = Imm & 0x01; 1139 bool HighHalfSelect = Imm & 0x10; 1140 1141 // Determine which operand(s) are actually in use for this instruction. 1142 Value *V0 = LowInputSelect ? II.getArgOperand(1) : II.getArgOperand(0); 1143 Value *V1 = HighInputSelect ? II.getArgOperand(1) : II.getArgOperand(0); 1144 1145 // If needed, replace operands based on zero mask. 1146 V0 = LowHalfZero ? ZeroVector : V0; 1147 V1 = HighHalfZero ? ZeroVector : V1; 1148 1149 // Permute low half of result. 1150 unsigned StartIndex = LowHalfSelect ? HalfSize : 0; 1151 for (unsigned i = 0; i < HalfSize; ++i) 1152 ShuffleMask[i] = StartIndex + i; 1153 1154 // Permute high half of result. 1155 StartIndex = HighHalfSelect ? HalfSize : 0; 1156 StartIndex += NumElts; 1157 for (unsigned i = 0; i < HalfSize; ++i) 1158 ShuffleMask[i + HalfSize] = StartIndex + i; 1159 1160 return Builder.CreateShuffleVector(V0, V1, ShuffleMask); 1161 } 1162 1163 /// Decode XOP integer vector comparison intrinsics. 1164 static Value *simplifyX86vpcom(const IntrinsicInst &II, 1165 InstCombiner::BuilderTy &Builder, 1166 bool IsSigned) { 1167 if (auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) { 1168 uint64_t Imm = CInt->getZExtValue() & 0x7; 1169 VectorType *VecTy = cast<VectorType>(II.getType()); 1170 CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE; 1171 1172 switch (Imm) { 1173 case 0x0: 1174 Pred = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; 1175 break; 1176 case 0x1: 1177 Pred = IsSigned ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; 1178 break; 1179 case 0x2: 1180 Pred = IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; 1181 break; 1182 case 0x3: 1183 Pred = IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; 1184 break; 1185 case 0x4: 1186 Pred = ICmpInst::ICMP_EQ; break; 1187 case 0x5: 1188 Pred = ICmpInst::ICMP_NE; break; 1189 case 0x6: 1190 return ConstantInt::getSigned(VecTy, 0); // FALSE 1191 case 0x7: 1192 return ConstantInt::getSigned(VecTy, -1); // TRUE 1193 } 1194 1195 if (Value *Cmp = Builder.CreateICmp(Pred, II.getArgOperand(0), 1196 II.getArgOperand(1))) 1197 return Builder.CreateSExtOrTrunc(Cmp, VecTy); 1198 } 1199 return nullptr; 1200 } 1201 1202 // Emit a select instruction and appropriate bitcasts to help simplify 1203 // masked intrinsics. 1204 static Value *emitX86MaskSelect(Value *Mask, Value *Op0, Value *Op1, 1205 InstCombiner::BuilderTy &Builder) { 1206 unsigned VWidth = Op0->getType()->getVectorNumElements(); 1207 1208 // If the mask is all ones we don't need the select. But we need to check 1209 // only the bit thats will be used in case VWidth is less than 8. 1210 if (auto *C = dyn_cast<ConstantInt>(Mask)) 1211 if (C->getValue().zextOrTrunc(VWidth).isAllOnesValue()) 1212 return Op0; 1213 1214 auto *MaskTy = VectorType::get(Builder.getInt1Ty(), 1215 cast<IntegerType>(Mask->getType())->getBitWidth()); 1216 Mask = Builder.CreateBitCast(Mask, MaskTy); 1217 1218 // If we have less than 8 elements, then the starting mask was an i8 and 1219 // we need to extract down to the right number of elements. 1220 if (VWidth < 8) { 1221 uint32_t Indices[4]; 1222 for (unsigned i = 0; i != VWidth; ++i) 1223 Indices[i] = i; 1224 Mask = Builder.CreateShuffleVector(Mask, Mask, 1225 makeArrayRef(Indices, VWidth), 1226 "extract"); 1227 } 1228 1229 return Builder.CreateSelect(Mask, Op0, Op1); 1230 } 1231 1232 static Value *simplifyMinnumMaxnum(const IntrinsicInst &II) { 1233 Value *Arg0 = II.getArgOperand(0); 1234 Value *Arg1 = II.getArgOperand(1); 1235 1236 // fmin(x, x) -> x 1237 if (Arg0 == Arg1) 1238 return Arg0; 1239 1240 const auto *C1 = dyn_cast<ConstantFP>(Arg1); 1241 1242 // fmin(x, nan) -> x 1243 if (C1 && C1->isNaN()) 1244 return Arg0; 1245 1246 // This is the value because if undef were NaN, we would return the other 1247 // value and cannot return a NaN unless both operands are. 1248 // 1249 // fmin(undef, x) -> x 1250 if (isa<UndefValue>(Arg0)) 1251 return Arg1; 1252 1253 // fmin(x, undef) -> x 1254 if (isa<UndefValue>(Arg1)) 1255 return Arg0; 1256 1257 Value *X = nullptr; 1258 Value *Y = nullptr; 1259 if (II.getIntrinsicID() == Intrinsic::minnum) { 1260 // fmin(x, fmin(x, y)) -> fmin(x, y) 1261 // fmin(y, fmin(x, y)) -> fmin(x, y) 1262 if (match(Arg1, m_FMin(m_Value(X), m_Value(Y)))) { 1263 if (Arg0 == X || Arg0 == Y) 1264 return Arg1; 1265 } 1266 1267 // fmin(fmin(x, y), x) -> fmin(x, y) 1268 // fmin(fmin(x, y), y) -> fmin(x, y) 1269 if (match(Arg0, m_FMin(m_Value(X), m_Value(Y)))) { 1270 if (Arg1 == X || Arg1 == Y) 1271 return Arg0; 1272 } 1273 1274 // TODO: fmin(nnan x, inf) -> x 1275 // TODO: fmin(nnan ninf x, flt_max) -> x 1276 if (C1 && C1->isInfinity()) { 1277 // fmin(x, -inf) -> -inf 1278 if (C1->isNegative()) 1279 return Arg1; 1280 } 1281 } else { 1282 assert(II.getIntrinsicID() == Intrinsic::maxnum); 1283 // fmax(x, fmax(x, y)) -> fmax(x, y) 1284 // fmax(y, fmax(x, y)) -> fmax(x, y) 1285 if (match(Arg1, m_FMax(m_Value(X), m_Value(Y)))) { 1286 if (Arg0 == X || Arg0 == Y) 1287 return Arg1; 1288 } 1289 1290 // fmax(fmax(x, y), x) -> fmax(x, y) 1291 // fmax(fmax(x, y), y) -> fmax(x, y) 1292 if (match(Arg0, m_FMax(m_Value(X), m_Value(Y)))) { 1293 if (Arg1 == X || Arg1 == Y) 1294 return Arg0; 1295 } 1296 1297 // TODO: fmax(nnan x, -inf) -> x 1298 // TODO: fmax(nnan ninf x, -flt_max) -> x 1299 if (C1 && C1->isInfinity()) { 1300 // fmax(x, inf) -> inf 1301 if (!C1->isNegative()) 1302 return Arg1; 1303 } 1304 } 1305 return nullptr; 1306 } 1307 1308 static bool maskIsAllOneOrUndef(Value *Mask) { 1309 auto *ConstMask = dyn_cast<Constant>(Mask); 1310 if (!ConstMask) 1311 return false; 1312 if (ConstMask->isAllOnesValue() || isa<UndefValue>(ConstMask)) 1313 return true; 1314 for (unsigned I = 0, E = ConstMask->getType()->getVectorNumElements(); I != E; 1315 ++I) { 1316 if (auto *MaskElt = ConstMask->getAggregateElement(I)) 1317 if (MaskElt->isAllOnesValue() || isa<UndefValue>(MaskElt)) 1318 continue; 1319 return false; 1320 } 1321 return true; 1322 } 1323 1324 static Value *simplifyMaskedLoad(const IntrinsicInst &II, 1325 InstCombiner::BuilderTy &Builder) { 1326 // If the mask is all ones or undefs, this is a plain vector load of the 1st 1327 // argument. 1328 if (maskIsAllOneOrUndef(II.getArgOperand(2))) { 1329 Value *LoadPtr = II.getArgOperand(0); 1330 unsigned Alignment = cast<ConstantInt>(II.getArgOperand(1))->getZExtValue(); 1331 return Builder.CreateAlignedLoad(LoadPtr, Alignment, "unmaskedload"); 1332 } 1333 1334 return nullptr; 1335 } 1336 1337 static Instruction *simplifyMaskedStore(IntrinsicInst &II, InstCombiner &IC) { 1338 auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3)); 1339 if (!ConstMask) 1340 return nullptr; 1341 1342 // If the mask is all zeros, this instruction does nothing. 1343 if (ConstMask->isNullValue()) 1344 return IC.eraseInstFromFunction(II); 1345 1346 // If the mask is all ones, this is a plain vector store of the 1st argument. 1347 if (ConstMask->isAllOnesValue()) { 1348 Value *StorePtr = II.getArgOperand(1); 1349 unsigned Alignment = cast<ConstantInt>(II.getArgOperand(2))->getZExtValue(); 1350 return new StoreInst(II.getArgOperand(0), StorePtr, false, Alignment); 1351 } 1352 1353 return nullptr; 1354 } 1355 1356 static Instruction *simplifyMaskedGather(IntrinsicInst &II, InstCombiner &IC) { 1357 // If the mask is all zeros, return the "passthru" argument of the gather. 1358 auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(2)); 1359 if (ConstMask && ConstMask->isNullValue()) 1360 return IC.replaceInstUsesWith(II, II.getArgOperand(3)); 1361 1362 return nullptr; 1363 } 1364 1365 static Instruction *simplifyMaskedScatter(IntrinsicInst &II, InstCombiner &IC) { 1366 // If the mask is all zeros, a scatter does nothing. 1367 auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3)); 1368 if (ConstMask && ConstMask->isNullValue()) 1369 return IC.eraseInstFromFunction(II); 1370 1371 return nullptr; 1372 } 1373 1374 static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) { 1375 assert((II.getIntrinsicID() == Intrinsic::cttz || 1376 II.getIntrinsicID() == Intrinsic::ctlz) && 1377 "Expected cttz or ctlz intrinsic"); 1378 Value *Op0 = II.getArgOperand(0); 1379 1380 KnownBits Known = IC.computeKnownBits(Op0, 0, &II); 1381 1382 // Create a mask for bits above (ctlz) or below (cttz) the first known one. 1383 bool IsTZ = II.getIntrinsicID() == Intrinsic::cttz; 1384 unsigned PossibleZeros = IsTZ ? Known.countMaxTrailingZeros() 1385 : Known.countMaxLeadingZeros(); 1386 unsigned DefiniteZeros = IsTZ ? Known.countMinTrailingZeros() 1387 : Known.countMinLeadingZeros(); 1388 1389 // If all bits above (ctlz) or below (cttz) the first known one are known 1390 // zero, this value is constant. 1391 // FIXME: This should be in InstSimplify because we're replacing an 1392 // instruction with a constant. 1393 if (PossibleZeros == DefiniteZeros) { 1394 auto *C = ConstantInt::get(Op0->getType(), DefiniteZeros); 1395 return IC.replaceInstUsesWith(II, C); 1396 } 1397 1398 // If the input to cttz/ctlz is known to be non-zero, 1399 // then change the 'ZeroIsUndef' parameter to 'true' 1400 // because we know the zero behavior can't affect the result. 1401 if (!Known.One.isNullValue() || 1402 isKnownNonZero(Op0, IC.getDataLayout(), 0, &IC.getAssumptionCache(), &II, 1403 &IC.getDominatorTree())) { 1404 if (!match(II.getArgOperand(1), m_One())) { 1405 II.setOperand(1, IC.Builder.getTrue()); 1406 return &II; 1407 } 1408 } 1409 1410 // Add range metadata since known bits can't completely reflect what we know. 1411 // TODO: Handle splat vectors. 1412 auto *IT = dyn_cast<IntegerType>(Op0->getType()); 1413 if (IT && IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) { 1414 Metadata *LowAndHigh[] = { 1415 ConstantAsMetadata::get(ConstantInt::get(IT, DefiniteZeros)), 1416 ConstantAsMetadata::get(ConstantInt::get(IT, PossibleZeros + 1))}; 1417 II.setMetadata(LLVMContext::MD_range, 1418 MDNode::get(II.getContext(), LowAndHigh)); 1419 return &II; 1420 } 1421 1422 return nullptr; 1423 } 1424 1425 static Instruction *foldCtpop(IntrinsicInst &II, InstCombiner &IC) { 1426 assert(II.getIntrinsicID() == Intrinsic::ctpop && 1427 "Expected ctpop intrinsic"); 1428 Value *Op0 = II.getArgOperand(0); 1429 // FIXME: Try to simplify vectors of integers. 1430 auto *IT = dyn_cast<IntegerType>(Op0->getType()); 1431 if (!IT) 1432 return nullptr; 1433 1434 unsigned BitWidth = IT->getBitWidth(); 1435 KnownBits Known(BitWidth); 1436 IC.computeKnownBits(Op0, Known, 0, &II); 1437 1438 unsigned MinCount = Known.countMinPopulation(); 1439 unsigned MaxCount = Known.countMaxPopulation(); 1440 1441 // Add range metadata since known bits can't completely reflect what we know. 1442 if (IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) { 1443 Metadata *LowAndHigh[] = { 1444 ConstantAsMetadata::get(ConstantInt::get(IT, MinCount)), 1445 ConstantAsMetadata::get(ConstantInt::get(IT, MaxCount + 1))}; 1446 II.setMetadata(LLVMContext::MD_range, 1447 MDNode::get(II.getContext(), LowAndHigh)); 1448 return &II; 1449 } 1450 1451 return nullptr; 1452 } 1453 1454 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 1455 // XMM register mask efficiently, we could transform all x86 masked intrinsics 1456 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 1457 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) { 1458 Value *Ptr = II.getOperand(0); 1459 Value *Mask = II.getOperand(1); 1460 Constant *ZeroVec = Constant::getNullValue(II.getType()); 1461 1462 // Special case a zero mask since that's not a ConstantDataVector. 1463 // This masked load instruction creates a zero vector. 1464 if (isa<ConstantAggregateZero>(Mask)) 1465 return IC.replaceInstUsesWith(II, ZeroVec); 1466 1467 auto *ConstMask = dyn_cast<ConstantDataVector>(Mask); 1468 if (!ConstMask) 1469 return nullptr; 1470 1471 // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic 1472 // to allow target-independent optimizations. 1473 1474 // First, cast the x86 intrinsic scalar pointer to a vector pointer to match 1475 // the LLVM intrinsic definition for the pointer argument. 1476 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 1477 PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace); 1478 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 1479 1480 // Second, convert the x86 XMM integer vector mask to a vector of bools based 1481 // on each element's most significant bit (the sign bit). 1482 Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask); 1483 1484 // The pass-through vector for an x86 masked load is a zero vector. 1485 CallInst *NewMaskedLoad = 1486 IC.Builder.CreateMaskedLoad(PtrCast, 1, BoolMask, ZeroVec); 1487 return IC.replaceInstUsesWith(II, NewMaskedLoad); 1488 } 1489 1490 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 1491 // XMM register mask efficiently, we could transform all x86 masked intrinsics 1492 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 1493 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) { 1494 Value *Ptr = II.getOperand(0); 1495 Value *Mask = II.getOperand(1); 1496 Value *Vec = II.getOperand(2); 1497 1498 // Special case a zero mask since that's not a ConstantDataVector: 1499 // this masked store instruction does nothing. 1500 if (isa<ConstantAggregateZero>(Mask)) { 1501 IC.eraseInstFromFunction(II); 1502 return true; 1503 } 1504 1505 // The SSE2 version is too weird (eg, unaligned but non-temporal) to do 1506 // anything else at this level. 1507 if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu) 1508 return false; 1509 1510 auto *ConstMask = dyn_cast<ConstantDataVector>(Mask); 1511 if (!ConstMask) 1512 return false; 1513 1514 // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic 1515 // to allow target-independent optimizations. 1516 1517 // First, cast the x86 intrinsic scalar pointer to a vector pointer to match 1518 // the LLVM intrinsic definition for the pointer argument. 1519 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 1520 PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace); 1521 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 1522 1523 // Second, convert the x86 XMM integer vector mask to a vector of bools based 1524 // on each element's most significant bit (the sign bit). 1525 Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask); 1526 1527 IC.Builder.CreateMaskedStore(Vec, PtrCast, 1, BoolMask); 1528 1529 // 'Replace uses' doesn't work for stores. Erase the original masked store. 1530 IC.eraseInstFromFunction(II); 1531 return true; 1532 } 1533 1534 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs. 1535 // 1536 // A single NaN input is folded to minnum, so we rely on that folding for 1537 // handling NaNs. 1538 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, 1539 const APFloat &Src2) { 1540 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2); 1541 1542 APFloat::cmpResult Cmp0 = Max3.compare(Src0); 1543 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately"); 1544 if (Cmp0 == APFloat::cmpEqual) 1545 return maxnum(Src1, Src2); 1546 1547 APFloat::cmpResult Cmp1 = Max3.compare(Src1); 1548 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately"); 1549 if (Cmp1 == APFloat::cmpEqual) 1550 return maxnum(Src0, Src2); 1551 1552 return maxnum(Src0, Src1); 1553 } 1554 1555 // Returns true iff the 2 intrinsics have the same operands, limiting the 1556 // comparison to the first NumOperands. 1557 static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E, 1558 unsigned NumOperands) { 1559 assert(I.getNumArgOperands() >= NumOperands && "Not enough operands"); 1560 assert(E.getNumArgOperands() >= NumOperands && "Not enough operands"); 1561 for (unsigned i = 0; i < NumOperands; i++) 1562 if (I.getArgOperand(i) != E.getArgOperand(i)) 1563 return false; 1564 return true; 1565 } 1566 1567 // Remove trivially empty start/end intrinsic ranges, i.e. a start 1568 // immediately followed by an end (ignoring debuginfo or other 1569 // start/end intrinsics in between). As this handles only the most trivial 1570 // cases, tracking the nesting level is not needed: 1571 // 1572 // call @llvm.foo.start(i1 0) ; &I 1573 // call @llvm.foo.start(i1 0) 1574 // call @llvm.foo.end(i1 0) ; This one will not be skipped: it will be removed 1575 // call @llvm.foo.end(i1 0) 1576 static bool removeTriviallyEmptyRange(IntrinsicInst &I, unsigned StartID, 1577 unsigned EndID, InstCombiner &IC) { 1578 assert(I.getIntrinsicID() == StartID && 1579 "Start intrinsic does not have expected ID"); 1580 BasicBlock::iterator BI(I), BE(I.getParent()->end()); 1581 for (++BI; BI != BE; ++BI) { 1582 if (auto *E = dyn_cast<IntrinsicInst>(BI)) { 1583 if (isa<DbgInfoIntrinsic>(E) || E->getIntrinsicID() == StartID) 1584 continue; 1585 if (E->getIntrinsicID() == EndID && 1586 haveSameOperands(I, *E, E->getNumArgOperands())) { 1587 IC.eraseInstFromFunction(*E); 1588 IC.eraseInstFromFunction(I); 1589 return true; 1590 } 1591 } 1592 break; 1593 } 1594 1595 return false; 1596 } 1597 1598 // Convert NVVM intrinsics to target-generic LLVM code where possible. 1599 static Instruction *SimplifyNVVMIntrinsic(IntrinsicInst *II, InstCombiner &IC) { 1600 // Each NVVM intrinsic we can simplify can be replaced with one of: 1601 // 1602 // * an LLVM intrinsic, 1603 // * an LLVM cast operation, 1604 // * an LLVM binary operation, or 1605 // * ad-hoc LLVM IR for the particular operation. 1606 1607 // Some transformations are only valid when the module's 1608 // flush-denormals-to-zero (ftz) setting is true/false, whereas other 1609 // transformations are valid regardless of the module's ftz setting. 1610 enum FtzRequirementTy { 1611 FTZ_Any, // Any ftz setting is ok. 1612 FTZ_MustBeOn, // Transformation is valid only if ftz is on. 1613 FTZ_MustBeOff, // Transformation is valid only if ftz is off. 1614 }; 1615 // Classes of NVVM intrinsics that can't be replaced one-to-one with a 1616 // target-generic intrinsic, cast op, or binary op but that we can nonetheless 1617 // simplify. 1618 enum SpecialCase { 1619 SPC_Reciprocal, 1620 }; 1621 1622 // SimplifyAction is a poor-man's variant (plus an additional flag) that 1623 // represents how to replace an NVVM intrinsic with target-generic LLVM IR. 1624 struct SimplifyAction { 1625 // Invariant: At most one of these Optionals has a value. 1626 Optional<Intrinsic::ID> IID; 1627 Optional<Instruction::CastOps> CastOp; 1628 Optional<Instruction::BinaryOps> BinaryOp; 1629 Optional<SpecialCase> Special; 1630 1631 FtzRequirementTy FtzRequirement = FTZ_Any; 1632 1633 SimplifyAction() = default; 1634 1635 SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq) 1636 : IID(IID), FtzRequirement(FtzReq) {} 1637 1638 // Cast operations don't have anything to do with FTZ, so we skip that 1639 // argument. 1640 SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {} 1641 1642 SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq) 1643 : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {} 1644 1645 SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq) 1646 : Special(Special), FtzRequirement(FtzReq) {} 1647 }; 1648 1649 // Try to generate a SimplifyAction describing how to replace our 1650 // IntrinsicInstr with target-generic LLVM IR. 1651 const SimplifyAction Action = [II]() -> SimplifyAction { 1652 switch (II->getIntrinsicID()) { 1653 1654 // NVVM intrinsics that map directly to LLVM intrinsics. 1655 case Intrinsic::nvvm_ceil_d: 1656 return {Intrinsic::ceil, FTZ_Any}; 1657 case Intrinsic::nvvm_ceil_f: 1658 return {Intrinsic::ceil, FTZ_MustBeOff}; 1659 case Intrinsic::nvvm_ceil_ftz_f: 1660 return {Intrinsic::ceil, FTZ_MustBeOn}; 1661 case Intrinsic::nvvm_fabs_d: 1662 return {Intrinsic::fabs, FTZ_Any}; 1663 case Intrinsic::nvvm_fabs_f: 1664 return {Intrinsic::fabs, FTZ_MustBeOff}; 1665 case Intrinsic::nvvm_fabs_ftz_f: 1666 return {Intrinsic::fabs, FTZ_MustBeOn}; 1667 case Intrinsic::nvvm_floor_d: 1668 return {Intrinsic::floor, FTZ_Any}; 1669 case Intrinsic::nvvm_floor_f: 1670 return {Intrinsic::floor, FTZ_MustBeOff}; 1671 case Intrinsic::nvvm_floor_ftz_f: 1672 return {Intrinsic::floor, FTZ_MustBeOn}; 1673 case Intrinsic::nvvm_fma_rn_d: 1674 return {Intrinsic::fma, FTZ_Any}; 1675 case Intrinsic::nvvm_fma_rn_f: 1676 return {Intrinsic::fma, FTZ_MustBeOff}; 1677 case Intrinsic::nvvm_fma_rn_ftz_f: 1678 return {Intrinsic::fma, FTZ_MustBeOn}; 1679 case Intrinsic::nvvm_fmax_d: 1680 return {Intrinsic::maxnum, FTZ_Any}; 1681 case Intrinsic::nvvm_fmax_f: 1682 return {Intrinsic::maxnum, FTZ_MustBeOff}; 1683 case Intrinsic::nvvm_fmax_ftz_f: 1684 return {Intrinsic::maxnum, FTZ_MustBeOn}; 1685 case Intrinsic::nvvm_fmin_d: 1686 return {Intrinsic::minnum, FTZ_Any}; 1687 case Intrinsic::nvvm_fmin_f: 1688 return {Intrinsic::minnum, FTZ_MustBeOff}; 1689 case Intrinsic::nvvm_fmin_ftz_f: 1690 return {Intrinsic::minnum, FTZ_MustBeOn}; 1691 case Intrinsic::nvvm_round_d: 1692 return {Intrinsic::round, FTZ_Any}; 1693 case Intrinsic::nvvm_round_f: 1694 return {Intrinsic::round, FTZ_MustBeOff}; 1695 case Intrinsic::nvvm_round_ftz_f: 1696 return {Intrinsic::round, FTZ_MustBeOn}; 1697 case Intrinsic::nvvm_sqrt_rn_d: 1698 return {Intrinsic::sqrt, FTZ_Any}; 1699 case Intrinsic::nvvm_sqrt_f: 1700 // nvvm_sqrt_f is a special case. For most intrinsics, foo_ftz_f is the 1701 // ftz version, and foo_f is the non-ftz version. But nvvm_sqrt_f adopts 1702 // the ftz-ness of the surrounding code. sqrt_rn_f and sqrt_rn_ftz_f are 1703 // the versions with explicit ftz-ness. 1704 return {Intrinsic::sqrt, FTZ_Any}; 1705 case Intrinsic::nvvm_sqrt_rn_f: 1706 return {Intrinsic::sqrt, FTZ_MustBeOff}; 1707 case Intrinsic::nvvm_sqrt_rn_ftz_f: 1708 return {Intrinsic::sqrt, FTZ_MustBeOn}; 1709 case Intrinsic::nvvm_trunc_d: 1710 return {Intrinsic::trunc, FTZ_Any}; 1711 case Intrinsic::nvvm_trunc_f: 1712 return {Intrinsic::trunc, FTZ_MustBeOff}; 1713 case Intrinsic::nvvm_trunc_ftz_f: 1714 return {Intrinsic::trunc, FTZ_MustBeOn}; 1715 1716 // NVVM intrinsics that map to LLVM cast operations. 1717 // 1718 // Note that llvm's target-generic conversion operators correspond to the rz 1719 // (round to zero) versions of the nvvm conversion intrinsics, even though 1720 // most everything else here uses the rn (round to nearest even) nvvm ops. 1721 case Intrinsic::nvvm_d2i_rz: 1722 case Intrinsic::nvvm_f2i_rz: 1723 case Intrinsic::nvvm_d2ll_rz: 1724 case Intrinsic::nvvm_f2ll_rz: 1725 return {Instruction::FPToSI}; 1726 case Intrinsic::nvvm_d2ui_rz: 1727 case Intrinsic::nvvm_f2ui_rz: 1728 case Intrinsic::nvvm_d2ull_rz: 1729 case Intrinsic::nvvm_f2ull_rz: 1730 return {Instruction::FPToUI}; 1731 case Intrinsic::nvvm_i2d_rz: 1732 case Intrinsic::nvvm_i2f_rz: 1733 case Intrinsic::nvvm_ll2d_rz: 1734 case Intrinsic::nvvm_ll2f_rz: 1735 return {Instruction::SIToFP}; 1736 case Intrinsic::nvvm_ui2d_rz: 1737 case Intrinsic::nvvm_ui2f_rz: 1738 case Intrinsic::nvvm_ull2d_rz: 1739 case Intrinsic::nvvm_ull2f_rz: 1740 return {Instruction::UIToFP}; 1741 1742 // NVVM intrinsics that map to LLVM binary ops. 1743 case Intrinsic::nvvm_add_rn_d: 1744 return {Instruction::FAdd, FTZ_Any}; 1745 case Intrinsic::nvvm_add_rn_f: 1746 return {Instruction::FAdd, FTZ_MustBeOff}; 1747 case Intrinsic::nvvm_add_rn_ftz_f: 1748 return {Instruction::FAdd, FTZ_MustBeOn}; 1749 case Intrinsic::nvvm_mul_rn_d: 1750 return {Instruction::FMul, FTZ_Any}; 1751 case Intrinsic::nvvm_mul_rn_f: 1752 return {Instruction::FMul, FTZ_MustBeOff}; 1753 case Intrinsic::nvvm_mul_rn_ftz_f: 1754 return {Instruction::FMul, FTZ_MustBeOn}; 1755 case Intrinsic::nvvm_div_rn_d: 1756 return {Instruction::FDiv, FTZ_Any}; 1757 case Intrinsic::nvvm_div_rn_f: 1758 return {Instruction::FDiv, FTZ_MustBeOff}; 1759 case Intrinsic::nvvm_div_rn_ftz_f: 1760 return {Instruction::FDiv, FTZ_MustBeOn}; 1761 1762 // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but 1763 // need special handling. 1764 // 1765 // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just 1766 // as well. 1767 case Intrinsic::nvvm_rcp_rn_d: 1768 return {SPC_Reciprocal, FTZ_Any}; 1769 case Intrinsic::nvvm_rcp_rn_f: 1770 return {SPC_Reciprocal, FTZ_MustBeOff}; 1771 case Intrinsic::nvvm_rcp_rn_ftz_f: 1772 return {SPC_Reciprocal, FTZ_MustBeOn}; 1773 1774 // We do not currently simplify intrinsics that give an approximate answer. 1775 // These include: 1776 // 1777 // - nvvm_cos_approx_{f,ftz_f} 1778 // - nvvm_ex2_approx_{d,f,ftz_f} 1779 // - nvvm_lg2_approx_{d,f,ftz_f} 1780 // - nvvm_sin_approx_{f,ftz_f} 1781 // - nvvm_sqrt_approx_{f,ftz_f} 1782 // - nvvm_rsqrt_approx_{d,f,ftz_f} 1783 // - nvvm_div_approx_{ftz_d,ftz_f,f} 1784 // - nvvm_rcp_approx_ftz_d 1785 // 1786 // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast" 1787 // means that fastmath is enabled in the intrinsic. Unfortunately only 1788 // binary operators (currently) have a fastmath bit in SelectionDAG, so this 1789 // information gets lost and we can't select on it. 1790 // 1791 // TODO: div and rcp are lowered to a binary op, so these we could in theory 1792 // lower them to "fast fdiv". 1793 1794 default: 1795 return {}; 1796 } 1797 }(); 1798 1799 // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we 1800 // can bail out now. (Notice that in the case that IID is not an NVVM 1801 // intrinsic, we don't have to look up any module metadata, as 1802 // FtzRequirementTy will be FTZ_Any.) 1803 if (Action.FtzRequirement != FTZ_Any) { 1804 bool FtzEnabled = 1805 II->getFunction()->getFnAttribute("nvptx-f32ftz").getValueAsString() == 1806 "true"; 1807 1808 if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn)) 1809 return nullptr; 1810 } 1811 1812 // Simplify to target-generic intrinsic. 1813 if (Action.IID) { 1814 SmallVector<Value *, 4> Args(II->arg_operands()); 1815 // All the target-generic intrinsics currently of interest to us have one 1816 // type argument, equal to that of the nvvm intrinsic's argument. 1817 Type *Tys[] = {II->getArgOperand(0)->getType()}; 1818 return CallInst::Create( 1819 Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args); 1820 } 1821 1822 // Simplify to target-generic binary op. 1823 if (Action.BinaryOp) 1824 return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0), 1825 II->getArgOperand(1), II->getName()); 1826 1827 // Simplify to target-generic cast op. 1828 if (Action.CastOp) 1829 return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(), 1830 II->getName()); 1831 1832 // All that's left are the special cases. 1833 if (!Action.Special) 1834 return nullptr; 1835 1836 switch (*Action.Special) { 1837 case SPC_Reciprocal: 1838 // Simplify reciprocal. 1839 return BinaryOperator::Create( 1840 Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1), 1841 II->getArgOperand(0), II->getName()); 1842 } 1843 llvm_unreachable("All SpecialCase enumerators should be handled in switch."); 1844 } 1845 1846 Instruction *InstCombiner::visitVAStartInst(VAStartInst &I) { 1847 removeTriviallyEmptyRange(I, Intrinsic::vastart, Intrinsic::vaend, *this); 1848 return nullptr; 1849 } 1850 1851 Instruction *InstCombiner::visitVACopyInst(VACopyInst &I) { 1852 removeTriviallyEmptyRange(I, Intrinsic::vacopy, Intrinsic::vaend, *this); 1853 return nullptr; 1854 } 1855 1856 /// CallInst simplification. This mostly only handles folding of intrinsic 1857 /// instructions. For normal calls, it allows visitCallSite to do the heavy 1858 /// lifting. 1859 Instruction *InstCombiner::visitCallInst(CallInst &CI) { 1860 auto Args = CI.arg_operands(); 1861 if (Value *V = SimplifyCall(&CI, CI.getCalledValue(), Args.begin(), 1862 Args.end(), SQ.getWithInstruction(&CI))) 1863 return replaceInstUsesWith(CI, V); 1864 1865 if (isFreeCall(&CI, &TLI)) 1866 return visitFree(CI); 1867 1868 // If the caller function is nounwind, mark the call as nounwind, even if the 1869 // callee isn't. 1870 if (CI.getFunction()->doesNotThrow() && !CI.doesNotThrow()) { 1871 CI.setDoesNotThrow(); 1872 return &CI; 1873 } 1874 1875 IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CI); 1876 if (!II) return visitCallSite(&CI); 1877 1878 // Intrinsics cannot occur in an invoke, so handle them here instead of in 1879 // visitCallSite. 1880 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(II)) { 1881 bool Changed = false; 1882 1883 // memmove/cpy/set of zero bytes is a noop. 1884 if (Constant *NumBytes = dyn_cast<Constant>(MI->getLength())) { 1885 if (NumBytes->isNullValue()) 1886 return eraseInstFromFunction(CI); 1887 1888 if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes)) 1889 if (CI->getZExtValue() == 1) { 1890 // Replace the instruction with just byte operations. We would 1891 // transform other cases to loads/stores, but we don't know if 1892 // alignment is sufficient. 1893 } 1894 } 1895 1896 // No other transformations apply to volatile transfers. 1897 if (MI->isVolatile()) 1898 return nullptr; 1899 1900 // If we have a memmove and the source operation is a constant global, 1901 // then the source and dest pointers can't alias, so we can change this 1902 // into a call to memcpy. 1903 if (MemMoveInst *MMI = dyn_cast<MemMoveInst>(MI)) { 1904 if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource())) 1905 if (GVSrc->isConstant()) { 1906 Module *M = CI.getModule(); 1907 Intrinsic::ID MemCpyID = Intrinsic::memcpy; 1908 Type *Tys[3] = { CI.getArgOperand(0)->getType(), 1909 CI.getArgOperand(1)->getType(), 1910 CI.getArgOperand(2)->getType() }; 1911 CI.setCalledFunction(Intrinsic::getDeclaration(M, MemCpyID, Tys)); 1912 Changed = true; 1913 } 1914 } 1915 1916 if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) { 1917 // memmove(x,x,size) -> noop. 1918 if (MTI->getSource() == MTI->getDest()) 1919 return eraseInstFromFunction(CI); 1920 } 1921 1922 // If we can determine a pointer alignment that is bigger than currently 1923 // set, update the alignment. 1924 if (isa<MemTransferInst>(MI)) { 1925 if (Instruction *I = SimplifyMemTransfer(MI)) 1926 return I; 1927 } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(MI)) { 1928 if (Instruction *I = SimplifyMemSet(MSI)) 1929 return I; 1930 } 1931 1932 if (Changed) return II; 1933 } 1934 1935 if (auto *AMI = dyn_cast<ElementUnorderedAtomicMemCpyInst>(II)) { 1936 if (Constant *C = dyn_cast<Constant>(AMI->getLength())) 1937 if (C->isNullValue()) 1938 return eraseInstFromFunction(*AMI); 1939 1940 if (Instruction *I = SimplifyElementUnorderedAtomicMemCpy(AMI)) 1941 return I; 1942 } 1943 1944 if (Instruction *I = SimplifyNVVMIntrinsic(II, *this)) 1945 return I; 1946 1947 auto SimplifyDemandedVectorEltsLow = [this](Value *Op, unsigned Width, 1948 unsigned DemandedWidth) { 1949 APInt UndefElts(Width, 0); 1950 APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); 1951 return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); 1952 }; 1953 1954 switch (II->getIntrinsicID()) { 1955 default: break; 1956 case Intrinsic::objectsize: 1957 if (ConstantInt *N = 1958 lowerObjectSizeCall(II, DL, &TLI, /*MustSucceed=*/false)) 1959 return replaceInstUsesWith(CI, N); 1960 return nullptr; 1961 1962 case Intrinsic::bswap: { 1963 Value *IIOperand = II->getArgOperand(0); 1964 Value *X = nullptr; 1965 1966 // TODO should this be in InstSimplify? 1967 // bswap(bswap(x)) -> x 1968 if (match(IIOperand, m_BSwap(m_Value(X)))) 1969 return replaceInstUsesWith(CI, X); 1970 1971 // bswap(trunc(bswap(x))) -> trunc(lshr(x, c)) 1972 if (match(IIOperand, m_Trunc(m_BSwap(m_Value(X))))) { 1973 unsigned C = X->getType()->getPrimitiveSizeInBits() - 1974 IIOperand->getType()->getPrimitiveSizeInBits(); 1975 Value *CV = ConstantInt::get(X->getType(), C); 1976 Value *V = Builder.CreateLShr(X, CV); 1977 return new TruncInst(V, IIOperand->getType()); 1978 } 1979 break; 1980 } 1981 1982 case Intrinsic::bitreverse: { 1983 Value *IIOperand = II->getArgOperand(0); 1984 Value *X = nullptr; 1985 1986 // TODO should this be in InstSimplify? 1987 // bitreverse(bitreverse(x)) -> x 1988 if (match(IIOperand, m_BitReverse(m_Value(X)))) 1989 return replaceInstUsesWith(CI, X); 1990 break; 1991 } 1992 1993 case Intrinsic::masked_load: 1994 if (Value *SimplifiedMaskedOp = simplifyMaskedLoad(*II, Builder)) 1995 return replaceInstUsesWith(CI, SimplifiedMaskedOp); 1996 break; 1997 case Intrinsic::masked_store: 1998 return simplifyMaskedStore(*II, *this); 1999 case Intrinsic::masked_gather: 2000 return simplifyMaskedGather(*II, *this); 2001 case Intrinsic::masked_scatter: 2002 return simplifyMaskedScatter(*II, *this); 2003 2004 case Intrinsic::powi: 2005 if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) { 2006 // powi(x, 0) -> 1.0 2007 if (Power->isZero()) 2008 return replaceInstUsesWith(CI, ConstantFP::get(CI.getType(), 1.0)); 2009 // powi(x, 1) -> x 2010 if (Power->isOne()) 2011 return replaceInstUsesWith(CI, II->getArgOperand(0)); 2012 // powi(x, -1) -> 1/x 2013 if (Power->isMinusOne()) 2014 return BinaryOperator::CreateFDiv(ConstantFP::get(CI.getType(), 1.0), 2015 II->getArgOperand(0)); 2016 } 2017 break; 2018 2019 case Intrinsic::cttz: 2020 case Intrinsic::ctlz: 2021 if (auto *I = foldCttzCtlz(*II, *this)) 2022 return I; 2023 break; 2024 2025 case Intrinsic::ctpop: 2026 if (auto *I = foldCtpop(*II, *this)) 2027 return I; 2028 break; 2029 2030 case Intrinsic::uadd_with_overflow: 2031 case Intrinsic::sadd_with_overflow: 2032 case Intrinsic::umul_with_overflow: 2033 case Intrinsic::smul_with_overflow: 2034 if (isa<Constant>(II->getArgOperand(0)) && 2035 !isa<Constant>(II->getArgOperand(1))) { 2036 // Canonicalize constants into the RHS. 2037 Value *LHS = II->getArgOperand(0); 2038 II->setArgOperand(0, II->getArgOperand(1)); 2039 II->setArgOperand(1, LHS); 2040 return II; 2041 } 2042 LLVM_FALLTHROUGH; 2043 2044 case Intrinsic::usub_with_overflow: 2045 case Intrinsic::ssub_with_overflow: { 2046 OverflowCheckFlavor OCF = 2047 IntrinsicIDToOverflowCheckFlavor(II->getIntrinsicID()); 2048 assert(OCF != OCF_INVALID && "unexpected!"); 2049 2050 Value *OperationResult = nullptr; 2051 Constant *OverflowResult = nullptr; 2052 if (OptimizeOverflowCheck(OCF, II->getArgOperand(0), II->getArgOperand(1), 2053 *II, OperationResult, OverflowResult)) 2054 return CreateOverflowTuple(II, OperationResult, OverflowResult); 2055 2056 break; 2057 } 2058 2059 case Intrinsic::minnum: 2060 case Intrinsic::maxnum: { 2061 Value *Arg0 = II->getArgOperand(0); 2062 Value *Arg1 = II->getArgOperand(1); 2063 // Canonicalize constants to the RHS. 2064 if (isa<ConstantFP>(Arg0) && !isa<ConstantFP>(Arg1)) { 2065 II->setArgOperand(0, Arg1); 2066 II->setArgOperand(1, Arg0); 2067 return II; 2068 } 2069 if (Value *V = simplifyMinnumMaxnum(*II)) 2070 return replaceInstUsesWith(*II, V); 2071 break; 2072 } 2073 case Intrinsic::fmuladd: { 2074 // Canonicalize fast fmuladd to the separate fmul + fadd. 2075 if (II->hasUnsafeAlgebra()) { 2076 BuilderTy::FastMathFlagGuard Guard(Builder); 2077 Builder.setFastMathFlags(II->getFastMathFlags()); 2078 Value *Mul = Builder.CreateFMul(II->getArgOperand(0), 2079 II->getArgOperand(1)); 2080 Value *Add = Builder.CreateFAdd(Mul, II->getArgOperand(2)); 2081 Add->takeName(II); 2082 return replaceInstUsesWith(*II, Add); 2083 } 2084 2085 LLVM_FALLTHROUGH; 2086 } 2087 case Intrinsic::fma: { 2088 Value *Src0 = II->getArgOperand(0); 2089 Value *Src1 = II->getArgOperand(1); 2090 2091 // Canonicalize constants into the RHS. 2092 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 2093 II->setArgOperand(0, Src1); 2094 II->setArgOperand(1, Src0); 2095 std::swap(Src0, Src1); 2096 } 2097 2098 Value *LHS = nullptr; 2099 Value *RHS = nullptr; 2100 2101 // fma fneg(x), fneg(y), z -> fma x, y, z 2102 if (match(Src0, m_FNeg(m_Value(LHS))) && 2103 match(Src1, m_FNeg(m_Value(RHS)))) { 2104 II->setArgOperand(0, LHS); 2105 II->setArgOperand(1, RHS); 2106 return II; 2107 } 2108 2109 // fma fabs(x), fabs(x), z -> fma x, x, z 2110 if (match(Src0, m_Intrinsic<Intrinsic::fabs>(m_Value(LHS))) && 2111 match(Src1, m_Intrinsic<Intrinsic::fabs>(m_Value(RHS))) && LHS == RHS) { 2112 II->setArgOperand(0, LHS); 2113 II->setArgOperand(1, RHS); 2114 return II; 2115 } 2116 2117 // fma x, 1, z -> fadd x, z 2118 if (match(Src1, m_FPOne())) { 2119 Instruction *RI = BinaryOperator::CreateFAdd(Src0, II->getArgOperand(2)); 2120 RI->copyFastMathFlags(II); 2121 return RI; 2122 } 2123 2124 break; 2125 } 2126 case Intrinsic::fabs: { 2127 Value *Cond; 2128 Constant *LHS, *RHS; 2129 if (match(II->getArgOperand(0), 2130 m_Select(m_Value(Cond), m_Constant(LHS), m_Constant(RHS)))) { 2131 CallInst *Call0 = Builder.CreateCall(II->getCalledFunction(), {LHS}); 2132 CallInst *Call1 = Builder.CreateCall(II->getCalledFunction(), {RHS}); 2133 return SelectInst::Create(Cond, Call0, Call1); 2134 } 2135 2136 LLVM_FALLTHROUGH; 2137 } 2138 case Intrinsic::ceil: 2139 case Intrinsic::floor: 2140 case Intrinsic::round: 2141 case Intrinsic::nearbyint: 2142 case Intrinsic::rint: 2143 case Intrinsic::trunc: { 2144 Value *ExtSrc; 2145 if (match(II->getArgOperand(0), m_FPExt(m_Value(ExtSrc))) && 2146 II->getArgOperand(0)->hasOneUse()) { 2147 // fabs (fpext x) -> fpext (fabs x) 2148 Value *F = Intrinsic::getDeclaration(II->getModule(), II->getIntrinsicID(), 2149 { ExtSrc->getType() }); 2150 CallInst *NewFabs = Builder.CreateCall(F, ExtSrc); 2151 NewFabs->copyFastMathFlags(II); 2152 NewFabs->takeName(II); 2153 return new FPExtInst(NewFabs, II->getType()); 2154 } 2155 2156 break; 2157 } 2158 case Intrinsic::cos: 2159 case Intrinsic::amdgcn_cos: { 2160 Value *SrcSrc; 2161 Value *Src = II->getArgOperand(0); 2162 if (match(Src, m_FNeg(m_Value(SrcSrc))) || 2163 match(Src, m_Intrinsic<Intrinsic::fabs>(m_Value(SrcSrc)))) { 2164 // cos(-x) -> cos(x) 2165 // cos(fabs(x)) -> cos(x) 2166 II->setArgOperand(0, SrcSrc); 2167 return II; 2168 } 2169 2170 break; 2171 } 2172 case Intrinsic::ppc_altivec_lvx: 2173 case Intrinsic::ppc_altivec_lvxl: 2174 // Turn PPC lvx -> load if the pointer is known aligned. 2175 if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, &AC, 2176 &DT) >= 16) { 2177 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), 2178 PointerType::getUnqual(II->getType())); 2179 return new LoadInst(Ptr); 2180 } 2181 break; 2182 case Intrinsic::ppc_vsx_lxvw4x: 2183 case Intrinsic::ppc_vsx_lxvd2x: { 2184 // Turn PPC VSX loads into normal loads. 2185 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), 2186 PointerType::getUnqual(II->getType())); 2187 return new LoadInst(Ptr, Twine(""), false, 1); 2188 } 2189 case Intrinsic::ppc_altivec_stvx: 2190 case Intrinsic::ppc_altivec_stvxl: 2191 // Turn stvx -> store if the pointer is known aligned. 2192 if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, &AC, 2193 &DT) >= 16) { 2194 Type *OpPtrTy = 2195 PointerType::getUnqual(II->getArgOperand(0)->getType()); 2196 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); 2197 return new StoreInst(II->getArgOperand(0), Ptr); 2198 } 2199 break; 2200 case Intrinsic::ppc_vsx_stxvw4x: 2201 case Intrinsic::ppc_vsx_stxvd2x: { 2202 // Turn PPC VSX stores into normal stores. 2203 Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType()); 2204 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); 2205 return new StoreInst(II->getArgOperand(0), Ptr, false, 1); 2206 } 2207 case Intrinsic::ppc_qpx_qvlfs: 2208 // Turn PPC QPX qvlfs -> load if the pointer is known aligned. 2209 if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, &AC, 2210 &DT) >= 16) { 2211 Type *VTy = VectorType::get(Builder.getFloatTy(), 2212 II->getType()->getVectorNumElements()); 2213 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), 2214 PointerType::getUnqual(VTy)); 2215 Value *Load = Builder.CreateLoad(Ptr); 2216 return new FPExtInst(Load, II->getType()); 2217 } 2218 break; 2219 case Intrinsic::ppc_qpx_qvlfd: 2220 // Turn PPC QPX qvlfd -> load if the pointer is known aligned. 2221 if (getOrEnforceKnownAlignment(II->getArgOperand(0), 32, DL, II, &AC, 2222 &DT) >= 32) { 2223 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), 2224 PointerType::getUnqual(II->getType())); 2225 return new LoadInst(Ptr); 2226 } 2227 break; 2228 case Intrinsic::ppc_qpx_qvstfs: 2229 // Turn PPC QPX qvstfs -> store if the pointer is known aligned. 2230 if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, &AC, 2231 &DT) >= 16) { 2232 Type *VTy = VectorType::get(Builder.getFloatTy(), 2233 II->getArgOperand(0)->getType()->getVectorNumElements()); 2234 Value *TOp = Builder.CreateFPTrunc(II->getArgOperand(0), VTy); 2235 Type *OpPtrTy = PointerType::getUnqual(VTy); 2236 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); 2237 return new StoreInst(TOp, Ptr); 2238 } 2239 break; 2240 case Intrinsic::ppc_qpx_qvstfd: 2241 // Turn PPC QPX qvstfd -> store if the pointer is known aligned. 2242 if (getOrEnforceKnownAlignment(II->getArgOperand(1), 32, DL, II, &AC, 2243 &DT) >= 32) { 2244 Type *OpPtrTy = 2245 PointerType::getUnqual(II->getArgOperand(0)->getType()); 2246 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); 2247 return new StoreInst(II->getArgOperand(0), Ptr); 2248 } 2249 break; 2250 2251 case Intrinsic::x86_vcvtph2ps_128: 2252 case Intrinsic::x86_vcvtph2ps_256: { 2253 auto Arg = II->getArgOperand(0); 2254 auto ArgType = cast<VectorType>(Arg->getType()); 2255 auto RetType = cast<VectorType>(II->getType()); 2256 unsigned ArgWidth = ArgType->getNumElements(); 2257 unsigned RetWidth = RetType->getNumElements(); 2258 assert(RetWidth <= ArgWidth && "Unexpected input/return vector widths"); 2259 assert(ArgType->isIntOrIntVectorTy() && 2260 ArgType->getScalarSizeInBits() == 16 && 2261 "CVTPH2PS input type should be 16-bit integer vector"); 2262 assert(RetType->getScalarType()->isFloatTy() && 2263 "CVTPH2PS output type should be 32-bit float vector"); 2264 2265 // Constant folding: Convert to generic half to single conversion. 2266 if (isa<ConstantAggregateZero>(Arg)) 2267 return replaceInstUsesWith(*II, ConstantAggregateZero::get(RetType)); 2268 2269 if (isa<ConstantDataVector>(Arg)) { 2270 auto VectorHalfAsShorts = Arg; 2271 if (RetWidth < ArgWidth) { 2272 SmallVector<uint32_t, 8> SubVecMask; 2273 for (unsigned i = 0; i != RetWidth; ++i) 2274 SubVecMask.push_back((int)i); 2275 VectorHalfAsShorts = Builder.CreateShuffleVector( 2276 Arg, UndefValue::get(ArgType), SubVecMask); 2277 } 2278 2279 auto VectorHalfType = 2280 VectorType::get(Type::getHalfTy(II->getContext()), RetWidth); 2281 auto VectorHalfs = 2282 Builder.CreateBitCast(VectorHalfAsShorts, VectorHalfType); 2283 auto VectorFloats = Builder.CreateFPExt(VectorHalfs, RetType); 2284 return replaceInstUsesWith(*II, VectorFloats); 2285 } 2286 2287 // We only use the lowest lanes of the argument. 2288 if (Value *V = SimplifyDemandedVectorEltsLow(Arg, ArgWidth, RetWidth)) { 2289 II->setArgOperand(0, V); 2290 return II; 2291 } 2292 break; 2293 } 2294 2295 case Intrinsic::x86_sse_cvtss2si: 2296 case Intrinsic::x86_sse_cvtss2si64: 2297 case Intrinsic::x86_sse_cvttss2si: 2298 case Intrinsic::x86_sse_cvttss2si64: 2299 case Intrinsic::x86_sse2_cvtsd2si: 2300 case Intrinsic::x86_sse2_cvtsd2si64: 2301 case Intrinsic::x86_sse2_cvttsd2si: 2302 case Intrinsic::x86_sse2_cvttsd2si64: 2303 case Intrinsic::x86_avx512_vcvtss2si32: 2304 case Intrinsic::x86_avx512_vcvtss2si64: 2305 case Intrinsic::x86_avx512_vcvtss2usi32: 2306 case Intrinsic::x86_avx512_vcvtss2usi64: 2307 case Intrinsic::x86_avx512_vcvtsd2si32: 2308 case Intrinsic::x86_avx512_vcvtsd2si64: 2309 case Intrinsic::x86_avx512_vcvtsd2usi32: 2310 case Intrinsic::x86_avx512_vcvtsd2usi64: 2311 case Intrinsic::x86_avx512_cvttss2si: 2312 case Intrinsic::x86_avx512_cvttss2si64: 2313 case Intrinsic::x86_avx512_cvttss2usi: 2314 case Intrinsic::x86_avx512_cvttss2usi64: 2315 case Intrinsic::x86_avx512_cvttsd2si: 2316 case Intrinsic::x86_avx512_cvttsd2si64: 2317 case Intrinsic::x86_avx512_cvttsd2usi: 2318 case Intrinsic::x86_avx512_cvttsd2usi64: { 2319 // These intrinsics only demand the 0th element of their input vectors. If 2320 // we can simplify the input based on that, do so now. 2321 Value *Arg = II->getArgOperand(0); 2322 unsigned VWidth = Arg->getType()->getVectorNumElements(); 2323 if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) { 2324 II->setArgOperand(0, V); 2325 return II; 2326 } 2327 break; 2328 } 2329 2330 case Intrinsic::x86_mmx_pmovmskb: 2331 case Intrinsic::x86_sse_movmsk_ps: 2332 case Intrinsic::x86_sse2_movmsk_pd: 2333 case Intrinsic::x86_sse2_pmovmskb_128: 2334 case Intrinsic::x86_avx_movmsk_pd_256: 2335 case Intrinsic::x86_avx_movmsk_ps_256: 2336 case Intrinsic::x86_avx2_pmovmskb: { 2337 if (Value *V = simplifyX86movmsk(*II)) 2338 return replaceInstUsesWith(*II, V); 2339 break; 2340 } 2341 2342 case Intrinsic::x86_sse_comieq_ss: 2343 case Intrinsic::x86_sse_comige_ss: 2344 case Intrinsic::x86_sse_comigt_ss: 2345 case Intrinsic::x86_sse_comile_ss: 2346 case Intrinsic::x86_sse_comilt_ss: 2347 case Intrinsic::x86_sse_comineq_ss: 2348 case Intrinsic::x86_sse_ucomieq_ss: 2349 case Intrinsic::x86_sse_ucomige_ss: 2350 case Intrinsic::x86_sse_ucomigt_ss: 2351 case Intrinsic::x86_sse_ucomile_ss: 2352 case Intrinsic::x86_sse_ucomilt_ss: 2353 case Intrinsic::x86_sse_ucomineq_ss: 2354 case Intrinsic::x86_sse2_comieq_sd: 2355 case Intrinsic::x86_sse2_comige_sd: 2356 case Intrinsic::x86_sse2_comigt_sd: 2357 case Intrinsic::x86_sse2_comile_sd: 2358 case Intrinsic::x86_sse2_comilt_sd: 2359 case Intrinsic::x86_sse2_comineq_sd: 2360 case Intrinsic::x86_sse2_ucomieq_sd: 2361 case Intrinsic::x86_sse2_ucomige_sd: 2362 case Intrinsic::x86_sse2_ucomigt_sd: 2363 case Intrinsic::x86_sse2_ucomile_sd: 2364 case Intrinsic::x86_sse2_ucomilt_sd: 2365 case Intrinsic::x86_sse2_ucomineq_sd: 2366 case Intrinsic::x86_avx512_vcomi_ss: 2367 case Intrinsic::x86_avx512_vcomi_sd: 2368 case Intrinsic::x86_avx512_mask_cmp_ss: 2369 case Intrinsic::x86_avx512_mask_cmp_sd: { 2370 // These intrinsics only demand the 0th element of their input vectors. If 2371 // we can simplify the input based on that, do so now. 2372 bool MadeChange = false; 2373 Value *Arg0 = II->getArgOperand(0); 2374 Value *Arg1 = II->getArgOperand(1); 2375 unsigned VWidth = Arg0->getType()->getVectorNumElements(); 2376 if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) { 2377 II->setArgOperand(0, V); 2378 MadeChange = true; 2379 } 2380 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) { 2381 II->setArgOperand(1, V); 2382 MadeChange = true; 2383 } 2384 if (MadeChange) 2385 return II; 2386 break; 2387 } 2388 case Intrinsic::x86_avx512_mask_cmp_pd_128: 2389 case Intrinsic::x86_avx512_mask_cmp_pd_256: 2390 case Intrinsic::x86_avx512_mask_cmp_pd_512: 2391 case Intrinsic::x86_avx512_mask_cmp_ps_128: 2392 case Intrinsic::x86_avx512_mask_cmp_ps_256: 2393 case Intrinsic::x86_avx512_mask_cmp_ps_512: { 2394 // Folding cmp(sub(a,b),0) -> cmp(a,b) and cmp(0,sub(a,b)) -> cmp(b,a) 2395 Value *Arg0 = II->getArgOperand(0); 2396 Value *Arg1 = II->getArgOperand(1); 2397 bool Arg0IsZero = match(Arg0, m_Zero()); 2398 if (Arg0IsZero) 2399 std::swap(Arg0, Arg1); 2400 Value *A, *B; 2401 // This fold requires only the NINF(not +/- inf) since inf minus 2402 // inf is nan. 2403 // NSZ(No Signed Zeros) is not needed because zeros of any sign are 2404 // equal for both compares. 2405 // NNAN is not needed because nans compare the same for both compares. 2406 // The compare intrinsic uses the above assumptions and therefore 2407 // doesn't require additional flags. 2408 if ((match(Arg0, m_OneUse(m_FSub(m_Value(A), m_Value(B)))) && 2409 match(Arg1, m_Zero()) && 2410 cast<Instruction>(Arg0)->getFastMathFlags().noInfs())) { 2411 if (Arg0IsZero) 2412 std::swap(A, B); 2413 II->setArgOperand(0, A); 2414 II->setArgOperand(1, B); 2415 return II; 2416 } 2417 break; 2418 } 2419 2420 case Intrinsic::x86_avx512_mask_add_ps_512: 2421 case Intrinsic::x86_avx512_mask_div_ps_512: 2422 case Intrinsic::x86_avx512_mask_mul_ps_512: 2423 case Intrinsic::x86_avx512_mask_sub_ps_512: 2424 case Intrinsic::x86_avx512_mask_add_pd_512: 2425 case Intrinsic::x86_avx512_mask_div_pd_512: 2426 case Intrinsic::x86_avx512_mask_mul_pd_512: 2427 case Intrinsic::x86_avx512_mask_sub_pd_512: 2428 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 2429 // IR operations. 2430 if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(4))) { 2431 if (R->getValue() == 4) { 2432 Value *Arg0 = II->getArgOperand(0); 2433 Value *Arg1 = II->getArgOperand(1); 2434 2435 Value *V; 2436 switch (II->getIntrinsicID()) { 2437 default: llvm_unreachable("Case stmts out of sync!"); 2438 case Intrinsic::x86_avx512_mask_add_ps_512: 2439 case Intrinsic::x86_avx512_mask_add_pd_512: 2440 V = Builder.CreateFAdd(Arg0, Arg1); 2441 break; 2442 case Intrinsic::x86_avx512_mask_sub_ps_512: 2443 case Intrinsic::x86_avx512_mask_sub_pd_512: 2444 V = Builder.CreateFSub(Arg0, Arg1); 2445 break; 2446 case Intrinsic::x86_avx512_mask_mul_ps_512: 2447 case Intrinsic::x86_avx512_mask_mul_pd_512: 2448 V = Builder.CreateFMul(Arg0, Arg1); 2449 break; 2450 case Intrinsic::x86_avx512_mask_div_ps_512: 2451 case Intrinsic::x86_avx512_mask_div_pd_512: 2452 V = Builder.CreateFDiv(Arg0, Arg1); 2453 break; 2454 } 2455 2456 // Create a select for the masking. 2457 V = emitX86MaskSelect(II->getArgOperand(3), V, II->getArgOperand(2), 2458 Builder); 2459 return replaceInstUsesWith(*II, V); 2460 } 2461 } 2462 break; 2463 2464 case Intrinsic::x86_avx512_mask_add_ss_round: 2465 case Intrinsic::x86_avx512_mask_div_ss_round: 2466 case Intrinsic::x86_avx512_mask_mul_ss_round: 2467 case Intrinsic::x86_avx512_mask_sub_ss_round: 2468 case Intrinsic::x86_avx512_mask_add_sd_round: 2469 case Intrinsic::x86_avx512_mask_div_sd_round: 2470 case Intrinsic::x86_avx512_mask_mul_sd_round: 2471 case Intrinsic::x86_avx512_mask_sub_sd_round: 2472 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 2473 // IR operations. 2474 if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(4))) { 2475 if (R->getValue() == 4) { 2476 // Extract the element as scalars. 2477 Value *Arg0 = II->getArgOperand(0); 2478 Value *Arg1 = II->getArgOperand(1); 2479 Value *LHS = Builder.CreateExtractElement(Arg0, (uint64_t)0); 2480 Value *RHS = Builder.CreateExtractElement(Arg1, (uint64_t)0); 2481 2482 Value *V; 2483 switch (II->getIntrinsicID()) { 2484 default: llvm_unreachable("Case stmts out of sync!"); 2485 case Intrinsic::x86_avx512_mask_add_ss_round: 2486 case Intrinsic::x86_avx512_mask_add_sd_round: 2487 V = Builder.CreateFAdd(LHS, RHS); 2488 break; 2489 case Intrinsic::x86_avx512_mask_sub_ss_round: 2490 case Intrinsic::x86_avx512_mask_sub_sd_round: 2491 V = Builder.CreateFSub(LHS, RHS); 2492 break; 2493 case Intrinsic::x86_avx512_mask_mul_ss_round: 2494 case Intrinsic::x86_avx512_mask_mul_sd_round: 2495 V = Builder.CreateFMul(LHS, RHS); 2496 break; 2497 case Intrinsic::x86_avx512_mask_div_ss_round: 2498 case Intrinsic::x86_avx512_mask_div_sd_round: 2499 V = Builder.CreateFDiv(LHS, RHS); 2500 break; 2501 } 2502 2503 // Handle the masking aspect of the intrinsic. 2504 Value *Mask = II->getArgOperand(3); 2505 auto *C = dyn_cast<ConstantInt>(Mask); 2506 // We don't need a select if we know the mask bit is a 1. 2507 if (!C || !C->getValue()[0]) { 2508 // Cast the mask to an i1 vector and then extract the lowest element. 2509 auto *MaskTy = VectorType::get(Builder.getInt1Ty(), 2510 cast<IntegerType>(Mask->getType())->getBitWidth()); 2511 Mask = Builder.CreateBitCast(Mask, MaskTy); 2512 Mask = Builder.CreateExtractElement(Mask, (uint64_t)0); 2513 // Extract the lowest element from the passthru operand. 2514 Value *Passthru = Builder.CreateExtractElement(II->getArgOperand(2), 2515 (uint64_t)0); 2516 V = Builder.CreateSelect(Mask, V, Passthru); 2517 } 2518 2519 // Insert the result back into the original argument 0. 2520 V = Builder.CreateInsertElement(Arg0, V, (uint64_t)0); 2521 2522 return replaceInstUsesWith(*II, V); 2523 } 2524 } 2525 LLVM_FALLTHROUGH; 2526 2527 // X86 scalar intrinsics simplified with SimplifyDemandedVectorElts. 2528 case Intrinsic::x86_avx512_mask_max_ss_round: 2529 case Intrinsic::x86_avx512_mask_min_ss_round: 2530 case Intrinsic::x86_avx512_mask_max_sd_round: 2531 case Intrinsic::x86_avx512_mask_min_sd_round: 2532 case Intrinsic::x86_avx512_mask_vfmadd_ss: 2533 case Intrinsic::x86_avx512_mask_vfmadd_sd: 2534 case Intrinsic::x86_avx512_maskz_vfmadd_ss: 2535 case Intrinsic::x86_avx512_maskz_vfmadd_sd: 2536 case Intrinsic::x86_avx512_mask3_vfmadd_ss: 2537 case Intrinsic::x86_avx512_mask3_vfmadd_sd: 2538 case Intrinsic::x86_avx512_mask3_vfmsub_ss: 2539 case Intrinsic::x86_avx512_mask3_vfmsub_sd: 2540 case Intrinsic::x86_avx512_mask3_vfnmsub_ss: 2541 case Intrinsic::x86_avx512_mask3_vfnmsub_sd: 2542 case Intrinsic::x86_fma_vfmadd_ss: 2543 case Intrinsic::x86_fma_vfmsub_ss: 2544 case Intrinsic::x86_fma_vfnmadd_ss: 2545 case Intrinsic::x86_fma_vfnmsub_ss: 2546 case Intrinsic::x86_fma_vfmadd_sd: 2547 case Intrinsic::x86_fma_vfmsub_sd: 2548 case Intrinsic::x86_fma_vfnmadd_sd: 2549 case Intrinsic::x86_fma_vfnmsub_sd: 2550 case Intrinsic::x86_sse_cmp_ss: 2551 case Intrinsic::x86_sse_min_ss: 2552 case Intrinsic::x86_sse_max_ss: 2553 case Intrinsic::x86_sse2_cmp_sd: 2554 case Intrinsic::x86_sse2_min_sd: 2555 case Intrinsic::x86_sse2_max_sd: 2556 case Intrinsic::x86_sse41_round_ss: 2557 case Intrinsic::x86_sse41_round_sd: 2558 case Intrinsic::x86_xop_vfrcz_ss: 2559 case Intrinsic::x86_xop_vfrcz_sd: { 2560 unsigned VWidth = II->getType()->getVectorNumElements(); 2561 APInt UndefElts(VWidth, 0); 2562 APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth)); 2563 if (Value *V = SimplifyDemandedVectorElts(II, AllOnesEltMask, UndefElts)) { 2564 if (V != II) 2565 return replaceInstUsesWith(*II, V); 2566 return II; 2567 } 2568 break; 2569 } 2570 2571 // Constant fold ashr( <A x Bi>, Ci ). 2572 // Constant fold lshr( <A x Bi>, Ci ). 2573 // Constant fold shl( <A x Bi>, Ci ). 2574 case Intrinsic::x86_sse2_psrai_d: 2575 case Intrinsic::x86_sse2_psrai_w: 2576 case Intrinsic::x86_avx2_psrai_d: 2577 case Intrinsic::x86_avx2_psrai_w: 2578 case Intrinsic::x86_avx512_psrai_q_128: 2579 case Intrinsic::x86_avx512_psrai_q_256: 2580 case Intrinsic::x86_avx512_psrai_d_512: 2581 case Intrinsic::x86_avx512_psrai_q_512: 2582 case Intrinsic::x86_avx512_psrai_w_512: 2583 case Intrinsic::x86_sse2_psrli_d: 2584 case Intrinsic::x86_sse2_psrli_q: 2585 case Intrinsic::x86_sse2_psrli_w: 2586 case Intrinsic::x86_avx2_psrli_d: 2587 case Intrinsic::x86_avx2_psrli_q: 2588 case Intrinsic::x86_avx2_psrli_w: 2589 case Intrinsic::x86_avx512_psrli_d_512: 2590 case Intrinsic::x86_avx512_psrli_q_512: 2591 case Intrinsic::x86_avx512_psrli_w_512: 2592 case Intrinsic::x86_sse2_pslli_d: 2593 case Intrinsic::x86_sse2_pslli_q: 2594 case Intrinsic::x86_sse2_pslli_w: 2595 case Intrinsic::x86_avx2_pslli_d: 2596 case Intrinsic::x86_avx2_pslli_q: 2597 case Intrinsic::x86_avx2_pslli_w: 2598 case Intrinsic::x86_avx512_pslli_d_512: 2599 case Intrinsic::x86_avx512_pslli_q_512: 2600 case Intrinsic::x86_avx512_pslli_w_512: 2601 if (Value *V = simplifyX86immShift(*II, Builder)) 2602 return replaceInstUsesWith(*II, V); 2603 break; 2604 2605 case Intrinsic::x86_sse2_psra_d: 2606 case Intrinsic::x86_sse2_psra_w: 2607 case Intrinsic::x86_avx2_psra_d: 2608 case Intrinsic::x86_avx2_psra_w: 2609 case Intrinsic::x86_avx512_psra_q_128: 2610 case Intrinsic::x86_avx512_psra_q_256: 2611 case Intrinsic::x86_avx512_psra_d_512: 2612 case Intrinsic::x86_avx512_psra_q_512: 2613 case Intrinsic::x86_avx512_psra_w_512: 2614 case Intrinsic::x86_sse2_psrl_d: 2615 case Intrinsic::x86_sse2_psrl_q: 2616 case Intrinsic::x86_sse2_psrl_w: 2617 case Intrinsic::x86_avx2_psrl_d: 2618 case Intrinsic::x86_avx2_psrl_q: 2619 case Intrinsic::x86_avx2_psrl_w: 2620 case Intrinsic::x86_avx512_psrl_d_512: 2621 case Intrinsic::x86_avx512_psrl_q_512: 2622 case Intrinsic::x86_avx512_psrl_w_512: 2623 case Intrinsic::x86_sse2_psll_d: 2624 case Intrinsic::x86_sse2_psll_q: 2625 case Intrinsic::x86_sse2_psll_w: 2626 case Intrinsic::x86_avx2_psll_d: 2627 case Intrinsic::x86_avx2_psll_q: 2628 case Intrinsic::x86_avx2_psll_w: 2629 case Intrinsic::x86_avx512_psll_d_512: 2630 case Intrinsic::x86_avx512_psll_q_512: 2631 case Intrinsic::x86_avx512_psll_w_512: { 2632 if (Value *V = simplifyX86immShift(*II, Builder)) 2633 return replaceInstUsesWith(*II, V); 2634 2635 // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector 2636 // operand to compute the shift amount. 2637 Value *Arg1 = II->getArgOperand(1); 2638 assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 && 2639 "Unexpected packed shift size"); 2640 unsigned VWidth = Arg1->getType()->getVectorNumElements(); 2641 2642 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) { 2643 II->setArgOperand(1, V); 2644 return II; 2645 } 2646 break; 2647 } 2648 2649 case Intrinsic::x86_avx2_psllv_d: 2650 case Intrinsic::x86_avx2_psllv_d_256: 2651 case Intrinsic::x86_avx2_psllv_q: 2652 case Intrinsic::x86_avx2_psllv_q_256: 2653 case Intrinsic::x86_avx512_psllv_d_512: 2654 case Intrinsic::x86_avx512_psllv_q_512: 2655 case Intrinsic::x86_avx512_psllv_w_128: 2656 case Intrinsic::x86_avx512_psllv_w_256: 2657 case Intrinsic::x86_avx512_psllv_w_512: 2658 case Intrinsic::x86_avx2_psrav_d: 2659 case Intrinsic::x86_avx2_psrav_d_256: 2660 case Intrinsic::x86_avx512_psrav_q_128: 2661 case Intrinsic::x86_avx512_psrav_q_256: 2662 case Intrinsic::x86_avx512_psrav_d_512: 2663 case Intrinsic::x86_avx512_psrav_q_512: 2664 case Intrinsic::x86_avx512_psrav_w_128: 2665 case Intrinsic::x86_avx512_psrav_w_256: 2666 case Intrinsic::x86_avx512_psrav_w_512: 2667 case Intrinsic::x86_avx2_psrlv_d: 2668 case Intrinsic::x86_avx2_psrlv_d_256: 2669 case Intrinsic::x86_avx2_psrlv_q: 2670 case Intrinsic::x86_avx2_psrlv_q_256: 2671 case Intrinsic::x86_avx512_psrlv_d_512: 2672 case Intrinsic::x86_avx512_psrlv_q_512: 2673 case Intrinsic::x86_avx512_psrlv_w_128: 2674 case Intrinsic::x86_avx512_psrlv_w_256: 2675 case Intrinsic::x86_avx512_psrlv_w_512: 2676 if (Value *V = simplifyX86varShift(*II, Builder)) 2677 return replaceInstUsesWith(*II, V); 2678 break; 2679 2680 case Intrinsic::x86_sse2_pmulu_dq: 2681 case Intrinsic::x86_sse41_pmuldq: 2682 case Intrinsic::x86_avx2_pmul_dq: 2683 case Intrinsic::x86_avx2_pmulu_dq: 2684 case Intrinsic::x86_avx512_pmul_dq_512: 2685 case Intrinsic::x86_avx512_pmulu_dq_512: { 2686 if (Value *V = simplifyX86muldq(*II, Builder)) 2687 return replaceInstUsesWith(*II, V); 2688 2689 unsigned VWidth = II->getType()->getVectorNumElements(); 2690 APInt UndefElts(VWidth, 0); 2691 APInt DemandedElts = APInt::getAllOnesValue(VWidth); 2692 if (Value *V = SimplifyDemandedVectorElts(II, DemandedElts, UndefElts)) { 2693 if (V != II) 2694 return replaceInstUsesWith(*II, V); 2695 return II; 2696 } 2697 break; 2698 } 2699 2700 case Intrinsic::x86_sse2_packssdw_128: 2701 case Intrinsic::x86_sse2_packsswb_128: 2702 case Intrinsic::x86_avx2_packssdw: 2703 case Intrinsic::x86_avx2_packsswb: 2704 case Intrinsic::x86_avx512_packssdw_512: 2705 case Intrinsic::x86_avx512_packsswb_512: 2706 if (Value *V = simplifyX86pack(*II, true)) 2707 return replaceInstUsesWith(*II, V); 2708 break; 2709 2710 case Intrinsic::x86_sse2_packuswb_128: 2711 case Intrinsic::x86_sse41_packusdw: 2712 case Intrinsic::x86_avx2_packusdw: 2713 case Intrinsic::x86_avx2_packuswb: 2714 case Intrinsic::x86_avx512_packusdw_512: 2715 case Intrinsic::x86_avx512_packuswb_512: 2716 if (Value *V = simplifyX86pack(*II, false)) 2717 return replaceInstUsesWith(*II, V); 2718 break; 2719 2720 case Intrinsic::x86_pclmulqdq: { 2721 if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(2))) { 2722 unsigned Imm = C->getZExtValue(); 2723 2724 bool MadeChange = false; 2725 Value *Arg0 = II->getArgOperand(0); 2726 Value *Arg1 = II->getArgOperand(1); 2727 unsigned VWidth = Arg0->getType()->getVectorNumElements(); 2728 APInt DemandedElts(VWidth, 0); 2729 2730 APInt UndefElts1(VWidth, 0); 2731 DemandedElts = (Imm & 0x01) ? 2 : 1; 2732 if (Value *V = SimplifyDemandedVectorElts(Arg0, DemandedElts, 2733 UndefElts1)) { 2734 II->setArgOperand(0, V); 2735 MadeChange = true; 2736 } 2737 2738 APInt UndefElts2(VWidth, 0); 2739 DemandedElts = (Imm & 0x10) ? 2 : 1; 2740 if (Value *V = SimplifyDemandedVectorElts(Arg1, DemandedElts, 2741 UndefElts2)) { 2742 II->setArgOperand(1, V); 2743 MadeChange = true; 2744 } 2745 2746 // If both input elements are undef, the result is undef. 2747 if (UndefElts1[(Imm & 0x01) ? 1 : 0] || 2748 UndefElts2[(Imm & 0x10) ? 1 : 0]) 2749 return replaceInstUsesWith(*II, 2750 ConstantAggregateZero::get(II->getType())); 2751 2752 if (MadeChange) 2753 return II; 2754 } 2755 break; 2756 } 2757 2758 case Intrinsic::x86_sse41_insertps: 2759 if (Value *V = simplifyX86insertps(*II, Builder)) 2760 return replaceInstUsesWith(*II, V); 2761 break; 2762 2763 case Intrinsic::x86_sse4a_extrq: { 2764 Value *Op0 = II->getArgOperand(0); 2765 Value *Op1 = II->getArgOperand(1); 2766 unsigned VWidth0 = Op0->getType()->getVectorNumElements(); 2767 unsigned VWidth1 = Op1->getType()->getVectorNumElements(); 2768 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 2769 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 2770 VWidth1 == 16 && "Unexpected operand sizes"); 2771 2772 // See if we're dealing with constant values. 2773 Constant *C1 = dyn_cast<Constant>(Op1); 2774 ConstantInt *CILength = 2775 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 2776 : nullptr; 2777 ConstantInt *CIIndex = 2778 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 2779 : nullptr; 2780 2781 // Attempt to simplify to a constant, shuffle vector or EXTRQI call. 2782 if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, Builder)) 2783 return replaceInstUsesWith(*II, V); 2784 2785 // EXTRQ only uses the lowest 64-bits of the first 128-bit vector 2786 // operands and the lowest 16-bits of the second. 2787 bool MadeChange = false; 2788 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 2789 II->setArgOperand(0, V); 2790 MadeChange = true; 2791 } 2792 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) { 2793 II->setArgOperand(1, V); 2794 MadeChange = true; 2795 } 2796 if (MadeChange) 2797 return II; 2798 break; 2799 } 2800 2801 case Intrinsic::x86_sse4a_extrqi: { 2802 // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining 2803 // bits of the lower 64-bits. The upper 64-bits are undefined. 2804 Value *Op0 = II->getArgOperand(0); 2805 unsigned VWidth = Op0->getType()->getVectorNumElements(); 2806 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 2807 "Unexpected operand size"); 2808 2809 // See if we're dealing with constant values. 2810 ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(1)); 2811 ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(2)); 2812 2813 // Attempt to simplify to a constant or shuffle vector. 2814 if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, Builder)) 2815 return replaceInstUsesWith(*II, V); 2816 2817 // EXTRQI only uses the lowest 64-bits of the first 128-bit vector 2818 // operand. 2819 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 2820 II->setArgOperand(0, V); 2821 return II; 2822 } 2823 break; 2824 } 2825 2826 case Intrinsic::x86_sse4a_insertq: { 2827 Value *Op0 = II->getArgOperand(0); 2828 Value *Op1 = II->getArgOperand(1); 2829 unsigned VWidth = Op0->getType()->getVectorNumElements(); 2830 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 2831 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 2832 Op1->getType()->getVectorNumElements() == 2 && 2833 "Unexpected operand size"); 2834 2835 // See if we're dealing with constant values. 2836 Constant *C1 = dyn_cast<Constant>(Op1); 2837 ConstantInt *CI11 = 2838 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 2839 : nullptr; 2840 2841 // Attempt to simplify to a constant, shuffle vector or INSERTQI call. 2842 if (CI11) { 2843 const APInt &V11 = CI11->getValue(); 2844 APInt Len = V11.zextOrTrunc(6); 2845 APInt Idx = V11.lshr(8).zextOrTrunc(6); 2846 if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, Builder)) 2847 return replaceInstUsesWith(*II, V); 2848 } 2849 2850 // INSERTQ only uses the lowest 64-bits of the first 128-bit vector 2851 // operand. 2852 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 2853 II->setArgOperand(0, V); 2854 return II; 2855 } 2856 break; 2857 } 2858 2859 case Intrinsic::x86_sse4a_insertqi: { 2860 // INSERTQI: Extract lowest Length bits from lower half of second source and 2861 // insert over first source starting at Index bit. The upper 64-bits are 2862 // undefined. 2863 Value *Op0 = II->getArgOperand(0); 2864 Value *Op1 = II->getArgOperand(1); 2865 unsigned VWidth0 = Op0->getType()->getVectorNumElements(); 2866 unsigned VWidth1 = Op1->getType()->getVectorNumElements(); 2867 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 2868 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 2869 VWidth1 == 2 && "Unexpected operand sizes"); 2870 2871 // See if we're dealing with constant values. 2872 ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(2)); 2873 ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(3)); 2874 2875 // Attempt to simplify to a constant or shuffle vector. 2876 if (CILength && CIIndex) { 2877 APInt Len = CILength->getValue().zextOrTrunc(6); 2878 APInt Idx = CIIndex->getValue().zextOrTrunc(6); 2879 if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, Builder)) 2880 return replaceInstUsesWith(*II, V); 2881 } 2882 2883 // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector 2884 // operands. 2885 bool MadeChange = false; 2886 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 2887 II->setArgOperand(0, V); 2888 MadeChange = true; 2889 } 2890 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) { 2891 II->setArgOperand(1, V); 2892 MadeChange = true; 2893 } 2894 if (MadeChange) 2895 return II; 2896 break; 2897 } 2898 2899 case Intrinsic::x86_sse41_pblendvb: 2900 case Intrinsic::x86_sse41_blendvps: 2901 case Intrinsic::x86_sse41_blendvpd: 2902 case Intrinsic::x86_avx_blendv_ps_256: 2903 case Intrinsic::x86_avx_blendv_pd_256: 2904 case Intrinsic::x86_avx2_pblendvb: { 2905 // Convert blendv* to vector selects if the mask is constant. 2906 // This optimization is convoluted because the intrinsic is defined as 2907 // getting a vector of floats or doubles for the ps and pd versions. 2908 // FIXME: That should be changed. 2909 2910 Value *Op0 = II->getArgOperand(0); 2911 Value *Op1 = II->getArgOperand(1); 2912 Value *Mask = II->getArgOperand(2); 2913 2914 // fold (blend A, A, Mask) -> A 2915 if (Op0 == Op1) 2916 return replaceInstUsesWith(CI, Op0); 2917 2918 // Zero Mask - select 1st argument. 2919 if (isa<ConstantAggregateZero>(Mask)) 2920 return replaceInstUsesWith(CI, Op0); 2921 2922 // Constant Mask - select 1st/2nd argument lane based on top bit of mask. 2923 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) { 2924 Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask); 2925 return SelectInst::Create(NewSelector, Op1, Op0, "blendv"); 2926 } 2927 break; 2928 } 2929 2930 case Intrinsic::x86_ssse3_pshuf_b_128: 2931 case Intrinsic::x86_avx2_pshuf_b: 2932 case Intrinsic::x86_avx512_pshuf_b_512: 2933 if (Value *V = simplifyX86pshufb(*II, Builder)) 2934 return replaceInstUsesWith(*II, V); 2935 break; 2936 2937 case Intrinsic::x86_avx_vpermilvar_ps: 2938 case Intrinsic::x86_avx_vpermilvar_ps_256: 2939 case Intrinsic::x86_avx512_vpermilvar_ps_512: 2940 case Intrinsic::x86_avx_vpermilvar_pd: 2941 case Intrinsic::x86_avx_vpermilvar_pd_256: 2942 case Intrinsic::x86_avx512_vpermilvar_pd_512: 2943 if (Value *V = simplifyX86vpermilvar(*II, Builder)) 2944 return replaceInstUsesWith(*II, V); 2945 break; 2946 2947 case Intrinsic::x86_avx2_permd: 2948 case Intrinsic::x86_avx2_permps: 2949 if (Value *V = simplifyX86vpermv(*II, Builder)) 2950 return replaceInstUsesWith(*II, V); 2951 break; 2952 2953 case Intrinsic::x86_avx512_mask_permvar_df_256: 2954 case Intrinsic::x86_avx512_mask_permvar_df_512: 2955 case Intrinsic::x86_avx512_mask_permvar_di_256: 2956 case Intrinsic::x86_avx512_mask_permvar_di_512: 2957 case Intrinsic::x86_avx512_mask_permvar_hi_128: 2958 case Intrinsic::x86_avx512_mask_permvar_hi_256: 2959 case Intrinsic::x86_avx512_mask_permvar_hi_512: 2960 case Intrinsic::x86_avx512_mask_permvar_qi_128: 2961 case Intrinsic::x86_avx512_mask_permvar_qi_256: 2962 case Intrinsic::x86_avx512_mask_permvar_qi_512: 2963 case Intrinsic::x86_avx512_mask_permvar_sf_256: 2964 case Intrinsic::x86_avx512_mask_permvar_sf_512: 2965 case Intrinsic::x86_avx512_mask_permvar_si_256: 2966 case Intrinsic::x86_avx512_mask_permvar_si_512: 2967 if (Value *V = simplifyX86vpermv(*II, Builder)) { 2968 // We simplified the permuting, now create a select for the masking. 2969 V = emitX86MaskSelect(II->getArgOperand(3), V, II->getArgOperand(2), 2970 Builder); 2971 return replaceInstUsesWith(*II, V); 2972 } 2973 break; 2974 2975 case Intrinsic::x86_avx_vperm2f128_pd_256: 2976 case Intrinsic::x86_avx_vperm2f128_ps_256: 2977 case Intrinsic::x86_avx_vperm2f128_si_256: 2978 case Intrinsic::x86_avx2_vperm2i128: 2979 if (Value *V = simplifyX86vperm2(*II, Builder)) 2980 return replaceInstUsesWith(*II, V); 2981 break; 2982 2983 case Intrinsic::x86_avx_maskload_ps: 2984 case Intrinsic::x86_avx_maskload_pd: 2985 case Intrinsic::x86_avx_maskload_ps_256: 2986 case Intrinsic::x86_avx_maskload_pd_256: 2987 case Intrinsic::x86_avx2_maskload_d: 2988 case Intrinsic::x86_avx2_maskload_q: 2989 case Intrinsic::x86_avx2_maskload_d_256: 2990 case Intrinsic::x86_avx2_maskload_q_256: 2991 if (Instruction *I = simplifyX86MaskedLoad(*II, *this)) 2992 return I; 2993 break; 2994 2995 case Intrinsic::x86_sse2_maskmov_dqu: 2996 case Intrinsic::x86_avx_maskstore_ps: 2997 case Intrinsic::x86_avx_maskstore_pd: 2998 case Intrinsic::x86_avx_maskstore_ps_256: 2999 case Intrinsic::x86_avx_maskstore_pd_256: 3000 case Intrinsic::x86_avx2_maskstore_d: 3001 case Intrinsic::x86_avx2_maskstore_q: 3002 case Intrinsic::x86_avx2_maskstore_d_256: 3003 case Intrinsic::x86_avx2_maskstore_q_256: 3004 if (simplifyX86MaskedStore(*II, *this)) 3005 return nullptr; 3006 break; 3007 3008 case Intrinsic::x86_xop_vpcomb: 3009 case Intrinsic::x86_xop_vpcomd: 3010 case Intrinsic::x86_xop_vpcomq: 3011 case Intrinsic::x86_xop_vpcomw: 3012 if (Value *V = simplifyX86vpcom(*II, Builder, true)) 3013 return replaceInstUsesWith(*II, V); 3014 break; 3015 3016 case Intrinsic::x86_xop_vpcomub: 3017 case Intrinsic::x86_xop_vpcomud: 3018 case Intrinsic::x86_xop_vpcomuq: 3019 case Intrinsic::x86_xop_vpcomuw: 3020 if (Value *V = simplifyX86vpcom(*II, Builder, false)) 3021 return replaceInstUsesWith(*II, V); 3022 break; 3023 3024 case Intrinsic::ppc_altivec_vperm: 3025 // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant. 3026 // Note that ppc_altivec_vperm has a big-endian bias, so when creating 3027 // a vectorshuffle for little endian, we must undo the transformation 3028 // performed on vec_perm in altivec.h. That is, we must complement 3029 // the permutation mask with respect to 31 and reverse the order of 3030 // V1 and V2. 3031 if (Constant *Mask = dyn_cast<Constant>(II->getArgOperand(2))) { 3032 assert(Mask->getType()->getVectorNumElements() == 16 && 3033 "Bad type for intrinsic!"); 3034 3035 // Check that all of the elements are integer constants or undefs. 3036 bool AllEltsOk = true; 3037 for (unsigned i = 0; i != 16; ++i) { 3038 Constant *Elt = Mask->getAggregateElement(i); 3039 if (!Elt || !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) { 3040 AllEltsOk = false; 3041 break; 3042 } 3043 } 3044 3045 if (AllEltsOk) { 3046 // Cast the input vectors to byte vectors. 3047 Value *Op0 = Builder.CreateBitCast(II->getArgOperand(0), 3048 Mask->getType()); 3049 Value *Op1 = Builder.CreateBitCast(II->getArgOperand(1), 3050 Mask->getType()); 3051 Value *Result = UndefValue::get(Op0->getType()); 3052 3053 // Only extract each element once. 3054 Value *ExtractedElts[32]; 3055 memset(ExtractedElts, 0, sizeof(ExtractedElts)); 3056 3057 for (unsigned i = 0; i != 16; ++i) { 3058 if (isa<UndefValue>(Mask->getAggregateElement(i))) 3059 continue; 3060 unsigned Idx = 3061 cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue(); 3062 Idx &= 31; // Match the hardware behavior. 3063 if (DL.isLittleEndian()) 3064 Idx = 31 - Idx; 3065 3066 if (!ExtractedElts[Idx]) { 3067 Value *Op0ToUse = (DL.isLittleEndian()) ? Op1 : Op0; 3068 Value *Op1ToUse = (DL.isLittleEndian()) ? Op0 : Op1; 3069 ExtractedElts[Idx] = 3070 Builder.CreateExtractElement(Idx < 16 ? Op0ToUse : Op1ToUse, 3071 Builder.getInt32(Idx&15)); 3072 } 3073 3074 // Insert this value into the result vector. 3075 Result = Builder.CreateInsertElement(Result, ExtractedElts[Idx], 3076 Builder.getInt32(i)); 3077 } 3078 return CastInst::Create(Instruction::BitCast, Result, CI.getType()); 3079 } 3080 } 3081 break; 3082 3083 case Intrinsic::arm_neon_vld1: 3084 case Intrinsic::arm_neon_vld2: 3085 case Intrinsic::arm_neon_vld3: 3086 case Intrinsic::arm_neon_vld4: 3087 case Intrinsic::arm_neon_vld2lane: 3088 case Intrinsic::arm_neon_vld3lane: 3089 case Intrinsic::arm_neon_vld4lane: 3090 case Intrinsic::arm_neon_vst1: 3091 case Intrinsic::arm_neon_vst2: 3092 case Intrinsic::arm_neon_vst3: 3093 case Intrinsic::arm_neon_vst4: 3094 case Intrinsic::arm_neon_vst2lane: 3095 case Intrinsic::arm_neon_vst3lane: 3096 case Intrinsic::arm_neon_vst4lane: { 3097 unsigned MemAlign = 3098 getKnownAlignment(II->getArgOperand(0), DL, II, &AC, &DT); 3099 unsigned AlignArg = II->getNumArgOperands() - 1; 3100 ConstantInt *IntrAlign = dyn_cast<ConstantInt>(II->getArgOperand(AlignArg)); 3101 if (IntrAlign && IntrAlign->getZExtValue() < MemAlign) { 3102 II->setArgOperand(AlignArg, 3103 ConstantInt::get(Type::getInt32Ty(II->getContext()), 3104 MemAlign, false)); 3105 return II; 3106 } 3107 break; 3108 } 3109 3110 case Intrinsic::arm_neon_vmulls: 3111 case Intrinsic::arm_neon_vmullu: 3112 case Intrinsic::aarch64_neon_smull: 3113 case Intrinsic::aarch64_neon_umull: { 3114 Value *Arg0 = II->getArgOperand(0); 3115 Value *Arg1 = II->getArgOperand(1); 3116 3117 // Handle mul by zero first: 3118 if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) { 3119 return replaceInstUsesWith(CI, ConstantAggregateZero::get(II->getType())); 3120 } 3121 3122 // Check for constant LHS & RHS - in this case we just simplify. 3123 bool Zext = (II->getIntrinsicID() == Intrinsic::arm_neon_vmullu || 3124 II->getIntrinsicID() == Intrinsic::aarch64_neon_umull); 3125 VectorType *NewVT = cast<VectorType>(II->getType()); 3126 if (Constant *CV0 = dyn_cast<Constant>(Arg0)) { 3127 if (Constant *CV1 = dyn_cast<Constant>(Arg1)) { 3128 CV0 = ConstantExpr::getIntegerCast(CV0, NewVT, /*isSigned=*/!Zext); 3129 CV1 = ConstantExpr::getIntegerCast(CV1, NewVT, /*isSigned=*/!Zext); 3130 3131 return replaceInstUsesWith(CI, ConstantExpr::getMul(CV0, CV1)); 3132 } 3133 3134 // Couldn't simplify - canonicalize constant to the RHS. 3135 std::swap(Arg0, Arg1); 3136 } 3137 3138 // Handle mul by one: 3139 if (Constant *CV1 = dyn_cast<Constant>(Arg1)) 3140 if (ConstantInt *Splat = 3141 dyn_cast_or_null<ConstantInt>(CV1->getSplatValue())) 3142 if (Splat->isOne()) 3143 return CastInst::CreateIntegerCast(Arg0, II->getType(), 3144 /*isSigned=*/!Zext); 3145 3146 break; 3147 } 3148 case Intrinsic::amdgcn_rcp: { 3149 Value *Src = II->getArgOperand(0); 3150 3151 // TODO: Move to ConstantFolding/InstSimplify? 3152 if (isa<UndefValue>(Src)) 3153 return replaceInstUsesWith(CI, Src); 3154 3155 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 3156 const APFloat &ArgVal = C->getValueAPF(); 3157 APFloat Val(ArgVal.getSemantics(), 1.0); 3158 APFloat::opStatus Status = Val.divide(ArgVal, 3159 APFloat::rmNearestTiesToEven); 3160 // Only do this if it was exact and therefore not dependent on the 3161 // rounding mode. 3162 if (Status == APFloat::opOK) 3163 return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val)); 3164 } 3165 3166 break; 3167 } 3168 case Intrinsic::amdgcn_rsq: { 3169 Value *Src = II->getArgOperand(0); 3170 3171 // TODO: Move to ConstantFolding/InstSimplify? 3172 if (isa<UndefValue>(Src)) 3173 return replaceInstUsesWith(CI, Src); 3174 break; 3175 } 3176 case Intrinsic::amdgcn_frexp_mant: 3177 case Intrinsic::amdgcn_frexp_exp: { 3178 Value *Src = II->getArgOperand(0); 3179 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 3180 int Exp; 3181 APFloat Significand = frexp(C->getValueAPF(), Exp, 3182 APFloat::rmNearestTiesToEven); 3183 3184 if (II->getIntrinsicID() == Intrinsic::amdgcn_frexp_mant) { 3185 return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), 3186 Significand)); 3187 } 3188 3189 // Match instruction special case behavior. 3190 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf) 3191 Exp = 0; 3192 3193 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Exp)); 3194 } 3195 3196 if (isa<UndefValue>(Src)) 3197 return replaceInstUsesWith(CI, UndefValue::get(II->getType())); 3198 3199 break; 3200 } 3201 case Intrinsic::amdgcn_class: { 3202 enum { 3203 S_NAN = 1 << 0, // Signaling NaN 3204 Q_NAN = 1 << 1, // Quiet NaN 3205 N_INFINITY = 1 << 2, // Negative infinity 3206 N_NORMAL = 1 << 3, // Negative normal 3207 N_SUBNORMAL = 1 << 4, // Negative subnormal 3208 N_ZERO = 1 << 5, // Negative zero 3209 P_ZERO = 1 << 6, // Positive zero 3210 P_SUBNORMAL = 1 << 7, // Positive subnormal 3211 P_NORMAL = 1 << 8, // Positive normal 3212 P_INFINITY = 1 << 9 // Positive infinity 3213 }; 3214 3215 const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL | 3216 N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL | P_NORMAL | P_INFINITY; 3217 3218 Value *Src0 = II->getArgOperand(0); 3219 Value *Src1 = II->getArgOperand(1); 3220 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1); 3221 if (!CMask) { 3222 if (isa<UndefValue>(Src0)) 3223 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 3224 3225 if (isa<UndefValue>(Src1)) 3226 return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false)); 3227 break; 3228 } 3229 3230 uint32_t Mask = CMask->getZExtValue(); 3231 3232 // If all tests are made, it doesn't matter what the value is. 3233 if ((Mask & FullMask) == FullMask) 3234 return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), true)); 3235 3236 if ((Mask & FullMask) == 0) 3237 return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false)); 3238 3239 if (Mask == (S_NAN | Q_NAN)) { 3240 // Equivalent of isnan. Replace with standard fcmp. 3241 Value *FCmp = Builder.CreateFCmpUNO(Src0, Src0); 3242 FCmp->takeName(II); 3243 return replaceInstUsesWith(*II, FCmp); 3244 } 3245 3246 const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0); 3247 if (!CVal) { 3248 if (isa<UndefValue>(Src0)) 3249 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 3250 3251 // Clamp mask to used bits 3252 if ((Mask & FullMask) != Mask) { 3253 CallInst *NewCall = Builder.CreateCall(II->getCalledFunction(), 3254 { Src0, ConstantInt::get(Src1->getType(), Mask & FullMask) } 3255 ); 3256 3257 NewCall->takeName(II); 3258 return replaceInstUsesWith(*II, NewCall); 3259 } 3260 3261 break; 3262 } 3263 3264 const APFloat &Val = CVal->getValueAPF(); 3265 3266 bool Result = 3267 ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) || 3268 ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) || 3269 ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) || 3270 ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) || 3271 ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) || 3272 ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) || 3273 ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) || 3274 ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) || 3275 ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) || 3276 ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative()); 3277 3278 return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result)); 3279 } 3280 case Intrinsic::amdgcn_cvt_pkrtz: { 3281 Value *Src0 = II->getArgOperand(0); 3282 Value *Src1 = II->getArgOperand(1); 3283 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 3284 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 3285 const fltSemantics &HalfSem 3286 = II->getType()->getScalarType()->getFltSemantics(); 3287 bool LosesInfo; 3288 APFloat Val0 = C0->getValueAPF(); 3289 APFloat Val1 = C1->getValueAPF(); 3290 Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 3291 Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 3292 3293 Constant *Folded = ConstantVector::get({ 3294 ConstantFP::get(II->getContext(), Val0), 3295 ConstantFP::get(II->getContext(), Val1) }); 3296 return replaceInstUsesWith(*II, Folded); 3297 } 3298 } 3299 3300 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) 3301 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 3302 3303 break; 3304 } 3305 case Intrinsic::amdgcn_ubfe: 3306 case Intrinsic::amdgcn_sbfe: { 3307 // Decompose simple cases into standard shifts. 3308 Value *Src = II->getArgOperand(0); 3309 if (isa<UndefValue>(Src)) 3310 return replaceInstUsesWith(*II, Src); 3311 3312 unsigned Width; 3313 Type *Ty = II->getType(); 3314 unsigned IntSize = Ty->getIntegerBitWidth(); 3315 3316 ConstantInt *CWidth = dyn_cast<ConstantInt>(II->getArgOperand(2)); 3317 if (CWidth) { 3318 Width = CWidth->getZExtValue(); 3319 if ((Width & (IntSize - 1)) == 0) 3320 return replaceInstUsesWith(*II, ConstantInt::getNullValue(Ty)); 3321 3322 if (Width >= IntSize) { 3323 // Hardware ignores high bits, so remove those. 3324 II->setArgOperand(2, ConstantInt::get(CWidth->getType(), 3325 Width & (IntSize - 1))); 3326 return II; 3327 } 3328 } 3329 3330 unsigned Offset; 3331 ConstantInt *COffset = dyn_cast<ConstantInt>(II->getArgOperand(1)); 3332 if (COffset) { 3333 Offset = COffset->getZExtValue(); 3334 if (Offset >= IntSize) { 3335 II->setArgOperand(1, ConstantInt::get(COffset->getType(), 3336 Offset & (IntSize - 1))); 3337 return II; 3338 } 3339 } 3340 3341 bool Signed = II->getIntrinsicID() == Intrinsic::amdgcn_sbfe; 3342 3343 // TODO: Also emit sub if only width is constant. 3344 if (!CWidth && COffset && Offset == 0) { 3345 Constant *KSize = ConstantInt::get(COffset->getType(), IntSize); 3346 Value *ShiftVal = Builder.CreateSub(KSize, II->getArgOperand(2)); 3347 ShiftVal = Builder.CreateZExt(ShiftVal, II->getType()); 3348 3349 Value *Shl = Builder.CreateShl(Src, ShiftVal); 3350 Value *RightShift = Signed ? Builder.CreateAShr(Shl, ShiftVal) 3351 : Builder.CreateLShr(Shl, ShiftVal); 3352 RightShift->takeName(II); 3353 return replaceInstUsesWith(*II, RightShift); 3354 } 3355 3356 if (!CWidth || !COffset) 3357 break; 3358 3359 // TODO: This allows folding to undef when the hardware has specific 3360 // behavior? 3361 if (Offset + Width < IntSize) { 3362 Value *Shl = Builder.CreateShl(Src, IntSize - Offset - Width); 3363 Value *RightShift = Signed ? Builder.CreateAShr(Shl, IntSize - Width) 3364 : Builder.CreateLShr(Shl, IntSize - Width); 3365 RightShift->takeName(II); 3366 return replaceInstUsesWith(*II, RightShift); 3367 } 3368 3369 Value *RightShift = Signed ? Builder.CreateAShr(Src, Offset) 3370 : Builder.CreateLShr(Src, Offset); 3371 3372 RightShift->takeName(II); 3373 return replaceInstUsesWith(*II, RightShift); 3374 } 3375 case Intrinsic::amdgcn_exp: 3376 case Intrinsic::amdgcn_exp_compr: { 3377 ConstantInt *En = dyn_cast<ConstantInt>(II->getArgOperand(1)); 3378 if (!En) // Illegal. 3379 break; 3380 3381 unsigned EnBits = En->getZExtValue(); 3382 if (EnBits == 0xf) 3383 break; // All inputs enabled. 3384 3385 bool IsCompr = II->getIntrinsicID() == Intrinsic::amdgcn_exp_compr; 3386 bool Changed = false; 3387 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) { 3388 if ((!IsCompr && (EnBits & (1 << I)) == 0) || 3389 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) { 3390 Value *Src = II->getArgOperand(I + 2); 3391 if (!isa<UndefValue>(Src)) { 3392 II->setArgOperand(I + 2, UndefValue::get(Src->getType())); 3393 Changed = true; 3394 } 3395 } 3396 } 3397 3398 if (Changed) 3399 return II; 3400 3401 break; 3402 3403 } 3404 case Intrinsic::amdgcn_fmed3: { 3405 // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled 3406 // for the shader. 3407 3408 Value *Src0 = II->getArgOperand(0); 3409 Value *Src1 = II->getArgOperand(1); 3410 Value *Src2 = II->getArgOperand(2); 3411 3412 bool Swap = false; 3413 // Canonicalize constants to RHS operands. 3414 // 3415 // fmed3(c0, x, c1) -> fmed3(x, c0, c1) 3416 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 3417 std::swap(Src0, Src1); 3418 Swap = true; 3419 } 3420 3421 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) { 3422 std::swap(Src1, Src2); 3423 Swap = true; 3424 } 3425 3426 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 3427 std::swap(Src0, Src1); 3428 Swap = true; 3429 } 3430 3431 if (Swap) { 3432 II->setArgOperand(0, Src0); 3433 II->setArgOperand(1, Src1); 3434 II->setArgOperand(2, Src2); 3435 return II; 3436 } 3437 3438 if (match(Src2, m_NaN()) || isa<UndefValue>(Src2)) { 3439 CallInst *NewCall = Builder.CreateMinNum(Src0, Src1); 3440 NewCall->copyFastMathFlags(II); 3441 NewCall->takeName(II); 3442 return replaceInstUsesWith(*II, NewCall); 3443 } 3444 3445 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 3446 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 3447 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) { 3448 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(), 3449 C2->getValueAPF()); 3450 return replaceInstUsesWith(*II, 3451 ConstantFP::get(Builder.getContext(), Result)); 3452 } 3453 } 3454 } 3455 3456 break; 3457 } 3458 case Intrinsic::amdgcn_icmp: 3459 case Intrinsic::amdgcn_fcmp: { 3460 const ConstantInt *CC = dyn_cast<ConstantInt>(II->getArgOperand(2)); 3461 if (!CC) 3462 break; 3463 3464 // Guard against invalid arguments. 3465 int64_t CCVal = CC->getZExtValue(); 3466 bool IsInteger = II->getIntrinsicID() == Intrinsic::amdgcn_icmp; 3467 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE || 3468 CCVal > CmpInst::LAST_ICMP_PREDICATE)) || 3469 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE || 3470 CCVal > CmpInst::LAST_FCMP_PREDICATE))) 3471 break; 3472 3473 Value *Src0 = II->getArgOperand(0); 3474 Value *Src1 = II->getArgOperand(1); 3475 3476 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) { 3477 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) { 3478 Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1); 3479 if (CCmp->isNullValue()) { 3480 return replaceInstUsesWith( 3481 *II, ConstantExpr::getSExt(CCmp, II->getType())); 3482 } 3483 3484 // The result of V_ICMP/V_FCMP assembly instructions (which this 3485 // intrinsic exposes) is one bit per thread, masked with the EXEC 3486 // register (which contains the bitmask of live threads). So a 3487 // comparison that always returns true is the same as a read of the 3488 // EXEC register. 3489 Value *NewF = Intrinsic::getDeclaration( 3490 II->getModule(), Intrinsic::read_register, II->getType()); 3491 Metadata *MDArgs[] = {MDString::get(II->getContext(), "exec")}; 3492 MDNode *MD = MDNode::get(II->getContext(), MDArgs); 3493 Value *Args[] = {MetadataAsValue::get(II->getContext(), MD)}; 3494 CallInst *NewCall = Builder.CreateCall(NewF, Args); 3495 NewCall->addAttribute(AttributeList::FunctionIndex, 3496 Attribute::Convergent); 3497 NewCall->takeName(II); 3498 return replaceInstUsesWith(*II, NewCall); 3499 } 3500 3501 // Canonicalize constants to RHS. 3502 CmpInst::Predicate SwapPred 3503 = CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal)); 3504 II->setArgOperand(0, Src1); 3505 II->setArgOperand(1, Src0); 3506 II->setArgOperand(2, ConstantInt::get(CC->getType(), 3507 static_cast<int>(SwapPred))); 3508 return II; 3509 } 3510 3511 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE) 3512 break; 3513 3514 // Canonicalize compare eq with true value to compare != 0 3515 // llvm.amdgcn.icmp(zext (i1 x), 1, eq) 3516 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne) 3517 // llvm.amdgcn.icmp(sext (i1 x), -1, eq) 3518 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne) 3519 Value *ExtSrc; 3520 if (CCVal == CmpInst::ICMP_EQ && 3521 ((match(Src1, m_One()) && match(Src0, m_ZExt(m_Value(ExtSrc)))) || 3522 (match(Src1, m_AllOnes()) && match(Src0, m_SExt(m_Value(ExtSrc))))) && 3523 ExtSrc->getType()->isIntegerTy(1)) { 3524 II->setArgOperand(1, ConstantInt::getNullValue(Src1->getType())); 3525 II->setArgOperand(2, ConstantInt::get(CC->getType(), CmpInst::ICMP_NE)); 3526 return II; 3527 } 3528 3529 CmpInst::Predicate SrcPred; 3530 Value *SrcLHS; 3531 Value *SrcRHS; 3532 3533 // Fold compare eq/ne with 0 from a compare result as the predicate to the 3534 // intrinsic. The typical use is a wave vote function in the library, which 3535 // will be fed from a user code condition compared with 0. Fold in the 3536 // redundant compare. 3537 3538 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne) 3539 // -> llvm.amdgcn.[if]cmp(a, b, pred) 3540 // 3541 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq) 3542 // -> llvm.amdgcn.[if]cmp(a, b, inv pred) 3543 if (match(Src1, m_Zero()) && 3544 match(Src0, 3545 m_ZExtOrSExt(m_Cmp(SrcPred, m_Value(SrcLHS), m_Value(SrcRHS))))) { 3546 if (CCVal == CmpInst::ICMP_EQ) 3547 SrcPred = CmpInst::getInversePredicate(SrcPred); 3548 3549 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) ? 3550 Intrinsic::amdgcn_fcmp : Intrinsic::amdgcn_icmp; 3551 3552 Value *NewF = Intrinsic::getDeclaration(II->getModule(), NewIID, 3553 SrcLHS->getType()); 3554 Value *Args[] = { SrcLHS, SrcRHS, 3555 ConstantInt::get(CC->getType(), SrcPred) }; 3556 CallInst *NewCall = Builder.CreateCall(NewF, Args); 3557 NewCall->takeName(II); 3558 return replaceInstUsesWith(*II, NewCall); 3559 } 3560 3561 break; 3562 } 3563 case Intrinsic::stackrestore: { 3564 // If the save is right next to the restore, remove the restore. This can 3565 // happen when variable allocas are DCE'd. 3566 if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) { 3567 if (SS->getIntrinsicID() == Intrinsic::stacksave) { 3568 if (&*++SS->getIterator() == II) 3569 return eraseInstFromFunction(CI); 3570 } 3571 } 3572 3573 // Scan down this block to see if there is another stack restore in the 3574 // same block without an intervening call/alloca. 3575 BasicBlock::iterator BI(II); 3576 TerminatorInst *TI = II->getParent()->getTerminator(); 3577 bool CannotRemove = false; 3578 for (++BI; &*BI != TI; ++BI) { 3579 if (isa<AllocaInst>(BI)) { 3580 CannotRemove = true; 3581 break; 3582 } 3583 if (CallInst *BCI = dyn_cast<CallInst>(BI)) { 3584 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(BCI)) { 3585 // If there is a stackrestore below this one, remove this one. 3586 if (II->getIntrinsicID() == Intrinsic::stackrestore) 3587 return eraseInstFromFunction(CI); 3588 3589 // Bail if we cross over an intrinsic with side effects, such as 3590 // llvm.stacksave, llvm.read_register, or llvm.setjmp. 3591 if (II->mayHaveSideEffects()) { 3592 CannotRemove = true; 3593 break; 3594 } 3595 } else { 3596 // If we found a non-intrinsic call, we can't remove the stack 3597 // restore. 3598 CannotRemove = true; 3599 break; 3600 } 3601 } 3602 } 3603 3604 // If the stack restore is in a return, resume, or unwind block and if there 3605 // are no allocas or calls between the restore and the return, nuke the 3606 // restore. 3607 if (!CannotRemove && (isa<ReturnInst>(TI) || isa<ResumeInst>(TI))) 3608 return eraseInstFromFunction(CI); 3609 break; 3610 } 3611 case Intrinsic::lifetime_start: 3612 // Asan needs to poison memory to detect invalid access which is possible 3613 // even for empty lifetime range. 3614 if (II->getFunction()->hasFnAttribute(Attribute::SanitizeAddress)) 3615 break; 3616 3617 if (removeTriviallyEmptyRange(*II, Intrinsic::lifetime_start, 3618 Intrinsic::lifetime_end, *this)) 3619 return nullptr; 3620 break; 3621 case Intrinsic::assume: { 3622 Value *IIOperand = II->getArgOperand(0); 3623 // Remove an assume if it is immediately followed by an identical assume. 3624 if (match(II->getNextNode(), 3625 m_Intrinsic<Intrinsic::assume>(m_Specific(IIOperand)))) 3626 return eraseInstFromFunction(CI); 3627 3628 // Canonicalize assume(a && b) -> assume(a); assume(b); 3629 // Note: New assumption intrinsics created here are registered by 3630 // the InstCombineIRInserter object. 3631 Value *AssumeIntrinsic = II->getCalledValue(), *A, *B; 3632 if (match(IIOperand, m_And(m_Value(A), m_Value(B)))) { 3633 Builder.CreateCall(AssumeIntrinsic, A, II->getName()); 3634 Builder.CreateCall(AssumeIntrinsic, B, II->getName()); 3635 return eraseInstFromFunction(*II); 3636 } 3637 // assume(!(a || b)) -> assume(!a); assume(!b); 3638 if (match(IIOperand, m_Not(m_Or(m_Value(A), m_Value(B))))) { 3639 Builder.CreateCall(AssumeIntrinsic, Builder.CreateNot(A), II->getName()); 3640 Builder.CreateCall(AssumeIntrinsic, Builder.CreateNot(B), II->getName()); 3641 return eraseInstFromFunction(*II); 3642 } 3643 3644 // assume( (load addr) != null ) -> add 'nonnull' metadata to load 3645 // (if assume is valid at the load) 3646 CmpInst::Predicate Pred; 3647 Instruction *LHS; 3648 if (match(IIOperand, m_ICmp(Pred, m_Instruction(LHS), m_Zero())) && 3649 Pred == ICmpInst::ICMP_NE && LHS->getOpcode() == Instruction::Load && 3650 LHS->getType()->isPointerTy() && 3651 isValidAssumeForContext(II, LHS, &DT)) { 3652 MDNode *MD = MDNode::get(II->getContext(), None); 3653 LHS->setMetadata(LLVMContext::MD_nonnull, MD); 3654 return eraseInstFromFunction(*II); 3655 3656 // TODO: apply nonnull return attributes to calls and invokes 3657 // TODO: apply range metadata for range check patterns? 3658 } 3659 3660 // If there is a dominating assume with the same condition as this one, 3661 // then this one is redundant, and should be removed. 3662 KnownBits Known(1); 3663 computeKnownBits(IIOperand, Known, 0, II); 3664 if (Known.isAllOnes()) 3665 return eraseInstFromFunction(*II); 3666 3667 // Update the cache of affected values for this assumption (we might be 3668 // here because we just simplified the condition). 3669 AC.updateAffectedValues(II); 3670 break; 3671 } 3672 case Intrinsic::experimental_gc_relocate: { 3673 // Translate facts known about a pointer before relocating into 3674 // facts about the relocate value, while being careful to 3675 // preserve relocation semantics. 3676 Value *DerivedPtr = cast<GCRelocateInst>(II)->getDerivedPtr(); 3677 3678 // Remove the relocation if unused, note that this check is required 3679 // to prevent the cases below from looping forever. 3680 if (II->use_empty()) 3681 return eraseInstFromFunction(*II); 3682 3683 // Undef is undef, even after relocation. 3684 // TODO: provide a hook for this in GCStrategy. This is clearly legal for 3685 // most practical collectors, but there was discussion in the review thread 3686 // about whether it was legal for all possible collectors. 3687 if (isa<UndefValue>(DerivedPtr)) 3688 // Use undef of gc_relocate's type to replace it. 3689 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 3690 3691 if (auto *PT = dyn_cast<PointerType>(II->getType())) { 3692 // The relocation of null will be null for most any collector. 3693 // TODO: provide a hook for this in GCStrategy. There might be some 3694 // weird collector this property does not hold for. 3695 if (isa<ConstantPointerNull>(DerivedPtr)) 3696 // Use null-pointer of gc_relocate's type to replace it. 3697 return replaceInstUsesWith(*II, ConstantPointerNull::get(PT)); 3698 3699 // isKnownNonNull -> nonnull attribute 3700 if (isKnownNonNullAt(DerivedPtr, II, &DT)) 3701 II->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull); 3702 } 3703 3704 // TODO: bitcast(relocate(p)) -> relocate(bitcast(p)) 3705 // Canonicalize on the type from the uses to the defs 3706 3707 // TODO: relocate((gep p, C, C2, ...)) -> gep(relocate(p), C, C2, ...) 3708 break; 3709 } 3710 3711 case Intrinsic::experimental_guard: { 3712 // Is this guard followed by another guard? 3713 Instruction *NextInst = II->getNextNode(); 3714 Value *NextCond = nullptr; 3715 if (match(NextInst, 3716 m_Intrinsic<Intrinsic::experimental_guard>(m_Value(NextCond)))) { 3717 Value *CurrCond = II->getArgOperand(0); 3718 3719 // Remove a guard that it is immediately preceded by an identical guard. 3720 if (CurrCond == NextCond) 3721 return eraseInstFromFunction(*NextInst); 3722 3723 // Otherwise canonicalize guard(a); guard(b) -> guard(a & b). 3724 II->setArgOperand(0, Builder.CreateAnd(CurrCond, NextCond)); 3725 return eraseInstFromFunction(*NextInst); 3726 } 3727 break; 3728 } 3729 } 3730 return visitCallSite(II); 3731 } 3732 3733 // Fence instruction simplification 3734 Instruction *InstCombiner::visitFenceInst(FenceInst &FI) { 3735 // Remove identical consecutive fences. 3736 if (auto *NFI = dyn_cast<FenceInst>(FI.getNextNode())) 3737 if (FI.isIdenticalTo(NFI)) 3738 return eraseInstFromFunction(FI); 3739 return nullptr; 3740 } 3741 3742 // InvokeInst simplification 3743 // 3744 Instruction *InstCombiner::visitInvokeInst(InvokeInst &II) { 3745 return visitCallSite(&II); 3746 } 3747 3748 /// If this cast does not affect the value passed through the varargs area, we 3749 /// can eliminate the use of the cast. 3750 static bool isSafeToEliminateVarargsCast(const CallSite CS, 3751 const DataLayout &DL, 3752 const CastInst *const CI, 3753 const int ix) { 3754 if (!CI->isLosslessCast()) 3755 return false; 3756 3757 // If this is a GC intrinsic, avoid munging types. We need types for 3758 // statepoint reconstruction in SelectionDAG. 3759 // TODO: This is probably something which should be expanded to all 3760 // intrinsics since the entire point of intrinsics is that 3761 // they are understandable by the optimizer. 3762 if (isStatepoint(CS) || isGCRelocate(CS) || isGCResult(CS)) 3763 return false; 3764 3765 // The size of ByVal or InAlloca arguments is derived from the type, so we 3766 // can't change to a type with a different size. If the size were 3767 // passed explicitly we could avoid this check. 3768 if (!CS.isByValOrInAllocaArgument(ix)) 3769 return true; 3770 3771 Type* SrcTy = 3772 cast<PointerType>(CI->getOperand(0)->getType())->getElementType(); 3773 Type* DstTy = cast<PointerType>(CI->getType())->getElementType(); 3774 if (!SrcTy->isSized() || !DstTy->isSized()) 3775 return false; 3776 if (DL.getTypeAllocSize(SrcTy) != DL.getTypeAllocSize(DstTy)) 3777 return false; 3778 return true; 3779 } 3780 3781 Instruction *InstCombiner::tryOptimizeCall(CallInst *CI) { 3782 if (!CI->getCalledFunction()) return nullptr; 3783 3784 auto InstCombineRAUW = [this](Instruction *From, Value *With) { 3785 replaceInstUsesWith(*From, With); 3786 }; 3787 LibCallSimplifier Simplifier(DL, &TLI, InstCombineRAUW); 3788 if (Value *With = Simplifier.optimizeCall(CI)) { 3789 ++NumSimplified; 3790 return CI->use_empty() ? CI : replaceInstUsesWith(*CI, With); 3791 } 3792 3793 return nullptr; 3794 } 3795 3796 static IntrinsicInst *findInitTrampolineFromAlloca(Value *TrampMem) { 3797 // Strip off at most one level of pointer casts, looking for an alloca. This 3798 // is good enough in practice and simpler than handling any number of casts. 3799 Value *Underlying = TrampMem->stripPointerCasts(); 3800 if (Underlying != TrampMem && 3801 (!Underlying->hasOneUse() || Underlying->user_back() != TrampMem)) 3802 return nullptr; 3803 if (!isa<AllocaInst>(Underlying)) 3804 return nullptr; 3805 3806 IntrinsicInst *InitTrampoline = nullptr; 3807 for (User *U : TrampMem->users()) { 3808 IntrinsicInst *II = dyn_cast<IntrinsicInst>(U); 3809 if (!II) 3810 return nullptr; 3811 if (II->getIntrinsicID() == Intrinsic::init_trampoline) { 3812 if (InitTrampoline) 3813 // More than one init_trampoline writes to this value. Give up. 3814 return nullptr; 3815 InitTrampoline = II; 3816 continue; 3817 } 3818 if (II->getIntrinsicID() == Intrinsic::adjust_trampoline) 3819 // Allow any number of calls to adjust.trampoline. 3820 continue; 3821 return nullptr; 3822 } 3823 3824 // No call to init.trampoline found. 3825 if (!InitTrampoline) 3826 return nullptr; 3827 3828 // Check that the alloca is being used in the expected way. 3829 if (InitTrampoline->getOperand(0) != TrampMem) 3830 return nullptr; 3831 3832 return InitTrampoline; 3833 } 3834 3835 static IntrinsicInst *findInitTrampolineFromBB(IntrinsicInst *AdjustTramp, 3836 Value *TrampMem) { 3837 // Visit all the previous instructions in the basic block, and try to find a 3838 // init.trampoline which has a direct path to the adjust.trampoline. 3839 for (BasicBlock::iterator I = AdjustTramp->getIterator(), 3840 E = AdjustTramp->getParent()->begin(); 3841 I != E;) { 3842 Instruction *Inst = &*--I; 3843 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) 3844 if (II->getIntrinsicID() == Intrinsic::init_trampoline && 3845 II->getOperand(0) == TrampMem) 3846 return II; 3847 if (Inst->mayWriteToMemory()) 3848 return nullptr; 3849 } 3850 return nullptr; 3851 } 3852 3853 // Given a call to llvm.adjust.trampoline, find and return the corresponding 3854 // call to llvm.init.trampoline if the call to the trampoline can be optimized 3855 // to a direct call to a function. Otherwise return NULL. 3856 // 3857 static IntrinsicInst *findInitTrampoline(Value *Callee) { 3858 Callee = Callee->stripPointerCasts(); 3859 IntrinsicInst *AdjustTramp = dyn_cast<IntrinsicInst>(Callee); 3860 if (!AdjustTramp || 3861 AdjustTramp->getIntrinsicID() != Intrinsic::adjust_trampoline) 3862 return nullptr; 3863 3864 Value *TrampMem = AdjustTramp->getOperand(0); 3865 3866 if (IntrinsicInst *IT = findInitTrampolineFromAlloca(TrampMem)) 3867 return IT; 3868 if (IntrinsicInst *IT = findInitTrampolineFromBB(AdjustTramp, TrampMem)) 3869 return IT; 3870 return nullptr; 3871 } 3872 3873 /// Improvements for call and invoke instructions. 3874 Instruction *InstCombiner::visitCallSite(CallSite CS) { 3875 if (isAllocLikeFn(CS.getInstruction(), &TLI)) 3876 return visitAllocSite(*CS.getInstruction()); 3877 3878 bool Changed = false; 3879 3880 // Mark any parameters that are known to be non-null with the nonnull 3881 // attribute. This is helpful for inlining calls to functions with null 3882 // checks on their arguments. 3883 SmallVector<unsigned, 4> ArgNos; 3884 unsigned ArgNo = 0; 3885 3886 for (Value *V : CS.args()) { 3887 if (V->getType()->isPointerTy() && 3888 !CS.paramHasAttr(ArgNo, Attribute::NonNull) && 3889 isKnownNonNullAt(V, CS.getInstruction(), &DT)) 3890 ArgNos.push_back(ArgNo); 3891 ArgNo++; 3892 } 3893 3894 assert(ArgNo == CS.arg_size() && "sanity check"); 3895 3896 if (!ArgNos.empty()) { 3897 AttributeList AS = CS.getAttributes(); 3898 LLVMContext &Ctx = CS.getInstruction()->getContext(); 3899 AS = AS.addParamAttribute(Ctx, ArgNos, 3900 Attribute::get(Ctx, Attribute::NonNull)); 3901 CS.setAttributes(AS); 3902 Changed = true; 3903 } 3904 3905 // If the callee is a pointer to a function, attempt to move any casts to the 3906 // arguments of the call/invoke. 3907 Value *Callee = CS.getCalledValue(); 3908 if (!isa<Function>(Callee) && transformConstExprCastCall(CS)) 3909 return nullptr; 3910 3911 if (Function *CalleeF = dyn_cast<Function>(Callee)) { 3912 // Remove the convergent attr on calls when the callee is not convergent. 3913 if (CS.isConvergent() && !CalleeF->isConvergent() && 3914 !CalleeF->isIntrinsic()) { 3915 DEBUG(dbgs() << "Removing convergent attr from instr " 3916 << CS.getInstruction() << "\n"); 3917 CS.setNotConvergent(); 3918 return CS.getInstruction(); 3919 } 3920 3921 // If the call and callee calling conventions don't match, this call must 3922 // be unreachable, as the call is undefined. 3923 if (CalleeF->getCallingConv() != CS.getCallingConv() && 3924 // Only do this for calls to a function with a body. A prototype may 3925 // not actually end up matching the implementation's calling conv for a 3926 // variety of reasons (e.g. it may be written in assembly). 3927 !CalleeF->isDeclaration()) { 3928 Instruction *OldCall = CS.getInstruction(); 3929 new StoreInst(ConstantInt::getTrue(Callee->getContext()), 3930 UndefValue::get(Type::getInt1PtrTy(Callee->getContext())), 3931 OldCall); 3932 // If OldCall does not return void then replaceAllUsesWith undef. 3933 // This allows ValueHandlers and custom metadata to adjust itself. 3934 if (!OldCall->getType()->isVoidTy()) 3935 replaceInstUsesWith(*OldCall, UndefValue::get(OldCall->getType())); 3936 if (isa<CallInst>(OldCall)) 3937 return eraseInstFromFunction(*OldCall); 3938 3939 // We cannot remove an invoke, because it would change the CFG, just 3940 // change the callee to a null pointer. 3941 cast<InvokeInst>(OldCall)->setCalledFunction( 3942 Constant::getNullValue(CalleeF->getType())); 3943 return nullptr; 3944 } 3945 } 3946 3947 if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) { 3948 // If CS does not return void then replaceAllUsesWith undef. 3949 // This allows ValueHandlers and custom metadata to adjust itself. 3950 if (!CS.getInstruction()->getType()->isVoidTy()) 3951 replaceInstUsesWith(*CS.getInstruction(), 3952 UndefValue::get(CS.getInstruction()->getType())); 3953 3954 if (isa<InvokeInst>(CS.getInstruction())) { 3955 // Can't remove an invoke because we cannot change the CFG. 3956 return nullptr; 3957 } 3958 3959 // This instruction is not reachable, just remove it. We insert a store to 3960 // undef so that we know that this code is not reachable, despite the fact 3961 // that we can't modify the CFG here. 3962 new StoreInst(ConstantInt::getTrue(Callee->getContext()), 3963 UndefValue::get(Type::getInt1PtrTy(Callee->getContext())), 3964 CS.getInstruction()); 3965 3966 return eraseInstFromFunction(*CS.getInstruction()); 3967 } 3968 3969 if (IntrinsicInst *II = findInitTrampoline(Callee)) 3970 return transformCallThroughTrampoline(CS, II); 3971 3972 PointerType *PTy = cast<PointerType>(Callee->getType()); 3973 FunctionType *FTy = cast<FunctionType>(PTy->getElementType()); 3974 if (FTy->isVarArg()) { 3975 int ix = FTy->getNumParams(); 3976 // See if we can optimize any arguments passed through the varargs area of 3977 // the call. 3978 for (CallSite::arg_iterator I = CS.arg_begin() + FTy->getNumParams(), 3979 E = CS.arg_end(); I != E; ++I, ++ix) { 3980 CastInst *CI = dyn_cast<CastInst>(*I); 3981 if (CI && isSafeToEliminateVarargsCast(CS, DL, CI, ix)) { 3982 *I = CI->getOperand(0); 3983 Changed = true; 3984 } 3985 } 3986 } 3987 3988 if (isa<InlineAsm>(Callee) && !CS.doesNotThrow()) { 3989 // Inline asm calls cannot throw - mark them 'nounwind'. 3990 CS.setDoesNotThrow(); 3991 Changed = true; 3992 } 3993 3994 // Try to optimize the call if possible, we require DataLayout for most of 3995 // this. None of these calls are seen as possibly dead so go ahead and 3996 // delete the instruction now. 3997 if (CallInst *CI = dyn_cast<CallInst>(CS.getInstruction())) { 3998 Instruction *I = tryOptimizeCall(CI); 3999 // If we changed something return the result, etc. Otherwise let 4000 // the fallthrough check. 4001 if (I) return eraseInstFromFunction(*I); 4002 } 4003 4004 return Changed ? CS.getInstruction() : nullptr; 4005 } 4006 4007 /// If the callee is a constexpr cast of a function, attempt to move the cast to 4008 /// the arguments of the call/invoke. 4009 bool InstCombiner::transformConstExprCastCall(CallSite CS) { 4010 auto *Callee = dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts()); 4011 if (!Callee) 4012 return false; 4013 4014 // The prototype of a thunk is a lie. Don't directly call such a function. 4015 if (Callee->hasFnAttribute("thunk")) 4016 return false; 4017 4018 Instruction *Caller = CS.getInstruction(); 4019 const AttributeList &CallerPAL = CS.getAttributes(); 4020 4021 // Okay, this is a cast from a function to a different type. Unless doing so 4022 // would cause a type conversion of one of our arguments, change this call to 4023 // be a direct call with arguments casted to the appropriate types. 4024 // 4025 FunctionType *FT = Callee->getFunctionType(); 4026 Type *OldRetTy = Caller->getType(); 4027 Type *NewRetTy = FT->getReturnType(); 4028 4029 // Check to see if we are changing the return type... 4030 if (OldRetTy != NewRetTy) { 4031 4032 if (NewRetTy->isStructTy()) 4033 return false; // TODO: Handle multiple return values. 4034 4035 if (!CastInst::isBitOrNoopPointerCastable(NewRetTy, OldRetTy, DL)) { 4036 if (Callee->isDeclaration()) 4037 return false; // Cannot transform this return value. 4038 4039 if (!Caller->use_empty() && 4040 // void -> non-void is handled specially 4041 !NewRetTy->isVoidTy()) 4042 return false; // Cannot transform this return value. 4043 } 4044 4045 if (!CallerPAL.isEmpty() && !Caller->use_empty()) { 4046 AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex); 4047 if (RAttrs.overlaps(AttributeFuncs::typeIncompatible(NewRetTy))) 4048 return false; // Attribute not compatible with transformed value. 4049 } 4050 4051 // If the callsite is an invoke instruction, and the return value is used by 4052 // a PHI node in a successor, we cannot change the return type of the call 4053 // because there is no place to put the cast instruction (without breaking 4054 // the critical edge). Bail out in this case. 4055 if (!Caller->use_empty()) 4056 if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) 4057 for (User *U : II->users()) 4058 if (PHINode *PN = dyn_cast<PHINode>(U)) 4059 if (PN->getParent() == II->getNormalDest() || 4060 PN->getParent() == II->getUnwindDest()) 4061 return false; 4062 } 4063 4064 unsigned NumActualArgs = CS.arg_size(); 4065 unsigned NumCommonArgs = std::min(FT->getNumParams(), NumActualArgs); 4066 4067 // Prevent us turning: 4068 // declare void @takes_i32_inalloca(i32* inalloca) 4069 // call void bitcast (void (i32*)* @takes_i32_inalloca to void (i32)*)(i32 0) 4070 // 4071 // into: 4072 // call void @takes_i32_inalloca(i32* null) 4073 // 4074 // Similarly, avoid folding away bitcasts of byval calls. 4075 if (Callee->getAttributes().hasAttrSomewhere(Attribute::InAlloca) || 4076 Callee->getAttributes().hasAttrSomewhere(Attribute::ByVal)) 4077 return false; 4078 4079 CallSite::arg_iterator AI = CS.arg_begin(); 4080 for (unsigned i = 0, e = NumCommonArgs; i != e; ++i, ++AI) { 4081 Type *ParamTy = FT->getParamType(i); 4082 Type *ActTy = (*AI)->getType(); 4083 4084 if (!CastInst::isBitOrNoopPointerCastable(ActTy, ParamTy, DL)) 4085 return false; // Cannot transform this parameter value. 4086 4087 if (AttrBuilder(CallerPAL.getParamAttributes(i)) 4088 .overlaps(AttributeFuncs::typeIncompatible(ParamTy))) 4089 return false; // Attribute not compatible with transformed value. 4090 4091 if (CS.isInAllocaArgument(i)) 4092 return false; // Cannot transform to and from inalloca. 4093 4094 // If the parameter is passed as a byval argument, then we have to have a 4095 // sized type and the sized type has to have the same size as the old type. 4096 if (ParamTy != ActTy && CallerPAL.hasParamAttribute(i, Attribute::ByVal)) { 4097 PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy); 4098 if (!ParamPTy || !ParamPTy->getElementType()->isSized()) 4099 return false; 4100 4101 Type *CurElTy = ActTy->getPointerElementType(); 4102 if (DL.getTypeAllocSize(CurElTy) != 4103 DL.getTypeAllocSize(ParamPTy->getElementType())) 4104 return false; 4105 } 4106 } 4107 4108 if (Callee->isDeclaration()) { 4109 // Do not delete arguments unless we have a function body. 4110 if (FT->getNumParams() < NumActualArgs && !FT->isVarArg()) 4111 return false; 4112 4113 // If the callee is just a declaration, don't change the varargsness of the 4114 // call. We don't want to introduce a varargs call where one doesn't 4115 // already exist. 4116 PointerType *APTy = cast<PointerType>(CS.getCalledValue()->getType()); 4117 if (FT->isVarArg()!=cast<FunctionType>(APTy->getElementType())->isVarArg()) 4118 return false; 4119 4120 // If both the callee and the cast type are varargs, we still have to make 4121 // sure the number of fixed parameters are the same or we have the same 4122 // ABI issues as if we introduce a varargs call. 4123 if (FT->isVarArg() && 4124 cast<FunctionType>(APTy->getElementType())->isVarArg() && 4125 FT->getNumParams() != 4126 cast<FunctionType>(APTy->getElementType())->getNumParams()) 4127 return false; 4128 } 4129 4130 if (FT->getNumParams() < NumActualArgs && FT->isVarArg() && 4131 !CallerPAL.isEmpty()) { 4132 // In this case we have more arguments than the new function type, but we 4133 // won't be dropping them. Check that these extra arguments have attributes 4134 // that are compatible with being a vararg call argument. 4135 unsigned SRetIdx; 4136 if (CallerPAL.hasAttrSomewhere(Attribute::StructRet, &SRetIdx) && 4137 SRetIdx > FT->getNumParams()) 4138 return false; 4139 } 4140 4141 // Okay, we decided that this is a safe thing to do: go ahead and start 4142 // inserting cast instructions as necessary. 4143 SmallVector<Value *, 8> Args; 4144 SmallVector<AttributeSet, 8> ArgAttrs; 4145 Args.reserve(NumActualArgs); 4146 ArgAttrs.reserve(NumActualArgs); 4147 4148 // Get any return attributes. 4149 AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex); 4150 4151 // If the return value is not being used, the type may not be compatible 4152 // with the existing attributes. Wipe out any problematic attributes. 4153 RAttrs.remove(AttributeFuncs::typeIncompatible(NewRetTy)); 4154 4155 AI = CS.arg_begin(); 4156 for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) { 4157 Type *ParamTy = FT->getParamType(i); 4158 4159 Value *NewArg = *AI; 4160 if ((*AI)->getType() != ParamTy) 4161 NewArg = Builder.CreateBitOrPointerCast(*AI, ParamTy); 4162 Args.push_back(NewArg); 4163 4164 // Add any parameter attributes. 4165 ArgAttrs.push_back(CallerPAL.getParamAttributes(i)); 4166 } 4167 4168 // If the function takes more arguments than the call was taking, add them 4169 // now. 4170 for (unsigned i = NumCommonArgs; i != FT->getNumParams(); ++i) { 4171 Args.push_back(Constant::getNullValue(FT->getParamType(i))); 4172 ArgAttrs.push_back(AttributeSet()); 4173 } 4174 4175 // If we are removing arguments to the function, emit an obnoxious warning. 4176 if (FT->getNumParams() < NumActualArgs) { 4177 // TODO: if (!FT->isVarArg()) this call may be unreachable. PR14722 4178 if (FT->isVarArg()) { 4179 // Add all of the arguments in their promoted form to the arg list. 4180 for (unsigned i = FT->getNumParams(); i != NumActualArgs; ++i, ++AI) { 4181 Type *PTy = getPromotedType((*AI)->getType()); 4182 Value *NewArg = *AI; 4183 if (PTy != (*AI)->getType()) { 4184 // Must promote to pass through va_arg area! 4185 Instruction::CastOps opcode = 4186 CastInst::getCastOpcode(*AI, false, PTy, false); 4187 NewArg = Builder.CreateCast(opcode, *AI, PTy); 4188 } 4189 Args.push_back(NewArg); 4190 4191 // Add any parameter attributes. 4192 ArgAttrs.push_back(CallerPAL.getParamAttributes(i)); 4193 } 4194 } 4195 } 4196 4197 AttributeSet FnAttrs = CallerPAL.getFnAttributes(); 4198 4199 if (NewRetTy->isVoidTy()) 4200 Caller->setName(""); // Void type should not have a name. 4201 4202 assert((ArgAttrs.size() == FT->getNumParams() || FT->isVarArg()) && 4203 "missing argument attributes"); 4204 LLVMContext &Ctx = Callee->getContext(); 4205 AttributeList NewCallerPAL = AttributeList::get( 4206 Ctx, FnAttrs, AttributeSet::get(Ctx, RAttrs), ArgAttrs); 4207 4208 SmallVector<OperandBundleDef, 1> OpBundles; 4209 CS.getOperandBundlesAsDefs(OpBundles); 4210 4211 CallSite NewCS; 4212 if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { 4213 NewCS = Builder.CreateInvoke(Callee, II->getNormalDest(), 4214 II->getUnwindDest(), Args, OpBundles); 4215 } else { 4216 NewCS = Builder.CreateCall(Callee, Args, OpBundles); 4217 cast<CallInst>(NewCS.getInstruction()) 4218 ->setTailCallKind(cast<CallInst>(Caller)->getTailCallKind()); 4219 } 4220 NewCS->takeName(Caller); 4221 NewCS.setCallingConv(CS.getCallingConv()); 4222 NewCS.setAttributes(NewCallerPAL); 4223 4224 // Preserve the weight metadata for the new call instruction. The metadata 4225 // is used by SamplePGO to check callsite's hotness. 4226 uint64_t W; 4227 if (Caller->extractProfTotalWeight(W)) 4228 NewCS->setProfWeight(W); 4229 4230 // Insert a cast of the return type as necessary. 4231 Instruction *NC = NewCS.getInstruction(); 4232 Value *NV = NC; 4233 if (OldRetTy != NV->getType() && !Caller->use_empty()) { 4234 if (!NV->getType()->isVoidTy()) { 4235 NV = NC = CastInst::CreateBitOrPointerCast(NC, OldRetTy); 4236 NC->setDebugLoc(Caller->getDebugLoc()); 4237 4238 // If this is an invoke instruction, we should insert it after the first 4239 // non-phi, instruction in the normal successor block. 4240 if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { 4241 BasicBlock::iterator I = II->getNormalDest()->getFirstInsertionPt(); 4242 InsertNewInstBefore(NC, *I); 4243 } else { 4244 // Otherwise, it's a call, just insert cast right after the call. 4245 InsertNewInstBefore(NC, *Caller); 4246 } 4247 Worklist.AddUsersToWorkList(*Caller); 4248 } else { 4249 NV = UndefValue::get(Caller->getType()); 4250 } 4251 } 4252 4253 if (!Caller->use_empty()) 4254 replaceInstUsesWith(*Caller, NV); 4255 else if (Caller->hasValueHandle()) { 4256 if (OldRetTy == NV->getType()) 4257 ValueHandleBase::ValueIsRAUWd(Caller, NV); 4258 else 4259 // We cannot call ValueIsRAUWd with a different type, and the 4260 // actual tracked value will disappear. 4261 ValueHandleBase::ValueIsDeleted(Caller); 4262 } 4263 4264 eraseInstFromFunction(*Caller); 4265 return true; 4266 } 4267 4268 /// Turn a call to a function created by init_trampoline / adjust_trampoline 4269 /// intrinsic pair into a direct call to the underlying function. 4270 Instruction * 4271 InstCombiner::transformCallThroughTrampoline(CallSite CS, 4272 IntrinsicInst *Tramp) { 4273 Value *Callee = CS.getCalledValue(); 4274 PointerType *PTy = cast<PointerType>(Callee->getType()); 4275 FunctionType *FTy = cast<FunctionType>(PTy->getElementType()); 4276 AttributeList Attrs = CS.getAttributes(); 4277 4278 // If the call already has the 'nest' attribute somewhere then give up - 4279 // otherwise 'nest' would occur twice after splicing in the chain. 4280 if (Attrs.hasAttrSomewhere(Attribute::Nest)) 4281 return nullptr; 4282 4283 assert(Tramp && 4284 "transformCallThroughTrampoline called with incorrect CallSite."); 4285 4286 Function *NestF =cast<Function>(Tramp->getArgOperand(1)->stripPointerCasts()); 4287 FunctionType *NestFTy = cast<FunctionType>(NestF->getValueType()); 4288 4289 AttributeList NestAttrs = NestF->getAttributes(); 4290 if (!NestAttrs.isEmpty()) { 4291 unsigned NestArgNo = 0; 4292 Type *NestTy = nullptr; 4293 AttributeSet NestAttr; 4294 4295 // Look for a parameter marked with the 'nest' attribute. 4296 for (FunctionType::param_iterator I = NestFTy->param_begin(), 4297 E = NestFTy->param_end(); 4298 I != E; ++NestArgNo, ++I) { 4299 AttributeSet AS = NestAttrs.getParamAttributes(NestArgNo); 4300 if (AS.hasAttribute(Attribute::Nest)) { 4301 // Record the parameter type and any other attributes. 4302 NestTy = *I; 4303 NestAttr = AS; 4304 break; 4305 } 4306 } 4307 4308 if (NestTy) { 4309 Instruction *Caller = CS.getInstruction(); 4310 std::vector<Value*> NewArgs; 4311 std::vector<AttributeSet> NewArgAttrs; 4312 NewArgs.reserve(CS.arg_size() + 1); 4313 NewArgAttrs.reserve(CS.arg_size()); 4314 4315 // Insert the nest argument into the call argument list, which may 4316 // mean appending it. Likewise for attributes. 4317 4318 { 4319 unsigned ArgNo = 0; 4320 CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); 4321 do { 4322 if (ArgNo == NestArgNo) { 4323 // Add the chain argument and attributes. 4324 Value *NestVal = Tramp->getArgOperand(2); 4325 if (NestVal->getType() != NestTy) 4326 NestVal = Builder.CreateBitCast(NestVal, NestTy, "nest"); 4327 NewArgs.push_back(NestVal); 4328 NewArgAttrs.push_back(NestAttr); 4329 } 4330 4331 if (I == E) 4332 break; 4333 4334 // Add the original argument and attributes. 4335 NewArgs.push_back(*I); 4336 NewArgAttrs.push_back(Attrs.getParamAttributes(ArgNo)); 4337 4338 ++ArgNo; 4339 ++I; 4340 } while (true); 4341 } 4342 4343 // The trampoline may have been bitcast to a bogus type (FTy). 4344 // Handle this by synthesizing a new function type, equal to FTy 4345 // with the chain parameter inserted. 4346 4347 std::vector<Type*> NewTypes; 4348 NewTypes.reserve(FTy->getNumParams()+1); 4349 4350 // Insert the chain's type into the list of parameter types, which may 4351 // mean appending it. 4352 { 4353 unsigned ArgNo = 0; 4354 FunctionType::param_iterator I = FTy->param_begin(), 4355 E = FTy->param_end(); 4356 4357 do { 4358 if (ArgNo == NestArgNo) 4359 // Add the chain's type. 4360 NewTypes.push_back(NestTy); 4361 4362 if (I == E) 4363 break; 4364 4365 // Add the original type. 4366 NewTypes.push_back(*I); 4367 4368 ++ArgNo; 4369 ++I; 4370 } while (true); 4371 } 4372 4373 // Replace the trampoline call with a direct call. Let the generic 4374 // code sort out any function type mismatches. 4375 FunctionType *NewFTy = FunctionType::get(FTy->getReturnType(), NewTypes, 4376 FTy->isVarArg()); 4377 Constant *NewCallee = 4378 NestF->getType() == PointerType::getUnqual(NewFTy) ? 4379 NestF : ConstantExpr::getBitCast(NestF, 4380 PointerType::getUnqual(NewFTy)); 4381 AttributeList NewPAL = 4382 AttributeList::get(FTy->getContext(), Attrs.getFnAttributes(), 4383 Attrs.getRetAttributes(), NewArgAttrs); 4384 4385 SmallVector<OperandBundleDef, 1> OpBundles; 4386 CS.getOperandBundlesAsDefs(OpBundles); 4387 4388 Instruction *NewCaller; 4389 if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { 4390 NewCaller = InvokeInst::Create(NewCallee, 4391 II->getNormalDest(), II->getUnwindDest(), 4392 NewArgs, OpBundles); 4393 cast<InvokeInst>(NewCaller)->setCallingConv(II->getCallingConv()); 4394 cast<InvokeInst>(NewCaller)->setAttributes(NewPAL); 4395 } else { 4396 NewCaller = CallInst::Create(NewCallee, NewArgs, OpBundles); 4397 cast<CallInst>(NewCaller)->setTailCallKind( 4398 cast<CallInst>(Caller)->getTailCallKind()); 4399 cast<CallInst>(NewCaller)->setCallingConv( 4400 cast<CallInst>(Caller)->getCallingConv()); 4401 cast<CallInst>(NewCaller)->setAttributes(NewPAL); 4402 } 4403 4404 return NewCaller; 4405 } 4406 } 4407 4408 // Replace the trampoline call with a direct call. Since there is no 'nest' 4409 // parameter, there is no need to adjust the argument list. Let the generic 4410 // code sort out any function type mismatches. 4411 Constant *NewCallee = 4412 NestF->getType() == PTy ? NestF : 4413 ConstantExpr::getBitCast(NestF, PTy); 4414 CS.setCalledFunction(NewCallee); 4415 return CS.getInstruction(); 4416 } 4417